From a3634069026664117e28b86bbffbdebeae61e11d Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Tue, 13 Jan 2026 10:02:32 -0500
Subject: [PATCH] fix(layers): remove requires_grad from Linear layer Tensor
 calls

Module 03 (Linear layer) was incorrectly passing requires_grad=True
to Tensor constructor, which violates progressive disclosure design.

The requires_grad parameter is introduced in Module 06 via monkey
patching of Tensor.__init__. Module 03 should work independently
without depending on autograd functionality.

Changes:
- Remove requires_grad=True from weight/bias Tensor initialization
- Update ABOUT.md to clarify gradient tracking is enabled in Module 06

This fixes the issue reported where students working sequentially
through modules would get errors in Module 03 before completing
Module 06.

Closes #1101
---
 tinytorch/src/03_layers/03_layers.py |  4 ++--
 tinytorch/src/03_layers/ABOUT.md     | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tinytorch/src/03_layers/03_layers.py b/tinytorch/src/03_layers/03_layers.py
index e0c2600e8..be54e7081 100644
--- a/tinytorch/src/03_layers/03_layers.py
+++ b/tinytorch/src/03_layers/03_layers.py
@@ -303,12 +303,12 @@ class Linear(Layer):
         # Xavier/Glorot initialization for stable gradients
         scale = np.sqrt(XAVIER_SCALE_FACTOR / in_features)
         weight_data = np.random.randn(in_features, out_features) * scale
-        self.weight = Tensor(weight_data, requires_grad=True)
+        self.weight = Tensor(weight_data)
 
         # Initialize bias to zeros or None
         if bias:
             bias_data = np.zeros(out_features)
-            self.bias = Tensor(bias_data, requires_grad=True)
+            self.bias = Tensor(bias_data)
         else:
             self.bias = None
         ### END SOLUTION
diff --git a/tinytorch/src/03_layers/ABOUT.md b/tinytorch/src/03_layers/ABOUT.md
index 04134dfd1..b9b95883d 100644
--- a/tinytorch/src/03_layers/ABOUT.md
+++ b/tinytorch/src/03_layers/ABOUT.md
@@ -152,8 +152,8 @@ Linear (fully connected) layer implementing `y = xW + b`.
 - `bias`: Whether to include bias term (default: True)
 
 **Attributes:**
-- `weight`: Tensor of shape `(in_features, out_features)` with `requires_grad=True`
-- `bias`: Tensor of shape `(out_features,)` with `requires_grad=True` (or None)
+- `weight`: Tensor of shape `(in_features, out_features)` (gradient tracking enabled in Module 06)
+- `bias`: Tensor of shape `(out_features,)` or None (gradient tracking enabled in Module 06)
 
 | Method | Signature | Description |
 |--------|-----------|-------------|
@@ -235,17 +235,17 @@ def __init__(self, in_features, out_features, bias=True):
     # Xavier/Glorot initialization for stable gradients
     scale = np.sqrt(XAVIER_SCALE_FACTOR / in_features)
     weight_data = np.random.randn(in_features, out_features) * scale
-    self.weight = Tensor(weight_data, requires_grad=True)
+    self.weight = Tensor(weight_data)
 
     # Initialize bias to zeros or None
     if bias:
         bias_data = np.zeros(out_features)
-        self.bias = Tensor(bias_data, requires_grad=True)
+        self.bias = Tensor(bias_data)
     else:
         self.bias = None
 ```
 
-The `requires_grad=True` flag marks these tensors for gradient computation in Module 06. Even though you haven't built autograd yet, your layers are already prepared for it. Bias starts at zero because the weight initialization already handles the scale, and zero is a neutral starting point for per-class adjustments.
+Weights and biases are created as plain Tensors. In Module 06 (Autograd), you'll learn to enable gradient tracking via `enable_autograd()`, which monkey patches the Tensor class to support `requires_grad`. At that point, you can set `layer.weight.requires_grad = True` for parameters that need gradients. Bias starts at zero because the weight initialization already handles the scale, and zero is a neutral starting point for per-class adjustments.
 
 For Linear(1000, 10), the scale is `sqrt(1/1000) ≈ 0.032`. For Linear(10, 1000), the scale is `sqrt(1/10) ≈ 0.316`. Layers with more inputs get smaller initial weights because each input contributes to the output, and you want their combined effect to remain stable.