fix(module-13): Rewrite LayerNorm to use Tensor operations

- Change from .data extraction to Tensor arithmetic (x - mean, diff * diff, x / std) - Preserve computation graph through normalization - std tensor now preserves requires_grad correctly LayerNorm is used before and after attention in transformer blocks
2026-03-11 22:03:34 -05:00 · 2025-10-27 20:30:21 -04:00
parent c23946b20e
commit 8025c66a4b
3 changed files with 66 additions and 53 deletions
--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -86,12 +86,15 @@ class LayerNorm:
        mean = x.mean(axis=-1, keepdims=True)

        # Compute variance: E[(x - μ)²]
-        diff = Tensor(x.data - mean.data)
-        variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
+        # Use Tensor operations to preserve computation graph!
+        diff = x - mean
+        variance = (diff * diff).mean(axis=-1, keepdims=True)

-        # Normalize
-        std = Tensor(np.sqrt(variance.data + self.eps))
-        normalized = Tensor((x.data - mean.data) / std.data)
+        # Normalize - use Tensor operations to preserve gradients!
+        # Add eps as a Tensor for proper gradient flow
+        eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
+        std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
+        normalized = (x - mean) / std

        # Apply learnable transformation
        output = normalized * self.gamma + self.beta