mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 22:03:34 -05:00
fix(module-13): Rewrite LayerNorm to use Tensor operations
- Change from .data extraction to Tensor arithmetic (x - mean, diff * diff, x / std) - Preserve computation graph through normalization - std tensor now preserves requires_grad correctly LayerNorm is used before and after attention in transformer blocks
This commit is contained in:
13
tinytorch/models/transformer.py
generated
13
tinytorch/models/transformer.py
generated
@@ -86,12 +86,15 @@ class LayerNorm:
|
||||
mean = x.mean(axis=-1, keepdims=True)
|
||||
|
||||
# Compute variance: E[(x - μ)²]
|
||||
diff = Tensor(x.data - mean.data)
|
||||
variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
|
||||
# Use Tensor operations to preserve computation graph!
|
||||
diff = x - mean
|
||||
variance = (diff * diff).mean(axis=-1, keepdims=True)
|
||||
|
||||
# Normalize
|
||||
std = Tensor(np.sqrt(variance.data + self.eps))
|
||||
normalized = Tensor((x.data - mean.data) / std.data)
|
||||
# Normalize - use Tensor operations to preserve gradients!
|
||||
# Add eps as a Tensor for proper gradient flow
|
||||
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
|
||||
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
|
||||
normalized = (x - mean) / std
|
||||
|
||||
# Apply learnable transformation
|
||||
output = normalized * self.gamma + self.beta
|
||||
|
||||
Reference in New Issue
Block a user