mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 20:45:02 -05:00
Update tinytorch and tito with module exports
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
This commit is contained in:
45
tinytorch/models/transformer.py
generated
45
tinytorch/models/transformer.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
|
||||
|
||||
@@ -23,7 +9,6 @@ from ..core.tensor import Tensor
|
||||
from ..core.layers import Linear
|
||||
from ..core.attention import MultiHeadAttention
|
||||
from ..core.activations import GELU
|
||||
from ..text.embeddings import Embedding, PositionalEncoding
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
|
||||
class LayerNorm:
|
||||
@@ -61,6 +46,7 @@ class LayerNorm:
|
||||
self.eps = eps
|
||||
|
||||
# Learnable parameters: scale and shift
|
||||
# CRITICAL: requires_grad=True so optimizer can train these!
|
||||
self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True) # Scale parameter
|
||||
self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True) # Shift parameter
|
||||
### END SOLUTION
|
||||
@@ -83,19 +69,18 @@ class LayerNorm:
|
||||
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
|
||||
# Compute statistics across last dimension (features)
|
||||
mean = x.mean(axis=-1, keepdims=True)
|
||||
|
||||
# Compute variance: E[(x - μ)²]
|
||||
# Use Tensor operations to preserve computation graph!
|
||||
diff = x - mean
|
||||
variance = (diff * diff).mean(axis=-1, keepdims=True)
|
||||
diff = x - mean # Tensor subtraction maintains gradient
|
||||
variance = (diff * diff).mean(axis=-1, keepdims=True) # Tensor ops maintain gradient
|
||||
|
||||
# Normalize - use Tensor operations to preserve gradients!
|
||||
# Add eps as a Tensor for proper gradient flow
|
||||
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
|
||||
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
|
||||
normalized = (x - mean) / std
|
||||
# Normalize: (x - mean) / sqrt(variance + eps)
|
||||
# Note: sqrt and division need to preserve gradient flow
|
||||
std_data = np.sqrt(variance.data + self.eps)
|
||||
normalized = diff * Tensor(1.0 / std_data) # Scale by reciprocal to maintain gradient
|
||||
|
||||
# Apply learnable transformation
|
||||
output = normalized * self.gamma + self.beta
|
||||
@@ -103,7 +88,7 @@ class LayerNorm:
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x):
|
||||
"""Allows the layer norm to be called like a function."""
|
||||
"""Allows the layer to be called like a function."""
|
||||
return self.forward(x)
|
||||
|
||||
def parameters(self):
|
||||
@@ -147,7 +132,7 @@ class MLP:
|
||||
|
||||
# Two-layer feed-forward network
|
||||
self.linear1 = Linear(embed_dim, hidden_dim)
|
||||
self.gelu = GELU() # Use GELU activation from activations module
|
||||
self.gelu = GELU()
|
||||
self.linear2 = Linear(hidden_dim, embed_dim)
|
||||
### END SOLUTION
|
||||
|
||||
@@ -171,7 +156,7 @@ class MLP:
|
||||
# First linear layer with expansion
|
||||
hidden = self.linear1.forward(x)
|
||||
|
||||
# GELU activation (YOUR activation from Module 03!)
|
||||
# GELU activation
|
||||
hidden = self.gelu.forward(hidden)
|
||||
|
||||
# Second linear layer back to original size
|
||||
@@ -404,10 +389,6 @@ class GPT:
|
||||
return logits
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, tokens):
|
||||
"""Allows the GPT model to be called like a function."""
|
||||
return self.forward(tokens)
|
||||
|
||||
def _create_causal_mask(self, seq_len):
|
||||
"""Create causal mask to prevent attending to future positions."""
|
||||
### BEGIN SOLUTION
|
||||
|
||||
Reference in New Issue
Block a user