Update tinytorch and tito with module exports

Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
2026-03-11 20:45:02 -05:00 · 2025-11-10 19:42:03 -05:00
parent 9fdfa4317c
commit 41b132f55f
48 changed files with 681 additions and 2035 deletions
--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -1,19 +1,5 @@
-# ╔═══════════════════════════════════════════════════════════════════════════════╗
-# ║                        🚨 CRITICAL WARNING 🚨                                ║
-# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
-# ║                                                                               ║
-# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
-# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
-# ║                                                                               ║
-# ║  ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py       ║
-# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
-# ║                                                                               ║
-# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
-# ║     Editing it directly may break module functionality and training.         ║
-# ║                                                                               ║
-# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
-# ║     happens! The tinytorch/ directory is just the compiled output.           ║
-# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
+
 # %% auto 0
 __all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']

@@ -23,7 +9,6 @@ from ..core.tensor import Tensor
 from ..core.layers import Linear
 from ..core.attention import MultiHeadAttention
 from ..core.activations import GELU
-from ..text.embeddings import Embedding, PositionalEncoding

 # %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
 class LayerNorm:
@@ -61,6 +46,7 @@ class LayerNorm:
        self.eps = eps

        # Learnable parameters: scale and shift
+        # CRITICAL: requires_grad=True so optimizer can train these!
        self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True)  # Scale parameter
        self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True)  # Shift parameter
        ### END SOLUTION
@@ -83,19 +69,18 @@ class LayerNorm:
        HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
        """
        ### BEGIN SOLUTION
+        # CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
        # Compute statistics across last dimension (features)
        mean = x.mean(axis=-1, keepdims=True)

        # Compute variance: E[(x - μ)²]
-        # Use Tensor operations to preserve computation graph!
-        diff = x - mean
-        variance = (diff * diff).mean(axis=-1, keepdims=True)
+        diff = x - mean  # Tensor subtraction maintains gradient
+        variance = (diff * diff).mean(axis=-1, keepdims=True)  # Tensor ops maintain gradient

-        # Normalize - use Tensor operations to preserve gradients!
-        # Add eps as a Tensor for proper gradient flow
-        eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
-        std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
-        normalized = (x - mean) / std
+        # Normalize: (x - mean) / sqrt(variance + eps)
+        # Note: sqrt and division need to preserve gradient flow
+        std_data = np.sqrt(variance.data + self.eps)
+        normalized = diff * Tensor(1.0 / std_data)  # Scale by reciprocal to maintain gradient

        # Apply learnable transformation
        output = normalized * self.gamma + self.beta
@@ -103,7 +88,7 @@ class LayerNorm:
        ### END SOLUTION

    def __call__(self, x):
-        """Allows the layer norm to be called like a function."""
+        """Allows the layer to be called like a function."""
        return self.forward(x)

    def parameters(self):
@@ -147,7 +132,7 @@ class MLP:

        # Two-layer feed-forward network
        self.linear1 = Linear(embed_dim, hidden_dim)
-        self.gelu = GELU()  # Use GELU activation from activations module
+        self.gelu = GELU()
        self.linear2 = Linear(hidden_dim, embed_dim)
        ### END SOLUTION

@@ -171,7 +156,7 @@ class MLP:
        # First linear layer with expansion
        hidden = self.linear1.forward(x)

-        # GELU activation (YOUR activation from Module 03!)
+        # GELU activation
        hidden = self.gelu.forward(hidden)

        # Second linear layer back to original size
@@ -404,10 +389,6 @@ class GPT:
        return logits
        ### END SOLUTION

-    def __call__(self, tokens):
-        """Allows the GPT model to be called like a function."""
-        return self.forward(tokens)
-
    def _create_causal_mask(self, seq_len):
        """Create causal mask to prevent attending to future positions."""
        ### BEGIN SOLUTION