Reset package and export modules 01-07 only (skip broken spatial module)

2026-03-11 22:25:29 -05:00 · 2025-09-30 13:41:00 -04:00
parent a0aef7d52e
commit caff73a75b
24 changed files with 807 additions and 647 deletions
--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -1,148 +0,0 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
-
-# %% auto 0
-__all__ = ['Tensor', 'Linear', 'MultiHeadAttention', 'Embedding', 'gelu']
-
-# %% ../../modules/source/13_transformers/transformers_dev.ipynb 1
-import numpy as np
-import math
-from typing import Optional, List
-
-# Minimal implementations for development - in practice these import from previous modules
-class Tensor:
-    """Minimal Tensor class for transformer development - imports from Module 01 in practice."""
-    def __init__(self, data, requires_grad=False):
-        self.data = np.array(data)
-        self.shape = self.data.shape
-        self.size = self.data.size
-        self.requires_grad = requires_grad
-        self.grad = None
-
-    def __add__(self, other):
-        if isinstance(other, Tensor):
-            return Tensor(self.data + other.data)
-        return Tensor(self.data + other)
-
-    def __mul__(self, other):
-        if isinstance(other, Tensor):
-            return Tensor(self.data * other.data)
-        return Tensor(self.data * other)
-
-    def matmul(self, other):
-        return Tensor(np.dot(self.data, other.data))
-
-    def sum(self, axis=None, keepdims=False):
-        return Tensor(self.data.sum(axis=axis, keepdims=keepdims))
-
-    def mean(self, axis=None, keepdims=False):
-        return Tensor(self.data.mean(axis=axis, keepdims=keepdims))
-
-    def reshape(self, *shape):
-        return Tensor(self.data.reshape(shape))
-
-    def __repr__(self):
-        return f"Tensor(data={self.data}, shape={self.shape})"
-
-class Linear:
-    """Minimal Linear layer - imports from Module 03 in practice."""
-    def __init__(self, in_features, out_features, bias=True):
-        # Xavier/Glorot initialization
-        std = math.sqrt(2.0 / (in_features + out_features))
-        self.weight = Tensor(np.random.normal(0, std, (in_features, out_features)))
-        self.bias = Tensor(np.zeros(out_features)) if bias else None
-
-    def forward(self, x):
-        output = x.matmul(self.weight)
-        if self.bias is not None:
-            output = output + self.bias
-        return output
-
-    def parameters(self):
-        params = [self.weight]
-        if self.bias is not None:
-            params.append(self.bias)
-        return params
-
-class MultiHeadAttention:
-    """Minimal MultiHeadAttention - imports from Module 12 in practice."""
-    def __init__(self, embed_dim, num_heads):
-        assert embed_dim % num_heads == 0
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-
-        self.q_proj = Linear(embed_dim, embed_dim)
-        self.k_proj = Linear(embed_dim, embed_dim)
-        self.v_proj = Linear(embed_dim, embed_dim)
-        self.out_proj = Linear(embed_dim, embed_dim)
-
-    def forward(self, x, mask=None):
-        batch_size, seq_len, embed_dim = x.shape
-
-        # Linear projections
-        Q = self.q_proj.forward(x)
-        K = self.k_proj.forward(x)
-        V = self.v_proj.forward(x)
-
-        # Reshape for multi-head attention
-        Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
-        K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
-        V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
-
-        # Transpose to (batch_size, num_heads, seq_len, head_dim)
-        Q = Tensor(np.transpose(Q.data, (0, 2, 1, 3)))
-        K = Tensor(np.transpose(K.data, (0, 2, 1, 3)))
-        V = Tensor(np.transpose(V.data, (0, 2, 1, 3)))
-
-        # Scaled dot-product attention
-        scores = Tensor(np.matmul(Q.data, np.transpose(K.data, (0, 1, 3, 2))))
-        scores = scores * (1.0 / math.sqrt(self.head_dim))
-
-        # Apply causal mask for autoregressive generation
-        if mask is not None:
-            scores = Tensor(scores.data + mask.data)
-
-        # Softmax
-        attention_weights = self._softmax(scores)
-
-        # Apply attention to values
-        out = Tensor(np.matmul(attention_weights.data, V.data))
-
-        # Transpose back and reshape
-        out = Tensor(np.transpose(out.data, (0, 2, 1, 3)))
-        out = out.reshape(batch_size, seq_len, embed_dim)
-
-        # Final linear projection
-        return self.out_proj.forward(out)
-
-    def _softmax(self, x):
-        """Numerically stable softmax."""
-        exp_x = Tensor(np.exp(x.data - np.max(x.data, axis=-1, keepdims=True)))
-        return Tensor(exp_x.data / np.sum(exp_x.data, axis=-1, keepdims=True))
-
-    def parameters(self):
-        params = []
-        params.extend(self.q_proj.parameters())
-        params.extend(self.k_proj.parameters())
-        params.extend(self.v_proj.parameters())
-        params.extend(self.out_proj.parameters())
-        return params
-
-class Embedding:
-    """Minimal Embedding layer - imports from Module 11 in practice."""
-    def __init__(self, vocab_size, embed_dim):
-        self.vocab_size = vocab_size
-        self.embed_dim = embed_dim
-        # Initialize with small random values
-        self.weight = Tensor(np.random.normal(0, 0.02, (vocab_size, embed_dim)))
-
-    def forward(self, indices):
-        # Simple embedding lookup
-        return Tensor(self.weight.data[indices.data])
-
-    def parameters(self):
-        return [self.weight]
-
-def gelu(x):
-    """GELU activation function."""
-    return Tensor(0.5 * x.data * (1 + np.tanh(np.sqrt(2 / np.pi) * (x.data + 0.044715 * x.data**3))))