🤖 Fix transformer module exports and milestone 05 imports

Module export fixes: - Add #|default_exp models.transformer directive to transformers module - Add imports (MultiHeadAttention, GELU, etc.) to export block - Export dataloader module (08_dataloader) - All modules now properly exported to tinytorch package Milestone 05 fixes: - Correct import paths (text.embeddings, data.loader, models.transformer) - Fix Linear.weight vs Linear.weights typo - Fix indentation in training loop - Call .forward() explicitly on transformer components Status: Architecture test mode works, model builds successfully TODO: Fix TransformerBlock/MultiHeadAttention signature mismatch in module 13
2026-03-11 18:43:34 -05:00 · 2025-10-27 16:17:55 -04:00
parent 170dde319a
commit 757e3bf7e1
10 changed files with 2575 additions and 1125 deletions
--- a/tinytorch/_modidx.py
+++ b/tinytorch/_modidx.py
@@ -61,6 +61,16 @@ d = { 'settings': { 'branch': 'main',
                                                                                          'tinytorch/core/activations.py'),
                                            'tinytorch.core.activations.Tanh.forward': ( '02_activations/activations_dev.html#tanh.forward',
                                                                                         'tinytorch/core/activations.py')},
+            'tinytorch.core.attention': { 'tinytorch.core.attention.MultiHeadAttention': ( '12_attention/attention_dev.html#multiheadattention',
+                                                                                           'tinytorch/core/attention.py'),
+                                          'tinytorch.core.attention.MultiHeadAttention.__init__': ( '12_attention/attention_dev.html#multiheadattention.__init__',
+                                                                                                    'tinytorch/core/attention.py'),
+                                          'tinytorch.core.attention.MultiHeadAttention.forward': ( '12_attention/attention_dev.html#multiheadattention.forward',
+                                                                                                   'tinytorch/core/attention.py'),
+                                          'tinytorch.core.attention.MultiHeadAttention.parameters': ( '12_attention/attention_dev.html#multiheadattention.parameters',
+                                                                                                      'tinytorch/core/attention.py'),
+                                          'tinytorch.core.attention.scaled_dot_product_attention': ( '12_attention/attention_dev.html#scaled_dot_product_attention',
+                                                                                                     'tinytorch/core/attention.py')},
            'tinytorch.core.autograd': {},
            'tinytorch.core.layers': { 'tinytorch.core.layers.Dropout': ('03_layers/layers_dev.html#dropout', 'tinytorch/core/layers.py'),
                                       'tinytorch.core.layers.Dropout.__call__': ( '03_layers/layers_dev.html#dropout.__call__',
@@ -270,6 +280,72 @@ d = { 'settings': { 'branch': 'main',
                                                                                         'tinytorch/data/loader.py'),
                                       'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__',
                                                                                        'tinytorch/data/loader.py')},
+            'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
+                                                                                    'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
+                                                                                             'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
+                                                                                                        'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.GPT.forward': ( '13_transformers/transformers_dev.html#gpt.forward',
+                                                                                            'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.GPT.generate': ( '13_transformers/transformers_dev.html#gpt.generate',
+                                                                                             'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.GPT.parameters': ( '13_transformers/transformers_dev.html#gpt.parameters',
+                                                                                               'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
+                                                                                          'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
+                                                                                                   'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
+                                                                                                  'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.LayerNorm.parameters': ( '13_transformers/transformers_dev.html#layernorm.parameters',
+                                                                                                     'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
+                                                                                    'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
+                                                                                             'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
+                                                                                            'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.MLP.parameters': ( '13_transformers/transformers_dev.html#mlp.parameters',
+                                                                                               'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
+                                                                                                 'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
+                                                                                                          'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
+                                                                                                         'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
+                                                                                                            'tinytorch/models/transformer.py')},
+            'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
+                                                                                    'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
+                                                                                             'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
+                                                                                             'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.Embedding.forward': ( '11_embeddings/embeddings_dev.html#embedding.forward',
+                                                                                            'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.Embedding.parameters': ( '11_embeddings/embeddings_dev.html#embedding.parameters',
+                                                                                               'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
+                                                                                         'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
+                                                                                                  'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
+                                                                                                  'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.EmbeddingLayer.forward': ( '11_embeddings/embeddings_dev.html#embeddinglayer.forward',
+                                                                                                 'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.EmbeddingLayer.parameters': ( '11_embeddings/embeddings_dev.html#embeddinglayer.parameters',
+                                                                                                    'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
+                                                                                             'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
+                                                                                                      'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',
+                                                                                                      'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.PositionalEncoding.forward': ( '11_embeddings/embeddings_dev.html#positionalencoding.forward',
+                                                                                                     'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.PositionalEncoding.parameters': ( '11_embeddings/embeddings_dev.html#positionalencoding.parameters',
+                                                                                                        'tinytorch/text/embeddings.py')},
            'tinytorch.text.tokenization': { 'tinytorch.text.tokenization.BPETokenizer': ( '10_tokenization/tokenization_dev.html#bpetokenizer',
                                                                                           'tinytorch/text/tokenization.py'),
                                             'tinytorch.text.tokenization.BPETokenizer.__init__': ( '10_tokenization/tokenization_dev.html#bpetokenizer.__init__',
--- a/tinytorch/core/attention.py
+++ b/tinytorch/core/attention.py
@@ -0,0 +1,291 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/07_attention/attention_dev.py           ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['scaled_dot_product_attention', 'MultiHeadAttention']
+
+# %% ../../modules/source/12_attention/attention_dev.ipynb 0
+#| default_exp core.attention
+#| export
+
+# %% ../../modules/source/12_attention/attention_dev.ipynb 2
+import numpy as np
+import math
+import time
+from typing import Optional, Tuple, List
+
+# Import dependencies from previous modules - following TinyTorch dependency chain
+from .tensor import Tensor
+from .layers import Linear
+
+# %% ../../modules/source/12_attention/attention_dev.ipynb 6
+def scaled_dot_product_attention(Q: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+    """
+    Compute scaled dot-product attention.
+
+    This is the fundamental attention operation that powers all transformer models.
+    We'll implement it with explicit loops first to show the O(n²) complexity.
+
+    TODO: Implement scaled dot-product attention step by step
+
+    APPROACH:
+    1. Extract dimensions and validate inputs
+    2. Compute attention scores with explicit nested loops (show O(n²) complexity)
+    3. Scale by 1/√d_k for numerical stability
+    4. Apply causal mask if provided (set masked positions to -inf)
+    5. Apply softmax to get attention weights
+    6. Apply values with attention weights (another O(n²) operation)
+    7. Return output and attention weights
+
+    Args:
+        Q: Query tensor of shape (batch_size, seq_len, d_model)
+        K: Key tensor of shape (batch_size, seq_len, d_model)
+        V: Value tensor of shape (batch_size, seq_len, d_model)
+        mask: Optional causal mask, True=allow, False=mask (batch_size, seq_len, seq_len)
+
+    Returns:
+        output: Attended values (batch_size, seq_len, d_model)
+        attention_weights: Attention matrix (batch_size, seq_len, seq_len)
+
+    EXAMPLE:
+    >>> Q = Tensor(np.random.randn(2, 4, 64))  # batch=2, seq=4, dim=64
+    >>> K = Tensor(np.random.randn(2, 4, 64))
+    >>> V = Tensor(np.random.randn(2, 4, 64))
+    >>> output, weights = scaled_dot_product_attention(Q, K, V)
+    >>> print(output.shape)  # (2, 4, 64)
+    >>> print(weights.shape)  # (2, 4, 4)
+    >>> print(weights.data[0].sum(axis=1))  # Each row sums to ~1.0
+
+    HINTS:
+    - Use explicit nested loops to compute Q[i] @ K[j] for educational purposes
+    - Scale factor is 1/√d_k where d_k is the last dimension of Q
+    - Masked positions should be set to -1e9 before softmax
+    - Remember that softmax normalizes along the last dimension
+    """
+    ### BEGIN SOLUTION
+    # Step 1: Extract dimensions and validate
+    batch_size, seq_len, d_model = Q.shape
+    assert K.shape == (batch_size, seq_len, d_model), f"K shape {K.shape} doesn't match Q shape {Q.shape}"
+    assert V.shape == (batch_size, seq_len, d_model), f"V shape {V.shape} doesn't match Q shape {Q.shape}"
+
+    # Step 2: Compute attention scores with explicit loops (educational O(n²) demonstration)
+    scores = np.zeros((batch_size, seq_len, seq_len))
+
+    # Show the quadratic complexity explicitly
+    for b in range(batch_size):           # For each batch
+        for i in range(seq_len):          # For each query position
+            for j in range(seq_len):      # Attend to each key position
+                # Compute dot product between query i and key j
+                score = 0.0
+                for d in range(d_model):  # Dot product across embedding dimension
+                    score += Q.data[b, i, d] * K.data[b, j, d]
+                scores[b, i, j] = score
+
+    # Step 3: Scale by 1/√d_k for numerical stability
+    scale_factor = 1.0 / math.sqrt(d_model)
+    scores = scores * scale_factor
+
+    # Step 4: Apply causal mask if provided
+    if mask is not None:
+        # mask[i,j] = False means position j should not attend to position i
+        mask_value = -1e9  # Large negative value becomes 0 after softmax
+        for b in range(batch_size):
+            for i in range(seq_len):
+                for j in range(seq_len):
+                    if not mask.data[b, i, j]:  # If mask is False, block attention
+                        scores[b, i, j] = mask_value
+
+    # Step 5: Apply softmax to get attention weights (probability distribution)
+    attention_weights = np.zeros_like(scores)
+    for b in range(batch_size):
+        for i in range(seq_len):
+            # Softmax over the j dimension (what this query attends to)
+            row = scores[b, i, :]
+            max_val = np.max(row)  # Numerical stability
+            exp_row = np.exp(row - max_val)
+            sum_exp = np.sum(exp_row)
+            attention_weights[b, i, :] = exp_row / sum_exp
+
+    # Step 6: Apply attention weights to values (another O(n²) operation)
+    output = np.zeros((batch_size, seq_len, d_model))
+
+    # Again, show the quadratic complexity
+    for b in range(batch_size):           # For each batch
+        for i in range(seq_len):          # For each output position
+            for j in range(seq_len):      # Weighted sum over all value positions
+                weight = attention_weights[b, i, j]
+                for d in range(d_model):  # Accumulate across embedding dimension
+                    output[b, i, d] += weight * V.data[b, j, d]
+
+    return Tensor(output), Tensor(attention_weights)
+    ### END SOLUTION
+
+# %% ../../modules/source/12_attention/attention_dev.ipynb 10
+class MultiHeadAttention:
+    """
+    Multi-head attention mechanism.
+
+    Runs multiple attention heads in parallel, each learning different relationships.
+    This is the core component of transformer architectures.
+    """
+
+    def __init__(self, embed_dim: int, num_heads: int):
+        """
+        Initialize multi-head attention.
+
+        TODO: Set up linear projections and validate configuration
+
+        APPROACH:
+        1. Validate that embed_dim is divisible by num_heads
+        2. Calculate head_dim (embed_dim // num_heads)
+        3. Create linear layers for Q, K, V projections
+        4. Create output projection layer
+        5. Store configuration parameters
+
+        Args:
+            embed_dim: Embedding dimension (d_model)
+            num_heads: Number of parallel attention heads
+
+        EXAMPLE:
+        >>> mha = MultiHeadAttention(embed_dim=512, num_heads=8)
+        >>> mha.head_dim  # 64 (512 / 8)
+        >>> len(mha.parameters())  # 4 linear layers * 2 params each = 8 tensors
+
+        HINTS:
+        - head_dim = embed_dim // num_heads must be integer
+        - Need 4 Linear layers: q_proj, k_proj, v_proj, out_proj
+        - Each projection maps embed_dim → embed_dim
+        """
+        ### BEGIN SOLUTION
+        assert embed_dim % num_heads == 0, f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})"
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+
+        # Linear projections for queries, keys, values
+        self.q_proj = Linear(embed_dim, embed_dim)
+        self.k_proj = Linear(embed_dim, embed_dim)
+        self.v_proj = Linear(embed_dim, embed_dim)
+
+        # Output projection to mix information across heads
+        self.out_proj = Linear(embed_dim, embed_dim)
+        ### END SOLUTION
+
+    def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
+        """
+        Forward pass through multi-head attention.
+
+        TODO: Implement the complete multi-head attention forward pass
+
+        APPROACH:
+        1. Extract input dimensions (batch_size, seq_len, embed_dim)
+        2. Project input to Q, K, V using linear layers
+        3. Reshape projections to separate heads: (batch, seq, heads, head_dim)
+        4. Transpose to (batch, heads, seq, head_dim) for parallel processing
+        5. Apply scaled dot-product attention to each head
+        6. Transpose back and reshape to merge heads
+        7. Apply output projection
+
+        Args:
+            x: Input tensor (batch_size, seq_len, embed_dim)
+            mask: Optional attention mask (batch_size, seq_len, seq_len)
+
+        Returns:
+            output: Attended representation (batch_size, seq_len, embed_dim)
+
+        EXAMPLE:
+        >>> mha = MultiHeadAttention(embed_dim=64, num_heads=8)
+        >>> x = Tensor(np.random.randn(2, 10, 64))  # batch=2, seq=10, dim=64
+        >>> output = mha.forward(x)
+        >>> print(output.shape)  # (2, 10, 64) - same as input
+
+        HINTS:
+        - Reshape: (batch, seq, embed_dim) → (batch, seq, heads, head_dim)
+        - Transpose: (batch, seq, heads, head_dim) → (batch, heads, seq, head_dim)
+        - After attention: reverse the process to merge heads
+        - Use scaled_dot_product_attention for each head
+        """
+        ### BEGIN SOLUTION
+        # Step 1: Extract dimensions
+        batch_size, seq_len, embed_dim = x.shape
+        assert embed_dim == self.embed_dim, f"Input dim {embed_dim} doesn't match expected {self.embed_dim}"
+
+        # Step 2: Project to Q, K, V
+        Q = self.q_proj.forward(x)  # (batch, seq, embed_dim)
+        K = self.k_proj.forward(x)
+        V = self.v_proj.forward(x)
+
+        # Step 3: Reshape to separate heads
+        # From (batch, seq, embed_dim) to (batch, seq, num_heads, head_dim)
+        Q_heads = Q.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
+        K_heads = K.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
+        V_heads = V.data.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
+
+        # Step 4: Transpose to (batch, num_heads, seq, head_dim) for parallel processing
+        Q_heads = np.transpose(Q_heads, (0, 2, 1, 3))
+        K_heads = np.transpose(K_heads, (0, 2, 1, 3))
+        V_heads = np.transpose(V_heads, (0, 2, 1, 3))
+
+        # Step 5: Apply attention to each head
+        head_outputs = []
+        for h in range(self.num_heads):
+            # Extract this head's Q, K, V
+            Q_h = Tensor(Q_heads[:, h, :, :])  # (batch, seq, head_dim)
+            K_h = Tensor(K_heads[:, h, :, :])
+            V_h = Tensor(V_heads[:, h, :, :])
+
+            # Apply attention for this head
+            head_out, _ = scaled_dot_product_attention(Q_h, K_h, V_h, mask)
+            head_outputs.append(head_out.data)
+
+        # Step 6: Concatenate heads back together
+        # Stack: list of (batch, seq, head_dim) → (batch, num_heads, seq, head_dim)
+        concat_heads = np.stack(head_outputs, axis=1)
+
+        # Transpose back: (batch, num_heads, seq, head_dim) → (batch, seq, num_heads, head_dim)
+        concat_heads = np.transpose(concat_heads, (0, 2, 1, 3))
+
+        # Reshape: (batch, seq, num_heads, head_dim) → (batch, seq, embed_dim)
+        concat_output = concat_heads.reshape(batch_size, seq_len, self.embed_dim)
+
+        # Step 7: Apply output projection
+        output = self.out_proj.forward(Tensor(concat_output))
+
+        return output
+        ### END SOLUTION
+
+    def parameters(self) -> List[Tensor]:
+        """
+        Return all trainable parameters.
+
+        TODO: Collect parameters from all linear layers
+
+        APPROACH:
+        1. Get parameters from q_proj, k_proj, v_proj, out_proj
+        2. Combine into single list
+
+        Returns:
+            List of all parameter tensors
+        """
+        ### BEGIN SOLUTION
+        params = []
+        params.extend(self.q_proj.parameters())
+        params.extend(self.k_proj.parameters())
+        params.extend(self.v_proj.parameters())
+        params.extend(self.out_proj.parameters())
+        return params
+        ### END SOLUTION
--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -0,0 +1,462 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py       ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
+
+# %% ../../modules/source/13_transformers/transformers_dev.ipynb 2
+import numpy as np
+from ..core.tensor import Tensor
+from ..core.layers import Linear
+from ..core.attention import MultiHeadAttention
+from ..core.activations import GELU
+
+# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
+class LayerNorm:
+    """
+    Layer Normalization for transformer blocks.
+
+    Normalizes across the feature dimension (last axis) for each sample independently,
+    unlike batch normalization which normalizes across the batch dimension.
+    """
+
+    def __init__(self, normalized_shape, eps=1e-5):
+        """
+        Initialize LayerNorm with learnable parameters.
+
+        TODO: Set up normalization parameters
+
+        APPROACH:
+        1. Store the shape to normalize over (usually embed_dim)
+        2. Initialize learnable scale (gamma) and shift (beta) parameters
+        3. Set small epsilon for numerical stability
+
+        EXAMPLE:
+        >>> ln = LayerNorm(512)  # For 512-dimensional embeddings
+        >>> x = Tensor(np.random.randn(2, 10, 512))  # (batch, seq, features)
+        >>> normalized = ln.forward(x)
+        >>> # Each (2, 10) sample normalized independently across 512 features
+
+        HINTS:
+        - gamma should start at 1.0 (identity scaling)
+        - beta should start at 0.0 (no shift)
+        - eps prevents division by zero in variance calculation
+        """
+        ### BEGIN SOLUTION
+        self.normalized_shape = normalized_shape
+        self.eps = eps
+
+        # Learnable parameters: scale and shift
+        self.gamma = Tensor(np.ones(normalized_shape))  # Scale parameter
+        self.beta = Tensor(np.zeros(normalized_shape))  # Shift parameter
+        ### END SOLUTION
+
+    def forward(self, x):
+        """
+        Apply layer normalization.
+
+        TODO: Implement layer normalization formula
+
+        APPROACH:
+        1. Compute mean and variance across the last dimension
+        2. Normalize: (x - mean) / sqrt(variance + eps)
+        3. Apply learnable scale and shift: gamma * normalized + beta
+
+        MATHEMATICAL FORMULA:
+        y = (x - μ) / σ * γ + β
+        where μ = mean(x), σ = sqrt(var(x) + ε)
+
+        HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
+        """
+        ### BEGIN SOLUTION
+        # Compute statistics across last dimension (features)
+        mean = x.mean(axis=-1, keepdims=True)
+
+        # Compute variance: E[(x - μ)²]
+        diff = Tensor(x.data - mean.data)
+        variance = Tensor((diff.data ** 2).mean(axis=-1, keepdims=True))
+
+        # Normalize
+        std = Tensor(np.sqrt(variance.data + self.eps))
+        normalized = Tensor((x.data - mean.data) / std.data)
+
+        # Apply learnable transformation
+        output = normalized * self.gamma + self.beta
+        return output
+        ### END SOLUTION
+
+    def parameters(self):
+        """Return learnable parameters."""
+        return [self.gamma, self.beta]
+
+# %% ../../modules/source/13_transformers/transformers_dev.ipynb 13
+class MLP:
+    """
+    Multi-Layer Perceptron (Feed-Forward Network) for transformer blocks.
+
+    Standard pattern: Linear -> GELU -> Linear with expansion ratio of 4:1.
+    This provides the non-linear transformation in each transformer block.
+    """
+
+    def __init__(self, embed_dim, hidden_dim=None, dropout_prob=0.1):
+        """
+        Initialize MLP with two linear layers.
+
+        TODO: Set up the feed-forward network layers
+
+        APPROACH:
+        1. First layer expands from embed_dim to hidden_dim (usually 4x larger)
+        2. Second layer projects back to embed_dim
+        3. Use GELU activation (smoother than ReLU, preferred in transformers)
+
+        EXAMPLE:
+        >>> mlp = MLP(512)  # Will create 512 -> 2048 -> 512 network
+        >>> x = Tensor(np.random.randn(2, 10, 512))
+        >>> output = mlp.forward(x)
+        >>> assert output.shape == (2, 10, 512)
+
+        HINT: Standard transformer MLP uses 4x expansion (hidden_dim = 4 * embed_dim)
+        """
+        ### BEGIN SOLUTION
+        if hidden_dim is None:
+            hidden_dim = 4 * embed_dim  # Standard 4x expansion
+
+        self.embed_dim = embed_dim
+        self.hidden_dim = hidden_dim
+
+        # Two-layer feed-forward network
+        self.linear1 = Linear(embed_dim, hidden_dim)
+        self.linear2 = Linear(hidden_dim, embed_dim)
+        ### END SOLUTION
+
+    def forward(self, x):
+        """
+        Forward pass through MLP.
+
+        TODO: Implement the feed-forward computation
+
+        APPROACH:
+        1. First linear transformation: embed_dim -> hidden_dim
+        2. Apply GELU activation (smooth, differentiable)
+        3. Second linear transformation: hidden_dim -> embed_dim
+
+        COMPUTATION FLOW:
+        x -> Linear -> GELU -> Linear -> output
+
+        HINT: GELU activation is implemented above as a function
+        """
+        ### BEGIN SOLUTION
+        # First linear layer with expansion
+        hidden = self.linear1.forward(x)
+
+        # GELU activation
+        hidden = gelu(hidden)
+
+        # Second linear layer back to original size
+        output = self.linear2.forward(hidden)
+
+        return output
+        ### END SOLUTION
+
+    def parameters(self):
+        """Return all learnable parameters."""
+        params = []
+        params.extend(self.linear1.parameters())
+        params.extend(self.linear2.parameters())
+        return params
+
+# %% ../../modules/source/13_transformers/transformers_dev.ipynb 17
+class TransformerBlock:
+    """
+    Complete Transformer Block with self-attention, MLP, and residual connections.
+
+    This is the core building block of GPT and other transformer models.
+    Each block processes the input sequence and passes it to the next block.
+    """
+
+    def __init__(self, embed_dim, num_heads, mlp_ratio=4, dropout_prob=0.1):
+        """
+        Initialize a complete transformer block.
+
+        TODO: Set up all components of the transformer block
+
+        APPROACH:
+        1. Multi-head self-attention for sequence modeling
+        2. First layer normalization (pre-norm architecture)
+        3. MLP with specified expansion ratio
+        4. Second layer normalization
+
+        TRANSFORMER BLOCK ARCHITECTURE:
+        x → LayerNorm → MultiHeadAttention → + (residual) →
+            LayerNorm → MLP → + (residual) → output
+
+        EXAMPLE:
+        >>> block = TransformerBlock(embed_dim=512, num_heads=8)
+        >>> x = Tensor(np.random.randn(2, 10, 512))  # (batch, seq, embed)
+        >>> output = block.forward(x)
+        >>> assert output.shape == (2, 10, 512)
+
+        HINT: We use pre-norm architecture (LayerNorm before attention/MLP)
+        """
+        ### BEGIN SOLUTION
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        # Multi-head self-attention
+        self.attention = MultiHeadAttention(embed_dim, num_heads)
+
+        # Layer normalizations (pre-norm architecture)
+        self.ln1 = LayerNorm(embed_dim)  # Before attention
+        self.ln2 = LayerNorm(embed_dim)  # Before MLP
+
+        # Feed-forward network
+        hidden_dim = int(embed_dim * mlp_ratio)
+        self.mlp = MLP(embed_dim, hidden_dim)
+        ### END SOLUTION
+
+    def forward(self, x, mask=None):
+        """
+        Forward pass through transformer block.
+
+        TODO: Implement the complete transformer block computation
+
+        APPROACH:
+        1. Apply layer norm, then self-attention, then add residual
+        2. Apply layer norm, then MLP, then add residual
+        3. Return the transformed sequence
+
+        COMPUTATION FLOW:
+        x → ln1 → attention → + x → ln2 → mlp → + → output
+
+        RESIDUAL CONNECTIONS:
+        These are crucial for training deep networks - they allow gradients
+        to flow directly through the network during backpropagation.
+
+        HINT: Store intermediate results to add residual connections properly
+        """
+        ### BEGIN SOLUTION
+        # First sub-layer: Multi-head self-attention with residual connection
+        # Pre-norm: LayerNorm before attention
+        normed1 = self.ln1.forward(x)
+        # Self-attention: query, key, value are all the same (normed1)
+        attention_out = self.attention.forward(normed1, normed1, normed1, mask)
+
+        # Residual connection
+        x = x + attention_out
+
+        # Second sub-layer: MLP with residual connection
+        # Pre-norm: LayerNorm before MLP
+        normed2 = self.ln2.forward(x)
+        mlp_out = self.mlp.forward(normed2)
+
+        # Residual connection
+        output = x + mlp_out
+
+        return output
+        ### END SOLUTION
+
+    def parameters(self):
+        """Return all learnable parameters."""
+        params = []
+        params.extend(self.attention.parameters())
+        params.extend(self.ln1.parameters())
+        params.extend(self.ln2.parameters())
+        params.extend(self.mlp.parameters())
+        return params
+
+# %% ../../modules/source/13_transformers/transformers_dev.ipynb 21
+class GPT:
+    """
+    Complete GPT (Generative Pre-trained Transformer) model.
+
+    This combines embeddings, positional encoding, multiple transformer blocks,
+    and a language modeling head for text generation.
+    """
+
+    def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_len=1024):
+        """
+        Initialize complete GPT model.
+
+        TODO: Set up all components of the GPT architecture
+
+        APPROACH:
+        1. Token embedding layer to convert tokens to vectors
+        2. Positional embedding to add position information
+        3. Stack of transformer blocks (the main computation)
+        4. Final layer norm and language modeling head
+
+        GPT ARCHITECTURE:
+        tokens → embedding → + pos_embedding →
+                transformer_blocks → layer_norm → lm_head → logits
+
+        EXAMPLE:
+        >>> model = GPT(vocab_size=1000, embed_dim=256, num_layers=6, num_heads=8)
+        >>> tokens = Tensor(np.random.randint(0, 1000, (2, 10)))  # (batch, seq)
+        >>> logits = model.forward(tokens)
+        >>> assert logits.shape == (2, 10, 1000)  # (batch, seq, vocab)
+
+        HINTS:
+        - Positional embeddings are learned, not fixed sinusoidal
+        - Final layer norm stabilizes training
+        - Language modeling head shares weights with token embedding (tie_weights)
+        """
+        ### BEGIN SOLUTION
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.max_seq_len = max_seq_len
+
+        # Token and positional embeddings
+        self.token_embedding = Embedding(vocab_size, embed_dim)
+        self.position_embedding = Embedding(max_seq_len, embed_dim)
+
+        # Stack of transformer blocks
+        self.blocks = []
+        for _ in range(num_layers):
+            block = TransformerBlock(embed_dim, num_heads)
+            self.blocks.append(block)
+
+        # Final layer normalization
+        self.ln_f = LayerNorm(embed_dim)
+
+        # Language modeling head (projects to vocabulary)
+        self.lm_head = Linear(embed_dim, vocab_size, bias=False)
+        ### END SOLUTION
+
+    def forward(self, tokens):
+        """
+        Forward pass through GPT model.
+
+        TODO: Implement the complete GPT forward pass
+
+        APPROACH:
+        1. Get token embeddings and positional embeddings
+        2. Add them together (broadcasting handles different shapes)
+        3. Pass through all transformer blocks sequentially
+        4. Apply final layer norm and language modeling head
+
+        COMPUTATION FLOW:
+        tokens → embed + pos_embed → blocks → ln_f → lm_head → logits
+
+        CAUSAL MASKING:
+        For autoregressive generation, we need to prevent tokens from
+        seeing future tokens. This is handled by the attention mask.
+
+        HINT: Create position indices as range(seq_len) for positional embedding
+        """
+        ### BEGIN SOLUTION
+        batch_size, seq_len = tokens.shape
+
+        # Token embeddings
+        token_emb = self.token_embedding.forward(tokens)
+
+        # Positional embeddings
+        positions = Tensor(np.arange(seq_len).reshape(1, seq_len))
+        pos_emb = self.position_embedding.forward(positions)
+
+        # Combine embeddings
+        x = token_emb + pos_emb
+
+        # Create causal mask for autoregressive generation
+        mask = self._create_causal_mask(seq_len)
+
+        # Pass through transformer blocks
+        for block in self.blocks:
+            x = block.forward(x, mask)
+
+        # Final layer normalization
+        x = self.ln_f.forward(x)
+
+        # Language modeling head
+        logits = self.lm_head.forward(x)
+
+        return logits
+        ### END SOLUTION
+
+    def _create_causal_mask(self, seq_len):
+        """Create causal mask to prevent attending to future positions."""
+        ### BEGIN SOLUTION
+        # Upper triangular matrix filled with -inf
+        mask = np.triu(np.ones((seq_len, seq_len)) * -np.inf, k=1)
+        return Tensor(mask)
+        ### END SOLUTION
+
+    def generate(self, prompt_tokens, max_new_tokens=50, temperature=1.0):
+        """
+        Generate text autoregressively.
+
+        TODO: Implement autoregressive text generation
+
+        APPROACH:
+        1. Start with prompt tokens
+        2. For each new position:
+           - Run forward pass to get logits
+           - Sample next token from logits
+           - Append to sequence
+        3. Return generated sequence
+
+        AUTOREGRESSIVE GENERATION:
+        At each step, the model predicts the next token based on all
+        previous tokens. This is how GPT generates coherent text.
+
+        EXAMPLE:
+        >>> model = GPT(vocab_size=100, embed_dim=64, num_layers=2, num_heads=4)
+        >>> prompt = Tensor([[1, 2, 3]])  # Some token sequence
+        >>> generated = model.generate(prompt, max_new_tokens=5)
+        >>> assert generated.shape[1] == 3 + 5  # original + new tokens
+
+        HINT: Use np.random.choice with temperature for sampling
+        """
+        ### BEGIN SOLUTION
+        current_tokens = Tensor(prompt_tokens.data.copy())
+
+        for _ in range(max_new_tokens):
+            # Get logits for current sequence
+            logits = self.forward(current_tokens)
+
+            # Get logits for last position (next token prediction)
+            last_logits = logits.data[:, -1, :]  # (batch_size, vocab_size)
+
+            # Apply temperature scaling
+            scaled_logits = last_logits / temperature
+
+            # Convert to probabilities (softmax)
+            exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=-1, keepdims=True))
+            probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
+
+            # Sample next token
+            next_token = np.array([[np.random.choice(self.vocab_size, p=probs[0])]])
+
+            # Append to sequence
+            current_tokens = Tensor(np.concatenate([current_tokens.data, next_token], axis=1))
+
+        return current_tokens
+        ### END SOLUTION
+
+    def parameters(self):
+        """Return all learnable parameters."""
+        params = []
+        params.extend(self.token_embedding.parameters())
+        params.extend(self.position_embedding.parameters())
+
+        for block in self.blocks:
+            params.extend(block.parameters())
+
+        params.extend(self.ln_f.parameters())
+        params.extend(self.lm_head.parameters())
+
+        return params
--- a/tinytorch/text/embeddings.py
+++ b/tinytorch/text/embeddings.py
@@ -0,0 +1,333 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py         ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
+
+# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2
+import numpy as np
+import math
+from typing import List, Optional, Tuple
+
+# Import from previous modules - following dependency chain
+from ..core.tensor import Tensor
+
+# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6
+class Embedding:
+    """
+    Learnable embedding layer that maps token indices to dense vectors.
+
+    This is the fundamental building block for converting discrete tokens
+    into continuous representations that neural networks can process.
+
+    TODO: Implement the Embedding class
+
+    APPROACH:
+    1. Initialize embedding matrix with random weights (vocab_size, embed_dim)
+    2. Implement forward pass as matrix lookup using numpy indexing
+    3. Handle batch dimensions correctly
+    4. Return parameters for optimization
+
+    EXAMPLE:
+    >>> embed = Embedding(vocab_size=100, embed_dim=64)
+    >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])  # batch_size=2, seq_len=3
+    >>> output = embed.forward(tokens)
+    >>> print(output.shape)
+    (2, 3, 64)
+
+    HINTS:
+    - Use numpy advanced indexing for lookup: weight[indices]
+    - Embedding matrix shape: (vocab_size, embed_dim)
+    - Initialize with Xavier/Glorot uniform for stable gradients
+    - Handle multi-dimensional indices correctly
+    """
+
+    ### BEGIN SOLUTION
+    def __init__(self, vocab_size: int, embed_dim: int):
+        """
+        Initialize embedding layer.
+
+        Args:
+            vocab_size: Size of vocabulary (number of unique tokens)
+            embed_dim: Dimension of embedding vectors
+        """
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+
+        # Xavier initialization for better gradient flow
+        limit = math.sqrt(6.0 / (vocab_size + embed_dim))
+        self.weight = Tensor(
+            np.random.uniform(-limit, limit, (vocab_size, embed_dim)),
+            requires_grad=True
+        )
+
+    def forward(self, indices: Tensor) -> Tensor:
+        """
+        Forward pass: lookup embeddings for given indices.
+
+        Args:
+            indices: Token indices of shape (batch_size, seq_len) or (seq_len,)
+
+        Returns:
+            Embedded vectors of shape (*indices.shape, embed_dim)
+        """
+        # Handle input validation
+        if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0):
+            raise ValueError(
+                f"Index out of range. Expected 0 <= indices < {self.vocab_size}, "
+                f"got min={np.min(indices.data)}, max={np.max(indices.data)}"
+            )
+
+        # Perform embedding lookup using advanced indexing
+        # This is equivalent to one-hot multiplication but much more efficient
+        embedded = self.weight.data[indices.data.astype(int)]
+
+        return Tensor(embedded)
+
+    def parameters(self) -> List[Tensor]:
+        """Return trainable parameters."""
+        return [self.weight]
+
+    def __repr__(self):
+        return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})"
+    ### END SOLUTION
+
+# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10
+class PositionalEncoding:
+    """
+    Learnable positional encoding layer.
+
+    Adds trainable position-specific vectors to token embeddings,
+    allowing the model to learn positional patterns specific to the task.
+
+    TODO: Implement learnable positional encoding
+
+    APPROACH:
+    1. Create embedding matrix for positions: (max_seq_len, embed_dim)
+    2. Forward pass: lookup position embeddings and add to input
+    3. Handle different sequence lengths gracefully
+    4. Return parameters for training
+
+    EXAMPLE:
+    >>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64)
+    >>> embeddings = Tensor(np.random.randn(2, 10, 64))  # (batch, seq, embed)
+    >>> output = pos_enc.forward(embeddings)
+    >>> print(output.shape)
+    (2, 10, 64)  # Same shape, but now position-aware
+
+    HINTS:
+    - Position embeddings shape: (max_seq_len, embed_dim)
+    - Use slice [:seq_len] to handle variable lengths
+    - Add position encodings to input embeddings element-wise
+    - Initialize with smaller values than token embeddings (they're additive)
+    """
+
+    ### BEGIN SOLUTION
+    def __init__(self, max_seq_len: int, embed_dim: int):
+        """
+        Initialize learnable positional encoding.
+
+        Args:
+            max_seq_len: Maximum sequence length to support
+            embed_dim: Embedding dimension (must match token embeddings)
+        """
+        self.max_seq_len = max_seq_len
+        self.embed_dim = embed_dim
+
+        # Initialize position embedding matrix
+        # Smaller initialization than token embeddings since these are additive
+        limit = math.sqrt(2.0 / embed_dim)
+        self.position_embeddings = Tensor(
+            np.random.uniform(-limit, limit, (max_seq_len, embed_dim)),
+            requires_grad=True
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Add positional encodings to input embeddings.
+
+        Args:
+            x: Input embeddings of shape (batch_size, seq_len, embed_dim)
+
+        Returns:
+            Position-encoded embeddings of same shape
+        """
+        if len(x.shape) != 3:
+            raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}")
+
+        batch_size, seq_len, embed_dim = x.shape
+
+        if seq_len > self.max_seq_len:
+            raise ValueError(
+                f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}"
+            )
+
+        if embed_dim != self.embed_dim:
+            raise ValueError(
+                f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
+            )
+
+        # Get position embeddings for this sequence length
+        pos_embeddings = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
+
+        # Broadcast to match batch dimension: (1, seq_len, embed_dim)
+        pos_embeddings = pos_embeddings[np.newaxis, :, :]
+
+        # Add positional information to input embeddings
+        result = x.data + pos_embeddings
+
+        return Tensor(result)
+
+    def parameters(self) -> List[Tensor]:
+        """Return trainable parameters."""
+        return [self.position_embeddings]
+
+    def __repr__(self):
+        return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
+    ### END SOLUTION
+
+# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18
+class EmbeddingLayer:
+    """
+    Complete embedding system combining token and positional embeddings.
+
+    This is the production-ready component that handles the full embedding
+    pipeline used in transformers and other sequence models.
+
+    TODO: Implement complete embedding system
+
+    APPROACH:
+    1. Combine token embedding + positional encoding
+    2. Support both learned and sinusoidal position encodings
+    3. Handle variable sequence lengths gracefully
+    4. Add optional embedding scaling (Transformer convention)
+
+    EXAMPLE:
+    >>> embed_layer = EmbeddingLayer(
+    ...     vocab_size=50000,
+    ...     embed_dim=512,
+    ...     max_seq_len=2048,
+    ...     pos_encoding='learned'
+    ... )
+    >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])
+    >>> output = embed_layer.forward(tokens)
+    >>> print(output.shape)
+    (2, 3, 512)
+
+    HINTS:
+    - First apply token embedding, then add positional encoding
+    - Support 'learned', 'sinusoidal', or None for pos_encoding
+    - Handle both 2D (batch, seq) and 1D (seq) inputs gracefully
+    - Scale embeddings by sqrt(embed_dim) if requested (transformer convention)
+    """
+
+    ### BEGIN SOLUTION
+    def __init__(
+        self,
+        vocab_size: int,
+        embed_dim: int,
+        max_seq_len: int = 512,
+        pos_encoding: str = 'learned',
+        scale_embeddings: bool = False
+    ):
+        """
+        Initialize complete embedding system.
+
+        Args:
+            vocab_size: Size of vocabulary
+            embed_dim: Embedding dimension
+            max_seq_len: Maximum sequence length for positional encoding
+            pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None)
+            scale_embeddings: Whether to scale embeddings by sqrt(embed_dim)
+        """
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.max_seq_len = max_seq_len
+        self.pos_encoding_type = pos_encoding
+        self.scale_embeddings = scale_embeddings
+
+        # Token embedding layer
+        self.token_embedding = Embedding(vocab_size, embed_dim)
+
+        # Positional encoding
+        if pos_encoding == 'learned':
+            self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
+        elif pos_encoding == 'sinusoidal':
+            # Create fixed sinusoidal encodings (no parameters)
+            self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim)
+        elif pos_encoding is None:
+            self.pos_encoding = None
+        else:
+            raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None")
+
+    def forward(self, tokens: Tensor) -> Tensor:
+        """
+        Forward pass through complete embedding system.
+
+        Args:
+            tokens: Token indices of shape (batch_size, seq_len) or (seq_len,)
+
+        Returns:
+            Embedded tokens with positional information
+        """
+        # Handle 1D input by adding batch dimension
+        if len(tokens.shape) == 1:
+            tokens = Tensor(tokens.data[np.newaxis, :])  # (1, seq_len)
+            squeeze_batch = True
+        else:
+            squeeze_batch = False
+
+        # Get token embeddings
+        token_embeds = self.token_embedding.forward(tokens)  # (batch, seq, embed)
+
+        # Scale embeddings if requested (transformer convention)
+        if self.scale_embeddings:
+            token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim))
+
+        # Add positional encoding
+        if self.pos_encoding_type == 'learned':
+            # Use learnable positional encoding
+            output = self.pos_encoding.forward(token_embeds)
+        elif self.pos_encoding_type == 'sinusoidal':
+            # Use fixed sinusoidal encoding
+            batch_size, seq_len, embed_dim = token_embeds.shape
+            pos_embeddings = self.pos_encoding.data[:seq_len]  # (seq_len, embed_dim)
+            pos_embeddings = pos_embeddings[np.newaxis, :, :]  # (1, seq_len, embed_dim)
+            output = Tensor(token_embeds.data + pos_embeddings)
+        else:
+            # No positional encoding
+            output = token_embeds
+
+        # Remove batch dimension if it was added
+        if squeeze_batch:
+            output = Tensor(output.data[0])  # (seq_len, embed_dim)
+
+        return output
+
+    def parameters(self) -> List[Tensor]:
+        """Return all trainable parameters."""
+        params = self.token_embedding.parameters()
+
+        if self.pos_encoding_type == 'learned':
+            params.extend(self.pos_encoding.parameters())
+
+        return params
+
+    def __repr__(self):
+        return (f"EmbeddingLayer(vocab_size={self.vocab_size}, "
+                f"embed_dim={self.embed_dim}, "
+                f"pos_encoding='{self.pos_encoding_type}')")
+    ### END SOLUTION