# ╔═══════════════════════════════════════════════════════════════════════════════╗ # ║ 🚨 CRITICAL WARNING 🚨 ║ # ║ AUTOGENERATED! DO NOT EDIT! ║ # ║ ║ # ║ This file is AUTOMATICALLY GENERATED from source modules. ║ # ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║ # ║ ║ # ║ ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py ║ # ║ ✅ TO EXPORT: Run 'tito module complete ' ║ # ║ ║ # ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║ # ║ Editing it directly may break module functionality and training. ║ # ║ ║ # ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║ # ║ happens! The tinytorch/ directory is just the compiled output. ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 __all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer'] # %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2 import numpy as np import math from typing import List, Optional, Tuple # Import from previous modules - following dependency chain from ..core.tensor import Tensor # %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6 class Embedding: """ Learnable embedding layer that maps token indices to dense vectors. This is the fundamental building block for converting discrete tokens into continuous representations that neural networks can process. TODO: Implement the Embedding class APPROACH: 1. Initialize embedding matrix with random weights (vocab_size, embed_dim) 2. Implement forward pass as matrix lookup using numpy indexing 3. Handle batch dimensions correctly 4. Return parameters for optimization EXAMPLE: >>> embed = Embedding(vocab_size=100, embed_dim=64) >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # batch_size=2, seq_len=3 >>> output = embed.forward(tokens) >>> print(output.shape) (2, 3, 64) HINTS: - Use numpy advanced indexing for lookup: weight[indices] - Embedding matrix shape: (vocab_size, embed_dim) - Initialize with Xavier/Glorot uniform for stable gradients - Handle multi-dimensional indices correctly """ ### BEGIN SOLUTION def __init__(self, vocab_size: int, embed_dim: int): """ Initialize embedding layer. Args: vocab_size: Size of vocabulary (number of unique tokens) embed_dim: Dimension of embedding vectors """ self.vocab_size = vocab_size self.embed_dim = embed_dim # Xavier initialization for better gradient flow limit = math.sqrt(6.0 / (vocab_size + embed_dim)) self.weight = Tensor( np.random.uniform(-limit, limit, (vocab_size, embed_dim)), requires_grad=True ) def forward(self, indices: Tensor) -> Tensor: """ Forward pass: lookup embeddings for given indices. Args: indices: Token indices of shape (batch_size, seq_len) or (seq_len,) Returns: Embedded vectors of shape (*indices.shape, embed_dim) """ # Handle input validation if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0): raise ValueError( f"Index out of range. Expected 0 <= indices < {self.vocab_size}, " f"got min={np.min(indices.data)}, max={np.max(indices.data)}" ) # Perform embedding lookup using advanced indexing # This is equivalent to one-hot multiplication but much more efficient embedded = self.weight.data[indices.data.astype(int)] # Create result tensor result = Tensor(embedded, requires_grad=self.weight.requires_grad) # Attach gradient function (students learned this in Module 05!) if self.weight.requires_grad: from tinytorch.core.autograd import EmbeddingBackward result._grad_fn = EmbeddingBackward(self.weight, indices) return result def parameters(self) -> List[Tensor]: """Return trainable parameters.""" return [self.weight] def __repr__(self): return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})" ### END SOLUTION # %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10 class PositionalEncoding: """ Learnable positional encoding layer. Adds trainable position-specific vectors to token embeddings, allowing the model to learn positional patterns specific to the task. TODO: Implement learnable positional encoding APPROACH: 1. Create embedding matrix for positions: (max_seq_len, embed_dim) 2. Forward pass: lookup position embeddings and add to input 3. Handle different sequence lengths gracefully 4. Return parameters for training EXAMPLE: >>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64) >>> embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, embed) >>> output = pos_enc.forward(embeddings) >>> print(output.shape) (2, 10, 64) # Same shape, but now position-aware HINTS: - Position embeddings shape: (max_seq_len, embed_dim) - Use slice [:seq_len] to handle variable lengths - Add position encodings to input embeddings element-wise - Initialize with smaller values than token embeddings (they're additive) """ ### BEGIN SOLUTION def __init__(self, max_seq_len: int, embed_dim: int): """ Initialize learnable positional encoding. Args: max_seq_len: Maximum sequence length to support embed_dim: Embedding dimension (must match token embeddings) """ self.max_seq_len = max_seq_len self.embed_dim = embed_dim # Initialize position embedding matrix # Smaller initialization than token embeddings since these are additive limit = math.sqrt(2.0 / embed_dim) self.position_embeddings = Tensor( np.random.uniform(-limit, limit, (max_seq_len, embed_dim)), requires_grad=True ) def forward(self, x: Tensor) -> Tensor: """ Add positional encodings to input embeddings. Args: x: Input embeddings of shape (batch_size, seq_len, embed_dim) Returns: Position-encoded embeddings of same shape """ if len(x.shape) != 3: raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}") batch_size, seq_len, embed_dim = x.shape if seq_len > self.max_seq_len: raise ValueError( f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}" ) if embed_dim != self.embed_dim: raise ValueError( f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}" ) # Get position embeddings for this sequence length (slice using .data for efficiency) pos_embeddings_data = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim) # Broadcast to match batch dimension: (1, seq_len, embed_dim) pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :] # Wrap in Tensor to preserve requires_grad pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad) # Add positional information using Tensor operation to preserve gradients! result = x + pos_embeddings return result def parameters(self) -> List[Tensor]: """Return trainable parameters.""" return [self.position_embeddings] def __repr__(self): return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})" ### END SOLUTION # %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18 class EmbeddingLayer: """ Complete embedding system combining token and positional embeddings. This is the production-ready component that handles the full embedding pipeline used in transformers and other sequence models. TODO: Implement complete embedding system APPROACH: 1. Combine token embedding + positional encoding 2. Support both learned and sinusoidal position encodings 3. Handle variable sequence lengths gracefully 4. Add optional embedding scaling (Transformer convention) EXAMPLE: >>> embed_layer = EmbeddingLayer( ... vocab_size=50000, ... embed_dim=512, ... max_seq_len=2048, ... pos_encoding='learned' ... ) >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]]) >>> output = embed_layer.forward(tokens) >>> print(output.shape) (2, 3, 512) HINTS: - First apply token embedding, then add positional encoding - Support 'learned', 'sinusoidal', or None for pos_encoding - Handle both 2D (batch, seq) and 1D (seq) inputs gracefully - Scale embeddings by sqrt(embed_dim) if requested (transformer convention) """ ### BEGIN SOLUTION def __init__( self, vocab_size: int, embed_dim: int, max_seq_len: int = 512, pos_encoding: str = 'learned', scale_embeddings: bool = False ): """ Initialize complete embedding system. Args: vocab_size: Size of vocabulary embed_dim: Embedding dimension max_seq_len: Maximum sequence length for positional encoding pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None) scale_embeddings: Whether to scale embeddings by sqrt(embed_dim) """ self.vocab_size = vocab_size self.embed_dim = embed_dim self.max_seq_len = max_seq_len self.pos_encoding_type = pos_encoding self.scale_embeddings = scale_embeddings # Token embedding layer self.token_embedding = Embedding(vocab_size, embed_dim) # Positional encoding if pos_encoding == 'learned': self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim) elif pos_encoding == 'sinusoidal': # Create fixed sinusoidal encodings (no parameters) self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim) elif pos_encoding is None: self.pos_encoding = None else: raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None") def forward(self, tokens: Tensor) -> Tensor: """ Forward pass through complete embedding system. Args: tokens: Token indices of shape (batch_size, seq_len) or (seq_len,) Returns: Embedded tokens with positional information """ # Handle 1D input by adding batch dimension if len(tokens.shape) == 1: tokens = Tensor(tokens.data[np.newaxis, :]) # (1, seq_len) squeeze_batch = True else: squeeze_batch = False # Get token embeddings token_embeds = self.token_embedding.forward(tokens) # (batch, seq, embed) # Scale embeddings if requested (transformer convention) if self.scale_embeddings: token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim)) # Add positional encoding if self.pos_encoding_type == 'learned': # Use learnable positional encoding output = self.pos_encoding.forward(token_embeds) elif self.pos_encoding_type == 'sinusoidal': # Use fixed sinusoidal encoding batch_size, seq_len, embed_dim = token_embeds.shape pos_embeddings = self.pos_encoding.data[:seq_len] # (seq_len, embed_dim) pos_embeddings = pos_embeddings[np.newaxis, :, :] # (1, seq_len, embed_dim) output = Tensor(token_embeds.data + pos_embeddings) else: # No positional encoding output = token_embeds # Remove batch dimension if it was added if squeeze_batch: output = Tensor(output.data[0]) # (seq_len, embed_dim) return output def parameters(self) -> List[Tensor]: """Return all trainable parameters.""" params = self.token_embedding.parameters() if self.pos_encoding_type == 'learned': params.extend(self.pos_encoding.parameters()) return params def __repr__(self): return (f"EmbeddingLayer(vocab_size={self.vocab_size}, " f"embed_dim={self.embed_dim}, " f"pos_encoding='{self.pos_encoding_type}')") ### END SOLUTION