TinyTorch/tinytorch/text/embeddings.py

# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: src/XX_embeddings/XX_embeddings.py                     ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners)    ║
# ║     The tinytorch/ directory is generated code - edit source files instead!  ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['BYTES_PER_FLOAT32', 'MB_TO_BYTES', 'Embedding', 'PositionalEncoding', 'EmbeddingLayer']

# %% ../../modules/11_embeddings/embeddings.ipynb 2
import numpy as np
import math
from typing import List, Optional, Tuple

# Import from previous modules - following dependency chain
from ..core.tensor import Tensor
from ..core.autograd import EmbeddingBackward

# Constants for memory calculations
BYTES_PER_FLOAT32 = 4  # Standard float32 size in bytes
MB_TO_BYTES = 1024 * 1024  # Megabytes to bytes conversion

# %% ../../modules/11_embeddings/embeddings.ipynb 6
class Embedding:
    """
    Learnable embedding layer that maps token indices to dense vectors.

    This is the fundamental building block for converting discrete tokens
    into continuous representations that neural networks can process.

    TODO: Implement the Embedding class

    APPROACH:
    1. Initialize embedding matrix with random weights (vocab_size, embed_dim)
    2. Implement forward pass as matrix lookup using numpy indexing
    3. Handle batch dimensions correctly
    4. Return parameters for optimization

    EXAMPLE:
    >>> embed = Embedding(vocab_size=100, embed_dim=64)
    >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])  # batch_size=2, seq_len=3
    >>> output = embed.forward(tokens)
    >>> print(output.shape)
    (2, 3, 64)

    HINTS:
    - Use numpy advanced indexing for lookup: weight[indices]
    - Embedding matrix shape: (vocab_size, embed_dim)
    - Initialize with Xavier/Glorot uniform for stable gradients
    - Handle multi-dimensional indices correctly
    """

    ### BEGIN SOLUTION
    def __init__(self, vocab_size: int, embed_dim: int):
        """
        Initialize embedding layer.

        Args:
            vocab_size: Size of vocabulary (number of unique tokens)
            embed_dim: Dimension of embedding vectors
        """
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        # Xavier initialization for better gradient flow
        limit = math.sqrt(6.0 / (vocab_size + embed_dim))
        self.weight = Tensor(
            np.random.uniform(-limit, limit, (vocab_size, embed_dim)),
            requires_grad=True
        )

    def forward(self, indices: Tensor) -> Tensor:
        """
        Forward pass: lookup embeddings for given indices.

        Args:
            indices: Token indices of shape (batch_size, seq_len) or (seq_len,)

        Returns:
            Embedded vectors of shape (*indices.shape, embed_dim)
        """
        # Handle input validation
        if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0):
            raise ValueError(
                f"Index out of range. Expected 0 <= indices < {self.vocab_size}, "
                f"got min={np.min(indices.data)}, max={np.max(indices.data)}"
            )

        # Perform embedding lookup using advanced indexing
        # This is equivalent to one-hot multiplication but much more efficient
        embedded = self.weight.data[indices.data.astype(int)]

        # Create result tensor with gradient tracking
        result = Tensor(embedded, requires_grad=self.weight.requires_grad)

        # Attach backward function for gradient computation (following TinyTorch protocol)
        if result.requires_grad:
            result._grad_fn = EmbeddingBackward(self.weight, indices)

        return result

    def __call__(self, indices: Tensor) -> Tensor:
        """Allows the embedding to be called like a function."""
        return self.forward(indices)

    def parameters(self) -> List[Tensor]:
        """Return trainable parameters."""
        return [self.weight]

    def __repr__(self):
        return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})"
    ### END SOLUTION

# %% ../../modules/11_embeddings/embeddings.ipynb 10
class PositionalEncoding:
    """
    Learnable positional encoding layer.

    Adds trainable position-specific vectors to token embeddings,
    allowing the model to learn positional patterns specific to the task.

    TODO: Implement learnable positional encoding

    APPROACH:
    1. Create embedding matrix for positions: (max_seq_len, embed_dim)
    2. Forward pass: lookup position embeddings and add to input
    3. Handle different sequence lengths gracefully
    4. Return parameters for training

    EXAMPLE:
    >>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64)
    >>> embeddings = Tensor(np.random.randn(2, 10, 64))  # (batch, seq, embed)
    >>> output = pos_enc.forward(embeddings)
    >>> print(output.shape)
    (2, 10, 64)  # Same shape, but now position-aware

    HINTS:
    - Position embeddings shape: (max_seq_len, embed_dim)
    - Use slice [:seq_len] to handle variable lengths
    - Add position encodings to input embeddings element-wise
    - Initialize with smaller values than token embeddings (they're additive)
    """

    ### BEGIN SOLUTION
    def __init__(self, max_seq_len: int, embed_dim: int):
        """
        Initialize learnable positional encoding.

        Args:
            max_seq_len: Maximum sequence length to support
            embed_dim: Embedding dimension (must match token embeddings)
        """
        self.max_seq_len = max_seq_len
        self.embed_dim = embed_dim

        # Initialize position embedding matrix
        # Smaller initialization than token embeddings since these are additive
        limit = math.sqrt(2.0 / embed_dim)
        self.position_embeddings = Tensor(
            np.random.uniform(-limit, limit, (max_seq_len, embed_dim)),
            requires_grad=True
        )

    def forward(self, x: Tensor) -> Tensor:
        """
        Add positional encodings to input embeddings.

        Args:
            x: Input embeddings of shape (batch_size, seq_len, embed_dim)

        Returns:
            Position-encoded embeddings of same shape
        """
        if len(x.shape) != 3:
            raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}")

        batch_size, seq_len, embed_dim = x.shape

        if seq_len > self.max_seq_len:
            raise ValueError(
                f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}"
            )

        if embed_dim != self.embed_dim:
            raise ValueError(
                f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
            )

        # Slice position embeddings for this sequence length using Tensor slicing
        # This now preserves gradient flow (as of Module 01 update with __getitem__)
        pos_embeddings = self.position_embeddings[:seq_len]  # (seq_len, embed_dim) - gradients preserved!

        # Reshape to add batch dimension: (1, seq_len, embed_dim)
        # Need to use .data for reshaping temporarily, then wrap in Tensor
        pos_data = pos_embeddings.data[np.newaxis, :, :]
        pos_embeddings_batched = Tensor(pos_data, requires_grad=pos_embeddings.requires_grad)

        # Copy gradient function if it exists (to preserve backward connection)
        if hasattr(pos_embeddings, '_grad_fn') and pos_embeddings._grad_fn is not None:
            pos_embeddings_batched._grad_fn = pos_embeddings._grad_fn

        # Add positional information - gradients flow through both x and pos_embeddings!
        result = x + pos_embeddings_batched

        return result

    def __call__(self, x: Tensor) -> Tensor:
        """Allows the positional encoding to be called like a function."""
        return self.forward(x)

    def parameters(self) -> List[Tensor]:
        """Return trainable parameters."""
        return [self.position_embeddings]

    def __repr__(self):
        return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
    ### END SOLUTION

# %% ../../modules/11_embeddings/embeddings.ipynb 18
class EmbeddingLayer:
    """
    Complete embedding system combining token and positional embeddings.

    This is the production-ready component that handles the full embedding
    pipeline used in transformers and other sequence models.

    TODO: Implement complete embedding system

    APPROACH:
    1. Combine token embedding + positional encoding
    2. Support both learned and sinusoidal position encodings
    3. Handle variable sequence lengths gracefully
    4. Add optional embedding scaling (Transformer convention)

    EXAMPLE:
    >>> embed_layer = EmbeddingLayer(
    ...     vocab_size=50000,
    ...     embed_dim=512,
    ...     max_seq_len=2048,
    ...     pos_encoding='learned'
    ... )
    >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])
    >>> output = embed_layer.forward(tokens)
    >>> print(output.shape)
    (2, 3, 512)

    HINTS:
    - First apply token embedding, then add positional encoding
    - Support 'learned', 'sinusoidal', or None for pos_encoding
    - Handle both 2D (batch, seq) and 1D (seq) inputs gracefully
    - Scale embeddings by sqrt(embed_dim) if requested (transformer convention)
    """

    ### BEGIN SOLUTION
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        max_seq_len: int = 512,
        pos_encoding: str = 'learned',
        scale_embeddings: bool = False
    ):
        """
        Initialize complete embedding system.

        Args:
            vocab_size: Size of vocabulary
            embed_dim: Embedding dimension
            max_seq_len: Maximum sequence length for positional encoding
            pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None)
            scale_embeddings: Whether to scale embeddings by sqrt(embed_dim)
        """
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        self.pos_encoding_type = pos_encoding
        self.scale_embeddings = scale_embeddings

        # Token embedding layer
        self.token_embedding = Embedding(vocab_size, embed_dim)

        # Positional encoding
        if pos_encoding == 'learned':
            self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
        elif pos_encoding == 'sinusoidal':
            # Create fixed sinusoidal encodings (no parameters)
            self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim)
        elif pos_encoding is None:
            self.pos_encoding = None
        else:
            raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None")

    def forward(self, tokens: Tensor) -> Tensor:
        """
        Forward pass through complete embedding system.

        Args:
            tokens: Token indices of shape (batch_size, seq_len) or (seq_len,)

        Returns:
            Embedded tokens with positional information
        """
        # Handle 1D input by adding batch dimension
        if len(tokens.shape) == 1:
            # NOTE: Tensor reshape preserves gradients
            tokens = tokens.reshape(1, -1)
            squeeze_batch = True
        else:
            squeeze_batch = False

        # Get token embeddings
        token_embeds = self.token_embedding.forward(tokens)  # (batch, seq, embed)

        # Scale embeddings if requested (transformer convention)
        if self.scale_embeddings:
            scale_factor = math.sqrt(self.embed_dim)
            token_embeds = token_embeds * scale_factor  # Use Tensor multiplication to preserve gradients

        # Add positional encoding
        if self.pos_encoding_type == 'learned':
            # Use learnable positional encoding
            output = self.pos_encoding.forward(token_embeds)
        elif self.pos_encoding_type == 'sinusoidal':
            # Use fixed sinusoidal encoding (not learnable)
            batch_size, seq_len, embed_dim = token_embeds.shape
            pos_embeddings = self.pos_encoding[:seq_len]  # Slice using Tensor slicing

            # Reshape to add batch dimension
            pos_data = pos_embeddings.data[np.newaxis, :, :]
            pos_embeddings_batched = Tensor(pos_data, requires_grad=False)  # Sinusoidal are fixed

            output = token_embeds + pos_embeddings_batched
        else:
            # No positional encoding
            output = token_embeds

        # Remove batch dimension if it was added
        if squeeze_batch:
            # Use Tensor slicing (now supported in Module 01)
            output = output[0]

        return output

    def __call__(self, tokens: Tensor) -> Tensor:
        """Allows the embedding layer to be called like a function."""
        return self.forward(tokens)

    def parameters(self) -> List[Tensor]:
        """Return all trainable parameters."""
        params = self.token_embedding.parameters()

        if self.pos_encoding_type == 'learned':
            params.extend(self.pos_encoding.parameters())

        return params

    def __repr__(self):
        return (f"EmbeddingLayer(vocab_size={self.vocab_size}, "
                f"embed_dim={self.embed_dim}, "
                f"pos_encoding='{self.pos_encoding_type}')")
    ### END SOLUTION