TinyTorch/tinytorch/text/embeddings.py

# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py         ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
# ║     happens! The tinytorch/ directory is just the compiled output.           ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']

# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 2
import numpy as np
import math
from typing import List, Optional, Tuple

# Import from previous modules - following dependency chain
from ..core.tensor import Tensor

# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 6
class Embedding:
    """
    Learnable embedding layer that maps token indices to dense vectors.

    This is the fundamental building block for converting discrete tokens
    into continuous representations that neural networks can process.

    TODO: Implement the Embedding class

    APPROACH:
    1. Initialize embedding matrix with random weights (vocab_size, embed_dim)
    2. Implement forward pass as matrix lookup using numpy indexing
    3. Handle batch dimensions correctly
    4. Return parameters for optimization

    EXAMPLE:
    >>> embed = Embedding(vocab_size=100, embed_dim=64)
    >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])  # batch_size=2, seq_len=3
    >>> output = embed.forward(tokens)
    >>> print(output.shape)
    (2, 3, 64)

    HINTS:
    - Use numpy advanced indexing for lookup: weight[indices]
    - Embedding matrix shape: (vocab_size, embed_dim)
    - Initialize with Xavier/Glorot uniform for stable gradients
    - Handle multi-dimensional indices correctly
    """

    ### BEGIN SOLUTION
    def __init__(self, vocab_size: int, embed_dim: int):
        """
        Initialize embedding layer.

        Args:
            vocab_size: Size of vocabulary (number of unique tokens)
            embed_dim: Dimension of embedding vectors
        """
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        # Xavier initialization for better gradient flow
        limit = math.sqrt(6.0 / (vocab_size + embed_dim))
        self.weight = Tensor(
            np.random.uniform(-limit, limit, (vocab_size, embed_dim)),
            requires_grad=True
        )

    def forward(self, indices: Tensor) -> Tensor:
        """
        Forward pass: lookup embeddings for given indices.

        Args:
            indices: Token indices of shape (batch_size, seq_len) or (seq_len,)

        Returns:
            Embedded vectors of shape (*indices.shape, embed_dim)
        """
        # Handle input validation
        if np.any(indices.data >= self.vocab_size) or np.any(indices.data < 0):
            raise ValueError(
                f"Index out of range. Expected 0 <= indices < {self.vocab_size}, "
                f"got min={np.min(indices.data)}, max={np.max(indices.data)}"
            )

        # Perform embedding lookup using advanced indexing
        # This is equivalent to one-hot multiplication but much more efficient
        embedded = self.weight.data[indices.data.astype(int)]

        # Create result tensor
        result = Tensor(embedded, requires_grad=self.weight.requires_grad)

        # Attach gradient function (students learned this in Module 05!)
        if self.weight.requires_grad:
            from tinytorch.core.autograd import EmbeddingBackward
            result._grad_fn = EmbeddingBackward(self.weight, indices)

        return result

    def parameters(self) -> List[Tensor]:
        """Return trainable parameters."""
        return [self.weight]

    def __repr__(self):
        return f"Embedding(vocab_size={self.vocab_size}, embed_dim={self.embed_dim})"
    ### END SOLUTION

# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 10
class PositionalEncoding:
    """
    Learnable positional encoding layer.

    Adds trainable position-specific vectors to token embeddings,
    allowing the model to learn positional patterns specific to the task.

    TODO: Implement learnable positional encoding

    APPROACH:
    1. Create embedding matrix for positions: (max_seq_len, embed_dim)
    2. Forward pass: lookup position embeddings and add to input
    3. Handle different sequence lengths gracefully
    4. Return parameters for training

    EXAMPLE:
    >>> pos_enc = PositionalEncoding(max_seq_len=512, embed_dim=64)
    >>> embeddings = Tensor(np.random.randn(2, 10, 64))  # (batch, seq, embed)
    >>> output = pos_enc.forward(embeddings)
    >>> print(output.shape)
    (2, 10, 64)  # Same shape, but now position-aware

    HINTS:
    - Position embeddings shape: (max_seq_len, embed_dim)
    - Use slice [:seq_len] to handle variable lengths
    - Add position encodings to input embeddings element-wise
    - Initialize with smaller values than token embeddings (they're additive)
    """

    ### BEGIN SOLUTION
    def __init__(self, max_seq_len: int, embed_dim: int):
        """
        Initialize learnable positional encoding.

        Args:
            max_seq_len: Maximum sequence length to support
            embed_dim: Embedding dimension (must match token embeddings)
        """
        self.max_seq_len = max_seq_len
        self.embed_dim = embed_dim

        # Initialize position embedding matrix
        # Smaller initialization than token embeddings since these are additive
        limit = math.sqrt(2.0 / embed_dim)
        self.position_embeddings = Tensor(
            np.random.uniform(-limit, limit, (max_seq_len, embed_dim)),
            requires_grad=True
        )

    def forward(self, x: Tensor) -> Tensor:
        """
        Add positional encodings to input embeddings.

        Args:
            x: Input embeddings of shape (batch_size, seq_len, embed_dim)

        Returns:
            Position-encoded embeddings of same shape
        """
        if len(x.shape) != 3:
            raise ValueError(f"Expected 3D input (batch, seq, embed), got shape {x.shape}")

        batch_size, seq_len, embed_dim = x.shape

        if seq_len > self.max_seq_len:
            raise ValueError(
                f"Sequence length {seq_len} exceeds maximum {self.max_seq_len}"
            )

        if embed_dim != self.embed_dim:
            raise ValueError(
                f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
            )

        # Get position embeddings for this sequence length (slice using .data for efficiency)
        pos_embeddings_data = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)

        # Broadcast to match batch dimension: (1, seq_len, embed_dim)
        pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]

        # Wrap in Tensor to preserve requires_grad
        pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)

        # Add positional information using Tensor operation to preserve gradients!
        result = x + pos_embeddings

        return result

    def parameters(self) -> List[Tensor]:
        """Return trainable parameters."""
        return [self.position_embeddings]

    def __repr__(self):
        return f"PositionalEncoding(max_seq_len={self.max_seq_len}, embed_dim={self.embed_dim})"
    ### END SOLUTION

# %% ../../modules/source/11_embeddings/embeddings_dev.ipynb 18
class EmbeddingLayer:
    """
    Complete embedding system combining token and positional embeddings.

    This is the production-ready component that handles the full embedding
    pipeline used in transformers and other sequence models.

    TODO: Implement complete embedding system

    APPROACH:
    1. Combine token embedding + positional encoding
    2. Support both learned and sinusoidal position encodings
    3. Handle variable sequence lengths gracefully
    4. Add optional embedding scaling (Transformer convention)

    EXAMPLE:
    >>> embed_layer = EmbeddingLayer(
    ...     vocab_size=50000,
    ...     embed_dim=512,
    ...     max_seq_len=2048,
    ...     pos_encoding='learned'
    ... )
    >>> tokens = Tensor([[1, 2, 3], [4, 5, 6]])
    >>> output = embed_layer.forward(tokens)
    >>> print(output.shape)
    (2, 3, 512)

    HINTS:
    - First apply token embedding, then add positional encoding
    - Support 'learned', 'sinusoidal', or None for pos_encoding
    - Handle both 2D (batch, seq) and 1D (seq) inputs gracefully
    - Scale embeddings by sqrt(embed_dim) if requested (transformer convention)
    """

    ### BEGIN SOLUTION
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        max_seq_len: int = 512,
        pos_encoding: str = 'learned',
        scale_embeddings: bool = False
    ):
        """
        Initialize complete embedding system.

        Args:
            vocab_size: Size of vocabulary
            embed_dim: Embedding dimension
            max_seq_len: Maximum sequence length for positional encoding
            pos_encoding: Type of positional encoding ('learned', 'sinusoidal', or None)
            scale_embeddings: Whether to scale embeddings by sqrt(embed_dim)
        """
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        self.pos_encoding_type = pos_encoding
        self.scale_embeddings = scale_embeddings

        # Token embedding layer
        self.token_embedding = Embedding(vocab_size, embed_dim)

        # Positional encoding
        if pos_encoding == 'learned':
            self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
        elif pos_encoding == 'sinusoidal':
            # Create fixed sinusoidal encodings (no parameters)
            self.pos_encoding = create_sinusoidal_embeddings(max_seq_len, embed_dim)
        elif pos_encoding is None:
            self.pos_encoding = None
        else:
            raise ValueError(f"Unknown pos_encoding: {pos_encoding}. Use 'learned', 'sinusoidal', or None")

    def forward(self, tokens: Tensor) -> Tensor:
        """
        Forward pass through complete embedding system.

        Args:
            tokens: Token indices of shape (batch_size, seq_len) or (seq_len,)

        Returns:
            Embedded tokens with positional information
        """
        # Handle 1D input by adding batch dimension
        if len(tokens.shape) == 1:
            tokens = Tensor(tokens.data[np.newaxis, :])  # (1, seq_len)
            squeeze_batch = True
        else:
            squeeze_batch = False

        # Get token embeddings
        token_embeds = self.token_embedding.forward(tokens)  # (batch, seq, embed)

        # Scale embeddings if requested (transformer convention)
        if self.scale_embeddings:
            token_embeds = Tensor(token_embeds.data * math.sqrt(self.embed_dim))

        # Add positional encoding
        if self.pos_encoding_type == 'learned':
            # Use learnable positional encoding
            output = self.pos_encoding.forward(token_embeds)
        elif self.pos_encoding_type == 'sinusoidal':
            # Use fixed sinusoidal encoding
            batch_size, seq_len, embed_dim = token_embeds.shape
            pos_embeddings = self.pos_encoding.data[:seq_len]  # (seq_len, embed_dim)
            pos_embeddings = pos_embeddings[np.newaxis, :, :]  # (1, seq_len, embed_dim)
            output = Tensor(token_embeds.data + pos_embeddings)
        else:
            # No positional encoding
            output = token_embeds

        # Remove batch dimension if it was added
        if squeeze_batch:
            output = Tensor(output.data[0])  # (seq_len, embed_dim)

        return output

    def parameters(self) -> List[Tensor]:
        """Return all trainable parameters."""
        params = self.token_embedding.parameters()

        if self.pos_encoding_type == 'learned':
            params.extend(self.pos_encoding.parameters())

        return params

    def __repr__(self):
        return (f"EmbeddingLayer(vocab_size={self.vocab_size}, "
                f"embed_dim={self.embed_dim}, "
                f"pos_encoding='{self.pos_encoding_type}')")
    ### END SOLUTION