TinyTorch/tinytorch/core/embeddings.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/12_embeddings/embeddings_dev.ipynb.

# %% auto 0
__all__ = ['Embedding', 'PositionalEncoding', 'LearnedPositionalEmbedding', 'EmbeddingProfiler',
           'analyze_embedding_system_design']

# %% ../../modules/12_embeddings/embeddings_dev.ipynb 1
import math
import numpy as np
import os
import sys
from typing import Union, List, Optional, Tuple

# Import our Tensor class - try from package first, then from local module
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local tensor module
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
    from tensor_dev import Tensor

# Try to import tokenization classes
try:
    from tinytorch.core.tokenization import CharTokenizer, BPETokenizer
except ImportError:
    # For development, import from local module
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '11_tokenization'))
    try:
        from tokenization_dev import CharTokenizer, BPETokenizer
    except ImportError:
        # Create minimal mock classes if not available
        class CharTokenizer:
            def __init__(self):
                self.vocab_size = 256
        class BPETokenizer:
            def __init__(self, vocab_size=1000):
                self.vocab_size = vocab_size

# %% ../../modules/12_embeddings/embeddings_dev.ipynb 6
class Embedding:
    """
    Embedding layer that converts token indices to dense vector representations.

    This is the foundation of modern language models - a learnable lookup table
    that maps discrete tokens to continuous vectors that capture semantic meaning.
    """

    def __init__(self, vocab_size: int, embedding_dim: int,
                 padding_idx: Optional[int] = None,
                 init_type: str = 'uniform'):
        """
        Initialize embedding layer with learnable parameters.

        STEP-BY-STEP IMPLEMENTATION:
        1. Store configuration parameters
        2. Initialize embedding table with chosen initialization
        3. Handle special padding token if specified
        4. Set up for gradient tracking (will connect to autograd later)

        DESIGN DECISIONS:
        - Embedding table shape: (vocab_size, embedding_dim)
        - Initialization affects training dynamics
        - Padding idx gets zero gradient to stay constant

        Args:
            vocab_size: Number of tokens in vocabulary
            embedding_dim: Size of dense vector for each token
            padding_idx: Optional token index that should remain zero
            init_type: Initialization strategy ('uniform', 'normal', 'xavier')
        """
        ### BEGIN SOLUTION
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.init_type = init_type

        # Initialize embedding table based on strategy
        if init_type == 'uniform':
            # Uniform initialization in [-1/sqrt(dim), 1/sqrt(dim)]
            bound = 1.0 / math.sqrt(embedding_dim)
            self.weight = Tensor(np.random.uniform(-bound, bound, (vocab_size, embedding_dim)))
        elif init_type == 'normal':
            # Normal initialization with std=1/sqrt(dim)
            std = 1.0 / math.sqrt(embedding_dim)
            self.weight = Tensor(np.random.normal(0, std, (vocab_size, embedding_dim)))
        elif init_type == 'xavier':
            # Xavier/Glorot initialization
            bound = math.sqrt(6.0 / (vocab_size + embedding_dim))
            self.weight = Tensor(np.random.uniform(-bound, bound, (vocab_size, embedding_dim)))
        else:
            raise ValueError(f"Unknown init_type: {init_type}")

        # Set padding token to zero if specified
        if padding_idx is not None:
            self.weight.data[padding_idx] = 0.0

        # Track parameters for optimization
        self.parameters = [self.weight]
        ### END SOLUTION

    def forward(self, input_ids: Union[Tensor, List[int], np.ndarray]) -> Tensor:
        """
        Look up embeddings for input token indices.

        TODO: Implement embedding lookup.

        STEP-BY-STEP IMPLEMENTATION:
        1. Convert input to numpy array if needed
        2. Validate token indices are within vocabulary
        3. Use advanced indexing to look up embeddings
        4. Return tensor with shape (batch_size, seq_len, embedding_dim)

        EXAMPLE:
        embed = Embedding(vocab_size=100, embedding_dim=64)
        tokens = Tensor([[1, 2, 3], [4, 5, 6]])  # Shape: (2, 3)
        embeddings = embed.forward(tokens)  # Shape: (2, 3, 64)

        IMPLEMENTATION HINTS:
        - Handle both Tensor and list inputs
        - Use numpy advanced indexing: weight[indices]
        - Preserve batch and sequence dimensions

        Args:
            input_ids: Token indices with shape (batch_size, seq_len) or (seq_len,)

        Returns:
            Embeddings with shape (*input_shape, embedding_dim)
        """
        ### BEGIN SOLUTION
        # Convert input to numpy array
        if isinstance(input_ids, Tensor):
            indices = input_ids.data
        elif isinstance(input_ids, list):
            indices = np.array(input_ids)
        else:
            indices = input_ids

        # Validate indices
        indices = indices.astype(int)
        if np.any(indices < 0) or np.any(indices >= self.vocab_size):
            raise ValueError(f"Token indices must be in range [0, {self.vocab_size})")

        # Look up embeddings using advanced indexing
        # self.weight.data has shape (vocab_size, embedding_dim)
        # indices has shape (...), result has shape (..., embedding_dim)
        embeddings = self.weight.data[indices]

        return Tensor(embeddings)
        ### END SOLUTION

    def __call__(self, input_ids: Union[Tensor, List[int], np.ndarray]) -> Tensor:
        """Make the layer callable."""
        return self.forward(input_ids)

    def get_memory_usage(self):
        """
        Calculate memory usage of embedding table.

        This function is PROVIDED to show memory analysis.
        """
        # Embedding table memory
        weight_memory_mb = self.weight.data.nbytes / (1024 * 1024)

        # Memory per token
        memory_per_token_kb = (self.embedding_dim * 4) / 1024  # 4 bytes per float32

        return {
            'total_memory_mb': weight_memory_mb,
            'memory_per_token_kb': memory_per_token_kb,
            'total_parameters': self.vocab_size * self.embedding_dim,
            'vocab_size': self.vocab_size,
            'embedding_dim': self.embedding_dim
        }

# %% ../../modules/12_embeddings/embeddings_dev.ipynb 10
class PositionalEncoding:
    """
    Sinusoidal positional encoding that adds position information to embeddings.

    Uses sine and cosine functions of different frequencies to create
    unique position representations that the model can learn to use.
    """

    def __init__(self, embedding_dim: int, max_seq_length: int = 5000,
                 dropout: float = 0.0):
        """
        Initialize positional encoding with sinusoidal patterns.

        TODO: Implement positional encoding initialization.

        STEP-BY-STEP IMPLEMENTATION:
        1. Create position matrix (max_seq_length, embedding_dim)
        2. For each position and dimension:
           - Calculate frequency based on dimension
           - Apply sine to even dimensions, cosine to odd dimensions
        3. Store the precomputed positional encodings

        MATHEMATICAL FOUNDATION:
        PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
        PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

        Where:
        - pos = position in sequence
        - i = dimension index
        - d_model = embedding_dim

        Args:
            embedding_dim: Dimension of embeddings (must be even)
            max_seq_length: Maximum sequence length to precompute
            dropout: Dropout rate (for future use)
        """
        ### BEGIN SOLUTION
        self.embedding_dim = embedding_dim
        self.max_seq_length = max_seq_length
        self.dropout = dropout

        # Create positional encoding matrix
        pe = np.zeros((max_seq_length, embedding_dim))

        # Create position vector (0, 1, 2, ..., max_seq_length-1)
        position = np.arange(0, max_seq_length).reshape(-1, 1)  # Shape: (max_seq_length, 1)

        # Create dimension indices for frequency calculation
        # div_term calculates 10000^(2i/d_model) for i = 0, 1, 2, ...
        div_term = np.exp(np.arange(0, embedding_dim, 2) *
                         -(math.log(10000.0) / embedding_dim))

        # Apply sine to even dimensions (0, 2, 4, ...)
        pe[:, 0::2] = np.sin(position * div_term)

        # Apply cosine to odd dimensions (1, 3, 5, ...)
        if embedding_dim % 2 == 1:
            # Handle odd embedding_dim - cosine gets one less dimension
            pe[:, 1::2] = np.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = np.cos(position * div_term)

        # Store as tensor
        self.pe = Tensor(pe)
        ### END SOLUTION

    def forward(self, embeddings: Tensor) -> Tensor:
        """
        Add positional encoding to embeddings.

        TODO: Implement positional encoding addition.

        STEP-BY-STEP IMPLEMENTATION:
        1. Get sequence length from embeddings shape
        2. Extract relevant positional encodings
        3. Add positional encodings to embeddings
        4. Return position-aware embeddings

        EXAMPLE:
        pos_enc = PositionalEncoding(embedding_dim=64)
        embeddings = Tensor(np.random.randn(2, 10, 64))  # (batch, seq, dim)
        pos_embeddings = pos_enc.forward(embeddings)

        Args:
            embeddings: Input embeddings with shape (batch_size, seq_len, embedding_dim)

        Returns:
            Position-aware embeddings with same shape as input
        """
        ### BEGIN SOLUTION
        # Get sequence length from embeddings
        if len(embeddings.shape) == 3:
            batch_size, seq_length, embed_dim = embeddings.shape
        elif len(embeddings.shape) == 2:
            seq_length, embed_dim = embeddings.shape
            batch_size = None
        else:
            raise ValueError(f"Expected 2D or 3D embeddings, got shape {embeddings.shape}")

        if embed_dim != self.embedding_dim:
            raise ValueError(f"Embedding dim mismatch: expected {self.embedding_dim}, got {embed_dim}")

        if seq_length > self.max_seq_length:
            raise ValueError(f"Sequence length {seq_length} exceeds max {self.max_seq_length}")

        # Extract positional encodings for this sequence length
        position_encodings = self.pe.data[:seq_length, :]

        # Add positional encodings to embeddings
        if batch_size is not None:
            # Broadcast positional encodings across batch dimension
            # embeddings: (batch, seq, dim) + position_encodings: (seq, dim)
            result = embeddings.data + position_encodings[np.newaxis, :, :]
        else:
            # embeddings: (seq, dim) + position_encodings: (seq, dim)
            result = embeddings.data + position_encodings

        return Tensor(result)
        ### END SOLUTION

    def __call__(self, embeddings: Tensor) -> Tensor:
        """Make the class callable."""
        return self.forward(embeddings)

    def visualize_encoding(self, seq_length: int = 100, dims_to_show: int = 10) -> None:
        """
        Visualize positional encoding patterns.

        This function is PROVIDED to show encoding patterns.
        """
        print(f"📊 POSITIONAL ENCODING VISUALIZATION")
        print(f"Sequence length: {seq_length}, Dimensions shown: {dims_to_show}")
        print("=" * 60)

        # Get subset of positional encodings
        pe_subset = self.pe.data[:seq_length, :dims_to_show]

        # Show patterns for first few positions
        print("First 10 positions, first 10 dimensions:")
        print("Pos", end="")
        for d in range(min(dims_to_show, 10)):
            print(f"    Dim{d:2d}", end="")
        print()

        for pos in range(min(seq_length, 10)):
            print(f"{pos:3d}", end="")
            for d in range(min(dims_to_show, 10)):
                print(f"{pe_subset[pos, d]:8.3f}", end="")
            print()

        # Show frequency analysis
        print(f"\n📈 FREQUENCY ANALYSIS:")
        print("Even dimensions (sine): Lower frequencies for early dimensions")
        print("Odd dimensions (cosine): Same frequencies, phase-shifted")

        # Calculate frequency range
        min_freq = 1.0 / 10000
        max_freq = 1.0
        print(f"Frequency range: {min_freq:.6f} to {max_freq:.6f}")

# %% ../../modules/12_embeddings/embeddings_dev.ipynb 14
class LearnedPositionalEmbedding:
    """
    Learned positional embeddings - another embedding table for positions.

    Unlike sinusoidal encoding, these are learned parameters that
    the model optimizes during training. Used in models like BERT.
    """

    def __init__(self, max_seq_length: int, embedding_dim: int):
        """
        Initialize learned positional embeddings.

        TODO: Implement learned positional embedding initialization.

        STEP-BY-STEP IMPLEMENTATION:
        1. Create embedding layer for positions (0, 1, 2, ..., max_seq_length-1)
        2. Initialize with small random values
        3. Set up parameter tracking for optimization

        This is essentially an Embedding layer where the "vocabulary"
        is the set of possible positions in a sequence.

        Args:
            max_seq_length: Maximum sequence length supported
            embedding_dim: Dimension of position embeddings
        """
        ### BEGIN SOLUTION
        self.max_seq_length = max_seq_length
        self.embedding_dim = embedding_dim

        # Create learned positional embedding table
        # This is like an embedding layer for positions
        self.position_embedding = Embedding(
            vocab_size=max_seq_length,
            embedding_dim=embedding_dim,
            init_type='normal'
        )

        # Track parameters for optimization
        self.parameters = self.position_embedding.parameters
        ### END SOLUTION

    def forward(self, embeddings: Tensor) -> Tensor:
        """
        Add learned positional embeddings to input embeddings.

        TODO: Implement learned positional embedding addition.

        STEP-BY-STEP IMPLEMENTATION:
        1. Get sequence length from input shape
        2. Create position indices [0, 1, 2, ..., seq_length-1]
        3. Look up position embeddings using position indices
        4. Add position embeddings to input embeddings

        EXAMPLE:
        learned_pos = LearnedPositionalEmbedding(max_seq_length=100, embedding_dim=64)
        embeddings = Tensor(np.random.randn(2, 10, 64))  # (batch, seq, dim)
        pos_embeddings = learned_pos.forward(embeddings)

        Args:
            embeddings: Input embeddings with shape (batch_size, seq_len, embedding_dim)

        Returns:
            Position-aware embeddings with same shape as input
        """
        ### BEGIN SOLUTION
        # Get sequence length from embeddings
        if len(embeddings.shape) == 3:
            batch_size, seq_length, embed_dim = embeddings.shape
        elif len(embeddings.shape) == 2:
            seq_length, embed_dim = embeddings.shape
            batch_size = None
        else:
            raise ValueError(f"Expected 2D or 3D embeddings, got shape {embeddings.shape}")

        if embed_dim != self.embedding_dim:
            raise ValueError(f"Embedding dim mismatch: expected {self.embedding_dim}, got {embed_dim}")

        if seq_length > self.max_seq_length:
            raise ValueError(f"Sequence length {seq_length} exceeds max {self.max_seq_length}")

        # Create position indices [0, 1, 2, ..., seq_length-1]
        position_ids = list(range(seq_length))

        # Look up position embeddings
        position_embeddings = self.position_embedding.forward(position_ids)

        # Add position embeddings to input embeddings
        if batch_size is not None:
            # Broadcast across batch dimension
            result = embeddings.data + position_embeddings.data[np.newaxis, :, :]
        else:
            result = embeddings.data + position_embeddings.data

        return Tensor(result)
        ### END SOLUTION

    def __call__(self, embeddings: Tensor) -> Tensor:
        """Make the class callable."""
        return self.forward(embeddings)

# %% ../../modules/12_embeddings/embeddings_dev.ipynb 18
import time

class EmbeddingProfiler:
    """
    Performance profiling toolkit for embedding systems.

    Helps ML engineers understand memory usage, lookup performance,
    and scaling characteristics of embedding layers.
    """

    def __init__(self):
        self.results = {}

    def measure_lookup_performance(self, embedding_layer: Embedding,
                                  batch_sizes: List[int], seq_lengths: List[int]):
        """
        Measure embedding lookup performance across different batch sizes and sequence lengths.

        TODO: Implement embedding lookup performance measurement.

        STEP-BY-STEP IMPLEMENTATION:
        1. Create test token indices for each (batch_size, seq_length) combination
        2. Measure time to perform embedding lookup
        3. Calculate throughput metrics (tokens/second, memory bandwidth)
        4. Return comprehensive performance analysis

        METRICS TO CALCULATE:
        - Lookup time (milliseconds)
        - Tokens per second throughput
        - Memory bandwidth utilization
        - Scaling patterns with batch size and sequence length

        Args:
            embedding_layer: Embedding layer to test
            batch_sizes: List of batch sizes to test
            seq_lengths: List of sequence lengths to test

        Returns:
            Dictionary with performance metrics for each configuration
        """
        ### BEGIN SOLUTION
        results = {}
        vocab_size = embedding_layer.vocab_size

        for batch_size in batch_sizes:
            for seq_length in seq_lengths:
                # Create random token indices
                token_indices = np.random.randint(0, vocab_size, (batch_size, seq_length))

                # Measure lookup performance
                start_time = time.time()
                embeddings = embedding_layer.forward(token_indices)
                end_time = time.time()

                # Calculate metrics
                lookup_time_ms = (end_time - start_time) * 1000
                total_tokens = batch_size * seq_length
                tokens_per_second = total_tokens / (end_time - start_time) if end_time > start_time else 0

                # Memory calculations
                input_memory_mb = token_indices.nbytes / (1024 * 1024)
                output_memory_mb = embeddings.data.nbytes / (1024 * 1024)
                memory_bandwidth_mb_s = (input_memory_mb + output_memory_mb) / (end_time - start_time) if end_time > start_time else 0

                config_key = f"batch_{batch_size}_seq_{seq_length}"
                results[config_key] = {
                    'batch_size': batch_size,
                    'seq_length': seq_length,
                    'total_tokens': total_tokens,
                    'lookup_time_ms': lookup_time_ms,
                    'tokens_per_second': tokens_per_second,
                    'input_memory_mb': input_memory_mb,
                    'output_memory_mb': output_memory_mb,
                    'memory_bandwidth_mb_s': memory_bandwidth_mb_s,
                    'time_per_token_us': lookup_time_ms * 1000 / total_tokens if total_tokens > 0 else 0
                }

        return results
        ### END SOLUTION

    def analyze_memory_scaling(self, vocab_sizes: List[int], embedding_dims: List[int]):
        """
        Analyze how embedding memory usage scales with vocabulary size and embedding dimension.

        This function is PROVIDED to show memory scaling analysis.
        """
        print("📊 EMBEDDING MEMORY SCALING ANALYSIS")
        print("=" * 60)

        scaling_results = {}

        print(f"{'Vocab Size':<12} {'Embed Dim':<10} {'Parameters':<12} {'Memory (MB)':<12} {'Lookup Time':<12}")
        print("-" * 70)

        for vocab_size in vocab_sizes:
            for embed_dim in embedding_dims:
                # Create embedding layer
                embed = Embedding(vocab_size=vocab_size, embedding_dim=embed_dim)

                # Calculate memory usage
                memory_stats = embed.get_memory_usage()
                total_memory_mb = memory_stats['total_memory_mb']
                total_params = memory_stats['total_parameters']

                # Measure lookup time
                test_tokens = np.random.randint(0, vocab_size, (32, 64))  # Standard batch
                start_time = time.time()
                _ = embed.forward(test_tokens)
                lookup_time_ms = (time.time() - start_time) * 1000

                # Store results
                config_key = f"vocab_{vocab_size}_dim_{embed_dim}"
                scaling_results[config_key] = {
                    'vocab_size': vocab_size,
                    'embedding_dim': embed_dim,
                    'total_parameters': total_params,
                    'memory_mb': total_memory_mb,
                    'lookup_time_ms': lookup_time_ms
                }

                print(f"{vocab_size:<12,} {embed_dim:<10} {total_params:<12,} {total_memory_mb:<12.2f} {lookup_time_ms:<12.2f}")

        # Analyze scaling patterns
        print(f"\n📈 SCALING INSIGHTS:")
        if len(vocab_sizes) > 1 and len(embedding_dims) > 1:
            # Compare scaling with vocab size (fixed embedding dim)
            fixed_dim = embedding_dims[0]
            small_vocab = min(vocab_sizes)
            large_vocab = max(vocab_sizes)

            small_key = f"vocab_{small_vocab}_dim_{fixed_dim}"
            large_key = f"vocab_{large_vocab}_dim_{fixed_dim}"

            if small_key in scaling_results and large_key in scaling_results:
                vocab_ratio = large_vocab / small_vocab
                memory_ratio = scaling_results[large_key]['memory_mb'] / scaling_results[small_key]['memory_mb']
                print(f"   Vocabulary scaling: {vocab_ratio:.1f}x vocab → {memory_ratio:.1f}x memory (Linear)")

            # Compare scaling with embedding dim (fixed vocab)
            fixed_vocab = vocab_sizes[0]
            small_dim = min(embedding_dims)
            large_dim = max(embedding_dims)

            small_key = f"vocab_{fixed_vocab}_dim_{small_dim}"
            large_key = f"vocab_{fixed_vocab}_dim_{large_dim}"

            if small_key in scaling_results and large_key in scaling_results:
                dim_ratio = large_dim / small_dim
                memory_ratio = scaling_results[large_key]['memory_mb'] / scaling_results[small_key]['memory_mb']
                print(f"   Dimension scaling: {dim_ratio:.1f}x dim → {memory_ratio:.1f}x memory (Linear)")

        return scaling_results

    def compare_positional_encodings(self, seq_length: int = 100, embedding_dim: int = 256):
        """
        Compare performance and characteristics of different positional encoding approaches.

        This function is PROVIDED to show positional encoding comparison.
        """
        print(f"\n🔍 POSITIONAL ENCODING COMPARISON")
        print("=" * 50)

        # Create test embeddings
        batch_size = 16
        embeddings = Tensor(np.random.randn(batch_size, seq_length, embedding_dim))

        # Test sinusoidal positional encoding
        sinusoidal_pe = PositionalEncoding(embedding_dim=embedding_dim, max_seq_length=seq_length*2)
        start_time = time.time()
        sin_result = sinusoidal_pe.forward(embeddings)
        sin_time = (time.time() - start_time) * 1000

        # Test learned positional embedding
        learned_pe = LearnedPositionalEmbedding(max_seq_length=seq_length*2, embedding_dim=embedding_dim)
        start_time = time.time()
        learned_result = learned_pe.forward(embeddings)
        learned_time = (time.time() - start_time) * 1000

        # Calculate memory usage
        sin_memory = 0  # No learnable parameters
        learned_memory = learned_pe.position_embedding.get_memory_usage()['total_memory_mb']

        results = {
            'sinusoidal': {
                'computation_time_ms': sin_time,
                'memory_usage_mb': sin_memory,
                'parameters': 0,
                'deterministic': True,
                'extrapolation': 'Good (can handle longer sequences)'
            },
            'learned': {
                'computation_time_ms': learned_time,
                'memory_usage_mb': learned_memory,
                'parameters': seq_length * 2 * embedding_dim,
                'deterministic': False,
                'extrapolation': 'Limited (fixed max sequence length)'
            }
        }

        print(f"📊 COMPARISON RESULTS:")
        print(f"{'Method':<12} {'Time (ms)':<10} {'Memory (MB)':<12} {'Parameters':<12} {'Extrapolation'}")
        print("-" * 70)
        print(f"{'Sinusoidal':<12} {sin_time:<10.2f} {sin_memory:<12.2f} {0:<12,} {'Good'}")
        print(f"{'Learned':<12} {learned_time:<10.2f} {learned_memory:<12.2f} {results['learned']['parameters']:<12,} {'Limited'}")

        print(f"\n💡 INSIGHTS:")
        print(f"   - Sinusoidal: Zero parameters, deterministic, good extrapolation")
        print(f"   - Learned: Requires parameters, model-specific, limited extrapolation")
        print(f"   - Choice depends on: model capacity, sequence length requirements, extrapolation needs")

        return results

def analyze_embedding_system_design():
    """
    Comprehensive analysis of embedding system design choices and their impact.

    This function is PROVIDED to show systems-level design thinking.
    """
    print("🏗️ EMBEDDING SYSTEM DESIGN ANALYSIS")
    print("=" * 60)

    # Example model configurations
    model_configs = [
        {'name': 'Small GPT', 'vocab_size': 10000, 'embed_dim': 256, 'seq_length': 512},
        {'name': 'Medium GPT', 'vocab_size': 50000, 'embed_dim': 512, 'seq_length': 1024},
        {'name': 'Large GPT', 'vocab_size': 50000, 'embed_dim': 1024, 'seq_length': 2048}
    ]

    print(f"📋 MODEL CONFIGURATION COMPARISON:")
    print(f"{'Model':<12} {'Vocab Size':<10} {'Embed Dim':<10} {'Seq Len':<8} {'Embed Params':<12} {'Memory (MB)'}")
    print("-" * 80)

    for config in model_configs:
        # Calculate embedding parameters
        embed_params = config['vocab_size'] * config['embed_dim']

        # Calculate memory usage
        embed_memory_mb = embed_params * 4 / (1024 * 1024)  # 4 bytes per float32

        print(f"{config['name']:<12} {config['vocab_size']:<10,} {config['embed_dim']:<10} "
              f"{config['seq_length']:<8} {embed_params:<12,} {embed_memory_mb:<10.1f}")

    print(f"\n🎯 DESIGN TRADE-OFFS:")
    print(f"   1. Vocabulary Size:")
    print(f"      - Larger vocab: Better text coverage, more parameters")
    print(f"      - Smaller vocab: Longer sequences, more compute")
    print(f"   2. Embedding Dimension:")
    print(f"      - Higher dim: More model capacity, more memory")
    print(f"      - Lower dim: Faster computation, potential bottleneck")
    print(f"   3. Position Encoding:")
    print(f"      - Sinusoidal: No parameters, good extrapolation")
    print(f"      - Learned: Model-specific, limited to training length")
    print(f"   4. Memory Scaling:")
    print(f"      - Embedding table: O(vocab_size × embed_dim)")
    print(f"      - Sequence processing: O(batch_size × seq_length × embed_dim)")
    print(f"      - Total memory dominated by model size, not embedding table")

    print(f"\n🏭 PRODUCTION CONSIDERATIONS:")
    print(f"   - GPU memory limits affect maximum embedding table size")
    print(f"   - Embedding lookup is memory-bandwidth bound")
    print(f"   - Vocabulary size affects tokenization and model download size")
    print(f"   - Position encoding choice affects sequence length flexibility")