Files
TinyTorch/tinytorch/core/embeddings.py
Vijay Janapa Reddi b808346cf8 Clean up repository: remove temp files, organize modules, prepare for PyPI publication
- Removed temporary test files and audit reports
- Deleted backup and temp_holding directories
- Reorganized module structure (07->09 spatial, 09->07 dataloader)
- Added new modules: 11-14 (tokenization, embeddings, attention, transformers)
- Updated examples with historical ML milestones
- Cleaned up documentation structure
2025-09-24 10:13:37 -04:00

701 lines
29 KiB
Python
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/12_embeddings/embeddings_dev.ipynb.
# %% auto 0
__all__ = ['Embedding', 'PositionalEncoding', 'LearnedPositionalEmbedding', 'EmbeddingProfiler',
'analyze_embedding_system_design']
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 1
import math
import numpy as np
import os
import sys
from typing import Union, List, Optional, Tuple
# Import our Tensor class - try from package first, then from local module
try:
from tinytorch.core.tensor import Tensor
except ImportError:
# For development, import from local tensor module
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
from tensor_dev import Tensor
# Try to import tokenization classes
try:
from tinytorch.core.tokenization import CharTokenizer, BPETokenizer
except ImportError:
# For development, import from local module
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '11_tokenization'))
try:
from tokenization_dev import CharTokenizer, BPETokenizer
except ImportError:
# Create minimal mock classes if not available
class CharTokenizer:
def __init__(self):
self.vocab_size = 256
class BPETokenizer:
def __init__(self, vocab_size=1000):
self.vocab_size = vocab_size
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 6
class Embedding:
"""
Embedding layer that converts token indices to dense vector representations.
This is the foundation of modern language models - a learnable lookup table
that maps discrete tokens to continuous vectors that capture semantic meaning.
"""
def __init__(self, vocab_size: int, embedding_dim: int,
padding_idx: Optional[int] = None,
init_type: str = 'uniform'):
"""
Initialize embedding layer with learnable parameters.
STEP-BY-STEP IMPLEMENTATION:
1. Store configuration parameters
2. Initialize embedding table with chosen initialization
3. Handle special padding token if specified
4. Set up for gradient tracking (will connect to autograd later)
DESIGN DECISIONS:
- Embedding table shape: (vocab_size, embedding_dim)
- Initialization affects training dynamics
- Padding idx gets zero gradient to stay constant
Args:
vocab_size: Number of tokens in vocabulary
embedding_dim: Size of dense vector for each token
padding_idx: Optional token index that should remain zero
init_type: Initialization strategy ('uniform', 'normal', 'xavier')
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
self.init_type = init_type
# Initialize embedding table based on strategy
if init_type == 'uniform':
# Uniform initialization in [-1/sqrt(dim), 1/sqrt(dim)]
bound = 1.0 / math.sqrt(embedding_dim)
self.weight = Tensor(np.random.uniform(-bound, bound, (vocab_size, embedding_dim)))
elif init_type == 'normal':
# Normal initialization with std=1/sqrt(dim)
std = 1.0 / math.sqrt(embedding_dim)
self.weight = Tensor(np.random.normal(0, std, (vocab_size, embedding_dim)))
elif init_type == 'xavier':
# Xavier/Glorot initialization
bound = math.sqrt(6.0 / (vocab_size + embedding_dim))
self.weight = Tensor(np.random.uniform(-bound, bound, (vocab_size, embedding_dim)))
else:
raise ValueError(f"Unknown init_type: {init_type}")
# Set padding token to zero if specified
if padding_idx is not None:
self.weight.data[padding_idx] = 0.0
# Track parameters for optimization
self.parameters = [self.weight]
### END SOLUTION
def forward(self, input_ids: Union[Tensor, List[int], np.ndarray]) -> Tensor:
"""
Look up embeddings for input token indices.
TODO: Implement embedding lookup.
STEP-BY-STEP IMPLEMENTATION:
1. Convert input to numpy array if needed
2. Validate token indices are within vocabulary
3. Use advanced indexing to look up embeddings
4. Return tensor with shape (batch_size, seq_len, embedding_dim)
EXAMPLE:
embed = Embedding(vocab_size=100, embedding_dim=64)
tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # Shape: (2, 3)
embeddings = embed.forward(tokens) # Shape: (2, 3, 64)
IMPLEMENTATION HINTS:
- Handle both Tensor and list inputs
- Use numpy advanced indexing: weight[indices]
- Preserve batch and sequence dimensions
Args:
input_ids: Token indices with shape (batch_size, seq_len) or (seq_len,)
Returns:
Embeddings with shape (*input_shape, embedding_dim)
"""
### BEGIN SOLUTION
# Convert input to numpy array
if isinstance(input_ids, Tensor):
indices = input_ids.data
elif isinstance(input_ids, list):
indices = np.array(input_ids)
else:
indices = input_ids
# Validate indices
indices = indices.astype(int)
if np.any(indices < 0) or np.any(indices >= self.vocab_size):
raise ValueError(f"Token indices must be in range [0, {self.vocab_size})")
# Look up embeddings using advanced indexing
# self.weight.data has shape (vocab_size, embedding_dim)
# indices has shape (...), result has shape (..., embedding_dim)
embeddings = self.weight.data[indices]
return Tensor(embeddings)
### END SOLUTION
def __call__(self, input_ids: Union[Tensor, List[int], np.ndarray]) -> Tensor:
"""Make the layer callable."""
return self.forward(input_ids)
def get_memory_usage(self):
"""
Calculate memory usage of embedding table.
This function is PROVIDED to show memory analysis.
"""
# Embedding table memory
weight_memory_mb = self.weight.data.nbytes / (1024 * 1024)
# Memory per token
memory_per_token_kb = (self.embedding_dim * 4) / 1024 # 4 bytes per float32
return {
'total_memory_mb': weight_memory_mb,
'memory_per_token_kb': memory_per_token_kb,
'total_parameters': self.vocab_size * self.embedding_dim,
'vocab_size': self.vocab_size,
'embedding_dim': self.embedding_dim
}
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 10
class PositionalEncoding:
"""
Sinusoidal positional encoding that adds position information to embeddings.
Uses sine and cosine functions of different frequencies to create
unique position representations that the model can learn to use.
"""
def __init__(self, embedding_dim: int, max_seq_length: int = 5000,
dropout: float = 0.0):
"""
Initialize positional encoding with sinusoidal patterns.
TODO: Implement positional encoding initialization.
STEP-BY-STEP IMPLEMENTATION:
1. Create position matrix (max_seq_length, embedding_dim)
2. For each position and dimension:
- Calculate frequency based on dimension
- Apply sine to even dimensions, cosine to odd dimensions
3. Store the precomputed positional encodings
MATHEMATICAL FOUNDATION:
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
Where:
- pos = position in sequence
- i = dimension index
- d_model = embedding_dim
Args:
embedding_dim: Dimension of embeddings (must be even)
max_seq_length: Maximum sequence length to precompute
dropout: Dropout rate (for future use)
"""
### BEGIN SOLUTION
self.embedding_dim = embedding_dim
self.max_seq_length = max_seq_length
self.dropout = dropout
# Create positional encoding matrix
pe = np.zeros((max_seq_length, embedding_dim))
# Create position vector (0, 1, 2, ..., max_seq_length-1)
position = np.arange(0, max_seq_length).reshape(-1, 1) # Shape: (max_seq_length, 1)
# Create dimension indices for frequency calculation
# div_term calculates 10000^(2i/d_model) for i = 0, 1, 2, ...
div_term = np.exp(np.arange(0, embedding_dim, 2) *
-(math.log(10000.0) / embedding_dim))
# Apply sine to even dimensions (0, 2, 4, ...)
pe[:, 0::2] = np.sin(position * div_term)
# Apply cosine to odd dimensions (1, 3, 5, ...)
if embedding_dim % 2 == 1:
# Handle odd embedding_dim - cosine gets one less dimension
pe[:, 1::2] = np.cos(position * div_term[:-1])
else:
pe[:, 1::2] = np.cos(position * div_term)
# Store as tensor
self.pe = Tensor(pe)
### END SOLUTION
def forward(self, embeddings: Tensor) -> Tensor:
"""
Add positional encoding to embeddings.
TODO: Implement positional encoding addition.
STEP-BY-STEP IMPLEMENTATION:
1. Get sequence length from embeddings shape
2. Extract relevant positional encodings
3. Add positional encodings to embeddings
4. Return position-aware embeddings
EXAMPLE:
pos_enc = PositionalEncoding(embedding_dim=64)
embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, dim)
pos_embeddings = pos_enc.forward(embeddings)
Args:
embeddings: Input embeddings with shape (batch_size, seq_len, embedding_dim)
Returns:
Position-aware embeddings with same shape as input
"""
### BEGIN SOLUTION
# Get sequence length from embeddings
if len(embeddings.shape) == 3:
batch_size, seq_length, embed_dim = embeddings.shape
elif len(embeddings.shape) == 2:
seq_length, embed_dim = embeddings.shape
batch_size = None
else:
raise ValueError(f"Expected 2D or 3D embeddings, got shape {embeddings.shape}")
if embed_dim != self.embedding_dim:
raise ValueError(f"Embedding dim mismatch: expected {self.embedding_dim}, got {embed_dim}")
if seq_length > self.max_seq_length:
raise ValueError(f"Sequence length {seq_length} exceeds max {self.max_seq_length}")
# Extract positional encodings for this sequence length
position_encodings = self.pe.data[:seq_length, :]
# Add positional encodings to embeddings
if batch_size is not None:
# Broadcast positional encodings across batch dimension
# embeddings: (batch, seq, dim) + position_encodings: (seq, dim)
result = embeddings.data + position_encodings[np.newaxis, :, :]
else:
# embeddings: (seq, dim) + position_encodings: (seq, dim)
result = embeddings.data + position_encodings
return Tensor(result)
### END SOLUTION
def __call__(self, embeddings: Tensor) -> Tensor:
"""Make the class callable."""
return self.forward(embeddings)
def visualize_encoding(self, seq_length: int = 100, dims_to_show: int = 10) -> None:
"""
Visualize positional encoding patterns.
This function is PROVIDED to show encoding patterns.
"""
print(f"📊 POSITIONAL ENCODING VISUALIZATION")
print(f"Sequence length: {seq_length}, Dimensions shown: {dims_to_show}")
print("=" * 60)
# Get subset of positional encodings
pe_subset = self.pe.data[:seq_length, :dims_to_show]
# Show patterns for first few positions
print("First 10 positions, first 10 dimensions:")
print("Pos", end="")
for d in range(min(dims_to_show, 10)):
print(f" Dim{d:2d}", end="")
print()
for pos in range(min(seq_length, 10)):
print(f"{pos:3d}", end="")
for d in range(min(dims_to_show, 10)):
print(f"{pe_subset[pos, d]:8.3f}", end="")
print()
# Show frequency analysis
print(f"\n📈 FREQUENCY ANALYSIS:")
print("Even dimensions (sine): Lower frequencies for early dimensions")
print("Odd dimensions (cosine): Same frequencies, phase-shifted")
# Calculate frequency range
min_freq = 1.0 / 10000
max_freq = 1.0
print(f"Frequency range: {min_freq:.6f} to {max_freq:.6f}")
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 14
class LearnedPositionalEmbedding:
"""
Learned positional embeddings - another embedding table for positions.
Unlike sinusoidal encoding, these are learned parameters that
the model optimizes during training. Used in models like BERT.
"""
def __init__(self, max_seq_length: int, embedding_dim: int):
"""
Initialize learned positional embeddings.
TODO: Implement learned positional embedding initialization.
STEP-BY-STEP IMPLEMENTATION:
1. Create embedding layer for positions (0, 1, 2, ..., max_seq_length-1)
2. Initialize with small random values
3. Set up parameter tracking for optimization
This is essentially an Embedding layer where the "vocabulary"
is the set of possible positions in a sequence.
Args:
max_seq_length: Maximum sequence length supported
embedding_dim: Dimension of position embeddings
"""
### BEGIN SOLUTION
self.max_seq_length = max_seq_length
self.embedding_dim = embedding_dim
# Create learned positional embedding table
# This is like an embedding layer for positions
self.position_embedding = Embedding(
vocab_size=max_seq_length,
embedding_dim=embedding_dim,
init_type='normal'
)
# Track parameters for optimization
self.parameters = self.position_embedding.parameters
### END SOLUTION
def forward(self, embeddings: Tensor) -> Tensor:
"""
Add learned positional embeddings to input embeddings.
TODO: Implement learned positional embedding addition.
STEP-BY-STEP IMPLEMENTATION:
1. Get sequence length from input shape
2. Create position indices [0, 1, 2, ..., seq_length-1]
3. Look up position embeddings using position indices
4. Add position embeddings to input embeddings
EXAMPLE:
learned_pos = LearnedPositionalEmbedding(max_seq_length=100, embedding_dim=64)
embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, dim)
pos_embeddings = learned_pos.forward(embeddings)
Args:
embeddings: Input embeddings with shape (batch_size, seq_len, embedding_dim)
Returns:
Position-aware embeddings with same shape as input
"""
### BEGIN SOLUTION
# Get sequence length from embeddings
if len(embeddings.shape) == 3:
batch_size, seq_length, embed_dim = embeddings.shape
elif len(embeddings.shape) == 2:
seq_length, embed_dim = embeddings.shape
batch_size = None
else:
raise ValueError(f"Expected 2D or 3D embeddings, got shape {embeddings.shape}")
if embed_dim != self.embedding_dim:
raise ValueError(f"Embedding dim mismatch: expected {self.embedding_dim}, got {embed_dim}")
if seq_length > self.max_seq_length:
raise ValueError(f"Sequence length {seq_length} exceeds max {self.max_seq_length}")
# Create position indices [0, 1, 2, ..., seq_length-1]
position_ids = list(range(seq_length))
# Look up position embeddings
position_embeddings = self.position_embedding.forward(position_ids)
# Add position embeddings to input embeddings
if batch_size is not None:
# Broadcast across batch dimension
result = embeddings.data + position_embeddings.data[np.newaxis, :, :]
else:
result = embeddings.data + position_embeddings.data
return Tensor(result)
### END SOLUTION
def __call__(self, embeddings: Tensor) -> Tensor:
"""Make the class callable."""
return self.forward(embeddings)
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 18
import time
class EmbeddingProfiler:
"""
Performance profiling toolkit for embedding systems.
Helps ML engineers understand memory usage, lookup performance,
and scaling characteristics of embedding layers.
"""
def __init__(self):
self.results = {}
def measure_lookup_performance(self, embedding_layer: Embedding,
batch_sizes: List[int], seq_lengths: List[int]):
"""
Measure embedding lookup performance across different batch sizes and sequence lengths.
TODO: Implement embedding lookup performance measurement.
STEP-BY-STEP IMPLEMENTATION:
1. Create test token indices for each (batch_size, seq_length) combination
2. Measure time to perform embedding lookup
3. Calculate throughput metrics (tokens/second, memory bandwidth)
4. Return comprehensive performance analysis
METRICS TO CALCULATE:
- Lookup time (milliseconds)
- Tokens per second throughput
- Memory bandwidth utilization
- Scaling patterns with batch size and sequence length
Args:
embedding_layer: Embedding layer to test
batch_sizes: List of batch sizes to test
seq_lengths: List of sequence lengths to test
Returns:
Dictionary with performance metrics for each configuration
"""
### BEGIN SOLUTION
results = {}
vocab_size = embedding_layer.vocab_size
for batch_size in batch_sizes:
for seq_length in seq_lengths:
# Create random token indices
token_indices = np.random.randint(0, vocab_size, (batch_size, seq_length))
# Measure lookup performance
start_time = time.time()
embeddings = embedding_layer.forward(token_indices)
end_time = time.time()
# Calculate metrics
lookup_time_ms = (end_time - start_time) * 1000
total_tokens = batch_size * seq_length
tokens_per_second = total_tokens / (end_time - start_time) if end_time > start_time else 0
# Memory calculations
input_memory_mb = token_indices.nbytes / (1024 * 1024)
output_memory_mb = embeddings.data.nbytes / (1024 * 1024)
memory_bandwidth_mb_s = (input_memory_mb + output_memory_mb) / (end_time - start_time) if end_time > start_time else 0
config_key = f"batch_{batch_size}_seq_{seq_length}"
results[config_key] = {
'batch_size': batch_size,
'seq_length': seq_length,
'total_tokens': total_tokens,
'lookup_time_ms': lookup_time_ms,
'tokens_per_second': tokens_per_second,
'input_memory_mb': input_memory_mb,
'output_memory_mb': output_memory_mb,
'memory_bandwidth_mb_s': memory_bandwidth_mb_s,
'time_per_token_us': lookup_time_ms * 1000 / total_tokens if total_tokens > 0 else 0
}
return results
### END SOLUTION
def analyze_memory_scaling(self, vocab_sizes: List[int], embedding_dims: List[int]):
"""
Analyze how embedding memory usage scales with vocabulary size and embedding dimension.
This function is PROVIDED to show memory scaling analysis.
"""
print("📊 EMBEDDING MEMORY SCALING ANALYSIS")
print("=" * 60)
scaling_results = {}
print(f"{'Vocab Size':<12} {'Embed Dim':<10} {'Parameters':<12} {'Memory (MB)':<12} {'Lookup Time':<12}")
print("-" * 70)
for vocab_size in vocab_sizes:
for embed_dim in embedding_dims:
# Create embedding layer
embed = Embedding(vocab_size=vocab_size, embedding_dim=embed_dim)
# Calculate memory usage
memory_stats = embed.get_memory_usage()
total_memory_mb = memory_stats['total_memory_mb']
total_params = memory_stats['total_parameters']
# Measure lookup time
test_tokens = np.random.randint(0, vocab_size, (32, 64)) # Standard batch
start_time = time.time()
_ = embed.forward(test_tokens)
lookup_time_ms = (time.time() - start_time) * 1000
# Store results
config_key = f"vocab_{vocab_size}_dim_{embed_dim}"
scaling_results[config_key] = {
'vocab_size': vocab_size,
'embedding_dim': embed_dim,
'total_parameters': total_params,
'memory_mb': total_memory_mb,
'lookup_time_ms': lookup_time_ms
}
print(f"{vocab_size:<12,} {embed_dim:<10} {total_params:<12,} {total_memory_mb:<12.2f} {lookup_time_ms:<12.2f}")
# Analyze scaling patterns
print(f"\n📈 SCALING INSIGHTS:")
if len(vocab_sizes) > 1 and len(embedding_dims) > 1:
# Compare scaling with vocab size (fixed embedding dim)
fixed_dim = embedding_dims[0]
small_vocab = min(vocab_sizes)
large_vocab = max(vocab_sizes)
small_key = f"vocab_{small_vocab}_dim_{fixed_dim}"
large_key = f"vocab_{large_vocab}_dim_{fixed_dim}"
if small_key in scaling_results and large_key in scaling_results:
vocab_ratio = large_vocab / small_vocab
memory_ratio = scaling_results[large_key]['memory_mb'] / scaling_results[small_key]['memory_mb']
print(f" Vocabulary scaling: {vocab_ratio:.1f}x vocab → {memory_ratio:.1f}x memory (Linear)")
# Compare scaling with embedding dim (fixed vocab)
fixed_vocab = vocab_sizes[0]
small_dim = min(embedding_dims)
large_dim = max(embedding_dims)
small_key = f"vocab_{fixed_vocab}_dim_{small_dim}"
large_key = f"vocab_{fixed_vocab}_dim_{large_dim}"
if small_key in scaling_results and large_key in scaling_results:
dim_ratio = large_dim / small_dim
memory_ratio = scaling_results[large_key]['memory_mb'] / scaling_results[small_key]['memory_mb']
print(f" Dimension scaling: {dim_ratio:.1f}x dim → {memory_ratio:.1f}x memory (Linear)")
return scaling_results
def compare_positional_encodings(self, seq_length: int = 100, embedding_dim: int = 256):
"""
Compare performance and characteristics of different positional encoding approaches.
This function is PROVIDED to show positional encoding comparison.
"""
print(f"\n🔍 POSITIONAL ENCODING COMPARISON")
print("=" * 50)
# Create test embeddings
batch_size = 16
embeddings = Tensor(np.random.randn(batch_size, seq_length, embedding_dim))
# Test sinusoidal positional encoding
sinusoidal_pe = PositionalEncoding(embedding_dim=embedding_dim, max_seq_length=seq_length*2)
start_time = time.time()
sin_result = sinusoidal_pe.forward(embeddings)
sin_time = (time.time() - start_time) * 1000
# Test learned positional embedding
learned_pe = LearnedPositionalEmbedding(max_seq_length=seq_length*2, embedding_dim=embedding_dim)
start_time = time.time()
learned_result = learned_pe.forward(embeddings)
learned_time = (time.time() - start_time) * 1000
# Calculate memory usage
sin_memory = 0 # No learnable parameters
learned_memory = learned_pe.position_embedding.get_memory_usage()['total_memory_mb']
results = {
'sinusoidal': {
'computation_time_ms': sin_time,
'memory_usage_mb': sin_memory,
'parameters': 0,
'deterministic': True,
'extrapolation': 'Good (can handle longer sequences)'
},
'learned': {
'computation_time_ms': learned_time,
'memory_usage_mb': learned_memory,
'parameters': seq_length * 2 * embedding_dim,
'deterministic': False,
'extrapolation': 'Limited (fixed max sequence length)'
}
}
print(f"📊 COMPARISON RESULTS:")
print(f"{'Method':<12} {'Time (ms)':<10} {'Memory (MB)':<12} {'Parameters':<12} {'Extrapolation'}")
print("-" * 70)
print(f"{'Sinusoidal':<12} {sin_time:<10.2f} {sin_memory:<12.2f} {0:<12,} {'Good'}")
print(f"{'Learned':<12} {learned_time:<10.2f} {learned_memory:<12.2f} {results['learned']['parameters']:<12,} {'Limited'}")
print(f"\n💡 INSIGHTS:")
print(f" - Sinusoidal: Zero parameters, deterministic, good extrapolation")
print(f" - Learned: Requires parameters, model-specific, limited extrapolation")
print(f" - Choice depends on: model capacity, sequence length requirements, extrapolation needs")
return results
def analyze_embedding_system_design():
"""
Comprehensive analysis of embedding system design choices and their impact.
This function is PROVIDED to show systems-level design thinking.
"""
print("🏗️ EMBEDDING SYSTEM DESIGN ANALYSIS")
print("=" * 60)
# Example model configurations
model_configs = [
{'name': 'Small GPT', 'vocab_size': 10000, 'embed_dim': 256, 'seq_length': 512},
{'name': 'Medium GPT', 'vocab_size': 50000, 'embed_dim': 512, 'seq_length': 1024},
{'name': 'Large GPT', 'vocab_size': 50000, 'embed_dim': 1024, 'seq_length': 2048}
]
print(f"📋 MODEL CONFIGURATION COMPARISON:")
print(f"{'Model':<12} {'Vocab Size':<10} {'Embed Dim':<10} {'Seq Len':<8} {'Embed Params':<12} {'Memory (MB)'}")
print("-" * 80)
for config in model_configs:
# Calculate embedding parameters
embed_params = config['vocab_size'] * config['embed_dim']
# Calculate memory usage
embed_memory_mb = embed_params * 4 / (1024 * 1024) # 4 bytes per float32
print(f"{config['name']:<12} {config['vocab_size']:<10,} {config['embed_dim']:<10} "
f"{config['seq_length']:<8} {embed_params:<12,} {embed_memory_mb:<10.1f}")
print(f"\n🎯 DESIGN TRADE-OFFS:")
print(f" 1. Vocabulary Size:")
print(f" - Larger vocab: Better text coverage, more parameters")
print(f" - Smaller vocab: Longer sequences, more compute")
print(f" 2. Embedding Dimension:")
print(f" - Higher dim: More model capacity, more memory")
print(f" - Lower dim: Faster computation, potential bottleneck")
print(f" 3. Position Encoding:")
print(f" - Sinusoidal: No parameters, good extrapolation")
print(f" - Learned: Model-specific, limited to training length")
print(f" 4. Memory Scaling:")
print(f" - Embedding table: O(vocab_size × embed_dim)")
print(f" - Sequence processing: O(batch_size × seq_length × embed_dim)")
print(f" - Total memory dominated by model size, not embedding table")
print(f"\n🏭 PRODUCTION CONSIDERATIONS:")
print(f" - GPU memory limits affect maximum embedding table size")
print(f" - Embedding lookup is memory-bandwidth bound")
print(f" - Vocabulary size affects tokenization and model download size")
print(f" - Position encoding choice affects sequence length flexibility")