mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-02 03:45:50 -05:00
- Removed temporary test files and audit reports - Deleted backup and temp_holding directories - Reorganized module structure (07->09 spatial, 09->07 dataloader) - Added new modules: 11-14 (tokenization, embeddings, attention, transformers) - Updated examples with historical ML milestones - Cleaned up documentation structure
701 lines
29 KiB
Python
Generated
701 lines
29 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/12_embeddings/embeddings_dev.ipynb.
|
||
|
||
# %% auto 0
|
||
__all__ = ['Embedding', 'PositionalEncoding', 'LearnedPositionalEmbedding', 'EmbeddingProfiler',
|
||
'analyze_embedding_system_design']
|
||
|
||
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 1
|
||
import math
|
||
import numpy as np
|
||
import os
|
||
import sys
|
||
from typing import Union, List, Optional, Tuple
|
||
|
||
# Import our Tensor class - try from package first, then from local module
|
||
try:
|
||
from tinytorch.core.tensor import Tensor
|
||
except ImportError:
|
||
# For development, import from local tensor module
|
||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
|
||
from tensor_dev import Tensor
|
||
|
||
# Try to import tokenization classes
|
||
try:
|
||
from tinytorch.core.tokenization import CharTokenizer, BPETokenizer
|
||
except ImportError:
|
||
# For development, import from local module
|
||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '11_tokenization'))
|
||
try:
|
||
from tokenization_dev import CharTokenizer, BPETokenizer
|
||
except ImportError:
|
||
# Create minimal mock classes if not available
|
||
class CharTokenizer:
|
||
def __init__(self):
|
||
self.vocab_size = 256
|
||
class BPETokenizer:
|
||
def __init__(self, vocab_size=1000):
|
||
self.vocab_size = vocab_size
|
||
|
||
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 6
|
||
class Embedding:
|
||
"""
|
||
Embedding layer that converts token indices to dense vector representations.
|
||
|
||
This is the foundation of modern language models - a learnable lookup table
|
||
that maps discrete tokens to continuous vectors that capture semantic meaning.
|
||
"""
|
||
|
||
def __init__(self, vocab_size: int, embedding_dim: int,
|
||
padding_idx: Optional[int] = None,
|
||
init_type: str = 'uniform'):
|
||
"""
|
||
Initialize embedding layer with learnable parameters.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Store configuration parameters
|
||
2. Initialize embedding table with chosen initialization
|
||
3. Handle special padding token if specified
|
||
4. Set up for gradient tracking (will connect to autograd later)
|
||
|
||
DESIGN DECISIONS:
|
||
- Embedding table shape: (vocab_size, embedding_dim)
|
||
- Initialization affects training dynamics
|
||
- Padding idx gets zero gradient to stay constant
|
||
|
||
Args:
|
||
vocab_size: Number of tokens in vocabulary
|
||
embedding_dim: Size of dense vector for each token
|
||
padding_idx: Optional token index that should remain zero
|
||
init_type: Initialization strategy ('uniform', 'normal', 'xavier')
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.vocab_size = vocab_size
|
||
self.embedding_dim = embedding_dim
|
||
self.padding_idx = padding_idx
|
||
self.init_type = init_type
|
||
|
||
# Initialize embedding table based on strategy
|
||
if init_type == 'uniform':
|
||
# Uniform initialization in [-1/sqrt(dim), 1/sqrt(dim)]
|
||
bound = 1.0 / math.sqrt(embedding_dim)
|
||
self.weight = Tensor(np.random.uniform(-bound, bound, (vocab_size, embedding_dim)))
|
||
elif init_type == 'normal':
|
||
# Normal initialization with std=1/sqrt(dim)
|
||
std = 1.0 / math.sqrt(embedding_dim)
|
||
self.weight = Tensor(np.random.normal(0, std, (vocab_size, embedding_dim)))
|
||
elif init_type == 'xavier':
|
||
# Xavier/Glorot initialization
|
||
bound = math.sqrt(6.0 / (vocab_size + embedding_dim))
|
||
self.weight = Tensor(np.random.uniform(-bound, bound, (vocab_size, embedding_dim)))
|
||
else:
|
||
raise ValueError(f"Unknown init_type: {init_type}")
|
||
|
||
# Set padding token to zero if specified
|
||
if padding_idx is not None:
|
||
self.weight.data[padding_idx] = 0.0
|
||
|
||
# Track parameters for optimization
|
||
self.parameters = [self.weight]
|
||
### END SOLUTION
|
||
|
||
def forward(self, input_ids: Union[Tensor, List[int], np.ndarray]) -> Tensor:
|
||
"""
|
||
Look up embeddings for input token indices.
|
||
|
||
TODO: Implement embedding lookup.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Convert input to numpy array if needed
|
||
2. Validate token indices are within vocabulary
|
||
3. Use advanced indexing to look up embeddings
|
||
4. Return tensor with shape (batch_size, seq_len, embedding_dim)
|
||
|
||
EXAMPLE:
|
||
embed = Embedding(vocab_size=100, embedding_dim=64)
|
||
tokens = Tensor([[1, 2, 3], [4, 5, 6]]) # Shape: (2, 3)
|
||
embeddings = embed.forward(tokens) # Shape: (2, 3, 64)
|
||
|
||
IMPLEMENTATION HINTS:
|
||
- Handle both Tensor and list inputs
|
||
- Use numpy advanced indexing: weight[indices]
|
||
- Preserve batch and sequence dimensions
|
||
|
||
Args:
|
||
input_ids: Token indices with shape (batch_size, seq_len) or (seq_len,)
|
||
|
||
Returns:
|
||
Embeddings with shape (*input_shape, embedding_dim)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Convert input to numpy array
|
||
if isinstance(input_ids, Tensor):
|
||
indices = input_ids.data
|
||
elif isinstance(input_ids, list):
|
||
indices = np.array(input_ids)
|
||
else:
|
||
indices = input_ids
|
||
|
||
# Validate indices
|
||
indices = indices.astype(int)
|
||
if np.any(indices < 0) or np.any(indices >= self.vocab_size):
|
||
raise ValueError(f"Token indices must be in range [0, {self.vocab_size})")
|
||
|
||
# Look up embeddings using advanced indexing
|
||
# self.weight.data has shape (vocab_size, embedding_dim)
|
||
# indices has shape (...), result has shape (..., embedding_dim)
|
||
embeddings = self.weight.data[indices]
|
||
|
||
return Tensor(embeddings)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, input_ids: Union[Tensor, List[int], np.ndarray]) -> Tensor:
|
||
"""Make the layer callable."""
|
||
return self.forward(input_ids)
|
||
|
||
def get_memory_usage(self):
|
||
"""
|
||
Calculate memory usage of embedding table.
|
||
|
||
This function is PROVIDED to show memory analysis.
|
||
"""
|
||
# Embedding table memory
|
||
weight_memory_mb = self.weight.data.nbytes / (1024 * 1024)
|
||
|
||
# Memory per token
|
||
memory_per_token_kb = (self.embedding_dim * 4) / 1024 # 4 bytes per float32
|
||
|
||
return {
|
||
'total_memory_mb': weight_memory_mb,
|
||
'memory_per_token_kb': memory_per_token_kb,
|
||
'total_parameters': self.vocab_size * self.embedding_dim,
|
||
'vocab_size': self.vocab_size,
|
||
'embedding_dim': self.embedding_dim
|
||
}
|
||
|
||
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 10
|
||
class PositionalEncoding:
|
||
"""
|
||
Sinusoidal positional encoding that adds position information to embeddings.
|
||
|
||
Uses sine and cosine functions of different frequencies to create
|
||
unique position representations that the model can learn to use.
|
||
"""
|
||
|
||
def __init__(self, embedding_dim: int, max_seq_length: int = 5000,
|
||
dropout: float = 0.0):
|
||
"""
|
||
Initialize positional encoding with sinusoidal patterns.
|
||
|
||
TODO: Implement positional encoding initialization.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Create position matrix (max_seq_length, embedding_dim)
|
||
2. For each position and dimension:
|
||
- Calculate frequency based on dimension
|
||
- Apply sine to even dimensions, cosine to odd dimensions
|
||
3. Store the precomputed positional encodings
|
||
|
||
MATHEMATICAL FOUNDATION:
|
||
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
|
||
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
|
||
|
||
Where:
|
||
- pos = position in sequence
|
||
- i = dimension index
|
||
- d_model = embedding_dim
|
||
|
||
Args:
|
||
embedding_dim: Dimension of embeddings (must be even)
|
||
max_seq_length: Maximum sequence length to precompute
|
||
dropout: Dropout rate (for future use)
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.embedding_dim = embedding_dim
|
||
self.max_seq_length = max_seq_length
|
||
self.dropout = dropout
|
||
|
||
# Create positional encoding matrix
|
||
pe = np.zeros((max_seq_length, embedding_dim))
|
||
|
||
# Create position vector (0, 1, 2, ..., max_seq_length-1)
|
||
position = np.arange(0, max_seq_length).reshape(-1, 1) # Shape: (max_seq_length, 1)
|
||
|
||
# Create dimension indices for frequency calculation
|
||
# div_term calculates 10000^(2i/d_model) for i = 0, 1, 2, ...
|
||
div_term = np.exp(np.arange(0, embedding_dim, 2) *
|
||
-(math.log(10000.0) / embedding_dim))
|
||
|
||
# Apply sine to even dimensions (0, 2, 4, ...)
|
||
pe[:, 0::2] = np.sin(position * div_term)
|
||
|
||
# Apply cosine to odd dimensions (1, 3, 5, ...)
|
||
if embedding_dim % 2 == 1:
|
||
# Handle odd embedding_dim - cosine gets one less dimension
|
||
pe[:, 1::2] = np.cos(position * div_term[:-1])
|
||
else:
|
||
pe[:, 1::2] = np.cos(position * div_term)
|
||
|
||
# Store as tensor
|
||
self.pe = Tensor(pe)
|
||
### END SOLUTION
|
||
|
||
def forward(self, embeddings: Tensor) -> Tensor:
|
||
"""
|
||
Add positional encoding to embeddings.
|
||
|
||
TODO: Implement positional encoding addition.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Get sequence length from embeddings shape
|
||
2. Extract relevant positional encodings
|
||
3. Add positional encodings to embeddings
|
||
4. Return position-aware embeddings
|
||
|
||
EXAMPLE:
|
||
pos_enc = PositionalEncoding(embedding_dim=64)
|
||
embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, dim)
|
||
pos_embeddings = pos_enc.forward(embeddings)
|
||
|
||
Args:
|
||
embeddings: Input embeddings with shape (batch_size, seq_len, embedding_dim)
|
||
|
||
Returns:
|
||
Position-aware embeddings with same shape as input
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Get sequence length from embeddings
|
||
if len(embeddings.shape) == 3:
|
||
batch_size, seq_length, embed_dim = embeddings.shape
|
||
elif len(embeddings.shape) == 2:
|
||
seq_length, embed_dim = embeddings.shape
|
||
batch_size = None
|
||
else:
|
||
raise ValueError(f"Expected 2D or 3D embeddings, got shape {embeddings.shape}")
|
||
|
||
if embed_dim != self.embedding_dim:
|
||
raise ValueError(f"Embedding dim mismatch: expected {self.embedding_dim}, got {embed_dim}")
|
||
|
||
if seq_length > self.max_seq_length:
|
||
raise ValueError(f"Sequence length {seq_length} exceeds max {self.max_seq_length}")
|
||
|
||
# Extract positional encodings for this sequence length
|
||
position_encodings = self.pe.data[:seq_length, :]
|
||
|
||
# Add positional encodings to embeddings
|
||
if batch_size is not None:
|
||
# Broadcast positional encodings across batch dimension
|
||
# embeddings: (batch, seq, dim) + position_encodings: (seq, dim)
|
||
result = embeddings.data + position_encodings[np.newaxis, :, :]
|
||
else:
|
||
# embeddings: (seq, dim) + position_encodings: (seq, dim)
|
||
result = embeddings.data + position_encodings
|
||
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, embeddings: Tensor) -> Tensor:
|
||
"""Make the class callable."""
|
||
return self.forward(embeddings)
|
||
|
||
def visualize_encoding(self, seq_length: int = 100, dims_to_show: int = 10) -> None:
|
||
"""
|
||
Visualize positional encoding patterns.
|
||
|
||
This function is PROVIDED to show encoding patterns.
|
||
"""
|
||
print(f"📊 POSITIONAL ENCODING VISUALIZATION")
|
||
print(f"Sequence length: {seq_length}, Dimensions shown: {dims_to_show}")
|
||
print("=" * 60)
|
||
|
||
# Get subset of positional encodings
|
||
pe_subset = self.pe.data[:seq_length, :dims_to_show]
|
||
|
||
# Show patterns for first few positions
|
||
print("First 10 positions, first 10 dimensions:")
|
||
print("Pos", end="")
|
||
for d in range(min(dims_to_show, 10)):
|
||
print(f" Dim{d:2d}", end="")
|
||
print()
|
||
|
||
for pos in range(min(seq_length, 10)):
|
||
print(f"{pos:3d}", end="")
|
||
for d in range(min(dims_to_show, 10)):
|
||
print(f"{pe_subset[pos, d]:8.3f}", end="")
|
||
print()
|
||
|
||
# Show frequency analysis
|
||
print(f"\n📈 FREQUENCY ANALYSIS:")
|
||
print("Even dimensions (sine): Lower frequencies for early dimensions")
|
||
print("Odd dimensions (cosine): Same frequencies, phase-shifted")
|
||
|
||
# Calculate frequency range
|
||
min_freq = 1.0 / 10000
|
||
max_freq = 1.0
|
||
print(f"Frequency range: {min_freq:.6f} to {max_freq:.6f}")
|
||
|
||
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 14
|
||
class LearnedPositionalEmbedding:
|
||
"""
|
||
Learned positional embeddings - another embedding table for positions.
|
||
|
||
Unlike sinusoidal encoding, these are learned parameters that
|
||
the model optimizes during training. Used in models like BERT.
|
||
"""
|
||
|
||
def __init__(self, max_seq_length: int, embedding_dim: int):
|
||
"""
|
||
Initialize learned positional embeddings.
|
||
|
||
TODO: Implement learned positional embedding initialization.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Create embedding layer for positions (0, 1, 2, ..., max_seq_length-1)
|
||
2. Initialize with small random values
|
||
3. Set up parameter tracking for optimization
|
||
|
||
This is essentially an Embedding layer where the "vocabulary"
|
||
is the set of possible positions in a sequence.
|
||
|
||
Args:
|
||
max_seq_length: Maximum sequence length supported
|
||
embedding_dim: Dimension of position embeddings
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.max_seq_length = max_seq_length
|
||
self.embedding_dim = embedding_dim
|
||
|
||
# Create learned positional embedding table
|
||
# This is like an embedding layer for positions
|
||
self.position_embedding = Embedding(
|
||
vocab_size=max_seq_length,
|
||
embedding_dim=embedding_dim,
|
||
init_type='normal'
|
||
)
|
||
|
||
# Track parameters for optimization
|
||
self.parameters = self.position_embedding.parameters
|
||
### END SOLUTION
|
||
|
||
def forward(self, embeddings: Tensor) -> Tensor:
|
||
"""
|
||
Add learned positional embeddings to input embeddings.
|
||
|
||
TODO: Implement learned positional embedding addition.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Get sequence length from input shape
|
||
2. Create position indices [0, 1, 2, ..., seq_length-1]
|
||
3. Look up position embeddings using position indices
|
||
4. Add position embeddings to input embeddings
|
||
|
||
EXAMPLE:
|
||
learned_pos = LearnedPositionalEmbedding(max_seq_length=100, embedding_dim=64)
|
||
embeddings = Tensor(np.random.randn(2, 10, 64)) # (batch, seq, dim)
|
||
pos_embeddings = learned_pos.forward(embeddings)
|
||
|
||
Args:
|
||
embeddings: Input embeddings with shape (batch_size, seq_len, embedding_dim)
|
||
|
||
Returns:
|
||
Position-aware embeddings with same shape as input
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Get sequence length from embeddings
|
||
if len(embeddings.shape) == 3:
|
||
batch_size, seq_length, embed_dim = embeddings.shape
|
||
elif len(embeddings.shape) == 2:
|
||
seq_length, embed_dim = embeddings.shape
|
||
batch_size = None
|
||
else:
|
||
raise ValueError(f"Expected 2D or 3D embeddings, got shape {embeddings.shape}")
|
||
|
||
if embed_dim != self.embedding_dim:
|
||
raise ValueError(f"Embedding dim mismatch: expected {self.embedding_dim}, got {embed_dim}")
|
||
|
||
if seq_length > self.max_seq_length:
|
||
raise ValueError(f"Sequence length {seq_length} exceeds max {self.max_seq_length}")
|
||
|
||
# Create position indices [0, 1, 2, ..., seq_length-1]
|
||
position_ids = list(range(seq_length))
|
||
|
||
# Look up position embeddings
|
||
position_embeddings = self.position_embedding.forward(position_ids)
|
||
|
||
# Add position embeddings to input embeddings
|
||
if batch_size is not None:
|
||
# Broadcast across batch dimension
|
||
result = embeddings.data + position_embeddings.data[np.newaxis, :, :]
|
||
else:
|
||
result = embeddings.data + position_embeddings.data
|
||
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, embeddings: Tensor) -> Tensor:
|
||
"""Make the class callable."""
|
||
return self.forward(embeddings)
|
||
|
||
# %% ../../modules/12_embeddings/embeddings_dev.ipynb 18
|
||
import time
|
||
|
||
class EmbeddingProfiler:
|
||
"""
|
||
Performance profiling toolkit for embedding systems.
|
||
|
||
Helps ML engineers understand memory usage, lookup performance,
|
||
and scaling characteristics of embedding layers.
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.results = {}
|
||
|
||
def measure_lookup_performance(self, embedding_layer: Embedding,
|
||
batch_sizes: List[int], seq_lengths: List[int]):
|
||
"""
|
||
Measure embedding lookup performance across different batch sizes and sequence lengths.
|
||
|
||
TODO: Implement embedding lookup performance measurement.
|
||
|
||
STEP-BY-STEP IMPLEMENTATION:
|
||
1. Create test token indices for each (batch_size, seq_length) combination
|
||
2. Measure time to perform embedding lookup
|
||
3. Calculate throughput metrics (tokens/second, memory bandwidth)
|
||
4. Return comprehensive performance analysis
|
||
|
||
METRICS TO CALCULATE:
|
||
- Lookup time (milliseconds)
|
||
- Tokens per second throughput
|
||
- Memory bandwidth utilization
|
||
- Scaling patterns with batch size and sequence length
|
||
|
||
Args:
|
||
embedding_layer: Embedding layer to test
|
||
batch_sizes: List of batch sizes to test
|
||
seq_lengths: List of sequence lengths to test
|
||
|
||
Returns:
|
||
Dictionary with performance metrics for each configuration
|
||
"""
|
||
### BEGIN SOLUTION
|
||
results = {}
|
||
vocab_size = embedding_layer.vocab_size
|
||
|
||
for batch_size in batch_sizes:
|
||
for seq_length in seq_lengths:
|
||
# Create random token indices
|
||
token_indices = np.random.randint(0, vocab_size, (batch_size, seq_length))
|
||
|
||
# Measure lookup performance
|
||
start_time = time.time()
|
||
embeddings = embedding_layer.forward(token_indices)
|
||
end_time = time.time()
|
||
|
||
# Calculate metrics
|
||
lookup_time_ms = (end_time - start_time) * 1000
|
||
total_tokens = batch_size * seq_length
|
||
tokens_per_second = total_tokens / (end_time - start_time) if end_time > start_time else 0
|
||
|
||
# Memory calculations
|
||
input_memory_mb = token_indices.nbytes / (1024 * 1024)
|
||
output_memory_mb = embeddings.data.nbytes / (1024 * 1024)
|
||
memory_bandwidth_mb_s = (input_memory_mb + output_memory_mb) / (end_time - start_time) if end_time > start_time else 0
|
||
|
||
config_key = f"batch_{batch_size}_seq_{seq_length}"
|
||
results[config_key] = {
|
||
'batch_size': batch_size,
|
||
'seq_length': seq_length,
|
||
'total_tokens': total_tokens,
|
||
'lookup_time_ms': lookup_time_ms,
|
||
'tokens_per_second': tokens_per_second,
|
||
'input_memory_mb': input_memory_mb,
|
||
'output_memory_mb': output_memory_mb,
|
||
'memory_bandwidth_mb_s': memory_bandwidth_mb_s,
|
||
'time_per_token_us': lookup_time_ms * 1000 / total_tokens if total_tokens > 0 else 0
|
||
}
|
||
|
||
return results
|
||
### END SOLUTION
|
||
|
||
def analyze_memory_scaling(self, vocab_sizes: List[int], embedding_dims: List[int]):
|
||
"""
|
||
Analyze how embedding memory usage scales with vocabulary size and embedding dimension.
|
||
|
||
This function is PROVIDED to show memory scaling analysis.
|
||
"""
|
||
print("📊 EMBEDDING MEMORY SCALING ANALYSIS")
|
||
print("=" * 60)
|
||
|
||
scaling_results = {}
|
||
|
||
print(f"{'Vocab Size':<12} {'Embed Dim':<10} {'Parameters':<12} {'Memory (MB)':<12} {'Lookup Time':<12}")
|
||
print("-" * 70)
|
||
|
||
for vocab_size in vocab_sizes:
|
||
for embed_dim in embedding_dims:
|
||
# Create embedding layer
|
||
embed = Embedding(vocab_size=vocab_size, embedding_dim=embed_dim)
|
||
|
||
# Calculate memory usage
|
||
memory_stats = embed.get_memory_usage()
|
||
total_memory_mb = memory_stats['total_memory_mb']
|
||
total_params = memory_stats['total_parameters']
|
||
|
||
# Measure lookup time
|
||
test_tokens = np.random.randint(0, vocab_size, (32, 64)) # Standard batch
|
||
start_time = time.time()
|
||
_ = embed.forward(test_tokens)
|
||
lookup_time_ms = (time.time() - start_time) * 1000
|
||
|
||
# Store results
|
||
config_key = f"vocab_{vocab_size}_dim_{embed_dim}"
|
||
scaling_results[config_key] = {
|
||
'vocab_size': vocab_size,
|
||
'embedding_dim': embed_dim,
|
||
'total_parameters': total_params,
|
||
'memory_mb': total_memory_mb,
|
||
'lookup_time_ms': lookup_time_ms
|
||
}
|
||
|
||
print(f"{vocab_size:<12,} {embed_dim:<10} {total_params:<12,} {total_memory_mb:<12.2f} {lookup_time_ms:<12.2f}")
|
||
|
||
# Analyze scaling patterns
|
||
print(f"\n📈 SCALING INSIGHTS:")
|
||
if len(vocab_sizes) > 1 and len(embedding_dims) > 1:
|
||
# Compare scaling with vocab size (fixed embedding dim)
|
||
fixed_dim = embedding_dims[0]
|
||
small_vocab = min(vocab_sizes)
|
||
large_vocab = max(vocab_sizes)
|
||
|
||
small_key = f"vocab_{small_vocab}_dim_{fixed_dim}"
|
||
large_key = f"vocab_{large_vocab}_dim_{fixed_dim}"
|
||
|
||
if small_key in scaling_results and large_key in scaling_results:
|
||
vocab_ratio = large_vocab / small_vocab
|
||
memory_ratio = scaling_results[large_key]['memory_mb'] / scaling_results[small_key]['memory_mb']
|
||
print(f" Vocabulary scaling: {vocab_ratio:.1f}x vocab → {memory_ratio:.1f}x memory (Linear)")
|
||
|
||
# Compare scaling with embedding dim (fixed vocab)
|
||
fixed_vocab = vocab_sizes[0]
|
||
small_dim = min(embedding_dims)
|
||
large_dim = max(embedding_dims)
|
||
|
||
small_key = f"vocab_{fixed_vocab}_dim_{small_dim}"
|
||
large_key = f"vocab_{fixed_vocab}_dim_{large_dim}"
|
||
|
||
if small_key in scaling_results and large_key in scaling_results:
|
||
dim_ratio = large_dim / small_dim
|
||
memory_ratio = scaling_results[large_key]['memory_mb'] / scaling_results[small_key]['memory_mb']
|
||
print(f" Dimension scaling: {dim_ratio:.1f}x dim → {memory_ratio:.1f}x memory (Linear)")
|
||
|
||
return scaling_results
|
||
|
||
def compare_positional_encodings(self, seq_length: int = 100, embedding_dim: int = 256):
|
||
"""
|
||
Compare performance and characteristics of different positional encoding approaches.
|
||
|
||
This function is PROVIDED to show positional encoding comparison.
|
||
"""
|
||
print(f"\n🔍 POSITIONAL ENCODING COMPARISON")
|
||
print("=" * 50)
|
||
|
||
# Create test embeddings
|
||
batch_size = 16
|
||
embeddings = Tensor(np.random.randn(batch_size, seq_length, embedding_dim))
|
||
|
||
# Test sinusoidal positional encoding
|
||
sinusoidal_pe = PositionalEncoding(embedding_dim=embedding_dim, max_seq_length=seq_length*2)
|
||
start_time = time.time()
|
||
sin_result = sinusoidal_pe.forward(embeddings)
|
||
sin_time = (time.time() - start_time) * 1000
|
||
|
||
# Test learned positional embedding
|
||
learned_pe = LearnedPositionalEmbedding(max_seq_length=seq_length*2, embedding_dim=embedding_dim)
|
||
start_time = time.time()
|
||
learned_result = learned_pe.forward(embeddings)
|
||
learned_time = (time.time() - start_time) * 1000
|
||
|
||
# Calculate memory usage
|
||
sin_memory = 0 # No learnable parameters
|
||
learned_memory = learned_pe.position_embedding.get_memory_usage()['total_memory_mb']
|
||
|
||
results = {
|
||
'sinusoidal': {
|
||
'computation_time_ms': sin_time,
|
||
'memory_usage_mb': sin_memory,
|
||
'parameters': 0,
|
||
'deterministic': True,
|
||
'extrapolation': 'Good (can handle longer sequences)'
|
||
},
|
||
'learned': {
|
||
'computation_time_ms': learned_time,
|
||
'memory_usage_mb': learned_memory,
|
||
'parameters': seq_length * 2 * embedding_dim,
|
||
'deterministic': False,
|
||
'extrapolation': 'Limited (fixed max sequence length)'
|
||
}
|
||
}
|
||
|
||
print(f"📊 COMPARISON RESULTS:")
|
||
print(f"{'Method':<12} {'Time (ms)':<10} {'Memory (MB)':<12} {'Parameters':<12} {'Extrapolation'}")
|
||
print("-" * 70)
|
||
print(f"{'Sinusoidal':<12} {sin_time:<10.2f} {sin_memory:<12.2f} {0:<12,} {'Good'}")
|
||
print(f"{'Learned':<12} {learned_time:<10.2f} {learned_memory:<12.2f} {results['learned']['parameters']:<12,} {'Limited'}")
|
||
|
||
print(f"\n💡 INSIGHTS:")
|
||
print(f" - Sinusoidal: Zero parameters, deterministic, good extrapolation")
|
||
print(f" - Learned: Requires parameters, model-specific, limited extrapolation")
|
||
print(f" - Choice depends on: model capacity, sequence length requirements, extrapolation needs")
|
||
|
||
return results
|
||
|
||
def analyze_embedding_system_design():
|
||
"""
|
||
Comprehensive analysis of embedding system design choices and their impact.
|
||
|
||
This function is PROVIDED to show systems-level design thinking.
|
||
"""
|
||
print("🏗️ EMBEDDING SYSTEM DESIGN ANALYSIS")
|
||
print("=" * 60)
|
||
|
||
# Example model configurations
|
||
model_configs = [
|
||
{'name': 'Small GPT', 'vocab_size': 10000, 'embed_dim': 256, 'seq_length': 512},
|
||
{'name': 'Medium GPT', 'vocab_size': 50000, 'embed_dim': 512, 'seq_length': 1024},
|
||
{'name': 'Large GPT', 'vocab_size': 50000, 'embed_dim': 1024, 'seq_length': 2048}
|
||
]
|
||
|
||
print(f"📋 MODEL CONFIGURATION COMPARISON:")
|
||
print(f"{'Model':<12} {'Vocab Size':<10} {'Embed Dim':<10} {'Seq Len':<8} {'Embed Params':<12} {'Memory (MB)'}")
|
||
print("-" * 80)
|
||
|
||
for config in model_configs:
|
||
# Calculate embedding parameters
|
||
embed_params = config['vocab_size'] * config['embed_dim']
|
||
|
||
# Calculate memory usage
|
||
embed_memory_mb = embed_params * 4 / (1024 * 1024) # 4 bytes per float32
|
||
|
||
print(f"{config['name']:<12} {config['vocab_size']:<10,} {config['embed_dim']:<10} "
|
||
f"{config['seq_length']:<8} {embed_params:<12,} {embed_memory_mb:<10.1f}")
|
||
|
||
print(f"\n🎯 DESIGN TRADE-OFFS:")
|
||
print(f" 1. Vocabulary Size:")
|
||
print(f" - Larger vocab: Better text coverage, more parameters")
|
||
print(f" - Smaller vocab: Longer sequences, more compute")
|
||
print(f" 2. Embedding Dimension:")
|
||
print(f" - Higher dim: More model capacity, more memory")
|
||
print(f" - Lower dim: Faster computation, potential bottleneck")
|
||
print(f" 3. Position Encoding:")
|
||
print(f" - Sinusoidal: No parameters, good extrapolation")
|
||
print(f" - Learned: Model-specific, limited to training length")
|
||
print(f" 4. Memory Scaling:")
|
||
print(f" - Embedding table: O(vocab_size × embed_dim)")
|
||
print(f" - Sequence processing: O(batch_size × seq_length × embed_dim)")
|
||
print(f" - Total memory dominated by model size, not embedding table")
|
||
|
||
print(f"\n🏭 PRODUCTION CONSIDERATIONS:")
|
||
print(f" - GPU memory limits affect maximum embedding table size")
|
||
print(f" - Embedding lookup is memory-bandwidth bound")
|
||
print(f" - Vocabulary size affects tokenization and model download size")
|
||
print(f" - Position encoding choice affects sequence length flexibility")
|