mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-23 12:25:50 -05:00
Major directory restructure to support both developer and learner workflows: Structure Changes: - NEW: src/ directory for Python source files (version controlled) - Files renamed: tensor.py → 01_tensor.py (matches directory naming) - All 20 modules moved from modules/ to src/ - CHANGED: modules/ now holds generated notebooks (gitignored) - Generated from src/*.py using jupytext - Learners work in notebooks, developers work in Python source - UNCHANGED: tinytorch/ package (still auto-generated from notebooks) Workflow: src/*.py → modules/*.ipynb → tinytorch/*.py Command Updates: - Updated export command to read from src/ and generate to modules/ - Export flow: discovers modules in src/, converts to notebooks in modules/, exports to tinytorch/ - All 20 modules tested and working Configuration: - Updated .gitignore to ignore modules/ directory - Updated README.md with new three-layer architecture explanation - Updated export.py source mappings and paths Benefits: - Clean separation: developers edit Python, learners use notebooks - Better version control: only Python source committed, notebooks generated - Flexible learning: can work in notebooks OR Python source - Maintains backward compatibility: tinytorch package unchanged Tested: - Single module export: tito export 01_tensor ✅ - All modules export: tito export --all ✅ - Package imports: from tinytorch.core.tensor import Tensor ✅ - 20/20 modules successfully converted and exported
684 lines
26 KiB
Python
Generated
684 lines
26 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||
# ║ ║
|
||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||
# ║ ║
|
||
# ║ ✅ TO EDIT: src/XX_tinygpt/XX_tinygpt.py ║
|
||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||
# ║ ║
|
||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||
# ║ Editing it directly may break module functionality and training. ║
|
||
# ║ ║
|
||
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
|
||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||
# %% auto 0
|
||
__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline',
|
||
'test_unit_complete_pipeline']
|
||
|
||
# %% ../../modules/20_capstone/20_capstone.ipynb 2
|
||
#| default_exp applications.tinygpt
|
||
#| export
|
||
|
||
# %% ../../modules/20_capstone/20_capstone.ipynb 7
|
||
class TinyGPT:
|
||
"""
|
||
Complete GPT implementation integrating all TinyTorch modules.
|
||
|
||
This class demonstrates how framework components compose into real applications.
|
||
Built using modules 01,02,03,11,12,13 as core architecture.
|
||
|
||
Architecture:
|
||
- Token Embeddings (Module 11)
|
||
- Positional Encoding (Module 11)
|
||
- Transformer Blocks (Module 13)
|
||
- Output Linear Layer (Module 03)
|
||
- Language Modeling Head (Module 04)
|
||
"""
|
||
|
||
def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4,
|
||
num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1):
|
||
"""
|
||
Initialize TinyGPT with production-inspired architecture.
|
||
|
||
TODO: Build a complete GPT model using TinyTorch components
|
||
|
||
APPROACH:
|
||
1. Create token embeddings (vocab_size × embed_dim)
|
||
2. Create positional encoding (max_seq_len × embed_dim)
|
||
3. Build transformer layers using TransformerBlock
|
||
4. Add output projection layer
|
||
5. Calculate and report parameter count
|
||
|
||
ARCHITECTURE DECISIONS:
|
||
- embed_dim=128: Small enough for fast training, large enough for learning
|
||
- num_layers=4: Sufficient depth without excessive memory
|
||
- num_heads=4: Multi-head attention without head_dim being too small
|
||
- max_seq_len=256: Reasonable context length for character-level modeling
|
||
|
||
EXAMPLE:
|
||
>>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4)
|
||
>>> print(f"Parameters: {model.count_parameters():,}")
|
||
Parameters: 1,234,567
|
||
|
||
HINTS:
|
||
- Use Embedding class for token embeddings
|
||
- Use PositionalEncoding for position information
|
||
- Stack TransformerBlock instances in a list
|
||
- Final Linear layer maps embed_dim → vocab_size
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.vocab_size = vocab_size
|
||
self.embed_dim = embed_dim
|
||
self.num_layers = num_layers
|
||
self.num_heads = num_heads
|
||
self.max_seq_len = max_seq_len
|
||
self.dropout = dropout
|
||
|
||
# Token embeddings: convert token IDs to dense vectors
|
||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||
|
||
# Positional encoding: add position information
|
||
self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim)
|
||
|
||
# Transformer layers: core processing
|
||
self.transformer_blocks = []
|
||
for _ in range(num_layers):
|
||
block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0)
|
||
self.transformer_blocks.append(block)
|
||
|
||
# Output projection: map back to vocabulary
|
||
self.output_projection = Linear(embed_dim, vocab_size)
|
||
|
||
# Dropout for regularization
|
||
self.dropout_layer = Dropout(dropout)
|
||
|
||
# Calculate parameter count for systems analysis
|
||
self._param_count = self.count_parameters()
|
||
print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters")
|
||
print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D")
|
||
print(f"💾 Estimated memory: {self._param_count * BYTES_PER_FLOAT32 / MB_TO_BYTES:.1f}MB")
|
||
### END SOLUTION
|
||
|
||
def test_unit_tinygpt_init():
|
||
"""🔬 Test TinyGPT initialization and parameter counting."""
|
||
print("🔬 Unit Test: TinyGPT Initialization...")
|
||
|
||
# Create a small model for testing
|
||
model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128)
|
||
|
||
# Verify architecture components exist
|
||
assert hasattr(model, 'token_embedding')
|
||
assert hasattr(model, 'positional_encoding')
|
||
assert hasattr(model, 'transformer_blocks')
|
||
assert hasattr(model, 'output_projection')
|
||
assert len(model.transformer_blocks) == 2
|
||
|
||
# Verify parameter count is reasonable
|
||
param_count = model.count_parameters()
|
||
assert param_count > 0
|
||
assert param_count < 1000000 # Sanity check for small model
|
||
|
||
print(f"✅ Model created with {param_count:,} parameters")
|
||
print("✅ TinyGPT initialization works correctly!")
|
||
|
||
# Run immediate test when developing this module
|
||
if __name__ == "__main__":
|
||
test_unit_tinygpt_init()
|
||
|
||
# %% ../../modules/20_capstone/20_capstone.ipynb 10
|
||
class TinyGPTTrainer:
|
||
"""
|
||
Complete training pipeline integrating optimizers, schedulers, and monitoring.
|
||
|
||
Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training.
|
||
"""
|
||
|
||
def __init__(self, model: TinyGPT, tokenizer: CharTokenizer,
|
||
learning_rate: float = 3e-4, weight_decay: float = 0.01):
|
||
"""
|
||
Initialize trainer with model and optimization components.
|
||
|
||
TODO: Set up complete training infrastructure
|
||
|
||
APPROACH:
|
||
1. Store model and tokenizer references
|
||
2. Initialize AdamW optimizer (standard for transformers)
|
||
3. Initialize loss function (CrossEntropyLoss for language modeling)
|
||
4. Set up learning rate scheduler (cosine schedule)
|
||
5. Initialize training metrics tracking
|
||
|
||
PRODUCTION CHOICES:
|
||
- AdamW: Better generalization than Adam (weight decay)
|
||
- learning_rate=3e-4: Standard for small transformers
|
||
- Cosine schedule: Smooth learning rate decay
|
||
- CrossEntropy: Standard for classification/language modeling
|
||
|
||
EXAMPLE:
|
||
>>> model = TinyGPT(vocab_size=100)
|
||
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
|
||
>>> trainer = TinyGPTTrainer(model, tokenizer)
|
||
>>> print("Trainer ready for training")
|
||
Trainer ready for training
|
||
|
||
HINTS:
|
||
- Get all model parameters with model.parameters()
|
||
- Use AdamW with weight_decay for better generalization
|
||
- CrossEntropyLoss handles the language modeling objective
|
||
"""
|
||
### BEGIN SOLUTION
|
||
self.model = model
|
||
self.tokenizer = tokenizer
|
||
|
||
# Collect all trainable parameters
|
||
all_params = []
|
||
all_params.extend(model.token_embedding.parameters())
|
||
for block in model.transformer_blocks:
|
||
all_params.extend(block.parameters())
|
||
all_params.extend(model.output_projection.parameters())
|
||
|
||
# Initialize optimizer (AdamW for transformers)
|
||
self.optimizer = AdamW(
|
||
params=all_params,
|
||
lr=learning_rate,
|
||
weight_decay=weight_decay,
|
||
betas=(0.9, 0.95) # Standard for language models
|
||
)
|
||
|
||
# Loss function for next token prediction
|
||
self.loss_fn = CrossEntropyLoss()
|
||
|
||
# Learning rate scheduler
|
||
self.scheduler = CosineSchedule(
|
||
optimizer=self.optimizer,
|
||
max_epochs=100, # Will adjust based on actual training
|
||
min_lr=learning_rate * 0.1
|
||
)
|
||
|
||
# Training metrics
|
||
self.training_history = {
|
||
'losses': [],
|
||
'perplexities': [],
|
||
'learning_rates': [],
|
||
'epoch': 0
|
||
}
|
||
|
||
print(f"🚀 Trainer initialized:")
|
||
print(f" Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})")
|
||
print(f" Parameters: {len(all_params):,} tensors")
|
||
print(f" Loss: CrossEntropyLoss")
|
||
### END SOLUTION
|
||
|
||
def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]:
|
||
"""
|
||
Convert text batch to input/target tensors for language modeling.
|
||
|
||
TODO: Implement text-to-tensor conversion with proper targets
|
||
|
||
APPROACH:
|
||
1. Tokenize each text in the batch
|
||
2. Pad/truncate to consistent length
|
||
3. Create input_ids (text) and target_ids (text shifted by 1)
|
||
4. Convert to Tensor format
|
||
|
||
LANGUAGE MODELING OBJECTIVE:
|
||
- Input: [token1, token2, token3, token4]
|
||
- Target: [token2, token3, token4, token5]
|
||
- Model predicts next token at each position
|
||
|
||
EXAMPLE:
|
||
>>> trainer = TinyGPTTrainer(model, tokenizer)
|
||
>>> texts = ["hello world", "ai is fun"]
|
||
>>> inputs, targets = trainer.prepare_batch(texts)
|
||
>>> print(inputs.shape, targets.shape)
|
||
(2, 128) (2, 128)
|
||
|
||
HINTS:
|
||
- Use tokenizer.encode() for text → token conversion
|
||
- Pad shorter sequences with tokenizer pad token
|
||
- Target sequence is input sequence shifted right by 1
|
||
"""
|
||
### BEGIN SOLUTION
|
||
batch_size = len(text_batch)
|
||
|
||
# Tokenize all texts
|
||
tokenized_batch = []
|
||
for text in text_batch:
|
||
tokens = self.tokenizer.encode(text)
|
||
|
||
# Truncate or pad to max_length
|
||
if len(tokens) > max_length:
|
||
tokens = tokens[:max_length]
|
||
else:
|
||
# Pad with special token (use 0 as pad)
|
||
tokens.extend([0] * (max_length - len(tokens)))
|
||
|
||
tokenized_batch.append(tokens)
|
||
|
||
# Convert to numpy then Tensor
|
||
input_ids = Tensor(np.array(tokenized_batch)) # (batch_size, seq_len)
|
||
|
||
# Create targets (shifted input for next token prediction)
|
||
target_ids = Tensor(np.roll(input_ids.data, -1, axis=1)) # Shift left by 1
|
||
|
||
return input_ids, target_ids
|
||
### END SOLUTION
|
||
|
||
def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float:
|
||
"""
|
||
Single training step with forward, backward, and optimization.
|
||
|
||
TODO: Implement complete training step
|
||
|
||
APPROACH:
|
||
1. Zero gradients from previous step
|
||
2. Forward pass to get logits
|
||
3. Compute loss between logits and targets
|
||
4. Backward pass to compute gradients
|
||
5. Optimizer step to update parameters
|
||
6. Return loss value for monitoring
|
||
|
||
MEMORY MANAGEMENT:
|
||
During training, memory usage = 3× model size:
|
||
- 1× for parameters
|
||
- 1× for gradients
|
||
- 1× for optimizer states (Adam moments)
|
||
|
||
EXAMPLE:
|
||
>>> loss = trainer.train_step(input_ids, target_ids)
|
||
>>> print(f"Training loss: {loss:.4f}")
|
||
Training loss: 2.3456
|
||
|
||
HINTS:
|
||
- Always zero_grad() before forward pass
|
||
- Loss should be computed on flattened logits and targets
|
||
- Call backward() on the loss tensor
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Zero gradients from previous step
|
||
self.optimizer.zero_grad()
|
||
|
||
# Forward pass
|
||
logits = self.model.forward(input_ids) # (batch, seq_len, vocab_size)
|
||
|
||
# Reshape for loss computation
|
||
batch_size, seq_len, vocab_size = logits.shape
|
||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||
targets_flat = target_ids.reshape(batch_size * seq_len)
|
||
|
||
# Compute loss
|
||
loss = self.loss_fn.forward(logits_flat, targets_flat)
|
||
|
||
# Backward pass
|
||
loss.backward()
|
||
|
||
# Optimizer step
|
||
self.optimizer.step()
|
||
|
||
# Return scalar loss for monitoring
|
||
# loss.data is numpy array - float() handles conversion automatically
|
||
return float(loss.data)
|
||
### END SOLUTION
|
||
|
||
def test_unit_training_pipeline():
|
||
"""🔬 Test training pipeline components."""
|
||
print("🔬 Unit Test: Training Pipeline...")
|
||
|
||
# Create small model and trainer
|
||
model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2)
|
||
tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' '])
|
||
trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3)
|
||
|
||
# Test batch preparation
|
||
texts = ["hello", "world"]
|
||
input_ids, target_ids = trainer.prepare_batch(texts, max_length=8)
|
||
|
||
assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}"
|
||
assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}"
|
||
|
||
# Test training step
|
||
initial_loss = trainer.train_step(input_ids, target_ids)
|
||
assert initial_loss > 0, "Loss should be positive"
|
||
|
||
# Second step should work (gradients computed and applied)
|
||
second_loss = trainer.train_step(input_ids, target_ids)
|
||
assert second_loss > 0, "Second loss should also be positive"
|
||
|
||
print(f"✅ Batch preparation shape: {input_ids.shape}")
|
||
print(f"✅ Initial loss: {initial_loss:.4f}")
|
||
print(f"✅ Second loss: {second_loss:.4f}")
|
||
print("✅ Training pipeline works correctly!")
|
||
|
||
# Run immediate test when developing this module
|
||
if __name__ == "__main__":
|
||
test_unit_training_pipeline()
|
||
|
||
# %% ../../modules/20_capstone/20_capstone.ipynb 14
|
||
class CompleteTinyGPTPipeline:
|
||
"""
|
||
End-to-end ML pipeline demonstrating integration of all 19 modules.
|
||
|
||
Pipeline stages:
|
||
1. Data preparation (Module 10: Tokenization)
|
||
2. Model creation (Modules 01-04, 11-13: Architecture)
|
||
3. Training setup (Modules 05-07: Optimization)
|
||
4. Training loop (Module 08: DataLoader)
|
||
5. Optimization (Modules 17-18: Quantization, Pruning)
|
||
6. Evaluation (Module 19: Benchmarking)
|
||
7. Generation (Module 14: KV Caching)
|
||
"""
|
||
|
||
def __init__(self, vocab_size: int = 100, embed_dim: int = 128,
|
||
num_layers: int = 4, num_heads: int = 4):
|
||
"""
|
||
Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules.
|
||
|
||
TODO: Set up a complete ML pipeline with tokenization, model, training,
|
||
profiling, and benchmarking components
|
||
|
||
APPROACH:
|
||
1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads)
|
||
2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127)
|
||
3. Create TinyGPT model instance with stored parameters and max_seq_len=256
|
||
4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4
|
||
5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis
|
||
6. Initialize pipeline state tracking (is_trained flag, training_history list)
|
||
7. Print pipeline initialization summary with parameter count and memory usage
|
||
|
||
EXAMPLE:
|
||
>>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128,
|
||
... num_layers=4, num_heads=4)
|
||
🏗️ Complete TinyGPT Pipeline Initialized
|
||
Model: 419,300 parameters
|
||
Memory: 1.6MB
|
||
>>> pipeline.model.count_parameters()
|
||
419300
|
||
>>> pipeline.is_trained
|
||
False
|
||
>>> len(pipeline.training_history)
|
||
0
|
||
|
||
HINTS:
|
||
- CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)]
|
||
- TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len
|
||
- TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments
|
||
- Benchmark expects (models_list, datasets_list, metrics_list) format
|
||
- Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB
|
||
"""
|
||
|
||
### BEGIN SOLUTION
|
||
self.vocab_size = vocab_size
|
||
self.embed_dim = embed_dim
|
||
self.num_layers = num_layers
|
||
self.num_heads = num_heads
|
||
|
||
# Stage 1: Initialize tokenizer (Module 10)
|
||
self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)]) # Printable ASCII
|
||
|
||
# Stage 2: Create model (Modules 01-04, 11-13)
|
||
self.model = TinyGPT(
|
||
vocab_size=vocab_size,
|
||
embed_dim=embed_dim,
|
||
num_layers=num_layers,
|
||
num_heads=num_heads,
|
||
max_seq_len=256
|
||
)
|
||
|
||
# Stage 3: Setup training (Modules 05-07)
|
||
self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4)
|
||
|
||
# Stage 4: Initialize profiler and benchmark (Modules 15, 19)
|
||
self.profiler = Profiler()
|
||
self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"])
|
||
|
||
# Pipeline state
|
||
self.is_trained = False
|
||
self.training_history = []
|
||
|
||
print("🏗️ Complete TinyGPT Pipeline Initialized")
|
||
print(f" Model: {self.model.count_parameters():,} parameters")
|
||
print(f" Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB")
|
||
### END SOLUTION
|
||
|
||
def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader:
|
||
"""
|
||
Prepare training data using DataLoader (Module 08).
|
||
|
||
TODO: Create DataLoader for training text data
|
||
|
||
APPROACH:
|
||
1. Tokenize all texts in corpus
|
||
2. Create input/target pairs for language modeling
|
||
3. Package into TensorDataset
|
||
4. Create DataLoader with batching and shuffling
|
||
|
||
EXAMPLE:
|
||
>>> pipeline = CompleteTinyGPTPipeline()
|
||
>>> corpus = ["hello world", "ai is amazing"]
|
||
>>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
|
||
>>> print(f"Batches: {len(dataloader)}")
|
||
Batches: 1
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Tokenize and prepare training pairs
|
||
input_sequences = []
|
||
target_sequences = []
|
||
|
||
for text in text_corpus:
|
||
tokens = self.tokenizer.encode(text)
|
||
if len(tokens) < 2:
|
||
continue # Skip very short texts
|
||
|
||
# Create sliding window of input/target pairs
|
||
for i in range(len(tokens) - 1):
|
||
input_seq = tokens[:i+1]
|
||
target_seq = tokens[i+1]
|
||
|
||
# Pad input to consistent length
|
||
max_len = 32 # Reasonable context window
|
||
if len(input_seq) > max_len:
|
||
input_seq = input_seq[-max_len:]
|
||
else:
|
||
input_seq = [0] * (max_len - len(input_seq)) + input_seq
|
||
|
||
input_sequences.append(input_seq)
|
||
target_sequences.append(target_seq)
|
||
|
||
# Convert to tensors
|
||
inputs = Tensor(np.array(input_sequences))
|
||
targets = Tensor(np.array(target_sequences))
|
||
|
||
# Create dataset and dataloader
|
||
dataset = TensorDataset(inputs, targets)
|
||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||
|
||
print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches")
|
||
return dataloader
|
||
### END SOLUTION
|
||
|
||
def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]:
|
||
"""
|
||
Complete training loop with monitoring.
|
||
|
||
TODO: Implement full training with progress tracking
|
||
|
||
APPROACH:
|
||
1. Loop through epochs
|
||
2. For each batch: forward, backward, optimize
|
||
3. Track loss and perplexity
|
||
4. Update learning rate schedule
|
||
5. Return training history
|
||
|
||
EXAMPLE:
|
||
>>> history = pipeline.train(dataloader, epochs=5)
|
||
>>> print(f"Final loss: {history['losses'][-1]:.4f}")
|
||
Final loss: 1.2345
|
||
"""
|
||
### BEGIN SOLUTION
|
||
history = {'losses': [], 'perplexities': [], 'epochs': []}
|
||
|
||
print(f"🚀 Starting training for {epochs} epochs...")
|
||
|
||
for epoch in range(epochs):
|
||
epoch_losses = []
|
||
|
||
for batch_idx, (inputs, targets) in enumerate(dataloader):
|
||
# Training step
|
||
loss = self.trainer.train_step(inputs, targets)
|
||
epoch_losses.append(loss)
|
||
|
||
# Log progress
|
||
if batch_idx % 10 == 0:
|
||
perplexity = np.exp(loss)
|
||
print(f" Epoch {epoch+1}/{epochs}, Batch {batch_idx}: "
|
||
f"Loss={loss:.4f}, PPL={perplexity:.2f}")
|
||
|
||
# Epoch summary
|
||
avg_loss = np.mean(epoch_losses)
|
||
avg_perplexity = np.exp(avg_loss)
|
||
|
||
history['losses'].append(avg_loss)
|
||
history['perplexities'].append(avg_perplexity)
|
||
history['epochs'].append(epoch + 1)
|
||
|
||
# Update learning rate
|
||
self.trainer.scheduler.step()
|
||
|
||
print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}")
|
||
|
||
self.is_trained = True
|
||
self.training_history = history
|
||
print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}")
|
||
|
||
return history
|
||
### END SOLUTION
|
||
|
||
def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0):
|
||
"""
|
||
Apply optimization techniques (Modules 17-18).
|
||
|
||
TODO: Apply quantization and pruning optimizations
|
||
|
||
APPROACH:
|
||
1. Optionally apply quantization to reduce precision
|
||
2. Optionally apply pruning to remove weights
|
||
3. Measure size reduction
|
||
4. Validate model still works
|
||
|
||
EXAMPLE:
|
||
>>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
|
||
Model optimized: 75% size reduction
|
||
"""
|
||
### BEGIN SOLUTION
|
||
original_params = self.model.count_parameters()
|
||
original_memory = original_params * 4 / (1024 * 1024)
|
||
|
||
optimizations_applied = []
|
||
|
||
if quantize:
|
||
# Apply quantization (simulated)
|
||
# In real implementation, would use quantize_model()
|
||
quantized_memory = original_memory / 4 # INT8 vs FP32
|
||
optimizations_applied.append(f"INT8 quantization (4× memory reduction)")
|
||
print(" Applied INT8 quantization")
|
||
|
||
if prune_sparsity > 0:
|
||
# Apply pruning (simulated)
|
||
# In real implementation, would use magnitude_prune()
|
||
remaining_weights = 1 - prune_sparsity
|
||
optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)")
|
||
print(f" Applied {prune_sparsity:.0%} magnitude pruning")
|
||
|
||
# Calculate final size
|
||
size_reduction = 1.0
|
||
if quantize:
|
||
size_reduction *= 0.25 # 4× smaller
|
||
if prune_sparsity > 0:
|
||
size_reduction *= (1 - prune_sparsity)
|
||
|
||
final_memory = original_memory * size_reduction
|
||
reduction_factor = original_memory / final_memory
|
||
|
||
print(f"🔧 Model optimization complete:")
|
||
print(f" Original: {original_memory:.1f}MB")
|
||
print(f" Optimized: {final_memory:.1f}MB")
|
||
print(f" Reduction: {reduction_factor:.1f}× smaller")
|
||
print(f" Applied: {', '.join(optimizations_applied)}")
|
||
### END SOLUTION
|
||
|
||
def generate_text(self, prompt: str, max_tokens: int = 50) -> str:
|
||
"""
|
||
Generate text using the trained model.
|
||
|
||
TODO: Implement text generation with proper encoding/decoding
|
||
|
||
APPROACH:
|
||
1. Encode prompt to token IDs
|
||
2. Use model.generate() for autoregressive generation
|
||
3. Decode generated tokens back to text
|
||
4. Return generated text
|
||
|
||
EXAMPLE:
|
||
>>> text = pipeline.generate_text("Hello", max_tokens=10)
|
||
>>> print(f"Generated: {text}")
|
||
Generated: Hello world this is AI
|
||
"""
|
||
### BEGIN SOLUTION
|
||
if not self.is_trained:
|
||
print("⚠️ Model not trained yet. Generating with random weights.")
|
||
|
||
# Encode prompt
|
||
prompt_tokens = self.tokenizer.encode(prompt)
|
||
prompt_tensor = Tensor([prompt_tokens])
|
||
|
||
# Generate tokens
|
||
generated_tokens = self.model.generate(
|
||
prompt_tensor,
|
||
max_new_tokens=max_tokens,
|
||
temperature=0.8,
|
||
use_cache=True
|
||
)
|
||
|
||
# Decode to text
|
||
all_tokens = generated_tokens.data[0].tolist()
|
||
generated_text = self.tokenizer.decode(all_tokens)
|
||
|
||
return generated_text
|
||
### END SOLUTION
|
||
|
||
def test_unit_complete_pipeline():
|
||
"""🔬 Test complete pipeline integration."""
|
||
print("🔬 Unit Test: Complete Pipeline Integration...")
|
||
|
||
# Create pipeline
|
||
pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2)
|
||
|
||
# Test data preparation
|
||
corpus = ["hello world", "ai is fun", "machine learning"]
|
||
dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
|
||
assert len(dataloader) > 0, "DataLoader should have batches"
|
||
|
||
# Test training (minimal)
|
||
history = pipeline.train(dataloader, epochs=1)
|
||
assert 'losses' in history, "History should contain losses"
|
||
assert len(history['losses']) == 1, "Should have one epoch of losses"
|
||
|
||
# Test optimization
|
||
pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
|
||
|
||
# Test generation
|
||
generated = pipeline.generate_text("hello", max_tokens=5)
|
||
assert isinstance(generated, str), "Generated output should be string"
|
||
assert len(generated) > 0, "Generated text should not be empty"
|
||
|
||
print(f"✅ Pipeline stages completed successfully")
|
||
print(f"✅ Training history: {len(history['losses'])} epochs")
|
||
print(f"✅ Generated text: '{generated[:20]}...'")
|
||
print("✅ Complete pipeline integration works!")
|
||
|
||
# Run immediate test when developing this module
|
||
if __name__ == "__main__":
|
||
test_unit_complete_pipeline()
|