# ╔═══════════════════════════════════════════════════════════════════════════════╗ # ║ 🚨 CRITICAL WARNING 🚨 ║ # ║ AUTOGENERATED! DO NOT EDIT! ║ # ║ ║ # ║ This file is AUTOMATICALLY GENERATED from source modules. ║ # ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║ # ║ ║ # ║ ✅ TO EDIT: src/XX_tinygpt/XX_tinygpt.py ║ # ║ ✅ TO EXPORT: Run 'tito module complete ' ║ # ║ ║ # ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║ # ║ Editing it directly may break module functionality and training. ║ # ║ ║ # ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║ # ║ The tinytorch/ directory is generated code - edit source files instead! ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 __all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline', 'test_unit_complete_pipeline'] # %% ../../modules/20_capstone/20_capstone.ipynb 2 #| default_exp applications.tinygpt #| export # %% ../../modules/20_capstone/20_capstone.ipynb 7 class TinyGPT: """ Complete GPT implementation integrating all TinyTorch modules. This class demonstrates how framework components compose into real applications. Built using modules 01,02,03,11,12,13 as core architecture. Architecture: - Token Embeddings (Module 11) - Positional Encoding (Module 11) - Transformer Blocks (Module 13) - Output Linear Layer (Module 03) - Language Modeling Head (Module 04) """ def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4, num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1): """ Initialize TinyGPT with production-inspired architecture. TODO: Build a complete GPT model using TinyTorch components APPROACH: 1. Create token embeddings (vocab_size × embed_dim) 2. Create positional encoding (max_seq_len × embed_dim) 3. Build transformer layers using TransformerBlock 4. Add output projection layer 5. Calculate and report parameter count ARCHITECTURE DECISIONS: - embed_dim=128: Small enough for fast training, large enough for learning - num_layers=4: Sufficient depth without excessive memory - num_heads=4: Multi-head attention without head_dim being too small - max_seq_len=256: Reasonable context length for character-level modeling EXAMPLE: >>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4) >>> print(f"Parameters: {model.count_parameters():,}") Parameters: 1,234,567 HINTS: - Use Embedding class for token embeddings - Use PositionalEncoding for position information - Stack TransformerBlock instances in a list - Final Linear layer maps embed_dim → vocab_size """ ### BEGIN SOLUTION self.vocab_size = vocab_size self.embed_dim = embed_dim self.num_layers = num_layers self.num_heads = num_heads self.max_seq_len = max_seq_len self.dropout = dropout # Token embeddings: convert token IDs to dense vectors self.token_embedding = Embedding(vocab_size, embed_dim) # Positional encoding: add position information self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim) # Transformer layers: core processing self.transformer_blocks = [] for _ in range(num_layers): block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0) self.transformer_blocks.append(block) # Output projection: map back to vocabulary self.output_projection = Linear(embed_dim, vocab_size) # Dropout for regularization self.dropout_layer = Dropout(dropout) # Calculate parameter count for systems analysis self._param_count = self.count_parameters() print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters") print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D") print(f"💾 Estimated memory: {self._param_count * BYTES_PER_FLOAT32 / MB_TO_BYTES:.1f}MB") ### END SOLUTION def test_unit_tinygpt_init(): """🔬 Test TinyGPT initialization and parameter counting.""" print("🔬 Unit Test: TinyGPT Initialization...") # Create a small model for testing model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128) # Verify architecture components exist assert hasattr(model, 'token_embedding') assert hasattr(model, 'positional_encoding') assert hasattr(model, 'transformer_blocks') assert hasattr(model, 'output_projection') assert len(model.transformer_blocks) == 2 # Verify parameter count is reasonable param_count = model.count_parameters() assert param_count > 0 assert param_count < 1000000 # Sanity check for small model print(f"✅ Model created with {param_count:,} parameters") print("✅ TinyGPT initialization works correctly!") # Run immediate test when developing this module if __name__ == "__main__": test_unit_tinygpt_init() # %% ../../modules/20_capstone/20_capstone.ipynb 10 class TinyGPTTrainer: """ Complete training pipeline integrating optimizers, schedulers, and monitoring. Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training. """ def __init__(self, model: TinyGPT, tokenizer: CharTokenizer, learning_rate: float = 3e-4, weight_decay: float = 0.01): """ Initialize trainer with model and optimization components. TODO: Set up complete training infrastructure APPROACH: 1. Store model and tokenizer references 2. Initialize AdamW optimizer (standard for transformers) 3. Initialize loss function (CrossEntropyLoss for language modeling) 4. Set up learning rate scheduler (cosine schedule) 5. Initialize training metrics tracking PRODUCTION CHOICES: - AdamW: Better generalization than Adam (weight decay) - learning_rate=3e-4: Standard for small transformers - Cosine schedule: Smooth learning rate decay - CrossEntropy: Standard for classification/language modeling EXAMPLE: >>> model = TinyGPT(vocab_size=100) >>> tokenizer = CharTokenizer(['a', 'b', 'c']) >>> trainer = TinyGPTTrainer(model, tokenizer) >>> print("Trainer ready for training") Trainer ready for training HINTS: - Get all model parameters with model.parameters() - Use AdamW with weight_decay for better generalization - CrossEntropyLoss handles the language modeling objective """ ### BEGIN SOLUTION self.model = model self.tokenizer = tokenizer # Collect all trainable parameters all_params = [] all_params.extend(model.token_embedding.parameters()) for block in model.transformer_blocks: all_params.extend(block.parameters()) all_params.extend(model.output_projection.parameters()) # Initialize optimizer (AdamW for transformers) self.optimizer = AdamW( params=all_params, lr=learning_rate, weight_decay=weight_decay, betas=(0.9, 0.95) # Standard for language models ) # Loss function for next token prediction self.loss_fn = CrossEntropyLoss() # Learning rate scheduler self.scheduler = CosineSchedule( optimizer=self.optimizer, max_epochs=100, # Will adjust based on actual training min_lr=learning_rate * 0.1 ) # Training metrics self.training_history = { 'losses': [], 'perplexities': [], 'learning_rates': [], 'epoch': 0 } print(f"🚀 Trainer initialized:") print(f" Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})") print(f" Parameters: {len(all_params):,} tensors") print(f" Loss: CrossEntropyLoss") ### END SOLUTION def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]: """ Convert text batch to input/target tensors for language modeling. TODO: Implement text-to-tensor conversion with proper targets APPROACH: 1. Tokenize each text in the batch 2. Pad/truncate to consistent length 3. Create input_ids (text) and target_ids (text shifted by 1) 4. Convert to Tensor format LANGUAGE MODELING OBJECTIVE: - Input: [token1, token2, token3, token4] - Target: [token2, token3, token4, token5] - Model predicts next token at each position EXAMPLE: >>> trainer = TinyGPTTrainer(model, tokenizer) >>> texts = ["hello world", "ai is fun"] >>> inputs, targets = trainer.prepare_batch(texts) >>> print(inputs.shape, targets.shape) (2, 128) (2, 128) HINTS: - Use tokenizer.encode() for text → token conversion - Pad shorter sequences with tokenizer pad token - Target sequence is input sequence shifted right by 1 """ ### BEGIN SOLUTION batch_size = len(text_batch) # Tokenize all texts tokenized_batch = [] for text in text_batch: tokens = self.tokenizer.encode(text) # Truncate or pad to max_length if len(tokens) > max_length: tokens = tokens[:max_length] else: # Pad with special token (use 0 as pad) tokens.extend([0] * (max_length - len(tokens))) tokenized_batch.append(tokens) # Convert to numpy then Tensor input_ids = Tensor(np.array(tokenized_batch)) # (batch_size, seq_len) # Create targets (shifted input for next token prediction) target_ids = Tensor(np.roll(input_ids.data, -1, axis=1)) # Shift left by 1 return input_ids, target_ids ### END SOLUTION def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float: """ Single training step with forward, backward, and optimization. TODO: Implement complete training step APPROACH: 1. Zero gradients from previous step 2. Forward pass to get logits 3. Compute loss between logits and targets 4. Backward pass to compute gradients 5. Optimizer step to update parameters 6. Return loss value for monitoring MEMORY MANAGEMENT: During training, memory usage = 3× model size: - 1× for parameters - 1× for gradients - 1× for optimizer states (Adam moments) EXAMPLE: >>> loss = trainer.train_step(input_ids, target_ids) >>> print(f"Training loss: {loss:.4f}") Training loss: 2.3456 HINTS: - Always zero_grad() before forward pass - Loss should be computed on flattened logits and targets - Call backward() on the loss tensor """ ### BEGIN SOLUTION # Zero gradients from previous step self.optimizer.zero_grad() # Forward pass logits = self.model.forward(input_ids) # (batch, seq_len, vocab_size) # Reshape for loss computation batch_size, seq_len, vocab_size = logits.shape logits_flat = logits.reshape(batch_size * seq_len, vocab_size) targets_flat = target_ids.reshape(batch_size * seq_len) # Compute loss loss = self.loss_fn.forward(logits_flat, targets_flat) # Backward pass loss.backward() # Optimizer step self.optimizer.step() # Return scalar loss for monitoring # loss.data is numpy array - float() handles conversion automatically return float(loss.data) ### END SOLUTION def test_unit_training_pipeline(): """🔬 Test training pipeline components.""" print("🔬 Unit Test: Training Pipeline...") # Create small model and trainer model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2) tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' ']) trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3) # Test batch preparation texts = ["hello", "world"] input_ids, target_ids = trainer.prepare_batch(texts, max_length=8) assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}" assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}" # Test training step initial_loss = trainer.train_step(input_ids, target_ids) assert initial_loss > 0, "Loss should be positive" # Second step should work (gradients computed and applied) second_loss = trainer.train_step(input_ids, target_ids) assert second_loss > 0, "Second loss should also be positive" print(f"✅ Batch preparation shape: {input_ids.shape}") print(f"✅ Initial loss: {initial_loss:.4f}") print(f"✅ Second loss: {second_loss:.4f}") print("✅ Training pipeline works correctly!") # Run immediate test when developing this module if __name__ == "__main__": test_unit_training_pipeline() # %% ../../modules/20_capstone/20_capstone.ipynb 14 class CompleteTinyGPTPipeline: """ End-to-end ML pipeline demonstrating integration of all 19 modules. Pipeline stages: 1. Data preparation (Module 10: Tokenization) 2. Model creation (Modules 01-04, 11-13: Architecture) 3. Training setup (Modules 05-07: Optimization) 4. Training loop (Module 08: DataLoader) 5. Optimization (Modules 17-18: Quantization, Pruning) 6. Evaluation (Module 19: Benchmarking) 7. Generation (Module 14: KV Caching) """ def __init__(self, vocab_size: int = 100, embed_dim: int = 128, num_layers: int = 4, num_heads: int = 4): """ Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules. TODO: Set up a complete ML pipeline with tokenization, model, training, profiling, and benchmarking components APPROACH: 1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads) 2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127) 3. Create TinyGPT model instance with stored parameters and max_seq_len=256 4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4 5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis 6. Initialize pipeline state tracking (is_trained flag, training_history list) 7. Print pipeline initialization summary with parameter count and memory usage EXAMPLE: >>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128, ... num_layers=4, num_heads=4) 🏗️ Complete TinyGPT Pipeline Initialized Model: 419,300 parameters Memory: 1.6MB >>> pipeline.model.count_parameters() 419300 >>> pipeline.is_trained False >>> len(pipeline.training_history) 0 HINTS: - CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)] - TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len - TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments - Benchmark expects (models_list, datasets_list, metrics_list) format - Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB """ ### BEGIN SOLUTION self.vocab_size = vocab_size self.embed_dim = embed_dim self.num_layers = num_layers self.num_heads = num_heads # Stage 1: Initialize tokenizer (Module 10) self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)]) # Printable ASCII # Stage 2: Create model (Modules 01-04, 11-13) self.model = TinyGPT( vocab_size=vocab_size, embed_dim=embed_dim, num_layers=num_layers, num_heads=num_heads, max_seq_len=256 ) # Stage 3: Setup training (Modules 05-07) self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4) # Stage 4: Initialize profiler and benchmark (Modules 15, 19) self.profiler = Profiler() self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"]) # Pipeline state self.is_trained = False self.training_history = [] print("🏗️ Complete TinyGPT Pipeline Initialized") print(f" Model: {self.model.count_parameters():,} parameters") print(f" Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB") ### END SOLUTION def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader: """ Prepare training data using DataLoader (Module 08). TODO: Create DataLoader for training text data APPROACH: 1. Tokenize all texts in corpus 2. Create input/target pairs for language modeling 3. Package into TensorDataset 4. Create DataLoader with batching and shuffling EXAMPLE: >>> pipeline = CompleteTinyGPTPipeline() >>> corpus = ["hello world", "ai is amazing"] >>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2) >>> print(f"Batches: {len(dataloader)}") Batches: 1 """ ### BEGIN SOLUTION # Tokenize and prepare training pairs input_sequences = [] target_sequences = [] for text in text_corpus: tokens = self.tokenizer.encode(text) if len(tokens) < 2: continue # Skip very short texts # Create sliding window of input/target pairs for i in range(len(tokens) - 1): input_seq = tokens[:i+1] target_seq = tokens[i+1] # Pad input to consistent length max_len = 32 # Reasonable context window if len(input_seq) > max_len: input_seq = input_seq[-max_len:] else: input_seq = [0] * (max_len - len(input_seq)) + input_seq input_sequences.append(input_seq) target_sequences.append(target_seq) # Convert to tensors inputs = Tensor(np.array(input_sequences)) targets = Tensor(np.array(target_sequences)) # Create dataset and dataloader dataset = TensorDataset(inputs, targets) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches") return dataloader ### END SOLUTION def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]: """ Complete training loop with monitoring. TODO: Implement full training with progress tracking APPROACH: 1. Loop through epochs 2. For each batch: forward, backward, optimize 3. Track loss and perplexity 4. Update learning rate schedule 5. Return training history EXAMPLE: >>> history = pipeline.train(dataloader, epochs=5) >>> print(f"Final loss: {history['losses'][-1]:.4f}") Final loss: 1.2345 """ ### BEGIN SOLUTION history = {'losses': [], 'perplexities': [], 'epochs': []} print(f"🚀 Starting training for {epochs} epochs...") for epoch in range(epochs): epoch_losses = [] for batch_idx, (inputs, targets) in enumerate(dataloader): # Training step loss = self.trainer.train_step(inputs, targets) epoch_losses.append(loss) # Log progress if batch_idx % 10 == 0: perplexity = np.exp(loss) print(f" Epoch {epoch+1}/{epochs}, Batch {batch_idx}: " f"Loss={loss:.4f}, PPL={perplexity:.2f}") # Epoch summary avg_loss = np.mean(epoch_losses) avg_perplexity = np.exp(avg_loss) history['losses'].append(avg_loss) history['perplexities'].append(avg_perplexity) history['epochs'].append(epoch + 1) # Update learning rate self.trainer.scheduler.step() print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}") self.is_trained = True self.training_history = history print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}") return history ### END SOLUTION def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0): """ Apply optimization techniques (Modules 17-18). TODO: Apply quantization and pruning optimizations APPROACH: 1. Optionally apply quantization to reduce precision 2. Optionally apply pruning to remove weights 3. Measure size reduction 4. Validate model still works EXAMPLE: >>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5) Model optimized: 75% size reduction """ ### BEGIN SOLUTION original_params = self.model.count_parameters() original_memory = original_params * 4 / (1024 * 1024) optimizations_applied = [] if quantize: # Apply quantization (simulated) # In real implementation, would use quantize_model() quantized_memory = original_memory / 4 # INT8 vs FP32 optimizations_applied.append(f"INT8 quantization (4× memory reduction)") print(" Applied INT8 quantization") if prune_sparsity > 0: # Apply pruning (simulated) # In real implementation, would use magnitude_prune() remaining_weights = 1 - prune_sparsity optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)") print(f" Applied {prune_sparsity:.0%} magnitude pruning") # Calculate final size size_reduction = 1.0 if quantize: size_reduction *= 0.25 # 4× smaller if prune_sparsity > 0: size_reduction *= (1 - prune_sparsity) final_memory = original_memory * size_reduction reduction_factor = original_memory / final_memory print(f"🔧 Model optimization complete:") print(f" Original: {original_memory:.1f}MB") print(f" Optimized: {final_memory:.1f}MB") print(f" Reduction: {reduction_factor:.1f}× smaller") print(f" Applied: {', '.join(optimizations_applied)}") ### END SOLUTION def generate_text(self, prompt: str, max_tokens: int = 50) -> str: """ Generate text using the trained model. TODO: Implement text generation with proper encoding/decoding APPROACH: 1. Encode prompt to token IDs 2. Use model.generate() for autoregressive generation 3. Decode generated tokens back to text 4. Return generated text EXAMPLE: >>> text = pipeline.generate_text("Hello", max_tokens=10) >>> print(f"Generated: {text}") Generated: Hello world this is AI """ ### BEGIN SOLUTION if not self.is_trained: print("⚠️ Model not trained yet. Generating with random weights.") # Encode prompt prompt_tokens = self.tokenizer.encode(prompt) prompt_tensor = Tensor([prompt_tokens]) # Generate tokens generated_tokens = self.model.generate( prompt_tensor, max_new_tokens=max_tokens, temperature=0.8, use_cache=True ) # Decode to text all_tokens = generated_tokens.data[0].tolist() generated_text = self.tokenizer.decode(all_tokens) return generated_text ### END SOLUTION def test_unit_complete_pipeline(): """🔬 Test complete pipeline integration.""" print("🔬 Unit Test: Complete Pipeline Integration...") # Create pipeline pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2) # Test data preparation corpus = ["hello world", "ai is fun", "machine learning"] dataloader = pipeline.prepare_training_data(corpus, batch_size=2) assert len(dataloader) > 0, "DataLoader should have batches" # Test training (minimal) history = pipeline.train(dataloader, epochs=1) assert 'losses' in history, "History should contain losses" assert len(history['losses']) == 1, "Should have one epoch of losses" # Test optimization pipeline.optimize_model(quantize=True, prune_sparsity=0.5) # Test generation generated = pipeline.generate_text("hello", max_tokens=5) assert isinstance(generated, str), "Generated output should be string" assert len(generated) > 0, "Generated text should not be empty" print(f"✅ Pipeline stages completed successfully") print(f"✅ Training history: {len(history['losses'])} epochs") print(f"✅ Generated text: '{generated[:20]}...'") print("✅ Complete pipeline integration works!") # Run immediate test when developing this module if __name__ == "__main__": test_unit_complete_pipeline()