diff --git a/tinytorch/_modidx.py b/tinytorch/_modidx.py index 3cd9a0a8..88d63238 100644 --- a/tinytorch/_modidx.py +++ b/tinytorch/_modidx.py @@ -21,7 +21,37 @@ d = { 'settings': { 'branch': 'main', 'doc_host': 'https://tinytorch.github.io', 'git_url': 'https://github.com/tinytorch/TinyTorch/', 'lib_path': 'tinytorch'}, - 'syms': { 'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark', + 'syms': { 'tinytorch.applications.tinygpt': { 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline': ( '20_capstone/capstone_dev.html#completetinygptpipeline', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.__init__': ( '20_capstone/capstone_dev.html#completetinygptpipeline.__init__', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.generate_text': ( '20_capstone/capstone_dev.html#completetinygptpipeline.generate_text', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.optimize_model': ( '20_capstone/capstone_dev.html#completetinygptpipeline.optimize_model', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.prepare_training_data': ( '20_capstone/capstone_dev.html#completetinygptpipeline.prepare_training_data', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.train': ( '20_capstone/capstone_dev.html#completetinygptpipeline.train', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.TinyGPT': ( '20_capstone/capstone_dev.html#tinygpt', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.TinyGPT.__init__': ( '20_capstone/capstone_dev.html#tinygpt.__init__', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.TinyGPTTrainer': ( '20_capstone/capstone_dev.html#tinygpttrainer', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.TinyGPTTrainer.__init__': ( '20_capstone/capstone_dev.html#tinygpttrainer.__init__', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.TinyGPTTrainer.prepare_batch': ( '20_capstone/capstone_dev.html#tinygpttrainer.prepare_batch', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.TinyGPTTrainer.train_step': ( '20_capstone/capstone_dev.html#tinygpttrainer.train_step', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.test_unit_complete_pipeline': ( '20_capstone/capstone_dev.html#test_unit_complete_pipeline', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.test_unit_tinygpt_init': ( '20_capstone/capstone_dev.html#test_unit_tinygpt_init', + 'tinytorch/applications/tinygpt.py'), + 'tinytorch.applications.tinygpt.test_unit_training_pipeline': ( '20_capstone/capstone_dev.html#test_unit_training_pipeline', + 'tinytorch/applications/tinygpt.py')}, + 'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark', 'tinytorch/benchmarking/benchmark.py'), 'tinytorch.benchmarking.benchmark.Benchmark.__init__': ( '19_benchmarking/benchmarking_dev.html#benchmark.__init__', 'tinytorch/benchmarking/benchmark.py'), @@ -59,8 +89,6 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/benchmarking/benchmark.py'), 'tinytorch.benchmarking.benchmark.TinyMLPerf.run_standard_benchmark': ( '19_benchmarking/benchmarking_dev.html#tinymlperf.run_standard_benchmark', 'tinytorch/benchmarking/benchmark.py'), - 'tinytorch.benchmarking.benchmark.calculate_normalized_scores': ( '19_benchmarking/benchmarking_dev.html#calculate_normalized_scores', - 'tinytorch/benchmarking/benchmark.py'), 'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark', 'tinytorch/benchmarking/benchmark.py'), 'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark_suite', @@ -77,8 +105,6 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/competition/submit.py'), 'tinytorch.competition.submit.validate_installation': ( '20_competition/competition_dev.html#validate_installation', 'tinytorch/competition/submit.py'), - 'tinytorch.competition.submit.validate_submission': ( '20_competition/competition_dev.html#validate_submission', - 'tinytorch/competition/submit.py'), 'tinytorch.competition.submit.worked_example_optimization': ( '20_competition/competition_dev.html#worked_example_optimization', 'tinytorch/competition/submit.py')}, 'tinytorch.core.activations': { 'tinytorch.core.activations.GELU': ( '02_activations/activations_dev.html#gelu', @@ -315,11 +341,7 @@ d = { 'settings': { 'branch': 'main', 'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint', 'tinytorch/core/training.py'), 'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch', - 'tinytorch/core/training.py'), - 'tinytorch.core.training.load_checkpoint': ( '07_training/training_dev.html#load_checkpoint', - 'tinytorch/core/training.py'), - 'tinytorch.core.training.save_checkpoint': ( '07_training/training_dev.html#save_checkpoint', - 'tinytorch/core/training.py')}, + 'tinytorch/core/training.py')}, 'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader', 'tinytorch/data/loader.py'), 'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__', @@ -364,6 +386,8 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/generation/kv_cache.py')}, 'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt', 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.GPT.__call__': ( '13_transformers/transformers_dev.html#gpt.__call__', + 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask', @@ -376,6 +400,8 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm', 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.LayerNorm.__call__': ( '13_transformers/transformers_dev.html#layernorm.__call__', + 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward', @@ -384,6 +410,8 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp', 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.MLP.__call__': ( '13_transformers/transformers_dev.html#mlp.__call__', + 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward', @@ -392,32 +420,58 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock', 'tinytorch/models/transformer.py'), + 'tinytorch.models.transformer.TransformerBlock.__call__': ( '13_transformers/transformers_dev.html#transformerblock.__call__', + 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward', 'tinytorch/models/transformer.py'), 'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters', - 'tinytorch/models/transformer.py'), - 'tinytorch.models.transformer._tensor_mean': ( '13_transformers/transformers_dev.html#_tensor_mean', - 'tinytorch/models/transformer.py'), - 'tinytorch.models.transformer._tensor_sqrt': ( '13_transformers/transformers_dev.html#_tensor_sqrt', - 'tinytorch/models/transformer.py')}, - 'tinytorch.optimization.quantization': { 'tinytorch.optimization.quantization.QuantizationComplete': ( '17_quantization/quantization_dev.html#quantizationcomplete', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.QuantizationComplete.compare_models': ( '17_quantization/quantization_dev.html#quantizationcomplete.compare_models', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.QuantizationComplete.dequantize_tensor': ( '17_quantization/quantization_dev.html#quantizationcomplete.dequantize_tensor', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.QuantizationComplete.quantize_model': ( '17_quantization/quantization_dev.html#quantizationcomplete.quantize_model', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.QuantizationComplete.quantize_tensor': ( '17_quantization/quantization_dev.html#quantizationcomplete.quantize_tensor', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.dequantize_int8': ( '17_quantization/quantization_dev.html#dequantize_int8', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.quantize_int8': ( '17_quantization/quantization_dev.html#quantize_int8', - 'tinytorch/optimization/quantization.py'), - 'tinytorch.optimization.quantization.quantize_model': ( '17_quantization/quantization_dev.html#quantize_model', - 'tinytorch/optimization/quantization.py')}, + 'tinytorch/models/transformer.py')}, + 'tinytorch.optimization.acceleration': {}, + 'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '17_compression/compression_dev.html#compressioncomplete', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '17_compression/compression_dev.html#compressioncomplete.compress_model', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.CompressionComplete.magnitude_prune': ( '17_compression/compression_dev.html#compressioncomplete.magnitude_prune', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.CompressionComplete.measure_sparsity': ( '17_compression/compression_dev.html#compressioncomplete.measure_sparsity', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.CompressionComplete.structured_prune': ( '17_compression/compression_dev.html#compressioncomplete.structured_prune', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.KnowledgeDistillation': ( '17_compression/compression_dev.html#knowledgedistillation', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.KnowledgeDistillation.__init__': ( '17_compression/compression_dev.html#knowledgedistillation.__init__', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.KnowledgeDistillation._cross_entropy': ( '17_compression/compression_dev.html#knowledgedistillation._cross_entropy', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.KnowledgeDistillation._kl_divergence': ( '17_compression/compression_dev.html#knowledgedistillation._kl_divergence', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.KnowledgeDistillation._softmax': ( '17_compression/compression_dev.html#knowledgedistillation._softmax', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.KnowledgeDistillation.distillation_loss': ( '17_compression/compression_dev.html#knowledgedistillation.distillation_loss', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.Sequential': ( '17_compression/compression_dev.html#sequential', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.Sequential.__call__': ( '17_compression/compression_dev.html#sequential.__call__', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.Sequential.__init__': ( '17_compression/compression_dev.html#sequential.__init__', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.Sequential.forward': ( '17_compression/compression_dev.html#sequential.forward', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.Sequential.parameters': ( '17_compression/compression_dev.html#sequential.parameters', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.compress_model': ( '17_compression/compression_dev.html#compress_model', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.magnitude_prune': ( '17_compression/compression_dev.html#magnitude_prune', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.measure_sparsity': ( '17_compression/compression_dev.html#measure_sparsity', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.structured_prune': ( '17_compression/compression_dev.html#structured_prune', + 'tinytorch/optimization/compression.py'), + 'tinytorch.optimization.compression.test_unit_knowledge_distillation': ( '17_compression/compression_dev.html#test_unit_knowledge_distillation', + 'tinytorch/optimization/compression.py')}, + 'tinytorch.optimization.quantization': {}, 'tinytorch.profiling.profiler': { 'tinytorch.profiling.profiler.Profiler': ( '14_profiling/profiling_dev.html#profiler', 'tinytorch/profiling/profiler.py'), 'tinytorch.profiling.profiler.Profiler.__init__': ( '14_profiling/profiling_dev.html#profiler.__init__', @@ -442,6 +496,8 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/profiling/profiler.py')}, 'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding', 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.Embedding.__call__': ( '11_embeddings/embeddings_dev.html#embedding.__call__', + 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__', 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__', @@ -452,6 +508,8 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer', 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.EmbeddingLayer.__call__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__call__', + 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__', 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__', @@ -462,6 +520,8 @@ d = { 'settings': { 'branch': 'main', 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding', 'tinytorch/text/embeddings.py'), + 'tinytorch.text.embeddings.PositionalEncoding.__call__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__call__', + 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__', 'tinytorch/text/embeddings.py'), 'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__', diff --git a/tinytorch/applications/tinygpt.py b/tinytorch/applications/tinygpt.py new file mode 100644 index 00000000..24ccef7b --- /dev/null +++ b/tinytorch/applications/tinygpt.py @@ -0,0 +1,679 @@ +# ╔═══════════════════════════════════════════════════════════════════════════════╗ +# ║ 🚨 CRITICAL WARNING 🚨 ║ +# ║ AUTOGENERATED! DO NOT EDIT! ║ +# ║ ║ +# ║ This file is AUTOMATICALLY GENERATED from source modules. ║ +# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║ +# ║ ║ +# ║ ✅ TO EDIT: modules/source/XX_tinygpt/tinygpt_dev.py ║ +# ║ ✅ TO EXPORT: Run 'tito module complete ' ║ +# ║ ║ +# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║ +# ║ Editing it directly may break module functionality and training. ║ +# ║ ║ +# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║ +# ║ happens! The tinytorch/ directory is just the compiled output. ║ +# ╚═══════════════════════════════════════════════════════════════════════════════╝ +# %% auto 0 +__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline', + 'test_unit_complete_pipeline'] + +# %% ../../modules/source/20_capstone/capstone_dev.ipynb 2 +#| default_exp applications.tinygpt +#| export + +# %% ../../modules/source/20_capstone/capstone_dev.ipynb 7 +class TinyGPT: + """ + Complete GPT implementation integrating all TinyTorch modules. + + This class demonstrates how framework components compose into real applications. + Built using modules 01,02,03,11,12,13 as core architecture. + + Architecture: + - Token Embeddings (Module 11) + - Positional Encoding (Module 11) + - Transformer Blocks (Module 13) + - Output Linear Layer (Module 03) + - Language Modeling Head (Module 04) + """ + + def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4, + num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1): + """ + Initialize TinyGPT with production-inspired architecture. + + TODO: Build a complete GPT model using TinyTorch components + + APPROACH: + 1. Create token embeddings (vocab_size × embed_dim) + 2. Create positional encoding (max_seq_len × embed_dim) + 3. Build transformer layers using TransformerBlock + 4. Add output projection layer + 5. Calculate and report parameter count + + ARCHITECTURE DECISIONS: + - embed_dim=128: Small enough for fast training, large enough for learning + - num_layers=4: Sufficient depth without excessive memory + - num_heads=4: Multi-head attention without head_dim being too small + - max_seq_len=256: Reasonable context length for character-level modeling + + EXAMPLE: + >>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4) + >>> print(f"Parameters: {model.count_parameters():,}") + Parameters: 1,234,567 + + HINTS: + - Use Embedding class for token embeddings + - Use PositionalEncoding for position information + - Stack TransformerBlock instances in a list + - Final Linear layer maps embed_dim → vocab_size + """ + ### BEGIN SOLUTION + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.num_heads = num_heads + self.max_seq_len = max_seq_len + self.dropout = dropout + + # Token embeddings: convert token IDs to dense vectors + self.token_embedding = Embedding(vocab_size, embed_dim) + + # Positional encoding: add position information + self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim) + + # Transformer layers: core processing + self.transformer_blocks = [] + for _ in range(num_layers): + block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0) + self.transformer_blocks.append(block) + + # Output projection: map back to vocabulary + self.output_projection = Linear(embed_dim, vocab_size) + + # Dropout for regularization + self.dropout_layer = Dropout(dropout) + + # Calculate parameter count for systems analysis + self._param_count = self.count_parameters() + print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters") + print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D") + print(f"💾 Estimated memory: {self._param_count * 4 / 1024 / 1024:.1f}MB") + ### END SOLUTION + +def test_unit_tinygpt_init(): + """🔬 Test TinyGPT initialization and parameter counting.""" + print("🔬 Unit Test: TinyGPT Initialization...") + + # Create a small model for testing + model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128) + + # Verify architecture components exist + assert hasattr(model, 'token_embedding') + assert hasattr(model, 'positional_encoding') + assert hasattr(model, 'transformer_blocks') + assert hasattr(model, 'output_projection') + assert len(model.transformer_blocks) == 2 + + # Verify parameter count is reasonable + param_count = model.count_parameters() + assert param_count > 0 + assert param_count < 1000000 # Sanity check for small model + + print(f"✅ Model created with {param_count:,} parameters") + print("✅ TinyGPT initialization works correctly!") + +# Run immediate test +test_unit_tinygpt_init() + +# %% ../../modules/source/20_capstone/capstone_dev.ipynb 10 +class TinyGPTTrainer: + """ + Complete training pipeline integrating optimizers, schedulers, and monitoring. + + Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training. + """ + + def __init__(self, model: TinyGPT, tokenizer: CharTokenizer, + learning_rate: float = 3e-4, weight_decay: float = 0.01): + """ + Initialize trainer with model and optimization components. + + TODO: Set up complete training infrastructure + + APPROACH: + 1. Store model and tokenizer references + 2. Initialize AdamW optimizer (standard for transformers) + 3. Initialize loss function (CrossEntropyLoss for language modeling) + 4. Set up learning rate scheduler (cosine schedule) + 5. Initialize training metrics tracking + + PRODUCTION CHOICES: + - AdamW: Better generalization than Adam (weight decay) + - learning_rate=3e-4: Standard for small transformers + - Cosine schedule: Smooth learning rate decay + - CrossEntropy: Standard for classification/language modeling + + EXAMPLE: + >>> model = TinyGPT(vocab_size=100) + >>> tokenizer = CharTokenizer(['a', 'b', 'c']) + >>> trainer = TinyGPTTrainer(model, tokenizer) + >>> print("Trainer ready for training") + Trainer ready for training + + HINTS: + - Get all model parameters with model.parameters() + - Use AdamW with weight_decay for better generalization + - CrossEntropyLoss handles the language modeling objective + """ + ### BEGIN SOLUTION + self.model = model + self.tokenizer = tokenizer + + # Collect all trainable parameters + all_params = [] + all_params.extend(model.token_embedding.parameters()) + for block in model.transformer_blocks: + all_params.extend(block.parameters()) + all_params.extend(model.output_projection.parameters()) + + # Initialize optimizer (AdamW for transformers) + self.optimizer = AdamW( + params=all_params, + lr=learning_rate, + weight_decay=weight_decay, + betas=(0.9, 0.95) # Standard for language models + ) + + # Loss function for next token prediction + self.loss_fn = CrossEntropyLoss() + + # Learning rate scheduler + self.scheduler = CosineSchedule( + optimizer=self.optimizer, + max_epochs=100, # Will adjust based on actual training + min_lr=learning_rate * 0.1 + ) + + # Training metrics + self.training_history = { + 'losses': [], + 'perplexities': [], + 'learning_rates': [], + 'epoch': 0 + } + + print(f"🚀 Trainer initialized:") + print(f" Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})") + print(f" Parameters: {len(all_params):,} tensors") + print(f" Loss: CrossEntropyLoss") + ### END SOLUTION + + def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]: + """ + Convert text batch to input/target tensors for language modeling. + + TODO: Implement text-to-tensor conversion with proper targets + + APPROACH: + 1. Tokenize each text in the batch + 2. Pad/truncate to consistent length + 3. Create input_ids (text) and target_ids (text shifted by 1) + 4. Convert to Tensor format + + LANGUAGE MODELING OBJECTIVE: + - Input: [token1, token2, token3, token4] + - Target: [token2, token3, token4, token5] + - Model predicts next token at each position + + EXAMPLE: + >>> trainer = TinyGPTTrainer(model, tokenizer) + >>> texts = ["hello world", "ai is fun"] + >>> inputs, targets = trainer.prepare_batch(texts) + >>> print(inputs.shape, targets.shape) + (2, 128) (2, 128) + + HINTS: + - Use tokenizer.encode() for text → token conversion + - Pad shorter sequences with tokenizer pad token + - Target sequence is input sequence shifted right by 1 + """ + ### BEGIN SOLUTION + batch_size = len(text_batch) + + # Tokenize all texts + tokenized_batch = [] + for text in text_batch: + tokens = self.tokenizer.encode(text) + + # Truncate or pad to max_length + if len(tokens) > max_length: + tokens = tokens[:max_length] + else: + # Pad with special token (use 0 as pad) + tokens.extend([0] * (max_length - len(tokens))) + + tokenized_batch.append(tokens) + + # Convert to numpy then Tensor + input_ids = Tensor(np.array(tokenized_batch)) # (batch_size, seq_len) + + # Create targets (shifted input for next token prediction) + target_ids = Tensor(np.roll(input_ids.data, -1, axis=1)) # Shift left by 1 + + return input_ids, target_ids + ### END SOLUTION + + def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float: + """ + Single training step with forward, backward, and optimization. + + TODO: Implement complete training step + + APPROACH: + 1. Zero gradients from previous step + 2. Forward pass to get logits + 3. Compute loss between logits and targets + 4. Backward pass to compute gradients + 5. Optimizer step to update parameters + 6. Return loss value for monitoring + + MEMORY MANAGEMENT: + During training, memory usage = 3× model size: + - 1× for parameters + - 1× for gradients + - 1× for optimizer states (Adam moments) + + EXAMPLE: + >>> loss = trainer.train_step(input_ids, target_ids) + >>> print(f"Training loss: {loss:.4f}") + Training loss: 2.3456 + + HINTS: + - Always zero_grad() before forward pass + - Loss should be computed on flattened logits and targets + - Call backward() on the loss tensor + """ + ### BEGIN SOLUTION + # Zero gradients from previous step + self.optimizer.zero_grad() + + # Forward pass + logits = self.model.forward(input_ids) # (batch, seq_len, vocab_size) + + # Reshape for loss computation + batch_size, seq_len, vocab_size = logits.shape + logits_flat = logits.reshape(batch_size * seq_len, vocab_size) + targets_flat = target_ids.reshape(batch_size * seq_len) + + # Compute loss + loss = self.loss_fn.forward(logits_flat, targets_flat) + + # Backward pass + loss.backward() + + # Optimizer step + self.optimizer.step() + + # Return scalar loss for monitoring + return float(loss.data.item() if hasattr(loss.data, 'item') else loss.data) + ### END SOLUTION + +def test_unit_training_pipeline(): + """🔬 Test training pipeline components.""" + print("🔬 Unit Test: Training Pipeline...") + + # Create small model and trainer + model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2) + tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' ']) + trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3) + + # Test batch preparation + texts = ["hello", "world"] + input_ids, target_ids = trainer.prepare_batch(texts, max_length=8) + + assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}" + assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}" + + # Test training step + initial_loss = trainer.train_step(input_ids, target_ids) + assert initial_loss > 0, "Loss should be positive" + + # Second step should work (gradients computed and applied) + second_loss = trainer.train_step(input_ids, target_ids) + assert second_loss > 0, "Second loss should also be positive" + + print(f"✅ Batch preparation shape: {input_ids.shape}") + print(f"✅ Initial loss: {initial_loss:.4f}") + print(f"✅ Second loss: {second_loss:.4f}") + print("✅ Training pipeline works correctly!") + +# Run immediate test +test_unit_training_pipeline() + +# %% ../../modules/source/20_capstone/capstone_dev.ipynb 14 +class CompleteTinyGPTPipeline: + """ + End-to-end ML pipeline demonstrating integration of all 19 modules. + + Pipeline stages: + 1. Data preparation (Module 10: Tokenization) + 2. Model creation (Modules 01-04, 11-13: Architecture) + 3. Training setup (Modules 05-07: Optimization) + 4. Training loop (Module 08: DataLoader) + 5. Optimization (Modules 17-18: Quantization, Pruning) + 6. Evaluation (Module 19: Benchmarking) + 7. Generation (Module 14: KV Caching) + """ + + def __init__(self, vocab_size: int = 100, embed_dim: int = 128, + num_layers: int = 4, num_heads: int = 4): + """ + Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules. + + TODO: Set up a complete ML pipeline with tokenization, model, training, + profiling, and benchmarking components + + APPROACH: + 1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads) + 2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127) + 3. Create TinyGPT model instance with stored parameters and max_seq_len=256 + 4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4 + 5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis + 6. Initialize pipeline state tracking (is_trained flag, training_history list) + 7. Print pipeline initialization summary with parameter count and memory usage + + EXAMPLE: + >>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128, + ... num_layers=4, num_heads=4) + 🏗️ Complete TinyGPT Pipeline Initialized + Model: 419,300 parameters + Memory: 1.6MB + >>> pipeline.model.count_parameters() + 419300 + >>> pipeline.is_trained + False + >>> len(pipeline.training_history) + 0 + + HINTS: + - CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)] + - TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len + - TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments + - Benchmark expects (models_list, datasets_list, metrics_list) format + - Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB + """ + + ### BEGIN SOLUTION + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.num_heads = num_heads + + # Stage 1: Initialize tokenizer (Module 10) + self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)]) # Printable ASCII + + # Stage 2: Create model (Modules 01-04, 11-13) + self.model = TinyGPT( + vocab_size=vocab_size, + embed_dim=embed_dim, + num_layers=num_layers, + num_heads=num_heads, + max_seq_len=256 + ) + + # Stage 3: Setup training (Modules 05-07) + self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4) + + # Stage 4: Initialize profiler and benchmark (Modules 15, 19) + self.profiler = Profiler() + self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"]) + + # Pipeline state + self.is_trained = False + self.training_history = [] + + print("🏗️ Complete TinyGPT Pipeline Initialized") + print(f" Model: {self.model.count_parameters():,} parameters") + print(f" Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB") + ### END SOLUTION + + def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader: + """ + Prepare training data using DataLoader (Module 08). + + TODO: Create DataLoader for training text data + + APPROACH: + 1. Tokenize all texts in corpus + 2. Create input/target pairs for language modeling + 3. Package into TensorDataset + 4. Create DataLoader with batching and shuffling + + EXAMPLE: + >>> pipeline = CompleteTinyGPTPipeline() + >>> corpus = ["hello world", "ai is amazing"] + >>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2) + >>> print(f"Batches: {len(dataloader)}") + Batches: 1 + """ + ### BEGIN SOLUTION + # Tokenize and prepare training pairs + input_sequences = [] + target_sequences = [] + + for text in text_corpus: + tokens = self.tokenizer.encode(text) + if len(tokens) < 2: + continue # Skip very short texts + + # Create sliding window of input/target pairs + for i in range(len(tokens) - 1): + input_seq = tokens[:i+1] + target_seq = tokens[i+1] + + # Pad input to consistent length + max_len = 32 # Reasonable context window + if len(input_seq) > max_len: + input_seq = input_seq[-max_len:] + else: + input_seq = [0] * (max_len - len(input_seq)) + input_seq + + input_sequences.append(input_seq) + target_sequences.append(target_seq) + + # Convert to tensors + inputs = Tensor(np.array(input_sequences)) + targets = Tensor(np.array(target_sequences)) + + # Create dataset and dataloader + dataset = TensorDataset(inputs, targets) + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + + print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches") + return dataloader + ### END SOLUTION + + def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]: + """ + Complete training loop with monitoring. + + TODO: Implement full training with progress tracking + + APPROACH: + 1. Loop through epochs + 2. For each batch: forward, backward, optimize + 3. Track loss and perplexity + 4. Update learning rate schedule + 5. Return training history + + EXAMPLE: + >>> history = pipeline.train(dataloader, epochs=5) + >>> print(f"Final loss: {history['losses'][-1]:.4f}") + Final loss: 1.2345 + """ + ### BEGIN SOLUTION + history = {'losses': [], 'perplexities': [], 'epochs': []} + + print(f"🚀 Starting training for {epochs} epochs...") + + for epoch in range(epochs): + epoch_losses = [] + + for batch_idx, (inputs, targets) in enumerate(dataloader): + # Training step + loss = self.trainer.train_step(inputs, targets) + epoch_losses.append(loss) + + # Log progress + if batch_idx % 10 == 0: + perplexity = np.exp(loss) + print(f" Epoch {epoch+1}/{epochs}, Batch {batch_idx}: " + f"Loss={loss:.4f}, PPL={perplexity:.2f}") + + # Epoch summary + avg_loss = np.mean(epoch_losses) + avg_perplexity = np.exp(avg_loss) + + history['losses'].append(avg_loss) + history['perplexities'].append(avg_perplexity) + history['epochs'].append(epoch + 1) + + # Update learning rate + self.trainer.scheduler.step() + + print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}") + + self.is_trained = True + self.training_history = history + print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}") + + return history + ### END SOLUTION + + def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0): + """ + Apply optimization techniques (Modules 17-18). + + TODO: Apply quantization and pruning optimizations + + APPROACH: + 1. Optionally apply quantization to reduce precision + 2. Optionally apply pruning to remove weights + 3. Measure size reduction + 4. Validate model still works + + EXAMPLE: + >>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5) + Model optimized: 75% size reduction + """ + ### BEGIN SOLUTION + original_params = self.model.count_parameters() + original_memory = original_params * 4 / (1024 * 1024) + + optimizations_applied = [] + + if quantize: + # Apply quantization (simulated) + # In real implementation, would use quantize_model() + quantized_memory = original_memory / 4 # INT8 vs FP32 + optimizations_applied.append(f"INT8 quantization (4× memory reduction)") + print(" Applied INT8 quantization") + + if prune_sparsity > 0: + # Apply pruning (simulated) + # In real implementation, would use magnitude_prune() + remaining_weights = 1 - prune_sparsity + optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)") + print(f" Applied {prune_sparsity:.0%} magnitude pruning") + + # Calculate final size + size_reduction = 1.0 + if quantize: + size_reduction *= 0.25 # 4× smaller + if prune_sparsity > 0: + size_reduction *= (1 - prune_sparsity) + + final_memory = original_memory * size_reduction + reduction_factor = original_memory / final_memory + + print(f"🔧 Model optimization complete:") + print(f" Original: {original_memory:.1f}MB") + print(f" Optimized: {final_memory:.1f}MB") + print(f" Reduction: {reduction_factor:.1f}× smaller") + print(f" Applied: {', '.join(optimizations_applied)}") + ### END SOLUTION + + def generate_text(self, prompt: str, max_tokens: int = 50) -> str: + """ + Generate text using the trained model. + + TODO: Implement text generation with proper encoding/decoding + + APPROACH: + 1. Encode prompt to token IDs + 2. Use model.generate() for autoregressive generation + 3. Decode generated tokens back to text + 4. Return generated text + + EXAMPLE: + >>> text = pipeline.generate_text("Hello", max_tokens=10) + >>> print(f"Generated: {text}") + Generated: Hello world this is AI + """ + ### BEGIN SOLUTION + if not self.is_trained: + print("⚠️ Model not trained yet. Generating with random weights.") + + # Encode prompt + prompt_tokens = self.tokenizer.encode(prompt) + prompt_tensor = Tensor([prompt_tokens]) + + # Generate tokens + generated_tokens = self.model.generate( + prompt_tensor, + max_new_tokens=max_tokens, + temperature=0.8, + use_cache=True + ) + + # Decode to text + all_tokens = generated_tokens.data[0].tolist() + generated_text = self.tokenizer.decode(all_tokens) + + return generated_text + ### END SOLUTION + +def test_unit_complete_pipeline(): + """🔬 Test complete pipeline integration.""" + print("🔬 Unit Test: Complete Pipeline Integration...") + + # Create pipeline + pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2) + + # Test data preparation + corpus = ["hello world", "ai is fun", "machine learning"] + dataloader = pipeline.prepare_training_data(corpus, batch_size=2) + assert len(dataloader) > 0, "DataLoader should have batches" + + # Test training (minimal) + history = pipeline.train(dataloader, epochs=1) + assert 'losses' in history, "History should contain losses" + assert len(history['losses']) == 1, "Should have one epoch of losses" + + # Test optimization + pipeline.optimize_model(quantize=True, prune_sparsity=0.5) + + # Test generation + generated = pipeline.generate_text("hello", max_tokens=5) + assert isinstance(generated, str), "Generated output should be string" + assert len(generated) > 0, "Generated text should not be empty" + + print(f"✅ Pipeline stages completed successfully") + print(f"✅ Training history: {len(history['losses'])} epochs") + print(f"✅ Generated text: '{generated[:20]}...'") + print("✅ Complete pipeline integration works!") + +# Run immediate test +test_unit_complete_pipeline() diff --git a/tinytorch/benchmarking/benchmark.py b/tinytorch/benchmarking/benchmark.py index 138f627a..f6572c55 100644 --- a/tinytorch/benchmarking/benchmark.py +++ b/tinytorch/benchmarking/benchmark.py @@ -16,7 +16,7 @@ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 __all__ = ['OlympicEvent', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf', - 'test_unit_tinymlperf', 'calculate_normalized_scores'] + 'test_unit_tinymlperf'] # %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 0 #| default_exp benchmarking.benchmark @@ -72,7 +72,7 @@ class Benchmark: self.measurement_runs = measurement_runs self.results = {} - # Use Profiler from Module 15 for measurements + # Use Profiler from Module 14 for measurements self.profiler = Profiler() # System information for metadata @@ -1024,53 +1024,3 @@ def test_unit_tinymlperf(): print("✅ TinyMLPerf works correctly!") test_unit_tinymlperf() - -# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 24 -def calculate_normalized_scores(baseline_results: dict, - optimized_results: dict) -> dict: - """ - Calculate normalized performance metrics for fair competition comparison. - - This function converts absolute measurements into relative improvements, - enabling fair comparison across different hardware platforms. - - Args: - baseline_results: Dict with keys: 'latency', 'memory', 'accuracy' - optimized_results: Dict with same keys as baseline_results - - Returns: - Dict with normalized metrics: - - speedup: Relative latency improvement (higher is better) - - compression_ratio: Relative memory reduction (higher is better) - - accuracy_delta: Absolute accuracy change (closer to 0 is better) - - efficiency_score: Combined metric balancing all factors - - Example: - >>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89} - >>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87} - >>> scores = calculate_normalized_scores(baseline, optimized) - >>> print(f"Speedup: {scores['speedup']:.2f}x") - Speedup: 2.50x - """ - # Calculate speedup (higher is better) - speedup = baseline_results['latency'] / optimized_results['latency'] - - # Calculate compression ratio (higher is better) - compression_ratio = baseline_results['memory'] / optimized_results['memory'] - - # Calculate accuracy delta (closer to 0 is better, negative means degradation) - accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy'] - - # Calculate efficiency score (combined metric) - # Penalize accuracy loss: the more accuracy you lose, the lower your score - accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0 - efficiency_score = (speedup * compression_ratio) / accuracy_penalty - - return { - 'speedup': speedup, - 'compression_ratio': compression_ratio, - 'accuracy_delta': accuracy_delta, - 'efficiency_score': efficiency_score, - 'baseline': baseline_results.copy(), - 'optimized': optimized_results.copy() - } diff --git a/tinytorch/competition/submit.py b/tinytorch/competition/submit.py index e1beaa7b..da8585d6 100644 --- a/tinytorch/competition/submit.py +++ b/tinytorch/competition/submit.py @@ -16,7 +16,7 @@ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 __all__ = ['validate_installation', 'load_baseline_model', 'generate_baseline', 'worked_example_optimization', - 'optimize_for_competition', 'validate_submission', 'generate_submission'] + 'optimize_for_competition', 'generate_submission'] # %% ../../modules/source/20_competition/competition_dev.ipynb 4 import numpy as np @@ -24,8 +24,6 @@ import json import time from pathlib import Path from typing import Dict, List, Tuple, Any, Optional -from ..benchmarking.benchmark import Benchmark, calculate_normalized_scores -from ..profiling.profiler import Profiler def validate_installation() -> Dict[str, bool]: """ @@ -364,24 +362,31 @@ def worked_example_optimization(): return submission # %% ../../modules/source/20_competition/competition_dev.ipynb 10 -def optimize_for_competition(baseline_model, event: str = "all_around", division: str = "closed"): +def optimize_for_competition(baseline_model, event: str = "all_around"): """ 🏅 YOUR COMPETITION ENTRY - IMPLEMENT YOUR STRATEGY HERE! + This is where you apply optimization techniques from Modules 14-18. + + Available techniques: + - Module 14: KV Caching (for transformers) - enable_kv_cache() + - Module 16: Acceleration (vectorization, fusion) + - Module 17: Quantization (INT8, INT4) - quantize_model() + - Module 18: Compression (pruning) - magnitude_prune() + Args: - baseline_model: Starting model (use for Closed, optional for Open) - event: Category you're competing in + baseline_model: The unoptimized model + event: Which Olympic event you're competing in - "latency_sprint": Minimize latency - "memory_challenge": Minimize memory - "accuracy_contest": Maximize accuracy - "all_around": Best balance - "extreme_push": Most aggressive - division: "closed" or "open" - which track you chose Returns: Your optimized model - 🔒 CLOSED DIVISION Example: + Example: from tinytorch.optimization.quantization import quantize_model from tinytorch.optimization.compression import magnitude_prune @@ -389,15 +394,6 @@ def optimize_for_competition(baseline_model, event: str = "all_around", division optimized = quantize_model(optimized, bits=8) optimized = magnitude_prune(optimized, sparsity=0.7) return optimized - - 🔓 OPEN DIVISION Example: - # Build your own model OR - # Use your improved implementations from earlier modules - # (after you've modified and re-exported them) - - from tinytorch.models import YourCustomArchitecture - optimized = YourCustomArchitecture() - return optimized """ print(f"🏅 YOUR OPTIMIZATION STRATEGY FOR: {event}") @@ -442,201 +438,74 @@ def optimize_for_competition(baseline_model, event: str = "all_around", division return optimized_model -#| export -def validate_submission(submission: Dict[str, Any]) -> Dict[str, Any]: - """ - Validate competition submission with sanity checks. - - This catches honest mistakes like unrealistic speedups or accidental training. - Honor code system - we trust but verify basic reasonableness. - - Args: - submission: Submission dictionary to validate - - Returns: - Dict with validation results and warnings - """ - checks = [] - warnings = [] - errors = [] - - # Extract metrics - normalized = submission.get("normalized_scores", {}) - speedup = normalized.get("speedup", 1.0) - compression = normalized.get("compression_ratio", 1.0) - accuracy_delta = normalized.get("accuracy_delta", 0.0) - - # Check 1: Speedup is reasonable (not claiming impossible gains) - if speedup > 50: - errors.append(f"❌ Speedup {speedup:.1f}x seems unrealistic (>50x)") - elif speedup > 20: - warnings.append(f"⚠️ Speedup {speedup:.1f}x is very high - please verify measurements") - else: - checks.append(f"✅ Speedup {speedup:.2f}x is reasonable") - - # Check 2: Compression is reasonable - if compression > 32: - errors.append(f"❌ Compression {compression:.1f}x seems unrealistic (>32x)") - elif compression > 16: - warnings.append(f"⚠️ Compression {compression:.1f}x is very high - please verify") - else: - checks.append(f"✅ Compression {compression:.2f}x is reasonable") - - # Check 3: Accuracy didn't improve (Closed Division rule - no training allowed!) - division = submission.get("division", "closed") - if division == "closed" and accuracy_delta > 1.0: - errors.append(f"❌ Accuracy improved by {accuracy_delta:.1f}pp - did you accidentally train the model?") - elif accuracy_delta > 0.5: - warnings.append(f"⚠️ Accuracy improved by {accuracy_delta:.1f}pp - verify no training occurred") - else: - checks.append(f"✅ Accuracy change {accuracy_delta:+.2f}pp is reasonable") - - # Check 4: GitHub repo provided - github_repo = submission.get("github_repo", "") - if not github_repo or github_repo == "": - warnings.append("⚠️ No GitHub repo provided - required for verification") - else: - checks.append(f"✅ GitHub repo provided: {github_repo}") - - # Check 5: Required fields present - required_fields = ["division", "event", "athlete_name", "baseline", "optimized", "normalized_scores"] - missing = [f for f in required_fields if f not in submission] - if missing: - errors.append(f"❌ Missing required fields: {', '.join(missing)}") - else: - checks.append("✅ All required fields present") - - # Check 6: Techniques documented - techniques = submission.get("techniques_applied", []) - if not techniques or "TODO" in str(techniques): - warnings.append("⚠️ No optimization techniques listed") - else: - checks.append(f"✅ Techniques documented: {', '.join(techniques[:3])}...") - - return { - "valid": len(errors) == 0, - "checks": checks, - "warnings": warnings, - "errors": errors - } - -#| export def generate_submission(baseline_model, optimized_model, - division: str = "closed", event: str = "all_around", athlete_name: str = "YourName", - github_repo: str = "", techniques: List[str] = None) -> Dict[str, Any]: """ - Generate standardized TinyMLPerf competition submission with normalized scoring. + Generate standardized competition submission. Args: baseline_model: Original unoptimized model optimized_model: Your optimized model - division: "closed" or "open" - event: Competition category (latency_sprint, memory_challenge, all_around, etc.) - athlete_name: Your name for submission - github_repo: GitHub repository URL for code verification - techniques: List of optimization techniques applied + event: Olympic event name + athlete_name: Your name for leaderboard + techniques: List of techniques applied Returns: Submission dictionary (will be saved as JSON) """ - print("📤 Generating TinyMLPerf Competition Submission...") + print("📤 Generating Competition Submission...") print("=" * 70) # Get baseline metrics baseline_metrics = generate_baseline(quick=True) - # Benchmark optimized model + # For demonstration, estimate optimized metrics + # In real competition, this would benchmark the actual optimized model print("🔬 Benchmarking optimized model...") - # Use Profiler and Benchmark from Module 19 - profiler = Profiler() - - # For demonstration, we'll use placeholder metrics - # In real competition, students would measure their actual optimized model + # Placeholder: Students' actual optimizations would be measured here optimized_metrics = { - "model": getattr(optimized_model, 'name', 'Optimized_Model'), - "accuracy": 84.0, # Would be measured with actual test set - "latency_ms": 28.0, # Would be measured with profiler - "memory_mb": 4.0, # Would be measured with profiler - "parameters": 2000000, # Would be counted + "model": "Your_Optimized_Model", + "accuracy": 84.0, # Measured + "latency_ms": 28.0, # Measured + "memory_mb": 4.0, # Measured + "parameters": 2000000, # Measured } - # Calculate normalized scores using Module 19's function - baseline_for_norm = { - "latency": baseline_metrics["latency_ms"], - "memory": baseline_metrics["memory_mb"], - "accuracy": baseline_metrics["accuracy"] + # Calculate improvements + improvements = { + "accuracy_change": optimized_metrics["accuracy"] - baseline_metrics["accuracy"], + "latency_speedup": baseline_metrics["latency_ms"] / optimized_metrics["latency_ms"], + "memory_reduction": baseline_metrics["memory_mb"] / optimized_metrics["memory_mb"], } - optimized_for_norm = { - "latency": optimized_metrics["latency_ms"], - "memory": optimized_metrics["memory_mb"], - "accuracy": optimized_metrics["accuracy"] - } - - normalized_scores = calculate_normalized_scores(baseline_for_norm, optimized_for_norm) - - # Create submission with all required fields + # Create submission submission = { - "division": division, "event": event, "athlete_name": athlete_name, - "github_repo": github_repo, "baseline": baseline_metrics, "optimized": optimized_metrics, - "normalized_scores": { - "speedup": normalized_scores["speedup"], - "compression_ratio": normalized_scores["compression_ratio"], - "accuracy_delta": normalized_scores["accuracy_delta"], - "efficiency_score": normalized_scores["efficiency_score"] - }, - "techniques_applied": techniques or ["TODO: Document your optimization techniques"], + "improvements": improvements, + "techniques_applied": techniques or ["TODO: List your techniques"], "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), - "tinytorch_version": "0.1.0", - "honor_code": False # Must be explicitly set to True after validation } - # Validate submission - print("\n🔍 Validating submission...") - validation = validate_submission(submission) - - # Display validation results - print("\n📋 Validation Results:") - for check in validation["checks"]: - print(f" {check}") - for warning in validation["warnings"]: - print(f" {warning}") - for error in validation["errors"]: - print(f" {error}") - - if not validation["valid"]: - print("\n❌ Submission has errors - please fix before submitting") - return submission - # Save to JSON output_file = Path("submission.json") with open(output_file, "w") as f: json.dump(submission, f, indent=2) - print(f"\n✅ Submission saved to: {output_file}") + print(f"✅ Submission saved to: {output_file}") print() - print("📊 Your Normalized Scores (MLPerf-style):") - print(f" Division: {division.upper()}") - print(f" Event: {event.replace('_', ' ').title()}") - print(f" Speedup: {normalized_scores['speedup']:.2f}x faster ⚡") - print(f" Compression: {normalized_scores['compression_ratio']:.2f}x smaller 💾") - print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {normalized_scores['accuracy_delta']:+.2f}pp)") - print(f" Efficiency: {normalized_scores['efficiency_score']:.2f}") - print() - print("📤 Next Steps:") - print(" 1. Verify all metrics are correct") - print(" 2. Push your code to GitHub (if not done)") - print(" 3. Run: tito submit submission.json") - print(" (This will validate and prepare final submission)") + print("📊 Your Results:") + print(f" Event: {event}") + print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {improvements['accuracy_change']:+.1f}pp)") + print(f" Latency: {optimized_metrics['latency_ms']:.1f}ms ({improvements['latency_speedup']:.2f}x faster)") + print(f" Memory: {optimized_metrics['memory_mb']:.2f}MB ({improvements['memory_reduction']:.2f}x smaller)") print() + print("📤 Upload submission.json to TorchPerf Olympics platform!") print("=" * 70) return submission diff --git a/tinytorch/core/autograd.py b/tinytorch/core/autograd.py index 4e340bfd..1a71c287 100644 --- a/tinytorch/core/autograd.py +++ b/tinytorch/core/autograd.py @@ -15,9 +15,9 @@ # ║ happens! The tinytorch/ directory is just the compiled output. ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 -__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'SumBackward', - 'ReshapeBackward', 'EmbeddingBackward', 'SqrtBackward', 'MeanBackward', 'ReLUBackward', 'GELUBackward', - 'SigmoidBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd'] +__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'TransposeBackward', + 'PermuteBackward', 'EmbeddingBackward', 'ReshapeBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward', + 'SoftmaxBackward', 'GELUBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd'] # %% ../../modules/source/05_autograd/autograd_dev.ipynb 1 import numpy as np @@ -164,92 +164,66 @@ class MulBackward(Function): return grad_a, grad_b -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 12 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 13 class SubBackward(Function): """ Gradient computation for tensor subtraction. **Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1 - - **Key Insight:** Subtraction passes gradient unchanged to first input, - but negates it for second input (because of the minus sign). - - **Applications:** Used in residual connections, computing differences in losses. """ def apply(self, grad_output): """ Compute gradients for subtraction. - Args: - grad_output: Gradient flowing backward from output - Returns: - Tuple of (grad_a, grad_b) for the two inputs - - **Mathematical Foundation:** - - ∂(a-b)/∂a = 1 → grad_a = grad_output - - ∂(a-b)/∂b = -1 → grad_b = -grad_output + Tuple of (grad_a, grad_b) where grad_b is negated """ a, b = self.saved_tensors grad_a = grad_b = None - # Gradient for first input: grad_output (unchanged) if isinstance(a, Tensor) and a.requires_grad: - grad_a = grad_output + grad_a = grad_output # ∂(a-b)/∂a = 1 - # Gradient for second input: -grad_output (negated) if isinstance(b, Tensor) and b.requires_grad: - grad_b = -grad_output + grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!) return grad_a, grad_b - -#| export +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15 class DivBackward(Function): """ Gradient computation for tensor division. - **Mathematical Rule:** If z = a / b, then ∂z/∂a = 1/b and ∂z/∂b = -a/b² - - **Key Insight:** Division gradient for numerator is 1/denominator, - for denominator is -numerator/denominator². - - **Applications:** Used in normalization (LayerNorm, BatchNorm), loss functions. + **Mathematical Rule:** If z = a / b, then: + - ∂z/∂a = 1/b + - ∂z/∂b = -a/b² """ def apply(self, grad_output): """ - Compute gradients for division. + Compute gradients for division using quotient rule. - Args: - grad_output: Gradient flowing backward from output - Returns: - Tuple of (grad_a, grad_b) for the two inputs - - **Mathematical Foundation:** - - ∂(a/b)/∂a = 1/b → grad_a = grad_output / b - - ∂(a/b)/∂b = -a/b² → grad_b = -grad_output * a / b² + Tuple of (grad_a, grad_b) """ a, b = self.saved_tensors grad_a = grad_b = None - # Gradient for numerator: grad_output / b if isinstance(a, Tensor) and a.requires_grad: + # ∂(a/b)/∂a = 1/b if isinstance(b, Tensor): grad_a = grad_output / b.data else: grad_a = grad_output / b - # Gradient for denominator: -grad_output * a / b² if isinstance(b, Tensor) and b.requires_grad: + # ∂(a/b)/∂b = -a/b² grad_b = -grad_output * a.data / (b.data ** 2) return grad_a, grad_b - -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17 class MatmulBackward(Function): """ Gradient computation for matrix multiplication. @@ -269,8 +243,6 @@ class MatmulBackward(Function): """ Compute gradients for matrix multiplication. - Handles both 2D matrices and 3D batched tensors (for transformers). - Args: grad_output: Gradient flowing backward from output @@ -278,40 +250,244 @@ class MatmulBackward(Function): Tuple of (grad_a, grad_b) for the two matrix inputs **Mathematical Foundation:** - - 2D: ∂(A@B)/∂A = grad_output @ B.T - - 3D: ∂(A@B)/∂A = grad_output @ swapaxes(B, -2, -1) + - ∂(A@B)/∂A = grad_output @ B.T + - ∂(A@B)/∂B = A.T @ grad_output - **Why Both Cases:** - - 2D: Traditional matrix multiplication (Linear layers) - - 3D: Batched operations (Transformers: batch, seq, embed) + **Batched Operation:** For 3D+ tensors, we transpose only the last two + dimensions using np.swapaxes, preserving batch dimensions. """ a, b = self.saved_tensors grad_a = grad_b = None - # Detect if we're dealing with batched (3D) or regular (2D) tensors - is_batched = len(grad_output.shape) == 3 - - # Gradient for first input: grad_output @ b.T (or batched equivalent) + # Gradient for first input: grad_output @ b.T if isinstance(a, Tensor) and a.requires_grad: - if is_batched: - # Batched: use matmul and swapaxes for transpose - grad_a = np.matmul(grad_output, np.swapaxes(b.data, -2, -1)) + # For batched tensors, transpose only last two dims + if b.data.ndim >= 2: + b_T = np.swapaxes(b.data, -2, -1) else: - # 2D: use dot and .T for transpose - grad_a = np.dot(grad_output, b.data.T) + b_T = b.data.T + grad_a = np.matmul(grad_output, b_T) - # Gradient for second input: a.T @ grad_output (or batched equivalent) + # Gradient for second input: a.T @ grad_output if isinstance(b, Tensor) and b.requires_grad: - if is_batched: - # Batched: use matmul and swapaxes for transpose - grad_b = np.matmul(np.swapaxes(a.data, -2, -1), grad_output) + # For batched tensors, transpose only last two dims + if a.data.ndim >= 2: + a_T = np.swapaxes(a.data, -2, -1) else: - # 2D: use dot and .T for transpose - grad_b = np.dot(a.data.T, grad_output) + a_T = a.data.T + grad_b = np.matmul(a_T, grad_output) return grad_a, grad_b -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18 +class TransposeBackward(Function): + """ + Gradient computation for transpose operation. + + **Mathematical Rule:** If Y = X.T, then: + - ∂Y/∂X = grad_Y.T + + **Key Insight:** The gradient of transpose is just transpose the gradient! + This is because transpose is a linear operation that just rearranges elements. + + **Applications:** Used in attention (K.T for scores), weight gradients (W.T), + and any operation that needs to swap matrix dimensions. + """ + + def __init__(self, tensor, dim0, dim1): + """ + Args: + tensor: Input tensor + dim0: First dimension to swap (None for default) + dim1: Second dimension to swap (None for default) + """ + super().__init__(tensor) + self.dim0 = dim0 + self.dim1 = dim1 + + def apply(self, grad_output): + """ + Compute gradient for transpose. + + Args: + grad_output: Gradient flowing backward from output + + Returns: + Tuple with single gradient for input tensor + + **Mathematical Foundation:** + - ∂(X.T)/∂X = grad_output.T + - Just transpose the gradient back! + """ + x, = self.saved_tensors + grad_x = None + + if isinstance(x, Tensor) and x.requires_grad: + # Transpose gradient using the same dims + if self.dim0 is None and self.dim1 is None: + # Default: transpose last two dimensions + if grad_output.ndim < 2: + grad_x = grad_output.copy() + else: + axes = list(range(grad_output.ndim)) + axes[-2], axes[-1] = axes[-1], axes[-2] + grad_x = np.transpose(grad_output, axes) + else: + # Specific dimensions: swap them back + axes = list(range(grad_output.ndim)) + axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0] + grad_x = np.transpose(grad_output, axes) + + return (grad_x,) + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 19 +class PermuteBackward(Function): + """ + Gradient computation for arbitrary axis permutation (general transpose). + + **Mathematical Rule:** If Y = X.permute(axes), then: + - ∂Y/∂X = grad_Y.permute(inverse_axes) + + **Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse). + More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0). + + **Key Insight:** To reverse a permutation, we need to know where each axis went. + If axis i went to position axes[i], then in the inverse, position axes[i] should go to i. + + **Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads. + """ + + def __init__(self, tensor, axes): + """ + Args: + tensor: Input tensor + axes: Tuple of axis indices defining the permutation + """ + super().__init__(tensor) + self.axes = axes + # Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i + self.inverse_axes = tuple(np.argsort(axes)) + + def apply(self, grad_output): + """ + Compute gradient for permutation. + + The gradient is permuted back using the inverse permutation. + + **Mathematical Foundation:** + - ∂(X.permute(axes))/∂X = grad_output.permute(inverse_axes) + """ + x, = self.saved_tensors + grad_x = None + + if isinstance(x, Tensor) and x.requires_grad: + # Permute gradient back to original axis order + grad_x = np.transpose(grad_output, self.inverse_axes) + + return (grad_x,) + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 20 +class EmbeddingBackward(Function): + """ + Gradient computation for embedding lookup operation. + + **Mathematical Rule:** If Y = Embedding[indices], then: + - ∂Loss/∂Embedding[i] = sum of all gradients where index==i + + **Key Insight:** Embedding lookup is a gather operation. The backward + is a scatter operation that accumulates gradients to the embedding weights. + + **Applications:** Word embeddings, positional embeddings, token embeddings + in transformers. + """ + + def __init__(self, weight, indices): + """ + Args: + weight: Embedding weight matrix + indices: Indices used for lookup + """ + super().__init__(weight) + self.indices = indices + + def apply(self, grad_output): + """ + Compute gradient for embedding lookup. + + Args: + grad_output: Gradient flowing backward from output + + Returns: + Tuple with single gradient for weight tensor + + **Mathematical Foundation:** + - ∂(Embedding[indices])/∂Embedding = scatter gradients to selected rows + - Multiple indices can point to same embedding → gradients accumulate + """ + weight, = self.saved_tensors + grad_weight = None + + if isinstance(weight, Tensor) and weight.requires_grad: + # Initialize gradient with zeros + grad_weight = np.zeros_like(weight.data) + + # Scatter gradients back to embedding weights + # np.add.at accumulates gradients for repeated indices + indices_flat = self.indices.data.astype(int).flatten() + grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1]) + + np.add.at(grad_weight, indices_flat, grad_output_reshaped) + + return (grad_weight,) + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21 +class ReshapeBackward(Function): + """ + Gradient computation for reshape operation. + + **Mathematical Rule:** If Y = X.reshape(new_shape), then: + - ∂Y/∂X = grad_Y.reshape(X.shape) + + **Key Insight:** Reshape just rearranges the same elements. + The gradient is simply reshaped back to the original shape! + + **Applications:** Flattening tensors for linear layers, reshaping + between convolutional and dense layers. + """ + + def __init__(self, tensor, original_shape): + """ + Args: + tensor: Input tensor + original_shape: Shape before reshape + """ + super().__init__(tensor) + self.original_shape = original_shape + + def apply(self, grad_output): + """ + Compute gradient for reshape. + + Args: + grad_output: Gradient flowing backward from output + + Returns: + Tuple with single gradient for input tensor + + **Mathematical Foundation:** + - ∂(X.reshape(...))/∂X = grad_output.reshape(X.shape) + - Just reshape the gradient back! + """ + x, = self.saved_tensors + grad_x = None + + if isinstance(x, Tensor) and x.requires_grad: + # Reshape gradient back to original shape + grad_x = grad_output.reshape(self.original_shape) + + return (grad_x,) + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23 class SumBackward(Function): """ Gradient computation for tensor sum. @@ -345,186 +521,7 @@ class SumBackward(Function): return np.ones_like(tensor.data) * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17 -class ReshapeBackward(Function): - """ - Gradient computation for tensor reshape. - - **Mathematical Rule:** If z = reshape(a, new_shape), then ∂z/∂a is reshape(grad_z, old_shape) - - **Key Insight:** Reshape doesn't change values, only their arrangement. - Gradients flow back by reshaping to the original shape. - - **Applications:** Used in transformers (flattening for loss), CNNs, and - anywhere tensor dimensions need to be rearranged. - """ - - def apply(self, grad_output): - """ - Compute gradients for reshape operation. - - Args: - grad_output: Gradient flowing backward from output - - Returns: - Tuple containing gradient for the input tensor - - **Mathematical Foundation:** - - Reshape is a view operation: grad_input = reshape(grad_output, original_shape) - """ - tensor, = self.saved_tensors - original_shape = tensor.shape - - if isinstance(tensor, Tensor) and tensor.requires_grad: - # Reshape gradient back to original input shape - return np.reshape(grad_output, original_shape), - return None, - - -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18 -class EmbeddingBackward(Function): - """ - Gradient computation for embedding lookup. - - **Mathematical Rule:** If z = embedding[indices], gradients accumulate at indexed positions. - - **Key Insight:** Multiple indices can point to the same embedding vector, - so gradients must accumulate (not overwrite) at each position. - - **Applications:** Used in NLP transformers, language models, and any discrete input. - """ - - def apply(self, grad_output): - """ - Compute gradients for embedding lookup. - - Args: - grad_output: Gradient flowing backward from output (batch, seq, embed_dim) - - Returns: - Tuple containing gradient for the embedding weight matrix - - **Mathematical Foundation:** - - Embedding is a lookup: output[i] = weight[indices[i]] - - Gradients scatter back to indexed positions: grad_weight[indices[i]] += grad_output[i] - - Must accumulate because multiple positions can use same embedding - """ - weight, indices = self.saved_tensors - - if isinstance(weight, Tensor) and weight.requires_grad: - # Initialize gradient matrix with zeros - grad_weight = np.zeros_like(weight.data) - - # Scatter gradients back to embedding table - # np.add.at accumulates values at repeated indices - flat_indices = indices.data.astype(int).flatten() - flat_grad_output = grad_output.reshape((-1, weight.shape[-1])) - - np.add.at(grad_weight, flat_indices, flat_grad_output) - - return grad_weight, None - - return None, None - - -#| export -class SqrtBackward(Function): - """ - Gradient computation for square root. - - **Mathematical Rule:** If z = sqrt(x), then ∂z/∂x = 1 / (2 * sqrt(x)) - - **Key Insight:** Gradient is inversely proportional to the square root output. - - **Applications:** Used in normalization (LayerNorm, BatchNorm), distance metrics. - """ - - def apply(self, grad_output): - """ - Compute gradients for sqrt operation. - - Args: - grad_output: Gradient flowing backward from output - - Returns: - Tuple containing gradient for the input - - **Mathematical Foundation:** - - d/dx(sqrt(x)) = 1 / (2 * sqrt(x)) = 1 / (2 * output) - """ - x, = self.saved_tensors - output = self.saved_output - - if isinstance(x, Tensor) and x.requires_grad: - # Gradient: 1 / (2 * sqrt(x)) - grad_x = grad_output / (2.0 * output.data) - return grad_x, - - return None, - - -#| export -class MeanBackward(Function): - """ - Gradient computation for mean reduction. - - **Mathematical Rule:** If z = mean(x), then ∂z/∂x_i = 1 / N for all i - - **Key Insight:** Mean distributes gradient equally to all input elements. - - **Applications:** Used in loss functions, normalization (LayerNorm, BatchNorm). - """ - - def apply(self, grad_output): - """ - Compute gradients for mean reduction. - - Args: - grad_output: Gradient flowing backward from output - - Returns: - Tuple containing gradient for the input - - **Mathematical Foundation:** - - mean reduces by averaging, so gradient is distributed equally - - Each input element contributes 1/N to the output - - Gradient: grad_output / N, broadcasted to input shape - """ - x, = self.saved_tensors - axis = self.axis - keepdims = self.keepdims - - if isinstance(x, Tensor) and x.requires_grad: - # Number of elements that were averaged - if axis is None: - N = x.size - else: - if isinstance(axis, int): - N = x.shape[axis] - else: - N = np.prod([x.shape[ax] for ax in axis]) - - # Distribute gradient equally: each element gets grad_output / N - grad_x = grad_output / N - - # Broadcast gradient back to original shape - if not keepdims and axis is not None: - # Need to add back the reduced dimensions for broadcasting - if isinstance(axis, int): - grad_x = np.expand_dims(grad_x, axis=axis) - else: - for ax in sorted(axis): - grad_x = np.expand_dims(grad_x, axis=ax) - - # Broadcast to match input shape - grad_x = np.broadcast_to(grad_x, x.shape) - - return grad_x, - - return None, - - -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28 class ReLUBackward(Function): """ Gradient computation for ReLU activation. @@ -547,48 +544,7 @@ class ReLUBackward(Function): return grad_output * relu_grad, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24 -class GELUBackward(Function): - """ - Gradient computation for GELU activation. - - **Mathematical Rule:** GELU(x) = x * Φ(x) where Φ is the standard normal CDF - - **Key Insight:** GELU gradient involves both the function value and its derivative. - - **Applications:** Used in modern transformers (GPT, BERT) as a smooth alternative to ReLU. - """ - - def apply(self, grad_output): - """ - Compute gradients for GELU activation. - - Args: - grad_output: Gradient flowing backward from output - - Returns: - Tuple containing gradient for the input - - **Mathematical Foundation:** - - GELU approximation: f(x) = x * sigmoid(1.702 * x) - - Gradient: f'(x) = sigmoid(1.702*x) + x * sigmoid(1.702*x) * (1-sigmoid(1.702*x)) * 1.702 - """ - x, = self.saved_tensors - - if isinstance(x, Tensor) and x.requires_grad: - # GELU gradient using approximation - # f(x) = x * sigmoid(1.702*x) - # f'(x) = sigmoid(1.702*x) + 1.702 * x * sigmoid(1.702*x) * (1 - sigmoid(1.702*x)) - - sig = 1.0 / (1.0 + np.exp(-1.702 * x.data)) - grad_x = grad_output * (sig + 1.702 * x.data * sig * (1 - sig)) - - return grad_x, - - return None, - - -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29 class SigmoidBackward(Function): """ Gradient computation for sigmoid activation. @@ -618,7 +574,101 @@ class SigmoidBackward(Function): return grad_output * sigmoid_grad, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 30 +class SoftmaxBackward(Function): + """ + Gradient computation for softmax activation. + + Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x)) + Derivative: ∂softmax/∂x[i] = softmax[i] * (δ[i,j] - softmax[j]) + + For gradient computation: + grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax)) + + **Key Insight:** The gradient depends on all elements of softmax due to + the normalization, not just the element being differentiated. + """ + + def __init__(self, input_tensor, output_tensor, dim=-1): + """ + Initialize with input, output, and dimension. + + Args: + input_tensor: Original input to softmax + output_tensor: Output of softmax (needed for gradient) + dim: Dimension along which softmax was applied + """ + super().__init__(input_tensor) + self.output_data = output_tensor.data + self.dim = dim + + def apply(self, grad_output): + """ + Compute gradient for softmax. + + Mathematical formula: + ∂L/∂x[i] = softmax[i] * (∂L/∂y[i] - sum_j(∂L/∂y[j] * softmax[j])) + + This can be vectorized as: + grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True)) + """ + tensor, = self.saved_tensors + + if isinstance(tensor, Tensor) and tensor.requires_grad: + # Compute sum(grad_output * softmax) along the softmax dimension + sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True) + + # Softmax gradient: softmax * (grad_output - sum_term) + grad_x = self.output_data * (grad_output - sum_term) + + return (grad_x,) + return (None,) + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 31 +class GELUBackward(Function): + """ + Gradient computation for GELU activation. + + GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal + Approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³))) + + **Key Insight:** GELU is smoother than ReLU, providing non-zero gradients + for negative values, which helps training deep networks. + """ + + def __init__(self, input_tensor): + """Initialize with input tensor.""" + super().__init__(input_tensor) + + def apply(self, grad_output): + """ + Compute gradient for GELU. + + Mathematical formula (using approximation): + ∂gelu/∂x ≈ 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...) + + Simplified: We compute the derivative numerically or use the formula. + """ + tensor, = self.saved_tensors + + if isinstance(tensor, Tensor) and tensor.requires_grad: + x = tensor.data + # GELU derivative approximation + # Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) + sqrt_2_over_pi = np.sqrt(2.0 / np.pi) + x_cubed = x ** 3 + tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed) + tanh_out = np.tanh(tanh_arg) + sech_squared = 1 - tanh_out ** 2 + + # Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx + d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2) + gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg + + return (grad_output * gelu_grad,) + return (None,) + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 32 class MSEBackward(Function): """ Gradient computation for Mean Squared Error Loss. @@ -644,7 +694,7 @@ class MSEBackward(Function): return grad * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 33 class BCEBackward(Function): """ Gradient computation for Binary Cross-Entropy Loss. @@ -674,7 +724,7 @@ class BCEBackward(Function): return grad * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 34 class CrossEntropyBackward(Function): """ Gradient computation for Cross-Entropy Loss. @@ -719,7 +769,7 @@ class CrossEntropyBackward(Function): return grad * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 35 def enable_autograd(): """ Enable gradient tracking for all Tensor operations. @@ -758,8 +808,10 @@ def enable_autograd(): _original_add = Tensor.__add__ _original_sub = Tensor.__sub__ _original_mul = Tensor.__mul__ - _original_truediv = Tensor.__truediv__ + _original_div = Tensor.__truediv__ _original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None + _original_transpose = Tensor.transpose if hasattr(Tensor, 'transpose') else None + _original_reshape = Tensor.reshape if hasattr(Tensor, 'reshape') else None # Enhanced operations that track gradients def tracked_add(self, other): @@ -806,6 +858,76 @@ def enable_autograd(): return result + def tracked_matmul(self, other): + """ + Matrix multiplication with gradient tracking. + + Enhances the original matmul method to build computation graphs + when requires_grad=True for any input. + """ + if _original_matmul: + result = _original_matmul(self, other) + else: + # Fallback if matmul doesn't exist + result = Tensor(np.dot(self.data, other.data)) + + # Track gradient if needed + if self.requires_grad or other.requires_grad: + result.requires_grad = True + result._grad_fn = MatmulBackward(self, other) + + return result + + def tracked_transpose(self, dim0=None, dim1=None): + """ + Transpose with gradient tracking. + + Enhances the original transpose method to build computation graphs + when requires_grad=True for the input. + """ + if _original_transpose: + result = _original_transpose(self, dim0, dim1) + else: + # Fallback if transpose doesn't exist + if dim0 is None and dim1 is None: + axes = list(range(len(self.shape))) + if len(axes) >= 2: + axes[-2], axes[-1] = axes[-1], axes[-2] + result = Tensor(np.transpose(self.data, axes)) + else: + axes = list(range(len(self.shape))) + axes[dim0], axes[dim1] = axes[dim1], axes[dim0] + result = Tensor(np.transpose(self.data, axes)) + + # Track gradient if needed + if self.requires_grad: + result.requires_grad = True + result._grad_fn = TransposeBackward(self, dim0, dim1) + + return result + + def tracked_reshape(self, *shape): + """ + Reshape with gradient tracking. + + Enhances the original reshape method to build computation graphs + when requires_grad=True for the input. + """ + original_shape = self.shape + + if _original_reshape: + result = _original_reshape(self, *shape) + else: + # Fallback if reshape doesn't exist + result = Tensor(self.data.reshape(*shape)) + + # Track gradient if needed + if self.requires_grad: + result.requires_grad = True + result._grad_fn = ReshapeBackward(self, original_shape) + + return result + def tracked_sub(self, other): """ Subtraction with gradient tracking. @@ -827,7 +949,7 @@ def enable_autograd(): return result - def tracked_truediv(self, other): + def tracked_div(self, other): """ Division with gradient tracking. @@ -839,7 +961,7 @@ def enable_autograd(): other = Tensor(other) # Call original operation - result = _original_truediv(self, other) + result = _original_div(self, other) # Track gradient if needed if self.requires_grad or other.requires_grad: @@ -848,26 +970,6 @@ def enable_autograd(): return result - def tracked_matmul(self, other): - """ - Matrix multiplication with gradient tracking. - - Enhances the original matmul method to build computation graphs - when requires_grad=True for any input. - """ - if _original_matmul: - result = _original_matmul(self, other) - else: - # Fallback if matmul doesn't exist - result = Tensor(np.dot(self.data, other.data)) - - # Track gradient if needed - if self.requires_grad or other.requires_grad: - result.requires_grad = True - result._grad_fn = MatmulBackward(self, other) - - return result - def sum_op(self, axis=None, keepdims=False): """ Sum operation with gradient tracking. @@ -958,20 +1060,23 @@ def enable_autograd(): Tensor.__add__ = tracked_add Tensor.__sub__ = tracked_sub Tensor.__mul__ = tracked_mul - Tensor.__truediv__ = tracked_truediv + Tensor.__truediv__ = tracked_div Tensor.matmul = tracked_matmul + Tensor.transpose = tracked_transpose + Tensor.reshape = tracked_reshape Tensor.sum = sum_op Tensor.backward = backward Tensor.zero_grad = zero_grad # Patch activations and losses to track gradients try: - from tinytorch.core.activations import Sigmoid, ReLU, GELU + from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss # Store original methods _original_sigmoid_forward = Sigmoid.forward _original_relu_forward = ReLU.forward + _original_softmax_forward = Softmax.forward _original_gelu_forward = GELU.forward _original_bce_forward = BinaryCrossEntropyLoss.forward _original_mse_forward = MSELoss.forward @@ -999,13 +1104,24 @@ def enable_autograd(): return result + def tracked_softmax_forward(self, x, dim=-1): + """Softmax with gradient tracking.""" + # Call original forward to get result using Tensor operations + result = _original_softmax_forward(self, x, dim=dim) + + # Attach the correct gradient function + if x.requires_grad: + result.requires_grad = True + result._grad_fn = SoftmaxBackward(x, result, dim) + + return result + def tracked_gelu_forward(self, x): """GELU with gradient tracking.""" - # GELU approximation: x * sigmoid(1.702 * x) - sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data)) - result_data = x.data * sigmoid_part - result = Tensor(result_data) + # Call original forward to get result + result = _original_gelu_forward(self, x) + # Attach the correct gradient function if x.requires_grad: result.requires_grad = True result._grad_fn = GELUBackward(x) @@ -1071,6 +1187,7 @@ def enable_autograd(): # Install patched methods Sigmoid.forward = tracked_sigmoid_forward ReLU.forward = tracked_relu_forward + Softmax.forward = tracked_softmax_forward GELU.forward = tracked_gelu_forward BinaryCrossEntropyLoss.forward = tracked_bce_forward MSELoss.forward = tracked_mse_forward diff --git a/tinytorch/core/tensor.py b/tinytorch/core/tensor.py index 4c0912c0..82e681fa 100644 --- a/tinytorch/core/tensor.py +++ b/tinytorch/core/tensor.py @@ -113,21 +113,10 @@ class Tensor: ### BEGIN SOLUTION if isinstance(other, Tensor): # Tensor + Tensor: let NumPy handle broadcasting - result_data = self.data + other.data + return Tensor(self.data + other.data) else: # Tensor + scalar: NumPy broadcasts automatically - result_data = self.data + other - - # Create new tensor with result - result = Tensor(result_data) - - # Preserve gradient tracking if either operand requires gradients - if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'): - result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad) - elif hasattr(self, 'requires_grad'): - result.requires_grad = self.requires_grad - - return result + return Tensor(self.data + other) ### END SOLUTION # nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true} @@ -137,10 +126,12 @@ class Tensor: Common use: Centering data (x - mean), computing differences for loss functions. """ + ### BEGIN SOLUTION if isinstance(other, Tensor): return Tensor(self.data - other.data) else: return Tensor(self.data - other) + ### END SOLUTION def __mul__(self, other): """ @@ -149,10 +140,12 @@ class Tensor: Common use: Scaling features, applying masks, gating mechanisms in neural networks. Note: This is * operator, not @ (which will be matrix multiplication). """ + ### BEGIN SOLUTION if isinstance(other, Tensor): return Tensor(self.data * other.data) else: return Tensor(self.data * other) + ### END SOLUTION def __truediv__(self, other): """ @@ -160,10 +153,12 @@ class Tensor: Common use: Normalization (x / std), converting counts to probabilities. """ + ### BEGIN SOLUTION if isinstance(other, Tensor): return Tensor(self.data / other.data) else: return Tensor(self.data / other) + ### END SOLUTION # nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true} def matmul(self, other): @@ -232,7 +227,8 @@ class Tensor: ) # Perform optimized matrix multiplication - result_data = np.dot(self.data, other.data) + # Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors + result_data = np.matmul(self.data, other.data) return Tensor(result_data) ### END SOLUTION @@ -304,16 +300,8 @@ class Tensor: # Reshape the data (NumPy handles the memory layout efficiently) reshaped_data = np.reshape(self.data, new_shape) - - # Create output tensor preserving gradient tracking + # Preserve gradient tracking from the original tensor (important for autograd!) result = Tensor(reshaped_data, requires_grad=self.requires_grad) - - # Set up backward function for autograd - if self.requires_grad: - from tinytorch.core.autograd import ReshapeBackward - result._grad_fn = ReshapeBackward() - result._grad_fn.saved_tensors = (self,) - return result ### END SOLUTION @@ -380,7 +368,9 @@ class Tensor: axes[dim0], axes[dim1] = axes[dim1], axes[dim0] transposed_data = np.transpose(self.data, axes) - return Tensor(transposed_data) + # Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn) + result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False) + return result ### END SOLUTION # nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true} diff --git a/tinytorch/core/training.py b/tinytorch/core/training.py index f535f6b8..e4082b8f 100644 --- a/tinytorch/core/training.py +++ b/tinytorch/core/training.py @@ -15,7 +15,7 @@ # ║ happens! The tinytorch/ directory is just the compiled output. ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 -__all__ = ['CosineSchedule', 'save_checkpoint', 'load_checkpoint', 'Trainer'] +__all__ = ['CosineSchedule', 'Trainer'] # %% ../../modules/source/07_training/training_dev.ipynb 1 import numpy as np @@ -72,90 +72,6 @@ class CosineSchedule: ### END SOLUTION # %% ../../modules/source/07_training/training_dev.ipynb 14 -def save_checkpoint(checkpoint_dict: Dict[str, Any], path: str): - """ - Save checkpoint dictionary to disk using pickle. - - This is a low-level utility for saving model state. Use this when you have - a custom training loop and want to save just what you need (model params, - config, metadata). - - For complete training state with optimizer and scheduler, use - Trainer.save_checkpoint() instead. - - TODO: Implement checkpoint saving with pickle - - APPROACH: - 1. Create parent directory if it doesn't exist (Path(path).parent.mkdir) - 2. Open file in binary write mode ('wb') - 3. Use pickle.dump() to serialize the checkpoint dictionary - 4. Print confirmation message - - EXAMPLE: - >>> model = SimpleModel() - >>> checkpoint = { - ... 'model_params': [p.data.copy() for p in model.parameters()], - ... 'config': {'embed_dim': 32, 'num_layers': 2}, - ... 'metadata': {'final_loss': 0.089, 'training_steps': 5000} - ... } - >>> save_checkpoint(checkpoint, 'checkpoints/model.pkl') - ✓ Checkpoint saved: checkpoints/model.pkl - - HINTS: - - Use Path(path).parent.mkdir(parents=True, exist_ok=True) - - pickle.dump(obj, file) writes the object to file - - Always print a success message so users know it worked - """ - ### BEGIN SOLUTION - # Create parent directory if needed - Path(path).parent.mkdir(parents=True, exist_ok=True) - - # Save checkpoint using pickle - with open(path, 'wb') as f: - pickle.dump(checkpoint_dict, f) - - print(f"✓ Checkpoint saved: {path}") - ### END SOLUTION - -# %% ../../modules/source/07_training/training_dev.ipynb 15 -def load_checkpoint(path: str) -> Dict[str, Any]: - """ - Load checkpoint dictionary from disk using pickle. - - Companion function to save_checkpoint(). Restores the checkpoint dictionary - so you can rebuild your model, resume training, or inspect saved metadata. - - TODO: Implement checkpoint loading with pickle - - APPROACH: - 1. Open file in binary read mode ('rb') - 2. Use pickle.load() to deserialize the checkpoint - 3. Print confirmation message - 4. Return the loaded dictionary - - EXAMPLE: - >>> checkpoint = load_checkpoint('checkpoints/model.pkl') - ✓ Checkpoint loaded: checkpoints/model.pkl - >>> print(checkpoint['metadata']['final_loss']) - 0.089 - >>> model_params = checkpoint['model_params'] - >>> # Now restore model: for param, data in zip(model.parameters(), model_params)... - - HINTS: - - pickle.load(file) reads and deserializes the object - - Return the loaded dictionary - - Print a success message for user feedback - """ - ### BEGIN SOLUTION - # Load checkpoint using pickle - with open(path, 'rb') as f: - checkpoint = pickle.load(f) - - print(f"✓ Checkpoint loaded: {path}") - return checkpoint - ### END SOLUTION - -# %% ../../modules/source/07_training/training_dev.ipynb 19 class Trainer: """ Complete training orchestrator for neural networks. @@ -330,11 +246,6 @@ class Trainer: def save_checkpoint(self, path: str): """ Save complete training state for resumption. - - This high-level method saves everything needed to resume training: - model parameters, optimizer state, scheduler state, and training history. - - Uses the low-level save_checkpoint() function internally. Args: path: File path to save checkpoint @@ -349,23 +260,19 @@ class Trainer: 'training_mode': self.training_mode } - # Use the standalone save_checkpoint function - save_checkpoint(checkpoint, path) + Path(path).parent.mkdir(parents=True, exist_ok=True) + with open(path, 'wb') as f: + pickle.dump(checkpoint, f) def load_checkpoint(self, path: str): """ Load training state from checkpoint. - - This high-level method restores complete training state including - model parameters, optimizer state, scheduler state, and history. - - Uses the low-level load_checkpoint() function internally. Args: path: File path to load checkpoint from """ - # Use the standalone load_checkpoint function - checkpoint = load_checkpoint(path) + with open(path, 'rb') as f: + checkpoint = pickle.load(f) self.epoch = checkpoint['epoch'] self.step = checkpoint['step'] diff --git a/tinytorch/models/transformer.py b/tinytorch/models/transformer.py index a04d2cbd..4bf34131 100644 --- a/tinytorch/models/transformer.py +++ b/tinytorch/models/transformer.py @@ -23,47 +23,7 @@ from ..core.tensor import Tensor from ..core.layers import Linear from ..core.attention import MultiHeadAttention from ..core.activations import GELU -from ..text.embeddings import Embedding -from ..core.autograd import SqrtBackward, MeanBackward - -# Monkey-patch sqrt method onto Tensor for LayerNorm -def _tensor_sqrt(self): - """ - Compute element-wise square root with gradient tracking. - - Used in normalization layers (LayerNorm, BatchNorm). - """ - result_data = np.sqrt(self.data) - result = Tensor(result_data, requires_grad=self.requires_grad) - - if self.requires_grad: - result._grad_fn = SqrtBackward() - result._grad_fn.saved_tensors = (self,) - result._grad_fn.saved_output = result - - return result - -Tensor.sqrt = _tensor_sqrt - -# Monkey-patch mean method onto Tensor for LayerNorm -def _tensor_mean(self, axis=None, keepdims=False): - """ - Compute mean with gradient tracking. - - Used in normalization layers (LayerNorm, BatchNorm) and loss functions. - """ - result_data = np.mean(self.data, axis=axis, keepdims=keepdims) - result = Tensor(result_data, requires_grad=self.requires_grad) - - if self.requires_grad: - result._grad_fn = MeanBackward() - result._grad_fn.saved_tensors = (self,) - result._grad_fn.axis = axis - result._grad_fn.keepdims = keepdims - - return result - -Tensor.mean = _tensor_mean +from ..text.embeddings import Embedding, PositionalEncoding # %% ../../modules/source/13_transformers/transformers_dev.ipynb 9 class LayerNorm: @@ -101,7 +61,6 @@ class LayerNorm: self.eps = eps # Learnable parameters: scale and shift - # CRITICAL: requires_grad=True so optimizer can train these! self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True) # Scale parameter self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True) # Shift parameter ### END SOLUTION @@ -124,24 +83,29 @@ class LayerNorm: HINT: Use keepdims=True to maintain tensor dimensions for broadcasting """ ### BEGIN SOLUTION - # CRITICAL: Use Tensor operations (not .data) to maintain gradient flow! # Compute statistics across last dimension (features) mean = x.mean(axis=-1, keepdims=True) # Compute variance: E[(x - μ)²] - diff = x - mean # Tensor subtraction maintains gradient - variance = (diff * diff).mean(axis=-1, keepdims=True) # Tensor ops maintain gradient + # Use Tensor operations to preserve computation graph! + diff = x - mean + variance = (diff * diff).mean(axis=-1, keepdims=True) - # Normalize: (x - mean) / sqrt(variance + eps) - # Note: Use Tensor.sqrt() to preserve gradient flow - std = (variance + self.eps).sqrt() # sqrt maintains gradient flow - normalized = diff / std # Division maintains gradient flow + # Normalize - use Tensor operations to preserve gradients! + # Add eps as a Tensor for proper gradient flow + eps_tensor = Tensor(np.array(self.eps), requires_grad=False) + std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad) + normalized = (x - mean) / std # Apply learnable transformation output = normalized * self.gamma + self.beta return output ### END SOLUTION + def __call__(self, x): + """Allows the layer norm to be called like a function.""" + return self.forward(x) + def parameters(self): """Return learnable parameters.""" return [self.gamma, self.beta] @@ -183,10 +147,8 @@ class MLP: # Two-layer feed-forward network self.linear1 = Linear(embed_dim, hidden_dim) + self.gelu = GELU() # Use GELU activation from activations module self.linear2 = Linear(hidden_dim, embed_dim) - - # GELU activation - self.gelu = GELU() ### END SOLUTION def forward(self, x): @@ -209,8 +171,8 @@ class MLP: # First linear layer with expansion hidden = self.linear1.forward(x) - # GELU activation (callable pattern - activations have __call__) - hidden = self.gelu(hidden) + # GELU activation (YOUR activation from Module 03!) + hidden = self.gelu.forward(hidden) # Second linear layer back to original size output = self.linear2.forward(hidden) @@ -218,6 +180,10 @@ class MLP: return output ### END SOLUTION + def __call__(self, x): + """Allows the MLP to be called like a function.""" + return self.forward(x) + def parameters(self): """Return all learnable parameters.""" params = [] @@ -298,7 +264,7 @@ class TransformerBlock: # First sub-layer: Multi-head self-attention with residual connection # Pre-norm: LayerNorm before attention normed1 = self.ln1.forward(x) - # Self-attention: MultiHeadAttention internally creates Q, K, V from input + # Self-attention: query, key, value are all the same (normed1) attention_out = self.attention.forward(normed1, mask) # Residual connection @@ -315,6 +281,10 @@ class TransformerBlock: return output ### END SOLUTION + def __call__(self, x, mask=None): + """Allows the transformer block to be called like a function.""" + return self.forward(x, mask) + def parameters(self): """Return all learnable parameters.""" params = [] @@ -434,6 +404,10 @@ class GPT: return logits ### END SOLUTION + def __call__(self, tokens): + """Allows the GPT model to be called like a function.""" + return self.forward(tokens) + def _create_causal_mask(self, seq_len): """Create causal mask to prevent attending to future positions.""" ### BEGIN SOLUTION diff --git a/tinytorch/optimization/acceleration.py b/tinytorch/optimization/acceleration.py new file mode 100644 index 00000000..e59fe00f --- /dev/null +++ b/tinytorch/optimization/acceleration.py @@ -0,0 +1,22 @@ +# ╔═══════════════════════════════════════════════════════════════════════════════╗ +# ║ 🚨 CRITICAL WARNING 🚨 ║ +# ║ AUTOGENERATED! DO NOT EDIT! ║ +# ║ ║ +# ║ This file is AUTOMATICALLY GENERATED from source modules. ║ +# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║ +# ║ ║ +# ║ ✅ TO EDIT: modules/source/XX_acceleration/acceleration_dev.py ║ +# ║ ✅ TO EXPORT: Run 'tito module complete ' ║ +# ║ ║ +# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║ +# ║ Editing it directly may break module functionality and training. ║ +# ║ ║ +# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║ +# ║ happens! The tinytorch/ directory is just the compiled output. ║ +# ╚═══════════════════════════════════════════════════════════════════════════════╝ +# %% auto 0 +__all__ = [] + +# %% ../../modules/source/18_acceleration/acceleration_dev.ipynb 0 +#| default_exp optimization.acceleration +#| export diff --git a/tinytorch/optimization/compression.py b/tinytorch/optimization/compression.py new file mode 100644 index 00000000..20c318fa --- /dev/null +++ b/tinytorch/optimization/compression.py @@ -0,0 +1,300 @@ +# ╔═══════════════════════════════════════════════════════════════════════════════╗ +# ║ 🚨 CRITICAL WARNING 🚨 ║ +# ║ AUTOGENERATED! DO NOT EDIT! ║ +# ║ ║ +# ║ This file is AUTOMATICALLY GENERATED from source modules. ║ +# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║ +# ║ ║ +# ║ ✅ TO EDIT: modules/source/XX_compression/compression_dev.py ║ +# ║ ✅ TO EXPORT: Run 'tito module complete ' ║ +# ║ ║ +# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║ +# ║ Editing it directly may break module functionality and training. ║ +# ║ ║ +# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║ +# ║ happens! The tinytorch/ directory is just the compiled output. ║ +# ╚═══════════════════════════════════════════════════════════════════════════════╝ +# %% auto 0 +__all__ = ['Sequential', 'KnowledgeDistillation', 'test_unit_knowledge_distillation', 'CompressionComplete', 'measure_sparsity', + 'magnitude_prune', 'structured_prune', 'compress_model'] + +# %% ../../modules/source/17_compression/compression_dev.ipynb 1 +import numpy as np +import copy +from typing import List, Dict, Any, Tuple, Optional +import time + +# Import from TinyTorch modules +from ..core.tensor import Tensor +from ..core.layers import Linear + +# Sequential container for model compression +class Sequential: + """Sequential container for compression (not exported from core layers).""" + def __init__(self, *layers): + self.layers = list(layers) + + def forward(self, x): + for layer in self.layers: + x = layer.forward(x) if hasattr(layer, 'forward') else layer(x) + return x + + def __call__(self, x): + return self.forward(x) + + def parameters(self): + params = [] + for layer in self.layers: + if hasattr(layer, 'parameters'): + params.extend(layer.parameters()) + return params + +# %% ../../modules/source/17_compression/compression_dev.ipynb 15 +class KnowledgeDistillation: + """ + Knowledge distillation for model compression. + + Train a smaller student model to mimic a larger teacher model. + """ + + def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7): + """ + Initialize knowledge distillation. + + TODO: Set up teacher and student models with distillation parameters + + APPROACH: + 1. Store teacher and student models + 2. Set temperature for softening probability distributions + 3. Set alpha for balancing hard vs soft targets + + EXAMPLE: + >>> teacher = Sequential(Linear(100, 200), Linear(200, 50)) + >>> student = Sequential(Linear(100, 50)) + >>> kd = KnowledgeDistillation(teacher, student, temperature=4.0, alpha=0.8) + >>> print(f"Temperature: {kd.temperature}, Alpha: {kd.alpha}") + Temperature: 4.0, Alpha: 0.8 + + HINTS: + - Simply assign the parameters to instance variables + - Temperature typically ranges from 3-5 for effective softening + - Alpha of 0.7 means 70% soft targets, 30% hard targets + + Args: + teacher_model: Large, pre-trained model + student_model: Smaller model to train + temperature: Softening parameter for distributions + alpha: Weight for soft target loss (1-alpha for hard targets) + """ + ### BEGIN SOLUTION + self.teacher_model = teacher_model + self.student_model = student_model + self.temperature = temperature + self.alpha = alpha + ### END SOLUTION + + def distillation_loss(self, student_logits, teacher_logits, true_labels): + """ + Calculate combined distillation loss. + + TODO: Implement knowledge distillation loss function + + APPROACH: + 1. Calculate hard target loss (student vs true labels) + 2. Calculate soft target loss (student vs teacher, with temperature) + 3. Combine losses: alpha * soft_loss + (1-alpha) * hard_loss + + EXAMPLE: + >>> kd = KnowledgeDistillation(teacher, student) + >>> loss = kd.distillation_loss(student_out, teacher_out, labels) + >>> print(f"Distillation loss: {loss:.4f}") + + HINTS: + - Use temperature to soften distributions: logits/temperature + - Soft targets use KL divergence or cross-entropy + - Hard targets use standard classification loss + """ + ### BEGIN SOLUTION + # Convert to numpy for this implementation + if hasattr(student_logits, 'data'): + student_logits = student_logits.data + if hasattr(teacher_logits, 'data'): + teacher_logits = teacher_logits.data + if hasattr(true_labels, 'data'): + true_labels = true_labels.data + + # Soften distributions with temperature + student_soft = self._softmax(student_logits / self.temperature) + teacher_soft = self._softmax(teacher_logits / self.temperature) + + # Soft target loss (KL divergence) + soft_loss = self._kl_divergence(student_soft, teacher_soft) + + # Hard target loss (cross-entropy) + student_hard = self._softmax(student_logits) + hard_loss = self._cross_entropy(student_hard, true_labels) + + # Combined loss + total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss + + return total_loss + ### END SOLUTION + + def _softmax(self, logits): + """Compute softmax with numerical stability.""" + exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True)) + return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True) + + def _kl_divergence(self, p, q): + """Compute KL divergence between distributions.""" + return np.sum(p * np.log(p / (q + 1e-8) + 1e-8)) + + def _cross_entropy(self, predictions, labels): + """Compute cross-entropy loss.""" + # Simple implementation for integer labels + if labels.ndim == 1: + return -np.mean(np.log(predictions[np.arange(len(labels)), labels] + 1e-8)) + else: + return -np.mean(np.sum(labels * np.log(predictions + 1e-8), axis=1)) + +def test_unit_knowledge_distillation(): + """🔬 Test knowledge distillation functionality.""" + print("🔬 Unit Test: Knowledge Distillation...") + + # Create teacher and student models + teacher = Sequential(Linear(10, 20), Linear(20, 5)) + student = Sequential(Linear(10, 5)) # Smaller model + + # Initialize knowledge distillation + kd = KnowledgeDistillation(teacher, student, temperature=3.0, alpha=0.7) + + # Create dummy data + input_data = Tensor(np.random.randn(8, 10)) # Batch of 8 + true_labels = np.array([0, 1, 2, 3, 4, 0, 1, 2]) # Class labels + + # Forward passes + teacher_output = teacher.forward(input_data) + student_output = student.forward(input_data) + + # Calculate distillation loss + loss = kd.distillation_loss(student_output, teacher_output, true_labels) + + # Verify loss is reasonable + assert isinstance(loss, (float, np.floating)), f"Loss should be float, got {type(loss)}" + assert loss > 0, f"Loss should be positive, got {loss}" + assert not np.isnan(loss), "Loss should not be NaN" + + print("✅ knowledge_distillation works correctly!") + +test_unit_knowledge_distillation() + +# %% ../../modules/source/17_compression/compression_dev.ipynb 29 +class CompressionComplete: + """ + Complete compression system for milestone use. + + Provides pruning, distillation, and low-rank approximation techniques. + """ + + @staticmethod + def measure_sparsity(model) -> float: + """Measure the sparsity of a model (fraction of zero weights).""" + total_params = 0 + zero_params = 0 + + if hasattr(model, 'parameters'): + for param in model.parameters(): + total_params += param.size + zero_params += np.sum(param.data == 0) + + return zero_params / total_params if total_params > 0 else 0.0 + + @staticmethod + def magnitude_prune(model, sparsity=0.5): + """ + Prune model weights by magnitude (smallest weights set to zero). + + Args: + model: Model with parameters() method + sparsity: Fraction of weights to prune (0-1) + """ + if hasattr(model, 'parameters'): + for param in model.parameters(): + threshold = np.percentile(np.abs(param.data), sparsity * 100) + param.data[np.abs(param.data) < threshold] = 0 + + return model + + @staticmethod + def structured_prune(model, prune_ratio=0.5): + """ + Prune entire neurons/channels (structured pruning). + + Args: + model: Model to prune + prune_ratio: Fraction of structures to prune (0-1) + """ + if hasattr(model, 'parameters'): + params = list(model.parameters()) + if len(params) > 0 and hasattr(params[0], 'data'): + weight = params[0] + if len(weight.shape) == 2: # Linear layer + # Prune output neurons + neuron_norms = np.linalg.norm(weight.data, axis=0) + threshold = np.percentile(neuron_norms, prune_ratio * 100) + mask = neuron_norms >= threshold + weight.data[:, ~mask] = 0 + + return model + + @staticmethod + def compress_model(model, compression_config: Dict[str, Any]): + """ + Apply complete compression pipeline to a model. + + Args: + model: Model to compress + compression_config: Dictionary with compression settings + - 'magnitude_sparsity': float (0-1) + - 'structured_prune_ratio': float (0-1) + + Returns: + Compressed model with sparsity stats + """ + stats = { + 'original_sparsity': CompressionComplete.measure_sparsity(model) + } + + # Apply magnitude pruning + if 'magnitude_sparsity' in compression_config: + model = CompressionComplete.magnitude_prune( + model, compression_config['magnitude_sparsity'] + ) + + # Apply structured pruning + if 'structured_prune_ratio' in compression_config: + model = CompressionComplete.structured_prune( + model, compression_config['structured_prune_ratio'] + ) + + stats['final_sparsity'] = CompressionComplete.measure_sparsity(model) + stats['compression_ratio'] = 1.0 / (1.0 - stats['final_sparsity']) if stats['final_sparsity'] < 1.0 else float('inf') + + return model, stats + +# Convenience functions for backward compatibility +def measure_sparsity(model) -> float: + """Measure model sparsity.""" + return CompressionComplete.measure_sparsity(model) + +def magnitude_prune(model, sparsity=0.5): + """Apply magnitude-based pruning.""" + return CompressionComplete.magnitude_prune(model, sparsity) + +def structured_prune(model, prune_ratio=0.5): + """Apply structured pruning.""" + return CompressionComplete.structured_prune(model, prune_ratio) + +def compress_model(model, compression_config: Dict[str, Any]): + """Apply complete compression pipeline.""" + return CompressionComplete.compress_model(model, compression_config) diff --git a/tinytorch/optimization/quantization.py b/tinytorch/optimization/quantization.py index 70c0eb48..c30509d3 100644 --- a/tinytorch/optimization/quantization.py +++ b/tinytorch/optimization/quantization.py @@ -15,9 +15,9 @@ # ║ happens! The tinytorch/ directory is just the compiled output. ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 -__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model'] +__all__ = [] -# %% ../../modules/source/17_quantization/quantization_dev.ipynb 3 +# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3 import numpy as np import time from typing import Tuple, Dict, List, Optional @@ -29,94 +29,3 @@ from ..core.layers import Linear from ..core.activations import ReLU print("✅ Quantization module imports complete") - -# %% ../../modules/source/17_quantization/quantization_dev.ipynb 34 -class QuantizationComplete: - """ - Complete quantization system for milestone use. - - Provides INT8 quantization with calibration for 4× memory reduction. - """ - - @staticmethod - def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]: - """Quantize FP32 tensor to INT8.""" - data = tensor.data - min_val = float(np.min(data)) - max_val = float(np.max(data)) - - if abs(max_val - min_val) < 1e-8: - return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0 - - scale = (max_val - min_val) / 255.0 - zero_point = int(np.round(-128 - min_val / scale)) - zero_point = int(np.clip(zero_point, -128, 127)) - - quantized_data = np.round(data / scale + zero_point) - quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8) - - return Tensor(quantized_data), scale, zero_point - - @staticmethod - def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor: - """Dequantize INT8 tensor back to FP32.""" - dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale - return Tensor(dequantized_data) - - @staticmethod - def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]: - """ - Quantize all Linear layers in a model. - - Returns dictionary with quantization info and memory savings. - """ - quantized_layers = {} - original_size = 0 - quantized_size = 0 - - # Iterate through model parameters - if hasattr(model, 'parameters'): - for i, param in enumerate(model.parameters()): - param_size = param.data.nbytes - original_size += param_size - - # Quantize parameter - q_param, scale, zp = QuantizationComplete.quantize_tensor(param) - quantized_size += q_param.data.nbytes - - quantized_layers[f'param_{i}'] = { - 'quantized': q_param, - 'scale': scale, - 'zero_point': zp, - 'original_shape': param.data.shape - } - - return { - 'quantized_layers': quantized_layers, - 'original_size_mb': original_size / (1024 * 1024), - 'quantized_size_mb': quantized_size / (1024 * 1024), - 'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0 - } - - @staticmethod - def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]: - """Compare memory usage between original and quantized models.""" - return { - 'original_mb': quantized_info['original_size_mb'], - 'quantized_mb': quantized_info['quantized_size_mb'], - 'compression_ratio': quantized_info['compression_ratio'], - 'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb'] - } - -# Convenience functions for backward compatibility -def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]: - """Quantize FP32 tensor to INT8.""" - return QuantizationComplete.quantize_tensor(tensor) - -def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor: - """Dequantize INT8 tensor back to FP32.""" - return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point) - -def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]: - """Quantize entire model to INT8.""" - return QuantizationComplete.quantize_model(model, calibration_data) diff --git a/tinytorch/text/embeddings.py b/tinytorch/text/embeddings.py index 07981e95..dacb0f27 100644 --- a/tinytorch/text/embeddings.py +++ b/tinytorch/text/embeddings.py @@ -93,18 +93,22 @@ class Embedding: # Perform embedding lookup using advanced indexing # This is equivalent to one-hot multiplication but much more efficient - embedded_data = self.weight.data[indices.data.astype(int)] - - # Create output tensor with gradient tracking - from tinytorch.core.autograd import EmbeddingBackward - result = Tensor(embedded_data, requires_grad=self.weight.requires_grad) + embedded = self.weight.data[indices.data.astype(int)] + + # Create result tensor + result = Tensor(embedded, requires_grad=self.weight.requires_grad) + # Attach gradient function (students learned this in Module 05!) if self.weight.requires_grad: - result._grad_fn = EmbeddingBackward() - result._grad_fn.saved_tensors = (self.weight, indices) - + from tinytorch.core.autograd import EmbeddingBackward + result._grad_fn = EmbeddingBackward(self.weight, indices) + return result + def __call__(self, indices: Tensor) -> Tensor: + """Allows the embedding to be called like a function.""" + return self.forward(indices) + def parameters(self) -> List[Tensor]: """Return trainable parameters.""" return [self.weight] @@ -188,16 +192,23 @@ class PositionalEncoding: f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}" ) - # Get position embeddings for this sequence length - pos_embeddings = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim) + # Get position embeddings for this sequence length (slice using .data for efficiency) + pos_embeddings_data = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim) # Broadcast to match batch dimension: (1, seq_len, embed_dim) - pos_embeddings = pos_embeddings[np.newaxis, :, :] + pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :] + + # Wrap in Tensor to preserve requires_grad + pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad) - # Add positional information to input embeddings - result = x.data + pos_embeddings + # Add positional information using Tensor operation to preserve gradients! + result = x + pos_embeddings - return Tensor(result) + return result + + def __call__(self, x: Tensor) -> Tensor: + """Allows the positional encoding to be called like a function.""" + return self.forward(x) def parameters(self) -> List[Tensor]: """Return trainable parameters.""" @@ -325,6 +336,10 @@ class EmbeddingLayer: return output + def __call__(self, tokens: Tensor) -> Tensor: + """Allows the embedding layer to be called like a function.""" + return self.forward(tokens) + def parameters(self) -> List[Tensor]: """Return all trainable parameters.""" params = self.token_embedding.parameters()