Update tinytorch and tito with module exports

Re-exported all modules after restructuring:
- Updated _modidx.py with new module locations
- Removed outdated autogeneration headers
- Updated all core modules (tensor, autograd, layers, etc.)
- Updated optimization modules (quantization, compression, etc.)
- Updated TITO commands for new structure

Changes include:
- 24 tinytorch/ module files
- 24 tito/ command and core files
- Updated references from modules/source/ to modules/

All modules re-exported via nbdev from their new locations.
This commit is contained in:
Vijay Janapa Reddi
2025-11-10 19:42:03 -05:00
parent d25861c68e
commit 96880b3133
48 changed files with 681 additions and 2035 deletions

142
tinytorch/_modidx.py generated
View File

@@ -1,19 +1,3 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/[unknown]/[unknown]_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# Autogenerated by nbdev
d = { 'settings': { 'branch': 'main',
@@ -21,36 +5,7 @@ d = { 'settings': { 'branch': 'main',
'doc_host': 'https://tinytorch.github.io',
'git_url': 'https://github.com/tinytorch/TinyTorch/',
'lib_path': 'tinytorch'},
'syms': { 'tinytorch.applications.tinygpt': { 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline': ( '20_capstone/capstone_dev.html#completetinygptpipeline',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.__init__': ( '20_capstone/capstone_dev.html#completetinygptpipeline.__init__',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.generate_text': ( '20_capstone/capstone_dev.html#completetinygptpipeline.generate_text',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.optimize_model': ( '20_capstone/capstone_dev.html#completetinygptpipeline.optimize_model',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.prepare_training_data': ( '20_capstone/capstone_dev.html#completetinygptpipeline.prepare_training_data',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.train': ( '20_capstone/capstone_dev.html#completetinygptpipeline.train',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPT': ( '20_capstone/capstone_dev.html#tinygpt',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPT.__init__': ( '20_capstone/capstone_dev.html#tinygpt.__init__',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer': ( '20_capstone/capstone_dev.html#tinygpttrainer',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer.__init__': ( '20_capstone/capstone_dev.html#tinygpttrainer.__init__',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer.prepare_batch': ( '20_capstone/capstone_dev.html#tinygpttrainer.prepare_batch',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer.train_step': ( '20_capstone/capstone_dev.html#tinygpttrainer.train_step',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.test_unit_complete_pipeline': ( '20_capstone/capstone_dev.html#test_unit_complete_pipeline',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.test_unit_tinygpt_init': ( '20_capstone/capstone_dev.html#test_unit_tinygpt_init',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.test_unit_training_pipeline': ( '20_capstone/capstone_dev.html#test_unit_training_pipeline',
'tinytorch/applications/tinygpt.py')},
'syms': { 'tinytorch.applications.tinygpt': {},
'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.Benchmark.__init__': ( '19_benchmarking/benchmarking_dev.html#benchmark.__init__',
@@ -89,6 +44,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.TinyMLPerf.run_standard_benchmark': ( '19_benchmarking/benchmarking_dev.html#tinymlperf.run_standard_benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.calculate_normalized_scores': ( '19_benchmarking/benchmarking_dev.html#calculate_normalized_scores',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark_suite',
@@ -105,6 +62,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.validate_installation': ( '20_competition/competition_dev.html#validate_installation',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.validate_submission': ( '20_competition/competition_dev.html#validate_submission',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.worked_example_optimization': ( '20_competition/competition_dev.html#worked_example_optimization',
'tinytorch/competition/submit.py')},
'tinytorch.core.activations': { 'tinytorch.core.activations.GELU': ( '02_activations/activations_dev.html#gelu',
@@ -341,7 +300,11 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint',
'tinytorch/core/training.py'),
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch',
'tinytorch/core/training.py')},
'tinytorch/core/training.py'),
'tinytorch.core.training.load_checkpoint': ( '07_training/training_dev.html#load_checkpoint',
'tinytorch/core/training.py'),
'tinytorch.core.training.save_checkpoint': ( '07_training/training_dev.html#save_checkpoint',
'tinytorch/core/training.py')},
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__',
@@ -386,8 +349,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/generation/kv_cache.py')},
'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.__call__': ( '13_transformers/transformers_dev.html#gpt.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
@@ -400,8 +361,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.__call__': ( '13_transformers/transformers_dev.html#layernorm.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
@@ -410,8 +369,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.__call__': ( '13_transformers/transformers_dev.html#mlp.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
@@ -420,8 +377,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.__call__': ( '13_transformers/transformers_dev.html#transformerblock.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
@@ -429,49 +384,54 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
'tinytorch/models/transformer.py')},
'tinytorch.optimization.acceleration': {},
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '17_compression/compression_dev.html#compressioncomplete',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '17_compression/compression_dev.html#compressioncomplete.compress_model',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.magnitude_prune': ( '17_compression/compression_dev.html#compressioncomplete.magnitude_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.measure_sparsity': ( '17_compression/compression_dev.html#compressioncomplete.measure_sparsity',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.structured_prune': ( '17_compression/compression_dev.html#compressioncomplete.structured_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation': ( '17_compression/compression_dev.html#knowledgedistillation',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation.__init__': ( '17_compression/compression_dev.html#knowledgedistillation.__init__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation._cross_entropy': ( '17_compression/compression_dev.html#knowledgedistillation._cross_entropy',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation._kl_divergence': ( '17_compression/compression_dev.html#knowledgedistillation._kl_divergence',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation._softmax': ( '17_compression/compression_dev.html#knowledgedistillation._softmax',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation.distillation_loss': ( '17_compression/compression_dev.html#knowledgedistillation.distillation_loss',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.Linear': ( '17_compression/compression_dev.html#linear',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Linear.__init__': ( '17_compression/compression_dev.html#linear.__init__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Linear.forward': ( '17_compression/compression_dev.html#linear.forward',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Linear.parameters': ( '17_compression/compression_dev.html#linear.parameters',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential': ( '17_compression/compression_dev.html#sequential',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.__call__': ( '17_compression/compression_dev.html#sequential.__call__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.__init__': ( '17_compression/compression_dev.html#sequential.__init__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.forward': ( '17_compression/compression_dev.html#sequential.forward',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.parameters': ( '17_compression/compression_dev.html#sequential.parameters',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.compress_model': ( '17_compression/compression_dev.html#compress_model',
'tinytorch.optimization.compression.Tensor': ( '17_compression/compression_dev.html#tensor',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Tensor.__add__': ( '17_compression/compression_dev.html#tensor.__add__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.magnitude_prune': ( '17_compression/compression_dev.html#magnitude_prune',
'tinytorch.optimization.compression.Tensor.__init__': ( '17_compression/compression_dev.html#tensor.__init__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.measure_sparsity': ( '17_compression/compression_dev.html#measure_sparsity',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.structured_prune': ( '17_compression/compression_dev.html#structured_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.test_unit_knowledge_distillation': ( '17_compression/compression_dev.html#test_unit_knowledge_distillation',
'tinytorch/optimization/compression.py')},
'tinytorch.optimization.quantization': {},
'tinytorch.optimization.compression.Tensor.__mul__': ( '17_compression/compression_dev.html#tensor.__mul__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Tensor.__repr__': ( '17_compression/compression_dev.html#tensor.__repr__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Tensor.abs': ( '17_compression/compression_dev.html#tensor.abs',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Tensor.matmul': ( '17_compression/compression_dev.html#tensor.matmul',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Tensor.sum': ( '17_compression/compression_dev.html#tensor.sum',
'tinytorch/optimization/compression.py')},
'tinytorch.optimization.quantization': { 'tinytorch.optimization.quantization.QuantizationComplete': ( '16_quantization/quantization_dev.html#quantizationcomplete',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.compare_models': ( '16_quantization/quantization_dev.html#quantizationcomplete.compare_models',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.dequantize_tensor': ( '16_quantization/quantization_dev.html#quantizationcomplete.dequantize_tensor',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.quantize_model': ( '16_quantization/quantization_dev.html#quantizationcomplete.quantize_model',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.quantize_tensor': ( '16_quantization/quantization_dev.html#quantizationcomplete.quantize_tensor',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.dequantize_int8': ( '16_quantization/quantization_dev.html#dequantize_int8',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.quantize_int8': ( '16_quantization/quantization_dev.html#quantize_int8',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.quantize_model': ( '16_quantization/quantization_dev.html#quantize_model',
'tinytorch/optimization/quantization.py')},
'tinytorch.profiling.profiler': { 'tinytorch.profiling.profiler.Profiler': ( '14_profiling/profiling_dev.html#profiler',
'tinytorch/profiling/profiler.py'),
'tinytorch.profiling.profiler.Profiler.__init__': ( '14_profiling/profiling_dev.html#profiler.__init__',
@@ -496,8 +456,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/profiling/profiler.py')},
'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__call__': ( '11_embeddings/embeddings_dev.html#embedding.__call__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
@@ -508,8 +466,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__call__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__call__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
@@ -520,8 +476,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__call__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__call__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',

View File

@@ -1,679 +1,8 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_tinygpt/tinygpt_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/20_capstone/capstone_dev.ipynb.
# %% auto 0
__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline',
'test_unit_complete_pipeline']
__all__ = []
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 2
#| default_exp applications.tinygpt
#| export
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 7
class TinyGPT:
"""
Complete GPT implementation integrating all TinyTorch modules.
This class demonstrates how framework components compose into real applications.
Built using modules 01,02,03,11,12,13 as core architecture.
Architecture:
- Token Embeddings (Module 11)
- Positional Encoding (Module 11)
- Transformer Blocks (Module 13)
- Output Linear Layer (Module 03)
- Language Modeling Head (Module 04)
"""
def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4,
num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1):
"""
Initialize TinyGPT with production-inspired architecture.
TODO: Build a complete GPT model using TinyTorch components
APPROACH:
1. Create token embeddings (vocab_size × embed_dim)
2. Create positional encoding (max_seq_len × embed_dim)
3. Build transformer layers using TransformerBlock
4. Add output projection layer
5. Calculate and report parameter count
ARCHITECTURE DECISIONS:
- embed_dim=128: Small enough for fast training, large enough for learning
- num_layers=4: Sufficient depth without excessive memory
- num_heads=4: Multi-head attention without head_dim being too small
- max_seq_len=256: Reasonable context length for character-level modeling
EXAMPLE:
>>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4)
>>> print(f"Parameters: {model.count_parameters():,}")
Parameters: 1,234,567
HINTS:
- Use Embedding class for token embeddings
- Use PositionalEncoding for position information
- Stack TransformerBlock instances in a list
- Final Linear layer maps embed_dim → vocab_size
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
self.max_seq_len = max_seq_len
self.dropout = dropout
# Token embeddings: convert token IDs to dense vectors
self.token_embedding = Embedding(vocab_size, embed_dim)
# Positional encoding: add position information
self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim)
# Transformer layers: core processing
self.transformer_blocks = []
for _ in range(num_layers):
block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0)
self.transformer_blocks.append(block)
# Output projection: map back to vocabulary
self.output_projection = Linear(embed_dim, vocab_size)
# Dropout for regularization
self.dropout_layer = Dropout(dropout)
# Calculate parameter count for systems analysis
self._param_count = self.count_parameters()
print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters")
print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D")
print(f"💾 Estimated memory: {self._param_count * 4 / 1024 / 1024:.1f}MB")
### END SOLUTION
def test_unit_tinygpt_init():
"""🔬 Test TinyGPT initialization and parameter counting."""
print("🔬 Unit Test: TinyGPT Initialization...")
# Create a small model for testing
model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128)
# Verify architecture components exist
assert hasattr(model, 'token_embedding')
assert hasattr(model, 'positional_encoding')
assert hasattr(model, 'transformer_blocks')
assert hasattr(model, 'output_projection')
assert len(model.transformer_blocks) == 2
# Verify parameter count is reasonable
param_count = model.count_parameters()
assert param_count > 0
assert param_count < 1000000 # Sanity check for small model
print(f"✅ Model created with {param_count:,} parameters")
print("✅ TinyGPT initialization works correctly!")
# Run immediate test
test_unit_tinygpt_init()
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 10
class TinyGPTTrainer:
"""
Complete training pipeline integrating optimizers, schedulers, and monitoring.
Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training.
"""
def __init__(self, model: TinyGPT, tokenizer: CharTokenizer,
learning_rate: float = 3e-4, weight_decay: float = 0.01):
"""
Initialize trainer with model and optimization components.
TODO: Set up complete training infrastructure
APPROACH:
1. Store model and tokenizer references
2. Initialize AdamW optimizer (standard for transformers)
3. Initialize loss function (CrossEntropyLoss for language modeling)
4. Set up learning rate scheduler (cosine schedule)
5. Initialize training metrics tracking
PRODUCTION CHOICES:
- AdamW: Better generalization than Adam (weight decay)
- learning_rate=3e-4: Standard for small transformers
- Cosine schedule: Smooth learning rate decay
- CrossEntropy: Standard for classification/language modeling
EXAMPLE:
>>> model = TinyGPT(vocab_size=100)
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
>>> trainer = TinyGPTTrainer(model, tokenizer)
>>> print("Trainer ready for training")
Trainer ready for training
HINTS:
- Get all model parameters with model.parameters()
- Use AdamW with weight_decay for better generalization
- CrossEntropyLoss handles the language modeling objective
"""
### BEGIN SOLUTION
self.model = model
self.tokenizer = tokenizer
# Collect all trainable parameters
all_params = []
all_params.extend(model.token_embedding.parameters())
for block in model.transformer_blocks:
all_params.extend(block.parameters())
all_params.extend(model.output_projection.parameters())
# Initialize optimizer (AdamW for transformers)
self.optimizer = AdamW(
params=all_params,
lr=learning_rate,
weight_decay=weight_decay,
betas=(0.9, 0.95) # Standard for language models
)
# Loss function for next token prediction
self.loss_fn = CrossEntropyLoss()
# Learning rate scheduler
self.scheduler = CosineSchedule(
optimizer=self.optimizer,
max_epochs=100, # Will adjust based on actual training
min_lr=learning_rate * 0.1
)
# Training metrics
self.training_history = {
'losses': [],
'perplexities': [],
'learning_rates': [],
'epoch': 0
}
print(f"🚀 Trainer initialized:")
print(f" Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})")
print(f" Parameters: {len(all_params):,} tensors")
print(f" Loss: CrossEntropyLoss")
### END SOLUTION
def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]:
"""
Convert text batch to input/target tensors for language modeling.
TODO: Implement text-to-tensor conversion with proper targets
APPROACH:
1. Tokenize each text in the batch
2. Pad/truncate to consistent length
3. Create input_ids (text) and target_ids (text shifted by 1)
4. Convert to Tensor format
LANGUAGE MODELING OBJECTIVE:
- Input: [token1, token2, token3, token4]
- Target: [token2, token3, token4, token5]
- Model predicts next token at each position
EXAMPLE:
>>> trainer = TinyGPTTrainer(model, tokenizer)
>>> texts = ["hello world", "ai is fun"]
>>> inputs, targets = trainer.prepare_batch(texts)
>>> print(inputs.shape, targets.shape)
(2, 128) (2, 128)
HINTS:
- Use tokenizer.encode() for text → token conversion
- Pad shorter sequences with tokenizer pad token
- Target sequence is input sequence shifted right by 1
"""
### BEGIN SOLUTION
batch_size = len(text_batch)
# Tokenize all texts
tokenized_batch = []
for text in text_batch:
tokens = self.tokenizer.encode(text)
# Truncate or pad to max_length
if len(tokens) > max_length:
tokens = tokens[:max_length]
else:
# Pad with special token (use 0 as pad)
tokens.extend([0] * (max_length - len(tokens)))
tokenized_batch.append(tokens)
# Convert to numpy then Tensor
input_ids = Tensor(np.array(tokenized_batch)) # (batch_size, seq_len)
# Create targets (shifted input for next token prediction)
target_ids = Tensor(np.roll(input_ids.data, -1, axis=1)) # Shift left by 1
return input_ids, target_ids
### END SOLUTION
def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float:
"""
Single training step with forward, backward, and optimization.
TODO: Implement complete training step
APPROACH:
1. Zero gradients from previous step
2. Forward pass to get logits
3. Compute loss between logits and targets
4. Backward pass to compute gradients
5. Optimizer step to update parameters
6. Return loss value for monitoring
MEMORY MANAGEMENT:
During training, memory usage = 3× model size:
- 1× for parameters
- 1× for gradients
- 1× for optimizer states (Adam moments)
EXAMPLE:
>>> loss = trainer.train_step(input_ids, target_ids)
>>> print(f"Training loss: {loss:.4f}")
Training loss: 2.3456
HINTS:
- Always zero_grad() before forward pass
- Loss should be computed on flattened logits and targets
- Call backward() on the loss tensor
"""
### BEGIN SOLUTION
# Zero gradients from previous step
self.optimizer.zero_grad()
# Forward pass
logits = self.model.forward(input_ids) # (batch, seq_len, vocab_size)
# Reshape for loss computation
batch_size, seq_len, vocab_size = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
targets_flat = target_ids.reshape(batch_size * seq_len)
# Compute loss
loss = self.loss_fn.forward(logits_flat, targets_flat)
# Backward pass
loss.backward()
# Optimizer step
self.optimizer.step()
# Return scalar loss for monitoring
return float(loss.data.item() if hasattr(loss.data, 'item') else loss.data)
### END SOLUTION
def test_unit_training_pipeline():
"""🔬 Test training pipeline components."""
print("🔬 Unit Test: Training Pipeline...")
# Create small model and trainer
model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2)
tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' '])
trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3)
# Test batch preparation
texts = ["hello", "world"]
input_ids, target_ids = trainer.prepare_batch(texts, max_length=8)
assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}"
assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}"
# Test training step
initial_loss = trainer.train_step(input_ids, target_ids)
assert initial_loss > 0, "Loss should be positive"
# Second step should work (gradients computed and applied)
second_loss = trainer.train_step(input_ids, target_ids)
assert second_loss > 0, "Second loss should also be positive"
print(f"✅ Batch preparation shape: {input_ids.shape}")
print(f"✅ Initial loss: {initial_loss:.4f}")
print(f"✅ Second loss: {second_loss:.4f}")
print("✅ Training pipeline works correctly!")
# Run immediate test
test_unit_training_pipeline()
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 14
class CompleteTinyGPTPipeline:
"""
End-to-end ML pipeline demonstrating integration of all 19 modules.
Pipeline stages:
1. Data preparation (Module 10: Tokenization)
2. Model creation (Modules 01-04, 11-13: Architecture)
3. Training setup (Modules 05-07: Optimization)
4. Training loop (Module 08: DataLoader)
5. Optimization (Modules 17-18: Quantization, Pruning)
6. Evaluation (Module 19: Benchmarking)
7. Generation (Module 14: KV Caching)
"""
def __init__(self, vocab_size: int = 100, embed_dim: int = 128,
num_layers: int = 4, num_heads: int = 4):
"""
Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules.
TODO: Set up a complete ML pipeline with tokenization, model, training,
profiling, and benchmarking components
APPROACH:
1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads)
2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127)
3. Create TinyGPT model instance with stored parameters and max_seq_len=256
4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4
5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis
6. Initialize pipeline state tracking (is_trained flag, training_history list)
7. Print pipeline initialization summary with parameter count and memory usage
EXAMPLE:
>>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128,
... num_layers=4, num_heads=4)
🏗️ Complete TinyGPT Pipeline Initialized
Model: 419,300 parameters
Memory: 1.6MB
>>> pipeline.model.count_parameters()
419300
>>> pipeline.is_trained
False
>>> len(pipeline.training_history)
0
HINTS:
- CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)]
- TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len
- TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments
- Benchmark expects (models_list, datasets_list, metrics_list) format
- Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
# Stage 1: Initialize tokenizer (Module 10)
self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)]) # Printable ASCII
# Stage 2: Create model (Modules 01-04, 11-13)
self.model = TinyGPT(
vocab_size=vocab_size,
embed_dim=embed_dim,
num_layers=num_layers,
num_heads=num_heads,
max_seq_len=256
)
# Stage 3: Setup training (Modules 05-07)
self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4)
# Stage 4: Initialize profiler and benchmark (Modules 15, 19)
self.profiler = Profiler()
self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"])
# Pipeline state
self.is_trained = False
self.training_history = []
print("🏗️ Complete TinyGPT Pipeline Initialized")
print(f" Model: {self.model.count_parameters():,} parameters")
print(f" Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB")
### END SOLUTION
def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader:
"""
Prepare training data using DataLoader (Module 08).
TODO: Create DataLoader for training text data
APPROACH:
1. Tokenize all texts in corpus
2. Create input/target pairs for language modeling
3. Package into TensorDataset
4. Create DataLoader with batching and shuffling
EXAMPLE:
>>> pipeline = CompleteTinyGPTPipeline()
>>> corpus = ["hello world", "ai is amazing"]
>>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
>>> print(f"Batches: {len(dataloader)}")
Batches: 1
"""
### BEGIN SOLUTION
# Tokenize and prepare training pairs
input_sequences = []
target_sequences = []
for text in text_corpus:
tokens = self.tokenizer.encode(text)
if len(tokens) < 2:
continue # Skip very short texts
# Create sliding window of input/target pairs
for i in range(len(tokens) - 1):
input_seq = tokens[:i+1]
target_seq = tokens[i+1]
# Pad input to consistent length
max_len = 32 # Reasonable context window
if len(input_seq) > max_len:
input_seq = input_seq[-max_len:]
else:
input_seq = [0] * (max_len - len(input_seq)) + input_seq
input_sequences.append(input_seq)
target_sequences.append(target_seq)
# Convert to tensors
inputs = Tensor(np.array(input_sequences))
targets = Tensor(np.array(target_sequences))
# Create dataset and dataloader
dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches")
return dataloader
### END SOLUTION
def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]:
"""
Complete training loop with monitoring.
TODO: Implement full training with progress tracking
APPROACH:
1. Loop through epochs
2. For each batch: forward, backward, optimize
3. Track loss and perplexity
4. Update learning rate schedule
5. Return training history
EXAMPLE:
>>> history = pipeline.train(dataloader, epochs=5)
>>> print(f"Final loss: {history['losses'][-1]:.4f}")
Final loss: 1.2345
"""
### BEGIN SOLUTION
history = {'losses': [], 'perplexities': [], 'epochs': []}
print(f"🚀 Starting training for {epochs} epochs...")
for epoch in range(epochs):
epoch_losses = []
for batch_idx, (inputs, targets) in enumerate(dataloader):
# Training step
loss = self.trainer.train_step(inputs, targets)
epoch_losses.append(loss)
# Log progress
if batch_idx % 10 == 0:
perplexity = np.exp(loss)
print(f" Epoch {epoch+1}/{epochs}, Batch {batch_idx}: "
f"Loss={loss:.4f}, PPL={perplexity:.2f}")
# Epoch summary
avg_loss = np.mean(epoch_losses)
avg_perplexity = np.exp(avg_loss)
history['losses'].append(avg_loss)
history['perplexities'].append(avg_perplexity)
history['epochs'].append(epoch + 1)
# Update learning rate
self.trainer.scheduler.step()
print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}")
self.is_trained = True
self.training_history = history
print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}")
return history
### END SOLUTION
def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0):
"""
Apply optimization techniques (Modules 17-18).
TODO: Apply quantization and pruning optimizations
APPROACH:
1. Optionally apply quantization to reduce precision
2. Optionally apply pruning to remove weights
3. Measure size reduction
4. Validate model still works
EXAMPLE:
>>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
Model optimized: 75% size reduction
"""
### BEGIN SOLUTION
original_params = self.model.count_parameters()
original_memory = original_params * 4 / (1024 * 1024)
optimizations_applied = []
if quantize:
# Apply quantization (simulated)
# In real implementation, would use quantize_model()
quantized_memory = original_memory / 4 # INT8 vs FP32
optimizations_applied.append(f"INT8 quantization (4× memory reduction)")
print(" Applied INT8 quantization")
if prune_sparsity > 0:
# Apply pruning (simulated)
# In real implementation, would use magnitude_prune()
remaining_weights = 1 - prune_sparsity
optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)")
print(f" Applied {prune_sparsity:.0%} magnitude pruning")
# Calculate final size
size_reduction = 1.0
if quantize:
size_reduction *= 0.25 # 4× smaller
if prune_sparsity > 0:
size_reduction *= (1 - prune_sparsity)
final_memory = original_memory * size_reduction
reduction_factor = original_memory / final_memory
print(f"🔧 Model optimization complete:")
print(f" Original: {original_memory:.1f}MB")
print(f" Optimized: {final_memory:.1f}MB")
print(f" Reduction: {reduction_factor:.1f}× smaller")
print(f" Applied: {', '.join(optimizations_applied)}")
### END SOLUTION
def generate_text(self, prompt: str, max_tokens: int = 50) -> str:
"""
Generate text using the trained model.
TODO: Implement text generation with proper encoding/decoding
APPROACH:
1. Encode prompt to token IDs
2. Use model.generate() for autoregressive generation
3. Decode generated tokens back to text
4. Return generated text
EXAMPLE:
>>> text = pipeline.generate_text("Hello", max_tokens=10)
>>> print(f"Generated: {text}")
Generated: Hello world this is AI
"""
### BEGIN SOLUTION
if not self.is_trained:
print("⚠️ Model not trained yet. Generating with random weights.")
# Encode prompt
prompt_tokens = self.tokenizer.encode(prompt)
prompt_tensor = Tensor([prompt_tokens])
# Generate tokens
generated_tokens = self.model.generate(
prompt_tensor,
max_new_tokens=max_tokens,
temperature=0.8,
use_cache=True
)
# Decode to text
all_tokens = generated_tokens.data[0].tolist()
generated_text = self.tokenizer.decode(all_tokens)
return generated_text
### END SOLUTION
def test_unit_complete_pipeline():
"""🔬 Test complete pipeline integration."""
print("🔬 Unit Test: Complete Pipeline Integration...")
# Create pipeline
pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2)
# Test data preparation
corpus = ["hello world", "ai is fun", "machine learning"]
dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
assert len(dataloader) > 0, "DataLoader should have batches"
# Test training (minimal)
history = pipeline.train(dataloader, epochs=1)
assert 'losses' in history, "History should contain losses"
assert len(history['losses']) == 1, "Should have one epoch of losses"
# Test optimization
pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
# Test generation
generated = pipeline.generate_text("hello", max_tokens=5)
assert isinstance(generated, str), "Generated output should be string"
assert len(generated) > 0, "Generated text should not be empty"
print(f"✅ Pipeline stages completed successfully")
print(f"✅ Training history: {len(history['losses'])} epochs")
print(f"✅ Generated text: '{generated[:20]}...'")
print("✅ Complete pipeline integration works!")
# Run immediate test
test_unit_complete_pipeline()

View File

@@ -1,22 +1,8 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_benchmark/benchmark_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/19_benchmarking/benchmarking_dev.ipynb.
# %% auto 0
__all__ = ['OlympicEvent', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf',
'test_unit_tinymlperf']
'test_unit_tinymlperf', 'calculate_normalized_scores']
# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 0
#| default_exp benchmarking.benchmark
@@ -72,7 +58,7 @@ class Benchmark:
self.measurement_runs = measurement_runs
self.results = {}
# Use Profiler from Module 14 for measurements
# Use Profiler from Module 15 for measurements
self.profiler = Profiler()
# System information for metadata
@@ -1024,3 +1010,53 @@ def test_unit_tinymlperf():
print("✅ TinyMLPerf works correctly!")
test_unit_tinymlperf()
# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 24
def calculate_normalized_scores(baseline_results: dict,
optimized_results: dict) -> dict:
"""
Calculate normalized performance metrics for fair competition comparison.
This function converts absolute measurements into relative improvements,
enabling fair comparison across different hardware platforms.
Args:
baseline_results: Dict with keys: 'latency', 'memory', 'accuracy'
optimized_results: Dict with same keys as baseline_results
Returns:
Dict with normalized metrics:
- speedup: Relative latency improvement (higher is better)
- compression_ratio: Relative memory reduction (higher is better)
- accuracy_delta: Absolute accuracy change (closer to 0 is better)
- efficiency_score: Combined metric balancing all factors
Example:
>>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
>>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
>>> scores = calculate_normalized_scores(baseline, optimized)
>>> print(f"Speedup: {scores['speedup']:.2f}x")
Speedup: 2.50x
"""
# Calculate speedup (higher is better)
speedup = baseline_results['latency'] / optimized_results['latency']
# Calculate compression ratio (higher is better)
compression_ratio = baseline_results['memory'] / optimized_results['memory']
# Calculate accuracy delta (closer to 0 is better, negative means degradation)
accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy']
# Calculate efficiency score (combined metric)
# Penalize accuracy loss: the more accuracy you lose, the lower your score
accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0
efficiency_score = (speedup * compression_ratio) / accuracy_penalty
return {
'speedup': speedup,
'compression_ratio': compression_ratio,
'accuracy_delta': accuracy_delta,
'efficiency_score': efficiency_score,
'baseline': baseline_results.copy(),
'optimized': optimized_results.copy()
}

View File

@@ -1,22 +1,8 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_submit/submit_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/20_competition/competition_dev.ipynb.
# %% auto 0
__all__ = ['validate_installation', 'load_baseline_model', 'generate_baseline', 'worked_example_optimization',
'optimize_for_competition', 'generate_submission']
'optimize_for_competition', 'validate_submission', 'generate_submission']
# %% ../../modules/source/20_competition/competition_dev.ipynb 4
import numpy as np
@@ -24,6 +10,8 @@ import json
import time
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional
from ..benchmarking.benchmark import Benchmark, calculate_normalized_scores
from ..profiling.profiler import Profiler
def validate_installation() -> Dict[str, bool]:
"""
@@ -362,31 +350,24 @@ def worked_example_optimization():
return submission
# %% ../../modules/source/20_competition/competition_dev.ipynb 10
def optimize_for_competition(baseline_model, event: str = "all_around"):
def optimize_for_competition(baseline_model, event: str = "all_around", division: str = "closed"):
"""
🏅 YOUR COMPETITION ENTRY - IMPLEMENT YOUR STRATEGY HERE!
This is where you apply optimization techniques from Modules 14-18.
Available techniques:
- Module 14: KV Caching (for transformers) - enable_kv_cache()
- Module 16: Acceleration (vectorization, fusion)
- Module 17: Quantization (INT8, INT4) - quantize_model()
- Module 18: Compression (pruning) - magnitude_prune()
Args:
baseline_model: The unoptimized model
event: Which Olympic event you're competing in
baseline_model: Starting model (use for Closed, optional for Open)
event: Category you're competing in
- "latency_sprint": Minimize latency
- "memory_challenge": Minimize memory
- "accuracy_contest": Maximize accuracy
- "all_around": Best balance
- "extreme_push": Most aggressive
division: "closed" or "open" - which track you chose
Returns:
Your optimized model
Example:
🔒 CLOSED DIVISION Example:
from tinytorch.optimization.quantization import quantize_model
from tinytorch.optimization.compression import magnitude_prune
@@ -394,6 +375,15 @@ def optimize_for_competition(baseline_model, event: str = "all_around"):
optimized = quantize_model(optimized, bits=8)
optimized = magnitude_prune(optimized, sparsity=0.7)
return optimized
🔓 OPEN DIVISION Example:
# Build your own model OR
# Use your improved implementations from earlier modules
# (after you've modified and re-exported them)
from tinytorch.models import YourCustomArchitecture
optimized = YourCustomArchitecture()
return optimized
"""
print(f"🏅 YOUR OPTIMIZATION STRATEGY FOR: {event}")
@@ -438,74 +428,201 @@ def optimize_for_competition(baseline_model, event: str = "all_around"):
return optimized_model
#| export
def validate_submission(submission: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate competition submission with sanity checks.
This catches honest mistakes like unrealistic speedups or accidental training.
Honor code system - we trust but verify basic reasonableness.
Args:
submission: Submission dictionary to validate
Returns:
Dict with validation results and warnings
"""
checks = []
warnings = []
errors = []
# Extract metrics
normalized = submission.get("normalized_scores", {})
speedup = normalized.get("speedup", 1.0)
compression = normalized.get("compression_ratio", 1.0)
accuracy_delta = normalized.get("accuracy_delta", 0.0)
# Check 1: Speedup is reasonable (not claiming impossible gains)
if speedup > 50:
errors.append(f"❌ Speedup {speedup:.1f}x seems unrealistic (>50x)")
elif speedup > 20:
warnings.append(f"⚠️ Speedup {speedup:.1f}x is very high - please verify measurements")
else:
checks.append(f"✅ Speedup {speedup:.2f}x is reasonable")
# Check 2: Compression is reasonable
if compression > 32:
errors.append(f"❌ Compression {compression:.1f}x seems unrealistic (>32x)")
elif compression > 16:
warnings.append(f"⚠️ Compression {compression:.1f}x is very high - please verify")
else:
checks.append(f"✅ Compression {compression:.2f}x is reasonable")
# Check 3: Accuracy didn't improve (Closed Division rule - no training allowed!)
division = submission.get("division", "closed")
if division == "closed" and accuracy_delta > 1.0:
errors.append(f"❌ Accuracy improved by {accuracy_delta:.1f}pp - did you accidentally train the model?")
elif accuracy_delta > 0.5:
warnings.append(f"⚠️ Accuracy improved by {accuracy_delta:.1f}pp - verify no training occurred")
else:
checks.append(f"✅ Accuracy change {accuracy_delta:+.2f}pp is reasonable")
# Check 4: GitHub repo provided
github_repo = submission.get("github_repo", "")
if not github_repo or github_repo == "":
warnings.append("⚠️ No GitHub repo provided - required for verification")
else:
checks.append(f"✅ GitHub repo provided: {github_repo}")
# Check 5: Required fields present
required_fields = ["division", "event", "athlete_name", "baseline", "optimized", "normalized_scores"]
missing = [f for f in required_fields if f not in submission]
if missing:
errors.append(f"❌ Missing required fields: {', '.join(missing)}")
else:
checks.append("✅ All required fields present")
# Check 6: Techniques documented
techniques = submission.get("techniques_applied", [])
if not techniques or "TODO" in str(techniques):
warnings.append("⚠️ No optimization techniques listed")
else:
checks.append(f"✅ Techniques documented: {', '.join(techniques[:3])}...")
return {
"valid": len(errors) == 0,
"checks": checks,
"warnings": warnings,
"errors": errors
}
#| export
def generate_submission(baseline_model, optimized_model,
division: str = "closed",
event: str = "all_around",
athlete_name: str = "YourName",
github_repo: str = "",
techniques: List[str] = None) -> Dict[str, Any]:
"""
Generate standardized competition submission.
Generate standardized TinyMLPerf competition submission with normalized scoring.
Args:
baseline_model: Original unoptimized model
optimized_model: Your optimized model
event: Olympic event name
athlete_name: Your name for leaderboard
techniques: List of techniques applied
division: "closed" or "open"
event: Competition category (latency_sprint, memory_challenge, all_around, etc.)
athlete_name: Your name for submission
github_repo: GitHub repository URL for code verification
techniques: List of optimization techniques applied
Returns:
Submission dictionary (will be saved as JSON)
"""
print("📤 Generating Competition Submission...")
print("📤 Generating TinyMLPerf Competition Submission...")
print("=" * 70)
# Get baseline metrics
baseline_metrics = generate_baseline(quick=True)
# For demonstration, estimate optimized metrics
# In real competition, this would benchmark the actual optimized model
# Benchmark optimized model
print("🔬 Benchmarking optimized model...")
# Placeholder: Students' actual optimizations would be measured here
# Use Profiler and Benchmark from Module 19
profiler = Profiler()
# For demonstration, we'll use placeholder metrics
# In real competition, students would measure their actual optimized model
optimized_metrics = {
"model": "Your_Optimized_Model",
"accuracy": 84.0, # Measured
"latency_ms": 28.0, # Measured
"memory_mb": 4.0, # Measured
"parameters": 2000000, # Measured
"model": getattr(optimized_model, 'name', 'Optimized_Model'),
"accuracy": 84.0, # Would be measured with actual test set
"latency_ms": 28.0, # Would be measured with profiler
"memory_mb": 4.0, # Would be measured with profiler
"parameters": 2000000, # Would be counted
}
# Calculate improvements
improvements = {
"accuracy_change": optimized_metrics["accuracy"] - baseline_metrics["accuracy"],
"latency_speedup": baseline_metrics["latency_ms"] / optimized_metrics["latency_ms"],
"memory_reduction": baseline_metrics["memory_mb"] / optimized_metrics["memory_mb"],
# Calculate normalized scores using Module 19's function
baseline_for_norm = {
"latency": baseline_metrics["latency_ms"],
"memory": baseline_metrics["memory_mb"],
"accuracy": baseline_metrics["accuracy"]
}
# Create submission
optimized_for_norm = {
"latency": optimized_metrics["latency_ms"],
"memory": optimized_metrics["memory_mb"],
"accuracy": optimized_metrics["accuracy"]
}
normalized_scores = calculate_normalized_scores(baseline_for_norm, optimized_for_norm)
# Create submission with all required fields
submission = {
"division": division,
"event": event,
"athlete_name": athlete_name,
"github_repo": github_repo,
"baseline": baseline_metrics,
"optimized": optimized_metrics,
"improvements": improvements,
"techniques_applied": techniques or ["TODO: List your techniques"],
"normalized_scores": {
"speedup": normalized_scores["speedup"],
"compression_ratio": normalized_scores["compression_ratio"],
"accuracy_delta": normalized_scores["accuracy_delta"],
"efficiency_score": normalized_scores["efficiency_score"]
},
"techniques_applied": techniques or ["TODO: Document your optimization techniques"],
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"tinytorch_version": "0.1.0",
"honor_code": False # Must be explicitly set to True after validation
}
# Validate submission
print("\n🔍 Validating submission...")
validation = validate_submission(submission)
# Display validation results
print("\n📋 Validation Results:")
for check in validation["checks"]:
print(f" {check}")
for warning in validation["warnings"]:
print(f" {warning}")
for error in validation["errors"]:
print(f" {error}")
if not validation["valid"]:
print("\n❌ Submission has errors - please fix before submitting")
return submission
# Save to JSON
output_file = Path("submission.json")
with open(output_file, "w") as f:
json.dump(submission, f, indent=2)
print(f"✅ Submission saved to: {output_file}")
print(f"\n✅ Submission saved to: {output_file}")
print()
print("📊 Your Results:")
print(f" Event: {event}")
print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {improvements['accuracy_change']:+.1f}pp)")
print(f" Latency: {optimized_metrics['latency_ms']:.1f}ms ({improvements['latency_speedup']:.2f}x faster)")
print(f" Memory: {optimized_metrics['memory_mb']:.2f}MB ({improvements['memory_reduction']:.2f}x smaller)")
print("📊 Your Normalized Scores (MLPerf-style):")
print(f" Division: {division.upper()}")
print(f" Event: {event.replace('_', ' ').title()}")
print(f" Speedup: {normalized_scores['speedup']:.2f}x faster")
print(f" Compression: {normalized_scores['compression_ratio']:.2f}x smaller 💾")
print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {normalized_scores['accuracy_delta']:+.2f}pp)")
print(f" Efficiency: {normalized_scores['efficiency_score']:.2f}")
print()
print("📤 Next Steps:")
print(" 1. Verify all metrics are correct")
print(" 2. Push your code to GitHub (if not done)")
print(" 3. Run: tito submit submission.json")
print(" (This will validate and prepare final submission)")
print()
print("📤 Upload submission.json to TorchPerf Olympics platform!")
print("=" * 70)
return submission

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/03_activations/activations_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/02_activations/activations_dev.ipynb.
# %% auto 0
__all__ = ['Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/07_attention/attention_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/12_attention/attention_dev.ipynb.
# %% auto 0
__all__ = ['scaled_dot_product_attention', 'MultiHeadAttention']
@@ -293,6 +279,10 @@ class MultiHeadAttention:
return output
### END SOLUTION
def __call__(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
"""Allows the attention layer to be called like a function."""
return self.forward(x, mask)
def parameters(self) -> List[Tensor]:
"""
Return all trainable parameters.

View File

@@ -1,23 +1,8 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/09_autograd/autograd_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/05_autograd/autograd_dev.ipynb.
# %% auto 0
__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'TransposeBackward',
'PermuteBackward', 'EmbeddingBackward', 'ReshapeBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
'SoftmaxBackward', 'GELUBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
import numpy as np
@@ -164,66 +149,7 @@ class MulBackward(Function):
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 13
class SubBackward(Function):
"""
Gradient computation for tensor subtraction.
**Mathematical Rule:** If z = a - b, then z/a = 1 and z/b = -1
"""
def apply(self, grad_output):
"""
Compute gradients for subtraction.
Returns:
Tuple of (grad_a, grad_b) where grad_b is negated
"""
a, b = self.saved_tensors
grad_a = grad_b = None
if isinstance(a, Tensor) and a.requires_grad:
grad_a = grad_output # ∂(a-b)/∂a = 1
if isinstance(b, Tensor) and b.requires_grad:
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15
class DivBackward(Function):
"""
Gradient computation for tensor division.
**Mathematical Rule:** If z = a / b, then:
- z/a = 1/b
- z/b = -a/
"""
def apply(self, grad_output):
"""
Compute gradients for division using quotient rule.
Returns:
Tuple of (grad_a, grad_b)
"""
a, b = self.saved_tensors
grad_a = grad_b = None
if isinstance(a, Tensor) and a.requires_grad:
# ∂(a/b)/∂a = 1/b
if isinstance(b, Tensor):
grad_a = grad_output / b.data
else:
grad_a = grad_output / b
if isinstance(b, Tensor) and b.requires_grad:
# ∂(a/b)/∂b = -a/b²
grad_b = -grad_output * a.data / (b.data ** 2)
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14
class MatmulBackward(Function):
"""
Gradient computation for matrix multiplication.
@@ -252,242 +178,21 @@ class MatmulBackward(Function):
**Mathematical Foundation:**
- (A@B)/A = grad_output @ B.T
- (A@B)/B = A.T @ grad_output
**Batched Operation:** For 3D+ tensors, we transpose only the last two
dimensions using np.swapaxes, preserving batch dimensions.
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Gradient for first input: grad_output @ b.T
if isinstance(a, Tensor) and a.requires_grad:
# For batched tensors, transpose only last two dims
if b.data.ndim >= 2:
b_T = np.swapaxes(b.data, -2, -1)
else:
b_T = b.data.T
grad_a = np.matmul(grad_output, b_T)
grad_a = np.dot(grad_output, b.data.T)
# Gradient for second input: a.T @ grad_output
if isinstance(b, Tensor) and b.requires_grad:
# For batched tensors, transpose only last two dims
if a.data.ndim >= 2:
a_T = np.swapaxes(a.data, -2, -1)
else:
a_T = a.data.T
grad_b = np.matmul(a_T, grad_output)
grad_b = np.dot(a.data.T, grad_output)
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
class TransposeBackward(Function):
"""
Gradient computation for transpose operation.
**Mathematical Rule:** If Y = X.T, then:
- Y/X = grad_Y.T
**Key Insight:** The gradient of transpose is just transpose the gradient!
This is because transpose is a linear operation that just rearranges elements.
**Applications:** Used in attention (K.T for scores), weight gradients (W.T),
and any operation that needs to swap matrix dimensions.
"""
def __init__(self, tensor, dim0, dim1):
"""
Args:
tensor: Input tensor
dim0: First dimension to swap (None for default)
dim1: Second dimension to swap (None for default)
"""
super().__init__(tensor)
self.dim0 = dim0
self.dim1 = dim1
def apply(self, grad_output):
"""
Compute gradient for transpose.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- (X.T)/X = grad_output.T
- Just transpose the gradient back!
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Transpose gradient using the same dims
if self.dim0 is None and self.dim1 is None:
# Default: transpose last two dimensions
if grad_output.ndim < 2:
grad_x = grad_output.copy()
else:
axes = list(range(grad_output.ndim))
axes[-2], axes[-1] = axes[-1], axes[-2]
grad_x = np.transpose(grad_output, axes)
else:
# Specific dimensions: swap them back
axes = list(range(grad_output.ndim))
axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0]
grad_x = np.transpose(grad_output, axes)
return (grad_x,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 19
class PermuteBackward(Function):
"""
Gradient computation for arbitrary axis permutation (general transpose).
**Mathematical Rule:** If Y = X.permute(axes), then:
- Y/X = grad_Y.permute(inverse_axes)
**Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse).
More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0).
**Key Insight:** To reverse a permutation, we need to know where each axis went.
If axis i went to position axes[i], then in the inverse, position axes[i] should go to i.
**Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads.
"""
def __init__(self, tensor, axes):
"""
Args:
tensor: Input tensor
axes: Tuple of axis indices defining the permutation
"""
super().__init__(tensor)
self.axes = axes
# Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i
self.inverse_axes = tuple(np.argsort(axes))
def apply(self, grad_output):
"""
Compute gradient for permutation.
The gradient is permuted back using the inverse permutation.
**Mathematical Foundation:**
- (X.permute(axes))/X = grad_output.permute(inverse_axes)
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Permute gradient back to original axis order
grad_x = np.transpose(grad_output, self.inverse_axes)
return (grad_x,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 20
class EmbeddingBackward(Function):
"""
Gradient computation for embedding lookup operation.
**Mathematical Rule:** If Y = Embedding[indices], then:
- Loss/Embedding[i] = sum of all gradients where index==i
**Key Insight:** Embedding lookup is a gather operation. The backward
is a scatter operation that accumulates gradients to the embedding weights.
**Applications:** Word embeddings, positional embeddings, token embeddings
in transformers.
"""
def __init__(self, weight, indices):
"""
Args:
weight: Embedding weight matrix
indices: Indices used for lookup
"""
super().__init__(weight)
self.indices = indices
def apply(self, grad_output):
"""
Compute gradient for embedding lookup.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for weight tensor
**Mathematical Foundation:**
- (Embedding[indices])/Embedding = scatter gradients to selected rows
- Multiple indices can point to same embedding gradients accumulate
"""
weight, = self.saved_tensors
grad_weight = None
if isinstance(weight, Tensor) and weight.requires_grad:
# Initialize gradient with zeros
grad_weight = np.zeros_like(weight.data)
# Scatter gradients back to embedding weights
# np.add.at accumulates gradients for repeated indices
indices_flat = self.indices.data.astype(int).flatten()
grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1])
np.add.at(grad_weight, indices_flat, grad_output_reshaped)
return (grad_weight,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21
class ReshapeBackward(Function):
"""
Gradient computation for reshape operation.
**Mathematical Rule:** If Y = X.reshape(new_shape), then:
- Y/X = grad_Y.reshape(X.shape)
**Key Insight:** Reshape just rearranges the same elements.
The gradient is simply reshaped back to the original shape!
**Applications:** Flattening tensors for linear layers, reshaping
between convolutional and dense layers.
"""
def __init__(self, tensor, original_shape):
"""
Args:
tensor: Input tensor
original_shape: Shape before reshape
"""
super().__init__(tensor)
self.original_shape = original_shape
def apply(self, grad_output):
"""
Compute gradient for reshape.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- (X.reshape(...))/X = grad_output.reshape(X.shape)
- Just reshape the gradient back!
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Reshape gradient back to original shape
grad_x = grad_output.reshape(self.original_shape)
return (grad_x,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16
class SumBackward(Function):
"""
Gradient computation for tensor sum.
@@ -521,7 +226,7 @@ class SumBackward(Function):
return np.ones_like(tensor.data) * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
class ReLUBackward(Function):
"""
Gradient computation for ReLU activation.
@@ -544,7 +249,7 @@ class ReLUBackward(Function):
return grad_output * relu_grad,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
class SigmoidBackward(Function):
"""
Gradient computation for sigmoid activation.
@@ -574,101 +279,7 @@ class SigmoidBackward(Function):
return grad_output * sigmoid_grad,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 30
class SoftmaxBackward(Function):
"""
Gradient computation for softmax activation.
Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x))
Derivative: softmax/x[i] = softmax[i] * (δ[i,j] - softmax[j])
For gradient computation:
grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax))
**Key Insight:** The gradient depends on all elements of softmax due to
the normalization, not just the element being differentiated.
"""
def __init__(self, input_tensor, output_tensor, dim=-1):
"""
Initialize with input, output, and dimension.
Args:
input_tensor: Original input to softmax
output_tensor: Output of softmax (needed for gradient)
dim: Dimension along which softmax was applied
"""
super().__init__(input_tensor)
self.output_data = output_tensor.data
self.dim = dim
def apply(self, grad_output):
"""
Compute gradient for softmax.
Mathematical formula:
L/x[i] = softmax[i] * (L/y[i] - sum_j(L/y[j] * softmax[j]))
This can be vectorized as:
grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True))
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
# Compute sum(grad_output * softmax) along the softmax dimension
sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True)
# Softmax gradient: softmax * (grad_output - sum_term)
grad_x = self.output_data * (grad_output - sum_term)
return (grad_x,)
return (None,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 31
class GELUBackward(Function):
"""
Gradient computation for GELU activation.
GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal
Approximation: gelu(x) 0.5 * x * (1 + tanh((2/π) * (x + 0.044715 * )))
**Key Insight:** GELU is smoother than ReLU, providing non-zero gradients
for negative values, which helps training deep networks.
"""
def __init__(self, input_tensor):
"""Initialize with input tensor."""
super().__init__(input_tensor)
def apply(self, grad_output):
"""
Compute gradient for GELU.
Mathematical formula (using approximation):
gelu/x 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...)
Simplified: We compute the derivative numerically or use the formula.
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
x = tensor.data
# GELU derivative approximation
# Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
x_cubed = x ** 3
tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed)
tanh_out = np.tanh(tanh_arg)
sech_squared = 1 - tanh_out ** 2
# Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx
d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2)
gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg
return (grad_output * gelu_grad,)
return (None,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 32
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
class MSEBackward(Function):
"""
Gradient computation for Mean Squared Error Loss.
@@ -694,7 +305,7 @@ class MSEBackward(Function):
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 33
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
class BCEBackward(Function):
"""
Gradient computation for Binary Cross-Entropy Loss.
@@ -724,7 +335,7 @@ class BCEBackward(Function):
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 34
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
class CrossEntropyBackward(Function):
"""
Gradient computation for Cross-Entropy Loss.
@@ -769,7 +380,7 @@ class CrossEntropyBackward(Function):
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 35
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
def enable_autograd():
"""
Enable gradient tracking for all Tensor operations.
@@ -806,12 +417,8 @@ def enable_autograd():
# Store original operations
_original_add = Tensor.__add__
_original_sub = Tensor.__sub__
_original_mul = Tensor.__mul__
_original_div = Tensor.__truediv__
_original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
_original_transpose = Tensor.transpose if hasattr(Tensor, 'transpose') else None
_original_reshape = Tensor.reshape if hasattr(Tensor, 'reshape') else None
# Enhanced operations that track gradients
def tracked_add(self, other):
@@ -878,98 +485,6 @@ def enable_autograd():
return result
def tracked_transpose(self, dim0=None, dim1=None):
"""
Transpose with gradient tracking.
Enhances the original transpose method to build computation graphs
when requires_grad=True for the input.
"""
if _original_transpose:
result = _original_transpose(self, dim0, dim1)
else:
# Fallback if transpose doesn't exist
if dim0 is None and dim1 is None:
axes = list(range(len(self.shape)))
if len(axes) >= 2:
axes[-2], axes[-1] = axes[-1], axes[-2]
result = Tensor(np.transpose(self.data, axes))
else:
axes = list(range(len(self.shape)))
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
result = Tensor(np.transpose(self.data, axes))
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = TransposeBackward(self, dim0, dim1)
return result
def tracked_reshape(self, *shape):
"""
Reshape with gradient tracking.
Enhances the original reshape method to build computation graphs
when requires_grad=True for the input.
"""
original_shape = self.shape
if _original_reshape:
result = _original_reshape(self, *shape)
else:
# Fallback if reshape doesn't exist
result = Tensor(self.data.reshape(*shape))
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = ReshapeBackward(self, original_shape)
return result
def tracked_sub(self, other):
"""
Subtraction with gradient tracking.
Enhances the original __sub__ method to build computation graphs
when requires_grad=True for any input.
"""
# Convert scalar to Tensor if needed
if not isinstance(other, Tensor):
other = Tensor(other)
# Call original operation
result = _original_sub(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = SubBackward(self, other)
return result
def tracked_div(self, other):
"""
Division with gradient tracking.
Enhances the original __truediv__ method to build computation graphs
when requires_grad=True for any input.
"""
# Convert scalar to Tensor if needed
if not isinstance(other, Tensor):
other = Tensor(other)
# Call original operation
result = _original_div(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = DivBackward(self, other)
return result
def sum_op(self, axis=None, keepdims=False):
"""
Sum operation with gradient tracking.
@@ -1058,26 +573,20 @@ def enable_autograd():
# Install enhanced operations
Tensor.__add__ = tracked_add
Tensor.__sub__ = tracked_sub
Tensor.__mul__ = tracked_mul
Tensor.__truediv__ = tracked_div
Tensor.matmul = tracked_matmul
Tensor.transpose = tracked_transpose
Tensor.reshape = tracked_reshape
Tensor.sum = sum_op
Tensor.backward = backward
Tensor.zero_grad = zero_grad
# Patch activations and losses to track gradients
try:
from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU
from tinytorch.core.activations import Sigmoid, ReLU
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
# Store original methods
_original_sigmoid_forward = Sigmoid.forward
_original_relu_forward = ReLU.forward
_original_softmax_forward = Softmax.forward
_original_gelu_forward = GELU.forward
_original_bce_forward = BinaryCrossEntropyLoss.forward
_original_mse_forward = MSELoss.forward
_original_ce_forward = CrossEntropyLoss.forward
@@ -1104,30 +613,6 @@ def enable_autograd():
return result
def tracked_softmax_forward(self, x, dim=-1):
"""Softmax with gradient tracking."""
# Call original forward to get result using Tensor operations
result = _original_softmax_forward(self, x, dim=dim)
# Attach the correct gradient function
if x.requires_grad:
result.requires_grad = True
result._grad_fn = SoftmaxBackward(x, result, dim)
return result
def tracked_gelu_forward(self, x):
"""GELU with gradient tracking."""
# Call original forward to get result
result = _original_gelu_forward(self, x)
# Attach the correct gradient function
if x.requires_grad:
result.requires_grad = True
result._grad_fn = GELUBackward(x)
return result
def tracked_bce_forward(self, predictions, targets):
"""Binary cross-entropy with gradient tracking."""
# Compute BCE loss
@@ -1187,8 +672,6 @@ def enable_autograd():
# Install patched methods
Sigmoid.forward = tracked_sigmoid_forward
ReLU.forward = tracked_relu_forward
Softmax.forward = tracked_softmax_forward
GELU.forward = tracked_gelu_forward
BinaryCrossEntropyLoss.forward = tracked_bce_forward
MSELoss.forward = tracked_mse_forward
CrossEntropyLoss.forward = tracked_ce_forward

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/04_layers/layers_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_layers/layers_dev.ipynb.
# %% auto 0
__all__ = ['Linear', 'Dropout']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_losses/losses_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/04_losses/losses_dev.ipynb.
# %% auto 0
__all__ = ['import_previous_module', 'log_softmax', 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/10_optimizers/optimizers_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/06_optimizers/optimizers_dev.ipynb.
# %% auto 0
__all__ = ['Optimizer', 'SGD', 'Adam', 'AdamW']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/06_spatial/spatial_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/09_spatial/spatial_dev.ipynb.
# %% auto 0
__all__ = ['Conv2d', 'MaxPool2d', 'AvgPool2d', 'SimpleCNN']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/02_tensor/tensor_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/01_tensor/tensor_dev.ipynb.
# %% auto 0
__all__ = ['Tensor']

View File

@@ -1,21 +1,7 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/11_training/training_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_training/training_dev.ipynb.
# %% auto 0
__all__ = ['CosineSchedule', 'Trainer']
__all__ = ['CosineSchedule', 'save_checkpoint', 'load_checkpoint', 'Trainer']
# %% ../../modules/source/07_training/training_dev.ipynb 1
import numpy as np
@@ -72,6 +58,90 @@ class CosineSchedule:
### END SOLUTION
# %% ../../modules/source/07_training/training_dev.ipynb 14
def save_checkpoint(checkpoint_dict: Dict[str, Any], path: str):
"""
Save checkpoint dictionary to disk using pickle.
This is a low-level utility for saving model state. Use this when you have
a custom training loop and want to save just what you need (model params,
config, metadata).
For complete training state with optimizer and scheduler, use
Trainer.save_checkpoint() instead.
TODO: Implement checkpoint saving with pickle
APPROACH:
1. Create parent directory if it doesn't exist (Path(path).parent.mkdir)
2. Open file in binary write mode ('wb')
3. Use pickle.dump() to serialize the checkpoint dictionary
4. Print confirmation message
EXAMPLE:
>>> model = SimpleModel()
>>> checkpoint = {
... 'model_params': [p.data.copy() for p in model.parameters()],
... 'config': {'embed_dim': 32, 'num_layers': 2},
... 'metadata': {'final_loss': 0.089, 'training_steps': 5000}
... }
>>> save_checkpoint(checkpoint, 'checkpoints/model.pkl')
Checkpoint saved: checkpoints/model.pkl
HINTS:
- Use Path(path).parent.mkdir(parents=True, exist_ok=True)
- pickle.dump(obj, file) writes the object to file
- Always print a success message so users know it worked
"""
### BEGIN SOLUTION
# Create parent directory if needed
Path(path).parent.mkdir(parents=True, exist_ok=True)
# Save checkpoint using pickle
with open(path, 'wb') as f:
pickle.dump(checkpoint_dict, f)
print(f"✓ Checkpoint saved: {path}")
### END SOLUTION
# %% ../../modules/source/07_training/training_dev.ipynb 15
def load_checkpoint(path: str) -> Dict[str, Any]:
"""
Load checkpoint dictionary from disk using pickle.
Companion function to save_checkpoint(). Restores the checkpoint dictionary
so you can rebuild your model, resume training, or inspect saved metadata.
TODO: Implement checkpoint loading with pickle
APPROACH:
1. Open file in binary read mode ('rb')
2. Use pickle.load() to deserialize the checkpoint
3. Print confirmation message
4. Return the loaded dictionary
EXAMPLE:
>>> checkpoint = load_checkpoint('checkpoints/model.pkl')
Checkpoint loaded: checkpoints/model.pkl
>>> print(checkpoint['metadata']['final_loss'])
0.089
>>> model_params = checkpoint['model_params']
>>> # Now restore model: for param, data in zip(model.parameters(), model_params)...
HINTS:
- pickle.load(file) reads and deserializes the object
- Return the loaded dictionary
- Print a success message for user feedback
"""
### BEGIN SOLUTION
# Load checkpoint using pickle
with open(path, 'rb') as f:
checkpoint = pickle.load(f)
print(f"✓ Checkpoint loaded: {path}")
return checkpoint
### END SOLUTION
# %% ../../modules/source/07_training/training_dev.ipynb 19
class Trainer:
"""
Complete training orchestrator for neural networks.
@@ -246,6 +316,11 @@ class Trainer:
def save_checkpoint(self, path: str):
"""
Save complete training state for resumption.
This high-level method saves everything needed to resume training:
model parameters, optimizer state, scheduler state, and training history.
Uses the low-level save_checkpoint() function internally.
Args:
path: File path to save checkpoint
@@ -260,19 +335,23 @@ class Trainer:
'training_mode': self.training_mode
}
Path(path).parent.mkdir(parents=True, exist_ok=True)
with open(path, 'wb') as f:
pickle.dump(checkpoint, f)
# Use the standalone save_checkpoint function
save_checkpoint(checkpoint, path)
def load_checkpoint(self, path: str):
"""
Load training state from checkpoint.
This high-level method restores complete training state including
model parameters, optimizer state, scheduler state, and history.
Uses the low-level load_checkpoint() function internally.
Args:
path: File path to load checkpoint from
"""
with open(path, 'rb') as f:
checkpoint = pickle.load(f)
# Use the standalone load_checkpoint function
checkpoint = load_checkpoint(path)
self.epoch = checkpoint['epoch']
self.step = checkpoint['step']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_loader/loader_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_dataloader/dataloader_dev.ipynb.
# %% auto 0
__all__ = ['Dataset', 'TensorDataset', 'DataLoader']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_kv_cache/kv_cache_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/15_memoization/memoization_dev.ipynb.
# %% auto 0
__all__ = ['KVCache', 'enable_kv_cache', 'disable_kv_cache']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
# %% auto 0
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
@@ -23,7 +9,6 @@ from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.attention import MultiHeadAttention
from ..core.activations import GELU
from ..text.embeddings import Embedding, PositionalEncoding
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
class LayerNorm:
@@ -61,6 +46,7 @@ class LayerNorm:
self.eps = eps
# Learnable parameters: scale and shift
# CRITICAL: requires_grad=True so optimizer can train these!
self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True) # Scale parameter
self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True) # Shift parameter
### END SOLUTION
@@ -83,19 +69,18 @@ class LayerNorm:
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
"""
### BEGIN SOLUTION
# CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
# Compute statistics across last dimension (features)
mean = x.mean(axis=-1, keepdims=True)
# Compute variance: E[(x - μ)²]
# Use Tensor operations to preserve computation graph!
diff = x - mean
variance = (diff * diff).mean(axis=-1, keepdims=True)
diff = x - mean # Tensor subtraction maintains gradient
variance = (diff * diff).mean(axis=-1, keepdims=True) # Tensor ops maintain gradient
# Normalize - use Tensor operations to preserve gradients!
# Add eps as a Tensor for proper gradient flow
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
normalized = (x - mean) / std
# Normalize: (x - mean) / sqrt(variance + eps)
# Note: sqrt and division need to preserve gradient flow
std_data = np.sqrt(variance.data + self.eps)
normalized = diff * Tensor(1.0 / std_data) # Scale by reciprocal to maintain gradient
# Apply learnable transformation
output = normalized * self.gamma + self.beta
@@ -103,7 +88,7 @@ class LayerNorm:
### END SOLUTION
def __call__(self, x):
"""Allows the layer norm to be called like a function."""
"""Allows the layer to be called like a function."""
return self.forward(x)
def parameters(self):
@@ -147,7 +132,7 @@ class MLP:
# Two-layer feed-forward network
self.linear1 = Linear(embed_dim, hidden_dim)
self.gelu = GELU() # Use GELU activation from activations module
self.gelu = GELU()
self.linear2 = Linear(hidden_dim, embed_dim)
### END SOLUTION
@@ -171,7 +156,7 @@ class MLP:
# First linear layer with expansion
hidden = self.linear1.forward(x)
# GELU activation (YOUR activation from Module 03!)
# GELU activation
hidden = self.gelu.forward(hidden)
# Second linear layer back to original size
@@ -404,10 +389,6 @@ class GPT:
return logits
### END SOLUTION
def __call__(self, tokens):
"""Allows the GPT model to be called like a function."""
return self.forward(tokens)
def _create_causal_mask(self, seq_len):
"""Create causal mask to prevent attending to future positions."""
### BEGIN SOLUTION

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_acceleration/acceleration_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/18_acceleration/acceleration_dev.ipynb.
# %% auto 0
__all__ = []

View File

@@ -1,22 +1,7 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_compression/compression_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/17_compression/compression_dev.ipynb.
# %% auto 0
__all__ = ['Sequential', 'KnowledgeDistillation', 'test_unit_knowledge_distillation', 'CompressionComplete', 'measure_sparsity',
'magnitude_prune', 'structured_prune', 'compress_model']
__all__ = ['Tensor', 'Linear', 'Sequential']
# %% ../../modules/source/17_compression/compression_dev.ipynb 1
import numpy as np
@@ -24,277 +9,77 @@ import copy
from typing import List, Dict, Any, Tuple, Optional
import time
# Import from TinyTorch modules
from ..core.tensor import Tensor
from ..core.layers import Linear
# Import from previous modules
# Note: In the full package, these would be imports like:
# from tinytorch.core.tensor import Tensor
# from tinytorch.core.layers import Linear
# For development, we'll create minimal implementations
class Tensor:
"""Minimal Tensor class for compression development - imports from Module 01 in practice."""
def __init__(self, data, requires_grad=False):
self.data = np.array(data)
self.shape = self.data.shape
self.size = self.data.size
self.requires_grad = requires_grad
self.grad = None
def __add__(self, other):
if isinstance(other, Tensor):
return Tensor(self.data + other.data)
return Tensor(self.data + other)
def __mul__(self, other):
if isinstance(other, Tensor):
return Tensor(self.data * other.data)
return Tensor(self.data * other)
def matmul(self, other):
return Tensor(np.dot(self.data, other.data))
def abs(self):
return Tensor(np.abs(self.data))
def sum(self, axis=None):
return Tensor(self.data.sum(axis=axis))
def __repr__(self):
return f"Tensor(shape={self.shape})"
class Linear:
"""Minimal Linear layer for compression development - imports from Module 03 in practice."""
def __init__(self, in_features, out_features, bias=True):
self.in_features = in_features
self.out_features = out_features
# Initialize with He initialization
self.weight = Tensor(np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features))
self.bias = Tensor(np.zeros(out_features)) if bias else None
def forward(self, x):
output = x.matmul(self.weight)
if self.bias is not None:
output = output + self.bias
return output
def parameters(self):
params = [self.weight]
if self.bias is not None:
params.append(self.bias)
return params
# Sequential container for model compression
class Sequential:
"""Sequential container for compression (not exported from core layers)."""
"""Minimal Sequential container for model compression."""
def __init__(self, *layers):
self.layers = list(layers)
def forward(self, x):
for layer in self.layers:
x = layer.forward(x) if hasattr(layer, 'forward') else layer(x)
x = layer.forward(x)
return x
def __call__(self, x):
return self.forward(x)
def parameters(self):
params = []
for layer in self.layers:
if hasattr(layer, 'parameters'):
params.extend(layer.parameters())
return params
# %% ../../modules/source/17_compression/compression_dev.ipynb 15
class KnowledgeDistillation:
"""
Knowledge distillation for model compression.
Train a smaller student model to mimic a larger teacher model.
"""
def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
"""
Initialize knowledge distillation.
TODO: Set up teacher and student models with distillation parameters
APPROACH:
1. Store teacher and student models
2. Set temperature for softening probability distributions
3. Set alpha for balancing hard vs soft targets
EXAMPLE:
>>> teacher = Sequential(Linear(100, 200), Linear(200, 50))
>>> student = Sequential(Linear(100, 50))
>>> kd = KnowledgeDistillation(teacher, student, temperature=4.0, alpha=0.8)
>>> print(f"Temperature: {kd.temperature}, Alpha: {kd.alpha}")
Temperature: 4.0, Alpha: 0.8
HINTS:
- Simply assign the parameters to instance variables
- Temperature typically ranges from 3-5 for effective softening
- Alpha of 0.7 means 70% soft targets, 30% hard targets
Args:
teacher_model: Large, pre-trained model
student_model: Smaller model to train
temperature: Softening parameter for distributions
alpha: Weight for soft target loss (1-alpha for hard targets)
"""
### BEGIN SOLUTION
self.teacher_model = teacher_model
self.student_model = student_model
self.temperature = temperature
self.alpha = alpha
### END SOLUTION
def distillation_loss(self, student_logits, teacher_logits, true_labels):
"""
Calculate combined distillation loss.
TODO: Implement knowledge distillation loss function
APPROACH:
1. Calculate hard target loss (student vs true labels)
2. Calculate soft target loss (student vs teacher, with temperature)
3. Combine losses: alpha * soft_loss + (1-alpha) * hard_loss
EXAMPLE:
>>> kd = KnowledgeDistillation(teacher, student)
>>> loss = kd.distillation_loss(student_out, teacher_out, labels)
>>> print(f"Distillation loss: {loss:.4f}")
HINTS:
- Use temperature to soften distributions: logits/temperature
- Soft targets use KL divergence or cross-entropy
- Hard targets use standard classification loss
"""
### BEGIN SOLUTION
# Convert to numpy for this implementation
if hasattr(student_logits, 'data'):
student_logits = student_logits.data
if hasattr(teacher_logits, 'data'):
teacher_logits = teacher_logits.data
if hasattr(true_labels, 'data'):
true_labels = true_labels.data
# Soften distributions with temperature
student_soft = self._softmax(student_logits / self.temperature)
teacher_soft = self._softmax(teacher_logits / self.temperature)
# Soft target loss (KL divergence)
soft_loss = self._kl_divergence(student_soft, teacher_soft)
# Hard target loss (cross-entropy)
student_hard = self._softmax(student_logits)
hard_loss = self._cross_entropy(student_hard, true_labels)
# Combined loss
total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss
return total_loss
### END SOLUTION
def _softmax(self, logits):
"""Compute softmax with numerical stability."""
exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
def _kl_divergence(self, p, q):
"""Compute KL divergence between distributions."""
return np.sum(p * np.log(p / (q + 1e-8) + 1e-8))
def _cross_entropy(self, predictions, labels):
"""Compute cross-entropy loss."""
# Simple implementation for integer labels
if labels.ndim == 1:
return -np.mean(np.log(predictions[np.arange(len(labels)), labels] + 1e-8))
else:
return -np.mean(np.sum(labels * np.log(predictions + 1e-8), axis=1))
def test_unit_knowledge_distillation():
"""🔬 Test knowledge distillation functionality."""
print("🔬 Unit Test: Knowledge Distillation...")
# Create teacher and student models
teacher = Sequential(Linear(10, 20), Linear(20, 5))
student = Sequential(Linear(10, 5)) # Smaller model
# Initialize knowledge distillation
kd = KnowledgeDistillation(teacher, student, temperature=3.0, alpha=0.7)
# Create dummy data
input_data = Tensor(np.random.randn(8, 10)) # Batch of 8
true_labels = np.array([0, 1, 2, 3, 4, 0, 1, 2]) # Class labels
# Forward passes
teacher_output = teacher.forward(input_data)
student_output = student.forward(input_data)
# Calculate distillation loss
loss = kd.distillation_loss(student_output, teacher_output, true_labels)
# Verify loss is reasonable
assert isinstance(loss, (float, np.floating)), f"Loss should be float, got {type(loss)}"
assert loss > 0, f"Loss should be positive, got {loss}"
assert not np.isnan(loss), "Loss should not be NaN"
print("✅ knowledge_distillation works correctly!")
test_unit_knowledge_distillation()
# %% ../../modules/source/17_compression/compression_dev.ipynb 29
class CompressionComplete:
"""
Complete compression system for milestone use.
Provides pruning, distillation, and low-rank approximation techniques.
"""
@staticmethod
def measure_sparsity(model) -> float:
"""Measure the sparsity of a model (fraction of zero weights)."""
total_params = 0
zero_params = 0
if hasattr(model, 'parameters'):
for param in model.parameters():
total_params += param.size
zero_params += np.sum(param.data == 0)
return zero_params / total_params if total_params > 0 else 0.0
@staticmethod
def magnitude_prune(model, sparsity=0.5):
"""
Prune model weights by magnitude (smallest weights set to zero).
Args:
model: Model with parameters() method
sparsity: Fraction of weights to prune (0-1)
"""
if hasattr(model, 'parameters'):
for param in model.parameters():
threshold = np.percentile(np.abs(param.data), sparsity * 100)
param.data[np.abs(param.data) < threshold] = 0
return model
@staticmethod
def structured_prune(model, prune_ratio=0.5):
"""
Prune entire neurons/channels (structured pruning).
Args:
model: Model to prune
prune_ratio: Fraction of structures to prune (0-1)
"""
if hasattr(model, 'parameters'):
params = list(model.parameters())
if len(params) > 0 and hasattr(params[0], 'data'):
weight = params[0]
if len(weight.shape) == 2: # Linear layer
# Prune output neurons
neuron_norms = np.linalg.norm(weight.data, axis=0)
threshold = np.percentile(neuron_norms, prune_ratio * 100)
mask = neuron_norms >= threshold
weight.data[:, ~mask] = 0
return model
@staticmethod
def compress_model(model, compression_config: Dict[str, Any]):
"""
Apply complete compression pipeline to a model.
Args:
model: Model to compress
compression_config: Dictionary with compression settings
- 'magnitude_sparsity': float (0-1)
- 'structured_prune_ratio': float (0-1)
Returns:
Compressed model with sparsity stats
"""
stats = {
'original_sparsity': CompressionComplete.measure_sparsity(model)
}
# Apply magnitude pruning
if 'magnitude_sparsity' in compression_config:
model = CompressionComplete.magnitude_prune(
model, compression_config['magnitude_sparsity']
)
# Apply structured pruning
if 'structured_prune_ratio' in compression_config:
model = CompressionComplete.structured_prune(
model, compression_config['structured_prune_ratio']
)
stats['final_sparsity'] = CompressionComplete.measure_sparsity(model)
stats['compression_ratio'] = 1.0 / (1.0 - stats['final_sparsity']) if stats['final_sparsity'] < 1.0 else float('inf')
return model, stats
# Convenience functions for backward compatibility
def measure_sparsity(model) -> float:
"""Measure model sparsity."""
return CompressionComplete.measure_sparsity(model)
def magnitude_prune(model, sparsity=0.5):
"""Apply magnitude-based pruning."""
return CompressionComplete.magnitude_prune(model, sparsity)
def structured_prune(model, prune_ratio=0.5):
"""Apply structured pruning."""
return CompressionComplete.structured_prune(model, prune_ratio)
def compress_model(model, compression_config: Dict[str, Any]):
"""Apply complete compression pipeline."""
return CompressionComplete.compress_model(model, compression_config)

View File

@@ -1,21 +1,7 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_quantization/quantization_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/16_quantization/quantization_dev.ipynb.
# %% auto 0
__all__ = []
__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model']
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3
import numpy as np
@@ -29,3 +15,94 @@ from ..core.layers import Linear
from ..core.activations import ReLU
print("✅ Quantization module imports complete")
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 34
class QuantizationComplete:
"""
Complete quantization system for milestone use.
Provides INT8 quantization with calibration for 4× memory reduction.
"""
@staticmethod
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
data = tensor.data
min_val = float(np.min(data))
max_val = float(np.max(data))
if abs(max_val - min_val) < 1e-8:
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
scale = (max_val - min_val) / 255.0
zero_point = int(np.round(-128 - min_val / scale))
zero_point = int(np.clip(zero_point, -128, 127))
quantized_data = np.round(data / scale + zero_point)
quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8)
return Tensor(quantized_data), scale, zero_point
@staticmethod
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
return Tensor(dequantized_data)
@staticmethod
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""
Quantize all Linear layers in a model.
Returns dictionary with quantization info and memory savings.
"""
quantized_layers = {}
original_size = 0
quantized_size = 0
# Iterate through model parameters
if hasattr(model, 'parameters'):
for i, param in enumerate(model.parameters()):
param_size = param.data.nbytes
original_size += param_size
# Quantize parameter
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
quantized_size += q_param.data.nbytes
quantized_layers[f'param_{i}'] = {
'quantized': q_param,
'scale': scale,
'zero_point': zp,
'original_shape': param.data.shape
}
return {
'quantized_layers': quantized_layers,
'original_size_mb': original_size / (1024 * 1024),
'quantized_size_mb': quantized_size / (1024 * 1024),
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
}
@staticmethod
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
"""Compare memory usage between original and quantized models."""
return {
'original_mb': quantized_info['original_size_mb'],
'quantized_mb': quantized_info['quantized_size_mb'],
'compression_ratio': quantized_info['compression_ratio'],
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
}
# Convenience functions for backward compatibility
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
return QuantizationComplete.quantize_tensor(tensor)
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""Quantize entire model to INT8."""
return QuantizationComplete.quantize_model(model, calibration_data)

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_profiler/profiler_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/14_profiling/profiling_dev.ipynb.
# %% auto 0
__all__ = ['Profiler', 'quick_profile', 'analyze_weight_distribution']

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/11_embeddings/embeddings_dev.ipynb.
# %% auto 0
__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
@@ -95,13 +81,10 @@ class Embedding:
# This is equivalent to one-hot multiplication but much more efficient
embedded = self.weight.data[indices.data.astype(int)]
# Create result tensor
# Create result tensor with gradient tracking
# Note: Gradient computation handled by autograd system (Module 05)
# The embedding lookup is differentiable through the weight matrix
result = Tensor(embedded, requires_grad=self.weight.requires_grad)
# Attach gradient function (students learned this in Module 05!)
if self.weight.requires_grad:
from tinytorch.core.autograd import EmbeddingBackward
result._grad_fn = EmbeddingBackward(self.weight, indices)
return result
@@ -336,10 +319,6 @@ class EmbeddingLayer:
return output
def __call__(self, tokens: Tensor) -> Tensor:
"""Allows the embedding layer to be called like a function."""
return self.forward(tokens)
def parameters(self) -> List[Tensor]:
"""Return all trainable parameters."""
params = self.token_embedding.parameters()

View File

@@ -1,19 +1,5 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_tokenization/tokenization_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_tokenization/tokenization_dev.ipynb.
# %% auto 0
__all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer']
@@ -24,16 +10,6 @@ import json
import re
from collections import defaultdict, Counter
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 3
import numpy as np
from typing import List, Dict, Tuple, Optional, Set
import json
import re
from collections import defaultdict, Counter
# Import only Module 01 (Tensor) - this module has minimal dependencies
from ..core.tensor import Tensor
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8
class Tokenizer:
"""