Regenerate tinytorch package from all module exports

- Run tito export --all to update all exported code
- Fix file permissions (chmod u+w) to allow export writes
- Update 12 modified files with latest module code
- Add 3 new files (tinygpt, acceleration, compression)
- All 21 modules successfully exported
This commit is contained in:
Vijay Janapa Reddi
2025-11-10 06:23:47 -05:00
parent fce6a6a01e
commit 059db9c88c
12 changed files with 1652 additions and 860 deletions

122
tinytorch/_modidx.py generated
View File

@@ -21,7 +21,37 @@ d = { 'settings': { 'branch': 'main',
'doc_host': 'https://tinytorch.github.io',
'git_url': 'https://github.com/tinytorch/TinyTorch/',
'lib_path': 'tinytorch'},
'syms': { 'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark',
'syms': { 'tinytorch.applications.tinygpt': { 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline': ( '20_capstone/capstone_dev.html#completetinygptpipeline',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.__init__': ( '20_capstone/capstone_dev.html#completetinygptpipeline.__init__',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.generate_text': ( '20_capstone/capstone_dev.html#completetinygptpipeline.generate_text',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.optimize_model': ( '20_capstone/capstone_dev.html#completetinygptpipeline.optimize_model',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.prepare_training_data': ( '20_capstone/capstone_dev.html#completetinygptpipeline.prepare_training_data',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.train': ( '20_capstone/capstone_dev.html#completetinygptpipeline.train',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPT': ( '20_capstone/capstone_dev.html#tinygpt',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPT.__init__': ( '20_capstone/capstone_dev.html#tinygpt.__init__',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer': ( '20_capstone/capstone_dev.html#tinygpttrainer',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer.__init__': ( '20_capstone/capstone_dev.html#tinygpttrainer.__init__',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer.prepare_batch': ( '20_capstone/capstone_dev.html#tinygpttrainer.prepare_batch',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.TinyGPTTrainer.train_step': ( '20_capstone/capstone_dev.html#tinygpttrainer.train_step',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.test_unit_complete_pipeline': ( '20_capstone/capstone_dev.html#test_unit_complete_pipeline',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.test_unit_tinygpt_init': ( '20_capstone/capstone_dev.html#test_unit_tinygpt_init',
'tinytorch/applications/tinygpt.py'),
'tinytorch.applications.tinygpt.test_unit_training_pipeline': ( '20_capstone/capstone_dev.html#test_unit_training_pipeline',
'tinytorch/applications/tinygpt.py')},
'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.Benchmark.__init__': ( '19_benchmarking/benchmarking_dev.html#benchmark.__init__',
'tinytorch/benchmarking/benchmark.py'),
@@ -59,8 +89,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.TinyMLPerf.run_standard_benchmark': ( '19_benchmarking/benchmarking_dev.html#tinymlperf.run_standard_benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.calculate_normalized_scores': ( '19_benchmarking/benchmarking_dev.html#calculate_normalized_scores',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark',
'tinytorch/benchmarking/benchmark.py'),
'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark_suite',
@@ -77,8 +105,6 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.validate_installation': ( '20_competition/competition_dev.html#validate_installation',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.validate_submission': ( '20_competition/competition_dev.html#validate_submission',
'tinytorch/competition/submit.py'),
'tinytorch.competition.submit.worked_example_optimization': ( '20_competition/competition_dev.html#worked_example_optimization',
'tinytorch/competition/submit.py')},
'tinytorch.core.activations': { 'tinytorch.core.activations.GELU': ( '02_activations/activations_dev.html#gelu',
@@ -315,11 +341,7 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint',
'tinytorch/core/training.py'),
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch',
'tinytorch/core/training.py'),
'tinytorch.core.training.load_checkpoint': ( '07_training/training_dev.html#load_checkpoint',
'tinytorch/core/training.py'),
'tinytorch.core.training.save_checkpoint': ( '07_training/training_dev.html#save_checkpoint',
'tinytorch/core/training.py')},
'tinytorch/core/training.py')},
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__',
@@ -364,6 +386,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/generation/kv_cache.py')},
'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.__call__': ( '13_transformers/transformers_dev.html#gpt.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
@@ -376,6 +400,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.__call__': ( '13_transformers/transformers_dev.html#layernorm.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
@@ -384,6 +410,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.__call__': ( '13_transformers/transformers_dev.html#mlp.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
@@ -392,32 +420,58 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.__call__': ( '13_transformers/transformers_dev.html#transformerblock.__call__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer._tensor_mean': ( '13_transformers/transformers_dev.html#_tensor_mean',
'tinytorch/models/transformer.py'),
'tinytorch.models.transformer._tensor_sqrt': ( '13_transformers/transformers_dev.html#_tensor_sqrt',
'tinytorch/models/transformer.py')},
'tinytorch.optimization.quantization': { 'tinytorch.optimization.quantization.QuantizationComplete': ( '17_quantization/quantization_dev.html#quantizationcomplete',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.compare_models': ( '17_quantization/quantization_dev.html#quantizationcomplete.compare_models',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.dequantize_tensor': ( '17_quantization/quantization_dev.html#quantizationcomplete.dequantize_tensor',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.quantize_model': ( '17_quantization/quantization_dev.html#quantizationcomplete.quantize_model',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.QuantizationComplete.quantize_tensor': ( '17_quantization/quantization_dev.html#quantizationcomplete.quantize_tensor',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.dequantize_int8': ( '17_quantization/quantization_dev.html#dequantize_int8',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.quantize_int8': ( '17_quantization/quantization_dev.html#quantize_int8',
'tinytorch/optimization/quantization.py'),
'tinytorch.optimization.quantization.quantize_model': ( '17_quantization/quantization_dev.html#quantize_model',
'tinytorch/optimization/quantization.py')},
'tinytorch/models/transformer.py')},
'tinytorch.optimization.acceleration': {},
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '17_compression/compression_dev.html#compressioncomplete',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '17_compression/compression_dev.html#compressioncomplete.compress_model',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.magnitude_prune': ( '17_compression/compression_dev.html#compressioncomplete.magnitude_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.measure_sparsity': ( '17_compression/compression_dev.html#compressioncomplete.measure_sparsity',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.CompressionComplete.structured_prune': ( '17_compression/compression_dev.html#compressioncomplete.structured_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation': ( '17_compression/compression_dev.html#knowledgedistillation',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation.__init__': ( '17_compression/compression_dev.html#knowledgedistillation.__init__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation._cross_entropy': ( '17_compression/compression_dev.html#knowledgedistillation._cross_entropy',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation._kl_divergence': ( '17_compression/compression_dev.html#knowledgedistillation._kl_divergence',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation._softmax': ( '17_compression/compression_dev.html#knowledgedistillation._softmax',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.KnowledgeDistillation.distillation_loss': ( '17_compression/compression_dev.html#knowledgedistillation.distillation_loss',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential': ( '17_compression/compression_dev.html#sequential',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.__call__': ( '17_compression/compression_dev.html#sequential.__call__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.__init__': ( '17_compression/compression_dev.html#sequential.__init__',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.forward': ( '17_compression/compression_dev.html#sequential.forward',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.Sequential.parameters': ( '17_compression/compression_dev.html#sequential.parameters',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.compress_model': ( '17_compression/compression_dev.html#compress_model',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.magnitude_prune': ( '17_compression/compression_dev.html#magnitude_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.measure_sparsity': ( '17_compression/compression_dev.html#measure_sparsity',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.structured_prune': ( '17_compression/compression_dev.html#structured_prune',
'tinytorch/optimization/compression.py'),
'tinytorch.optimization.compression.test_unit_knowledge_distillation': ( '17_compression/compression_dev.html#test_unit_knowledge_distillation',
'tinytorch/optimization/compression.py')},
'tinytorch.optimization.quantization': {},
'tinytorch.profiling.profiler': { 'tinytorch.profiling.profiler.Profiler': ( '14_profiling/profiling_dev.html#profiler',
'tinytorch/profiling/profiler.py'),
'tinytorch.profiling.profiler.Profiler.__init__': ( '14_profiling/profiling_dev.html#profiler.__init__',
@@ -442,6 +496,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/profiling/profiler.py')},
'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__call__': ( '11_embeddings/embeddings_dev.html#embedding.__call__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
@@ -452,6 +508,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__call__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__call__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
@@ -462,6 +520,8 @@ d = { 'settings': { 'branch': 'main',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__call__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__call__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
'tinytorch/text/embeddings.py'),
'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',

679
tinytorch/applications/tinygpt.py generated Normal file
View File

@@ -0,0 +1,679 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_tinygpt/tinygpt_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline',
'test_unit_complete_pipeline']
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 2
#| default_exp applications.tinygpt
#| export
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 7
class TinyGPT:
"""
Complete GPT implementation integrating all TinyTorch modules.
This class demonstrates how framework components compose into real applications.
Built using modules 01,02,03,11,12,13 as core architecture.
Architecture:
- Token Embeddings (Module 11)
- Positional Encoding (Module 11)
- Transformer Blocks (Module 13)
- Output Linear Layer (Module 03)
- Language Modeling Head (Module 04)
"""
def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4,
num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1):
"""
Initialize TinyGPT with production-inspired architecture.
TODO: Build a complete GPT model using TinyTorch components
APPROACH:
1. Create token embeddings (vocab_size × embed_dim)
2. Create positional encoding (max_seq_len × embed_dim)
3. Build transformer layers using TransformerBlock
4. Add output projection layer
5. Calculate and report parameter count
ARCHITECTURE DECISIONS:
- embed_dim=128: Small enough for fast training, large enough for learning
- num_layers=4: Sufficient depth without excessive memory
- num_heads=4: Multi-head attention without head_dim being too small
- max_seq_len=256: Reasonable context length for character-level modeling
EXAMPLE:
>>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4)
>>> print(f"Parameters: {model.count_parameters():,}")
Parameters: 1,234,567
HINTS:
- Use Embedding class for token embeddings
- Use PositionalEncoding for position information
- Stack TransformerBlock instances in a list
- Final Linear layer maps embed_dim → vocab_size
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
self.max_seq_len = max_seq_len
self.dropout = dropout
# Token embeddings: convert token IDs to dense vectors
self.token_embedding = Embedding(vocab_size, embed_dim)
# Positional encoding: add position information
self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim)
# Transformer layers: core processing
self.transformer_blocks = []
for _ in range(num_layers):
block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0)
self.transformer_blocks.append(block)
# Output projection: map back to vocabulary
self.output_projection = Linear(embed_dim, vocab_size)
# Dropout for regularization
self.dropout_layer = Dropout(dropout)
# Calculate parameter count for systems analysis
self._param_count = self.count_parameters()
print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters")
print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D")
print(f"💾 Estimated memory: {self._param_count * 4 / 1024 / 1024:.1f}MB")
### END SOLUTION
def test_unit_tinygpt_init():
"""🔬 Test TinyGPT initialization and parameter counting."""
print("🔬 Unit Test: TinyGPT Initialization...")
# Create a small model for testing
model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128)
# Verify architecture components exist
assert hasattr(model, 'token_embedding')
assert hasattr(model, 'positional_encoding')
assert hasattr(model, 'transformer_blocks')
assert hasattr(model, 'output_projection')
assert len(model.transformer_blocks) == 2
# Verify parameter count is reasonable
param_count = model.count_parameters()
assert param_count > 0
assert param_count < 1000000 # Sanity check for small model
print(f"✅ Model created with {param_count:,} parameters")
print("✅ TinyGPT initialization works correctly!")
# Run immediate test
test_unit_tinygpt_init()
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 10
class TinyGPTTrainer:
"""
Complete training pipeline integrating optimizers, schedulers, and monitoring.
Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training.
"""
def __init__(self, model: TinyGPT, tokenizer: CharTokenizer,
learning_rate: float = 3e-4, weight_decay: float = 0.01):
"""
Initialize trainer with model and optimization components.
TODO: Set up complete training infrastructure
APPROACH:
1. Store model and tokenizer references
2. Initialize AdamW optimizer (standard for transformers)
3. Initialize loss function (CrossEntropyLoss for language modeling)
4. Set up learning rate scheduler (cosine schedule)
5. Initialize training metrics tracking
PRODUCTION CHOICES:
- AdamW: Better generalization than Adam (weight decay)
- learning_rate=3e-4: Standard for small transformers
- Cosine schedule: Smooth learning rate decay
- CrossEntropy: Standard for classification/language modeling
EXAMPLE:
>>> model = TinyGPT(vocab_size=100)
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
>>> trainer = TinyGPTTrainer(model, tokenizer)
>>> print("Trainer ready for training")
Trainer ready for training
HINTS:
- Get all model parameters with model.parameters()
- Use AdamW with weight_decay for better generalization
- CrossEntropyLoss handles the language modeling objective
"""
### BEGIN SOLUTION
self.model = model
self.tokenizer = tokenizer
# Collect all trainable parameters
all_params = []
all_params.extend(model.token_embedding.parameters())
for block in model.transformer_blocks:
all_params.extend(block.parameters())
all_params.extend(model.output_projection.parameters())
# Initialize optimizer (AdamW for transformers)
self.optimizer = AdamW(
params=all_params,
lr=learning_rate,
weight_decay=weight_decay,
betas=(0.9, 0.95) # Standard for language models
)
# Loss function for next token prediction
self.loss_fn = CrossEntropyLoss()
# Learning rate scheduler
self.scheduler = CosineSchedule(
optimizer=self.optimizer,
max_epochs=100, # Will adjust based on actual training
min_lr=learning_rate * 0.1
)
# Training metrics
self.training_history = {
'losses': [],
'perplexities': [],
'learning_rates': [],
'epoch': 0
}
print(f"🚀 Trainer initialized:")
print(f" Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})")
print(f" Parameters: {len(all_params):,} tensors")
print(f" Loss: CrossEntropyLoss")
### END SOLUTION
def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]:
"""
Convert text batch to input/target tensors for language modeling.
TODO: Implement text-to-tensor conversion with proper targets
APPROACH:
1. Tokenize each text in the batch
2. Pad/truncate to consistent length
3. Create input_ids (text) and target_ids (text shifted by 1)
4. Convert to Tensor format
LANGUAGE MODELING OBJECTIVE:
- Input: [token1, token2, token3, token4]
- Target: [token2, token3, token4, token5]
- Model predicts next token at each position
EXAMPLE:
>>> trainer = TinyGPTTrainer(model, tokenizer)
>>> texts = ["hello world", "ai is fun"]
>>> inputs, targets = trainer.prepare_batch(texts)
>>> print(inputs.shape, targets.shape)
(2, 128) (2, 128)
HINTS:
- Use tokenizer.encode() for text → token conversion
- Pad shorter sequences with tokenizer pad token
- Target sequence is input sequence shifted right by 1
"""
### BEGIN SOLUTION
batch_size = len(text_batch)
# Tokenize all texts
tokenized_batch = []
for text in text_batch:
tokens = self.tokenizer.encode(text)
# Truncate or pad to max_length
if len(tokens) > max_length:
tokens = tokens[:max_length]
else:
# Pad with special token (use 0 as pad)
tokens.extend([0] * (max_length - len(tokens)))
tokenized_batch.append(tokens)
# Convert to numpy then Tensor
input_ids = Tensor(np.array(tokenized_batch)) # (batch_size, seq_len)
# Create targets (shifted input for next token prediction)
target_ids = Tensor(np.roll(input_ids.data, -1, axis=1)) # Shift left by 1
return input_ids, target_ids
### END SOLUTION
def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float:
"""
Single training step with forward, backward, and optimization.
TODO: Implement complete training step
APPROACH:
1. Zero gradients from previous step
2. Forward pass to get logits
3. Compute loss between logits and targets
4. Backward pass to compute gradients
5. Optimizer step to update parameters
6. Return loss value for monitoring
MEMORY MANAGEMENT:
During training, memory usage = 3× model size:
- 1× for parameters
- 1× for gradients
- 1× for optimizer states (Adam moments)
EXAMPLE:
>>> loss = trainer.train_step(input_ids, target_ids)
>>> print(f"Training loss: {loss:.4f}")
Training loss: 2.3456
HINTS:
- Always zero_grad() before forward pass
- Loss should be computed on flattened logits and targets
- Call backward() on the loss tensor
"""
### BEGIN SOLUTION
# Zero gradients from previous step
self.optimizer.zero_grad()
# Forward pass
logits = self.model.forward(input_ids) # (batch, seq_len, vocab_size)
# Reshape for loss computation
batch_size, seq_len, vocab_size = logits.shape
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
targets_flat = target_ids.reshape(batch_size * seq_len)
# Compute loss
loss = self.loss_fn.forward(logits_flat, targets_flat)
# Backward pass
loss.backward()
# Optimizer step
self.optimizer.step()
# Return scalar loss for monitoring
return float(loss.data.item() if hasattr(loss.data, 'item') else loss.data)
### END SOLUTION
def test_unit_training_pipeline():
"""🔬 Test training pipeline components."""
print("🔬 Unit Test: Training Pipeline...")
# Create small model and trainer
model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2)
tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' '])
trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3)
# Test batch preparation
texts = ["hello", "world"]
input_ids, target_ids = trainer.prepare_batch(texts, max_length=8)
assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}"
assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}"
# Test training step
initial_loss = trainer.train_step(input_ids, target_ids)
assert initial_loss > 0, "Loss should be positive"
# Second step should work (gradients computed and applied)
second_loss = trainer.train_step(input_ids, target_ids)
assert second_loss > 0, "Second loss should also be positive"
print(f"✅ Batch preparation shape: {input_ids.shape}")
print(f"✅ Initial loss: {initial_loss:.4f}")
print(f"✅ Second loss: {second_loss:.4f}")
print("✅ Training pipeline works correctly!")
# Run immediate test
test_unit_training_pipeline()
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 14
class CompleteTinyGPTPipeline:
"""
End-to-end ML pipeline demonstrating integration of all 19 modules.
Pipeline stages:
1. Data preparation (Module 10: Tokenization)
2. Model creation (Modules 01-04, 11-13: Architecture)
3. Training setup (Modules 05-07: Optimization)
4. Training loop (Module 08: DataLoader)
5. Optimization (Modules 17-18: Quantization, Pruning)
6. Evaluation (Module 19: Benchmarking)
7. Generation (Module 14: KV Caching)
"""
def __init__(self, vocab_size: int = 100, embed_dim: int = 128,
num_layers: int = 4, num_heads: int = 4):
"""
Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules.
TODO: Set up a complete ML pipeline with tokenization, model, training,
profiling, and benchmarking components
APPROACH:
1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads)
2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127)
3. Create TinyGPT model instance with stored parameters and max_seq_len=256
4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4
5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis
6. Initialize pipeline state tracking (is_trained flag, training_history list)
7. Print pipeline initialization summary with parameter count and memory usage
EXAMPLE:
>>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128,
... num_layers=4, num_heads=4)
🏗️ Complete TinyGPT Pipeline Initialized
Model: 419,300 parameters
Memory: 1.6MB
>>> pipeline.model.count_parameters()
419300
>>> pipeline.is_trained
False
>>> len(pipeline.training_history)
0
HINTS:
- CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)]
- TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len
- TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments
- Benchmark expects (models_list, datasets_list, metrics_list) format
- Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB
"""
### BEGIN SOLUTION
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.num_layers = num_layers
self.num_heads = num_heads
# Stage 1: Initialize tokenizer (Module 10)
self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)]) # Printable ASCII
# Stage 2: Create model (Modules 01-04, 11-13)
self.model = TinyGPT(
vocab_size=vocab_size,
embed_dim=embed_dim,
num_layers=num_layers,
num_heads=num_heads,
max_seq_len=256
)
# Stage 3: Setup training (Modules 05-07)
self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4)
# Stage 4: Initialize profiler and benchmark (Modules 15, 19)
self.profiler = Profiler()
self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"])
# Pipeline state
self.is_trained = False
self.training_history = []
print("🏗️ Complete TinyGPT Pipeline Initialized")
print(f" Model: {self.model.count_parameters():,} parameters")
print(f" Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB")
### END SOLUTION
def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader:
"""
Prepare training data using DataLoader (Module 08).
TODO: Create DataLoader for training text data
APPROACH:
1. Tokenize all texts in corpus
2. Create input/target pairs for language modeling
3. Package into TensorDataset
4. Create DataLoader with batching and shuffling
EXAMPLE:
>>> pipeline = CompleteTinyGPTPipeline()
>>> corpus = ["hello world", "ai is amazing"]
>>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
>>> print(f"Batches: {len(dataloader)}")
Batches: 1
"""
### BEGIN SOLUTION
# Tokenize and prepare training pairs
input_sequences = []
target_sequences = []
for text in text_corpus:
tokens = self.tokenizer.encode(text)
if len(tokens) < 2:
continue # Skip very short texts
# Create sliding window of input/target pairs
for i in range(len(tokens) - 1):
input_seq = tokens[:i+1]
target_seq = tokens[i+1]
# Pad input to consistent length
max_len = 32 # Reasonable context window
if len(input_seq) > max_len:
input_seq = input_seq[-max_len:]
else:
input_seq = [0] * (max_len - len(input_seq)) + input_seq
input_sequences.append(input_seq)
target_sequences.append(target_seq)
# Convert to tensors
inputs = Tensor(np.array(input_sequences))
targets = Tensor(np.array(target_sequences))
# Create dataset and dataloader
dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches")
return dataloader
### END SOLUTION
def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]:
"""
Complete training loop with monitoring.
TODO: Implement full training with progress tracking
APPROACH:
1. Loop through epochs
2. For each batch: forward, backward, optimize
3. Track loss and perplexity
4. Update learning rate schedule
5. Return training history
EXAMPLE:
>>> history = pipeline.train(dataloader, epochs=5)
>>> print(f"Final loss: {history['losses'][-1]:.4f}")
Final loss: 1.2345
"""
### BEGIN SOLUTION
history = {'losses': [], 'perplexities': [], 'epochs': []}
print(f"🚀 Starting training for {epochs} epochs...")
for epoch in range(epochs):
epoch_losses = []
for batch_idx, (inputs, targets) in enumerate(dataloader):
# Training step
loss = self.trainer.train_step(inputs, targets)
epoch_losses.append(loss)
# Log progress
if batch_idx % 10 == 0:
perplexity = np.exp(loss)
print(f" Epoch {epoch+1}/{epochs}, Batch {batch_idx}: "
f"Loss={loss:.4f}, PPL={perplexity:.2f}")
# Epoch summary
avg_loss = np.mean(epoch_losses)
avg_perplexity = np.exp(avg_loss)
history['losses'].append(avg_loss)
history['perplexities'].append(avg_perplexity)
history['epochs'].append(epoch + 1)
# Update learning rate
self.trainer.scheduler.step()
print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}")
self.is_trained = True
self.training_history = history
print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}")
return history
### END SOLUTION
def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0):
"""
Apply optimization techniques (Modules 17-18).
TODO: Apply quantization and pruning optimizations
APPROACH:
1. Optionally apply quantization to reduce precision
2. Optionally apply pruning to remove weights
3. Measure size reduction
4. Validate model still works
EXAMPLE:
>>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
Model optimized: 75% size reduction
"""
### BEGIN SOLUTION
original_params = self.model.count_parameters()
original_memory = original_params * 4 / (1024 * 1024)
optimizations_applied = []
if quantize:
# Apply quantization (simulated)
# In real implementation, would use quantize_model()
quantized_memory = original_memory / 4 # INT8 vs FP32
optimizations_applied.append(f"INT8 quantization (4× memory reduction)")
print(" Applied INT8 quantization")
if prune_sparsity > 0:
# Apply pruning (simulated)
# In real implementation, would use magnitude_prune()
remaining_weights = 1 - prune_sparsity
optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)")
print(f" Applied {prune_sparsity:.0%} magnitude pruning")
# Calculate final size
size_reduction = 1.0
if quantize:
size_reduction *= 0.25 # 4× smaller
if prune_sparsity > 0:
size_reduction *= (1 - prune_sparsity)
final_memory = original_memory * size_reduction
reduction_factor = original_memory / final_memory
print(f"🔧 Model optimization complete:")
print(f" Original: {original_memory:.1f}MB")
print(f" Optimized: {final_memory:.1f}MB")
print(f" Reduction: {reduction_factor:.1f}× smaller")
print(f" Applied: {', '.join(optimizations_applied)}")
### END SOLUTION
def generate_text(self, prompt: str, max_tokens: int = 50) -> str:
"""
Generate text using the trained model.
TODO: Implement text generation with proper encoding/decoding
APPROACH:
1. Encode prompt to token IDs
2. Use model.generate() for autoregressive generation
3. Decode generated tokens back to text
4. Return generated text
EXAMPLE:
>>> text = pipeline.generate_text("Hello", max_tokens=10)
>>> print(f"Generated: {text}")
Generated: Hello world this is AI
"""
### BEGIN SOLUTION
if not self.is_trained:
print("⚠️ Model not trained yet. Generating with random weights.")
# Encode prompt
prompt_tokens = self.tokenizer.encode(prompt)
prompt_tensor = Tensor([prompt_tokens])
# Generate tokens
generated_tokens = self.model.generate(
prompt_tensor,
max_new_tokens=max_tokens,
temperature=0.8,
use_cache=True
)
# Decode to text
all_tokens = generated_tokens.data[0].tolist()
generated_text = self.tokenizer.decode(all_tokens)
return generated_text
### END SOLUTION
def test_unit_complete_pipeline():
"""🔬 Test complete pipeline integration."""
print("🔬 Unit Test: Complete Pipeline Integration...")
# Create pipeline
pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2)
# Test data preparation
corpus = ["hello world", "ai is fun", "machine learning"]
dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
assert len(dataloader) > 0, "DataLoader should have batches"
# Test training (minimal)
history = pipeline.train(dataloader, epochs=1)
assert 'losses' in history, "History should contain losses"
assert len(history['losses']) == 1, "Should have one epoch of losses"
# Test optimization
pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
# Test generation
generated = pipeline.generate_text("hello", max_tokens=5)
assert isinstance(generated, str), "Generated output should be string"
assert len(generated) > 0, "Generated text should not be empty"
print(f"✅ Pipeline stages completed successfully")
print(f"✅ Training history: {len(history['losses'])} epochs")
print(f"✅ Generated text: '{generated[:20]}...'")
print("✅ Complete pipeline integration works!")
# Run immediate test
test_unit_complete_pipeline()

View File

@@ -16,7 +16,7 @@
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['OlympicEvent', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf',
'test_unit_tinymlperf', 'calculate_normalized_scores']
'test_unit_tinymlperf']
# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 0
#| default_exp benchmarking.benchmark
@@ -72,7 +72,7 @@ class Benchmark:
self.measurement_runs = measurement_runs
self.results = {}
# Use Profiler from Module 15 for measurements
# Use Profiler from Module 14 for measurements
self.profiler = Profiler()
# System information for metadata
@@ -1024,53 +1024,3 @@ def test_unit_tinymlperf():
print("✅ TinyMLPerf works correctly!")
test_unit_tinymlperf()
# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 24
def calculate_normalized_scores(baseline_results: dict,
optimized_results: dict) -> dict:
"""
Calculate normalized performance metrics for fair competition comparison.
This function converts absolute measurements into relative improvements,
enabling fair comparison across different hardware platforms.
Args:
baseline_results: Dict with keys: 'latency', 'memory', 'accuracy'
optimized_results: Dict with same keys as baseline_results
Returns:
Dict with normalized metrics:
- speedup: Relative latency improvement (higher is better)
- compression_ratio: Relative memory reduction (higher is better)
- accuracy_delta: Absolute accuracy change (closer to 0 is better)
- efficiency_score: Combined metric balancing all factors
Example:
>>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
>>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
>>> scores = calculate_normalized_scores(baseline, optimized)
>>> print(f"Speedup: {scores['speedup']:.2f}x")
Speedup: 2.50x
"""
# Calculate speedup (higher is better)
speedup = baseline_results['latency'] / optimized_results['latency']
# Calculate compression ratio (higher is better)
compression_ratio = baseline_results['memory'] / optimized_results['memory']
# Calculate accuracy delta (closer to 0 is better, negative means degradation)
accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy']
# Calculate efficiency score (combined metric)
# Penalize accuracy loss: the more accuracy you lose, the lower your score
accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0
efficiency_score = (speedup * compression_ratio) / accuracy_penalty
return {
'speedup': speedup,
'compression_ratio': compression_ratio,
'accuracy_delta': accuracy_delta,
'efficiency_score': efficiency_score,
'baseline': baseline_results.copy(),
'optimized': optimized_results.copy()
}

View File

@@ -16,7 +16,7 @@
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['validate_installation', 'load_baseline_model', 'generate_baseline', 'worked_example_optimization',
'optimize_for_competition', 'validate_submission', 'generate_submission']
'optimize_for_competition', 'generate_submission']
# %% ../../modules/source/20_competition/competition_dev.ipynb 4
import numpy as np
@@ -24,8 +24,6 @@ import json
import time
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional
from ..benchmarking.benchmark import Benchmark, calculate_normalized_scores
from ..profiling.profiler import Profiler
def validate_installation() -> Dict[str, bool]:
"""
@@ -364,24 +362,31 @@ def worked_example_optimization():
return submission
# %% ../../modules/source/20_competition/competition_dev.ipynb 10
def optimize_for_competition(baseline_model, event: str = "all_around", division: str = "closed"):
def optimize_for_competition(baseline_model, event: str = "all_around"):
"""
🏅 YOUR COMPETITION ENTRY - IMPLEMENT YOUR STRATEGY HERE!
This is where you apply optimization techniques from Modules 14-18.
Available techniques:
- Module 14: KV Caching (for transformers) - enable_kv_cache()
- Module 16: Acceleration (vectorization, fusion)
- Module 17: Quantization (INT8, INT4) - quantize_model()
- Module 18: Compression (pruning) - magnitude_prune()
Args:
baseline_model: Starting model (use for Closed, optional for Open)
event: Category you're competing in
baseline_model: The unoptimized model
event: Which Olympic event you're competing in
- "latency_sprint": Minimize latency
- "memory_challenge": Minimize memory
- "accuracy_contest": Maximize accuracy
- "all_around": Best balance
- "extreme_push": Most aggressive
division: "closed" or "open" - which track you chose
Returns:
Your optimized model
🔒 CLOSED DIVISION Example:
Example:
from tinytorch.optimization.quantization import quantize_model
from tinytorch.optimization.compression import magnitude_prune
@@ -389,15 +394,6 @@ def optimize_for_competition(baseline_model, event: str = "all_around", division
optimized = quantize_model(optimized, bits=8)
optimized = magnitude_prune(optimized, sparsity=0.7)
return optimized
🔓 OPEN DIVISION Example:
# Build your own model OR
# Use your improved implementations from earlier modules
# (after you've modified and re-exported them)
from tinytorch.models import YourCustomArchitecture
optimized = YourCustomArchitecture()
return optimized
"""
print(f"🏅 YOUR OPTIMIZATION STRATEGY FOR: {event}")
@@ -442,201 +438,74 @@ def optimize_for_competition(baseline_model, event: str = "all_around", division
return optimized_model
#| export
def validate_submission(submission: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate competition submission with sanity checks.
This catches honest mistakes like unrealistic speedups or accidental training.
Honor code system - we trust but verify basic reasonableness.
Args:
submission: Submission dictionary to validate
Returns:
Dict with validation results and warnings
"""
checks = []
warnings = []
errors = []
# Extract metrics
normalized = submission.get("normalized_scores", {})
speedup = normalized.get("speedup", 1.0)
compression = normalized.get("compression_ratio", 1.0)
accuracy_delta = normalized.get("accuracy_delta", 0.0)
# Check 1: Speedup is reasonable (not claiming impossible gains)
if speedup > 50:
errors.append(f"❌ Speedup {speedup:.1f}x seems unrealistic (>50x)")
elif speedup > 20:
warnings.append(f"⚠️ Speedup {speedup:.1f}x is very high - please verify measurements")
else:
checks.append(f"✅ Speedup {speedup:.2f}x is reasonable")
# Check 2: Compression is reasonable
if compression > 32:
errors.append(f"❌ Compression {compression:.1f}x seems unrealistic (>32x)")
elif compression > 16:
warnings.append(f"⚠️ Compression {compression:.1f}x is very high - please verify")
else:
checks.append(f"✅ Compression {compression:.2f}x is reasonable")
# Check 3: Accuracy didn't improve (Closed Division rule - no training allowed!)
division = submission.get("division", "closed")
if division == "closed" and accuracy_delta > 1.0:
errors.append(f"❌ Accuracy improved by {accuracy_delta:.1f}pp - did you accidentally train the model?")
elif accuracy_delta > 0.5:
warnings.append(f"⚠️ Accuracy improved by {accuracy_delta:.1f}pp - verify no training occurred")
else:
checks.append(f"✅ Accuracy change {accuracy_delta:+.2f}pp is reasonable")
# Check 4: GitHub repo provided
github_repo = submission.get("github_repo", "")
if not github_repo or github_repo == "":
warnings.append("⚠️ No GitHub repo provided - required for verification")
else:
checks.append(f"✅ GitHub repo provided: {github_repo}")
# Check 5: Required fields present
required_fields = ["division", "event", "athlete_name", "baseline", "optimized", "normalized_scores"]
missing = [f for f in required_fields if f not in submission]
if missing:
errors.append(f"❌ Missing required fields: {', '.join(missing)}")
else:
checks.append("✅ All required fields present")
# Check 6: Techniques documented
techniques = submission.get("techniques_applied", [])
if not techniques or "TODO" in str(techniques):
warnings.append("⚠️ No optimization techniques listed")
else:
checks.append(f"✅ Techniques documented: {', '.join(techniques[:3])}...")
return {
"valid": len(errors) == 0,
"checks": checks,
"warnings": warnings,
"errors": errors
}
#| export
def generate_submission(baseline_model, optimized_model,
division: str = "closed",
event: str = "all_around",
athlete_name: str = "YourName",
github_repo: str = "",
techniques: List[str] = None) -> Dict[str, Any]:
"""
Generate standardized TinyMLPerf competition submission with normalized scoring.
Generate standardized competition submission.
Args:
baseline_model: Original unoptimized model
optimized_model: Your optimized model
division: "closed" or "open"
event: Competition category (latency_sprint, memory_challenge, all_around, etc.)
athlete_name: Your name for submission
github_repo: GitHub repository URL for code verification
techniques: List of optimization techniques applied
event: Olympic event name
athlete_name: Your name for leaderboard
techniques: List of techniques applied
Returns:
Submission dictionary (will be saved as JSON)
"""
print("📤 Generating TinyMLPerf Competition Submission...")
print("📤 Generating Competition Submission...")
print("=" * 70)
# Get baseline metrics
baseline_metrics = generate_baseline(quick=True)
# Benchmark optimized model
# For demonstration, estimate optimized metrics
# In real competition, this would benchmark the actual optimized model
print("🔬 Benchmarking optimized model...")
# Use Profiler and Benchmark from Module 19
profiler = Profiler()
# For demonstration, we'll use placeholder metrics
# In real competition, students would measure their actual optimized model
# Placeholder: Students' actual optimizations would be measured here
optimized_metrics = {
"model": getattr(optimized_model, 'name', 'Optimized_Model'),
"accuracy": 84.0, # Would be measured with actual test set
"latency_ms": 28.0, # Would be measured with profiler
"memory_mb": 4.0, # Would be measured with profiler
"parameters": 2000000, # Would be counted
"model": "Your_Optimized_Model",
"accuracy": 84.0, # Measured
"latency_ms": 28.0, # Measured
"memory_mb": 4.0, # Measured
"parameters": 2000000, # Measured
}
# Calculate normalized scores using Module 19's function
baseline_for_norm = {
"latency": baseline_metrics["latency_ms"],
"memory": baseline_metrics["memory_mb"],
"accuracy": baseline_metrics["accuracy"]
# Calculate improvements
improvements = {
"accuracy_change": optimized_metrics["accuracy"] - baseline_metrics["accuracy"],
"latency_speedup": baseline_metrics["latency_ms"] / optimized_metrics["latency_ms"],
"memory_reduction": baseline_metrics["memory_mb"] / optimized_metrics["memory_mb"],
}
optimized_for_norm = {
"latency": optimized_metrics["latency_ms"],
"memory": optimized_metrics["memory_mb"],
"accuracy": optimized_metrics["accuracy"]
}
normalized_scores = calculate_normalized_scores(baseline_for_norm, optimized_for_norm)
# Create submission with all required fields
# Create submission
submission = {
"division": division,
"event": event,
"athlete_name": athlete_name,
"github_repo": github_repo,
"baseline": baseline_metrics,
"optimized": optimized_metrics,
"normalized_scores": {
"speedup": normalized_scores["speedup"],
"compression_ratio": normalized_scores["compression_ratio"],
"accuracy_delta": normalized_scores["accuracy_delta"],
"efficiency_score": normalized_scores["efficiency_score"]
},
"techniques_applied": techniques or ["TODO: Document your optimization techniques"],
"improvements": improvements,
"techniques_applied": techniques or ["TODO: List your techniques"],
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"tinytorch_version": "0.1.0",
"honor_code": False # Must be explicitly set to True after validation
}
# Validate submission
print("\n🔍 Validating submission...")
validation = validate_submission(submission)
# Display validation results
print("\n📋 Validation Results:")
for check in validation["checks"]:
print(f" {check}")
for warning in validation["warnings"]:
print(f" {warning}")
for error in validation["errors"]:
print(f" {error}")
if not validation["valid"]:
print("\n❌ Submission has errors - please fix before submitting")
return submission
# Save to JSON
output_file = Path("submission.json")
with open(output_file, "w") as f:
json.dump(submission, f, indent=2)
print(f"\n✅ Submission saved to: {output_file}")
print(f"✅ Submission saved to: {output_file}")
print()
print("📊 Your Normalized Scores (MLPerf-style):")
print(f" Division: {division.upper()}")
print(f" Event: {event.replace('_', ' ').title()}")
print(f" Speedup: {normalized_scores['speedup']:.2f}x faster")
print(f" Compression: {normalized_scores['compression_ratio']:.2f}x smaller 💾")
print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {normalized_scores['accuracy_delta']:+.2f}pp)")
print(f" Efficiency: {normalized_scores['efficiency_score']:.2f}")
print()
print("📤 Next Steps:")
print(" 1. Verify all metrics are correct")
print(" 2. Push your code to GitHub (if not done)")
print(" 3. Run: tito submit submission.json")
print(" (This will validate and prepare final submission)")
print("📊 Your Results:")
print(f" Event: {event}")
print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {improvements['accuracy_change']:+.1f}pp)")
print(f" Latency: {optimized_metrics['latency_ms']:.1f}ms ({improvements['latency_speedup']:.2f}x faster)")
print(f" Memory: {optimized_metrics['memory_mb']:.2f}MB ({improvements['memory_reduction']:.2f}x smaller)")
print()
print("📤 Upload submission.json to TorchPerf Olympics platform!")
print("=" * 70)
return submission

View File

@@ -15,9 +15,9 @@
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'SumBackward',
'ReshapeBackward', 'EmbeddingBackward', 'SqrtBackward', 'MeanBackward', 'ReLUBackward', 'GELUBackward',
'SigmoidBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'TransposeBackward',
'PermuteBackward', 'EmbeddingBackward', 'ReshapeBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
'SoftmaxBackward', 'GELUBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
import numpy as np
@@ -164,92 +164,66 @@ class MulBackward(Function):
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 12
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 13
class SubBackward(Function):
"""
Gradient computation for tensor subtraction.
**Mathematical Rule:** If z = a - b, then z/a = 1 and z/b = -1
**Key Insight:** Subtraction passes gradient unchanged to first input,
but negates it for second input (because of the minus sign).
**Applications:** Used in residual connections, computing differences in losses.
"""
def apply(self, grad_output):
"""
Compute gradients for subtraction.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple of (grad_a, grad_b) for the two inputs
**Mathematical Foundation:**
- (a-b)/a = 1 grad_a = grad_output
- (a-b)/b = -1 grad_b = -grad_output
Tuple of (grad_a, grad_b) where grad_b is negated
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Gradient for first input: grad_output (unchanged)
if isinstance(a, Tensor) and a.requires_grad:
grad_a = grad_output
grad_a = grad_output # ∂(a-b)/∂a = 1
# Gradient for second input: -grad_output (negated)
if isinstance(b, Tensor) and b.requires_grad:
grad_b = -grad_output
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
return grad_a, grad_b
#| export
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15
class DivBackward(Function):
"""
Gradient computation for tensor division.
**Mathematical Rule:** If z = a / b, then z/a = 1/b and z/b = -a/
**Key Insight:** Division gradient for numerator is 1/denominator,
for denominator is -numerator/denominator².
**Applications:** Used in normalization (LayerNorm, BatchNorm), loss functions.
**Mathematical Rule:** If z = a / b, then:
- z/a = 1/b
- z/b = -a/
"""
def apply(self, grad_output):
"""
Compute gradients for division.
Compute gradients for division using quotient rule.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple of (grad_a, grad_b) for the two inputs
**Mathematical Foundation:**
- (a/b)/a = 1/b grad_a = grad_output / b
- (a/b)/b = -a/ grad_b = -grad_output * a /
Tuple of (grad_a, grad_b)
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Gradient for numerator: grad_output / b
if isinstance(a, Tensor) and a.requires_grad:
# ∂(a/b)/∂a = 1/b
if isinstance(b, Tensor):
grad_a = grad_output / b.data
else:
grad_a = grad_output / b
# Gradient for denominator: -grad_output * a / b²
if isinstance(b, Tensor) and b.requires_grad:
# ∂(a/b)/∂b = -a/b²
grad_b = -grad_output * a.data / (b.data ** 2)
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
class MatmulBackward(Function):
"""
Gradient computation for matrix multiplication.
@@ -269,8 +243,6 @@ class MatmulBackward(Function):
"""
Compute gradients for matrix multiplication.
Handles both 2D matrices and 3D batched tensors (for transformers).
Args:
grad_output: Gradient flowing backward from output
@@ -278,40 +250,244 @@ class MatmulBackward(Function):
Tuple of (grad_a, grad_b) for the two matrix inputs
**Mathematical Foundation:**
- 2D: (A@B)/A = grad_output @ B.T
- 3D: (A@B)/A = grad_output @ swapaxes(B, -2, -1)
- (A@B)/A = grad_output @ B.T
- (A@B)/B = A.T @ grad_output
**Why Both Cases:**
- 2D: Traditional matrix multiplication (Linear layers)
- 3D: Batched operations (Transformers: batch, seq, embed)
**Batched Operation:** For 3D+ tensors, we transpose only the last two
dimensions using np.swapaxes, preserving batch dimensions.
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Detect if we're dealing with batched (3D) or regular (2D) tensors
is_batched = len(grad_output.shape) == 3
# Gradient for first input: grad_output @ b.T (or batched equivalent)
# Gradient for first input: grad_output @ b.T
if isinstance(a, Tensor) and a.requires_grad:
if is_batched:
# Batched: use matmul and swapaxes for transpose
grad_a = np.matmul(grad_output, np.swapaxes(b.data, -2, -1))
# For batched tensors, transpose only last two dims
if b.data.ndim >= 2:
b_T = np.swapaxes(b.data, -2, -1)
else:
# 2D: use dot and .T for transpose
grad_a = np.dot(grad_output, b.data.T)
b_T = b.data.T
grad_a = np.matmul(grad_output, b_T)
# Gradient for second input: a.T @ grad_output (or batched equivalent)
# Gradient for second input: a.T @ grad_output
if isinstance(b, Tensor) and b.requires_grad:
if is_batched:
# Batched: use matmul and swapaxes for transpose
grad_b = np.matmul(np.swapaxes(a.data, -2, -1), grad_output)
# For batched tensors, transpose only last two dims
if a.data.ndim >= 2:
a_T = np.swapaxes(a.data, -2, -1)
else:
# 2D: use dot and .T for transpose
grad_b = np.dot(a.data.T, grad_output)
a_T = a.data.T
grad_b = np.matmul(a_T, grad_output)
return grad_a, grad_b
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
class TransposeBackward(Function):
"""
Gradient computation for transpose operation.
**Mathematical Rule:** If Y = X.T, then:
- Y/X = grad_Y.T
**Key Insight:** The gradient of transpose is just transpose the gradient!
This is because transpose is a linear operation that just rearranges elements.
**Applications:** Used in attention (K.T for scores), weight gradients (W.T),
and any operation that needs to swap matrix dimensions.
"""
def __init__(self, tensor, dim0, dim1):
"""
Args:
tensor: Input tensor
dim0: First dimension to swap (None for default)
dim1: Second dimension to swap (None for default)
"""
super().__init__(tensor)
self.dim0 = dim0
self.dim1 = dim1
def apply(self, grad_output):
"""
Compute gradient for transpose.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- (X.T)/X = grad_output.T
- Just transpose the gradient back!
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Transpose gradient using the same dims
if self.dim0 is None and self.dim1 is None:
# Default: transpose last two dimensions
if grad_output.ndim < 2:
grad_x = grad_output.copy()
else:
axes = list(range(grad_output.ndim))
axes[-2], axes[-1] = axes[-1], axes[-2]
grad_x = np.transpose(grad_output, axes)
else:
# Specific dimensions: swap them back
axes = list(range(grad_output.ndim))
axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0]
grad_x = np.transpose(grad_output, axes)
return (grad_x,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 19
class PermuteBackward(Function):
"""
Gradient computation for arbitrary axis permutation (general transpose).
**Mathematical Rule:** If Y = X.permute(axes), then:
- Y/X = grad_Y.permute(inverse_axes)
**Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse).
More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0).
**Key Insight:** To reverse a permutation, we need to know where each axis went.
If axis i went to position axes[i], then in the inverse, position axes[i] should go to i.
**Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads.
"""
def __init__(self, tensor, axes):
"""
Args:
tensor: Input tensor
axes: Tuple of axis indices defining the permutation
"""
super().__init__(tensor)
self.axes = axes
# Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i
self.inverse_axes = tuple(np.argsort(axes))
def apply(self, grad_output):
"""
Compute gradient for permutation.
The gradient is permuted back using the inverse permutation.
**Mathematical Foundation:**
- (X.permute(axes))/X = grad_output.permute(inverse_axes)
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Permute gradient back to original axis order
grad_x = np.transpose(grad_output, self.inverse_axes)
return (grad_x,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 20
class EmbeddingBackward(Function):
"""
Gradient computation for embedding lookup operation.
**Mathematical Rule:** If Y = Embedding[indices], then:
- Loss/Embedding[i] = sum of all gradients where index==i
**Key Insight:** Embedding lookup is a gather operation. The backward
is a scatter operation that accumulates gradients to the embedding weights.
**Applications:** Word embeddings, positional embeddings, token embeddings
in transformers.
"""
def __init__(self, weight, indices):
"""
Args:
weight: Embedding weight matrix
indices: Indices used for lookup
"""
super().__init__(weight)
self.indices = indices
def apply(self, grad_output):
"""
Compute gradient for embedding lookup.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for weight tensor
**Mathematical Foundation:**
- (Embedding[indices])/Embedding = scatter gradients to selected rows
- Multiple indices can point to same embedding gradients accumulate
"""
weight, = self.saved_tensors
grad_weight = None
if isinstance(weight, Tensor) and weight.requires_grad:
# Initialize gradient with zeros
grad_weight = np.zeros_like(weight.data)
# Scatter gradients back to embedding weights
# np.add.at accumulates gradients for repeated indices
indices_flat = self.indices.data.astype(int).flatten()
grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1])
np.add.at(grad_weight, indices_flat, grad_output_reshaped)
return (grad_weight,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21
class ReshapeBackward(Function):
"""
Gradient computation for reshape operation.
**Mathematical Rule:** If Y = X.reshape(new_shape), then:
- Y/X = grad_Y.reshape(X.shape)
**Key Insight:** Reshape just rearranges the same elements.
The gradient is simply reshaped back to the original shape!
**Applications:** Flattening tensors for linear layers, reshaping
between convolutional and dense layers.
"""
def __init__(self, tensor, original_shape):
"""
Args:
tensor: Input tensor
original_shape: Shape before reshape
"""
super().__init__(tensor)
self.original_shape = original_shape
def apply(self, grad_output):
"""
Compute gradient for reshape.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- (X.reshape(...))/X = grad_output.reshape(X.shape)
- Just reshape the gradient back!
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Reshape gradient back to original shape
grad_x = grad_output.reshape(self.original_shape)
return (grad_x,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
class SumBackward(Function):
"""
Gradient computation for tensor sum.
@@ -345,186 +521,7 @@ class SumBackward(Function):
return np.ones_like(tensor.data) * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
class ReshapeBackward(Function):
"""
Gradient computation for tensor reshape.
**Mathematical Rule:** If z = reshape(a, new_shape), then z/a is reshape(grad_z, old_shape)
**Key Insight:** Reshape doesn't change values, only their arrangement.
Gradients flow back by reshaping to the original shape.
**Applications:** Used in transformers (flattening for loss), CNNs, and
anywhere tensor dimensions need to be rearranged.
"""
def apply(self, grad_output):
"""
Compute gradients for reshape operation.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple containing gradient for the input tensor
**Mathematical Foundation:**
- Reshape is a view operation: grad_input = reshape(grad_output, original_shape)
"""
tensor, = self.saved_tensors
original_shape = tensor.shape
if isinstance(tensor, Tensor) and tensor.requires_grad:
# Reshape gradient back to original input shape
return np.reshape(grad_output, original_shape),
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
class EmbeddingBackward(Function):
"""
Gradient computation for embedding lookup.
**Mathematical Rule:** If z = embedding[indices], gradients accumulate at indexed positions.
**Key Insight:** Multiple indices can point to the same embedding vector,
so gradients must accumulate (not overwrite) at each position.
**Applications:** Used in NLP transformers, language models, and any discrete input.
"""
def apply(self, grad_output):
"""
Compute gradients for embedding lookup.
Args:
grad_output: Gradient flowing backward from output (batch, seq, embed_dim)
Returns:
Tuple containing gradient for the embedding weight matrix
**Mathematical Foundation:**
- Embedding is a lookup: output[i] = weight[indices[i]]
- Gradients scatter back to indexed positions: grad_weight[indices[i]] += grad_output[i]
- Must accumulate because multiple positions can use same embedding
"""
weight, indices = self.saved_tensors
if isinstance(weight, Tensor) and weight.requires_grad:
# Initialize gradient matrix with zeros
grad_weight = np.zeros_like(weight.data)
# Scatter gradients back to embedding table
# np.add.at accumulates values at repeated indices
flat_indices = indices.data.astype(int).flatten()
flat_grad_output = grad_output.reshape((-1, weight.shape[-1]))
np.add.at(grad_weight, flat_indices, flat_grad_output)
return grad_weight, None
return None, None
#| export
class SqrtBackward(Function):
"""
Gradient computation for square root.
**Mathematical Rule:** If z = sqrt(x), then z/x = 1 / (2 * sqrt(x))
**Key Insight:** Gradient is inversely proportional to the square root output.
**Applications:** Used in normalization (LayerNorm, BatchNorm), distance metrics.
"""
def apply(self, grad_output):
"""
Compute gradients for sqrt operation.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple containing gradient for the input
**Mathematical Foundation:**
- d/dx(sqrt(x)) = 1 / (2 * sqrt(x)) = 1 / (2 * output)
"""
x, = self.saved_tensors
output = self.saved_output
if isinstance(x, Tensor) and x.requires_grad:
# Gradient: 1 / (2 * sqrt(x))
grad_x = grad_output / (2.0 * output.data)
return grad_x,
return None,
#| export
class MeanBackward(Function):
"""
Gradient computation for mean reduction.
**Mathematical Rule:** If z = mean(x), then z/x_i = 1 / N for all i
**Key Insight:** Mean distributes gradient equally to all input elements.
**Applications:** Used in loss functions, normalization (LayerNorm, BatchNorm).
"""
def apply(self, grad_output):
"""
Compute gradients for mean reduction.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple containing gradient for the input
**Mathematical Foundation:**
- mean reduces by averaging, so gradient is distributed equally
- Each input element contributes 1/N to the output
- Gradient: grad_output / N, broadcasted to input shape
"""
x, = self.saved_tensors
axis = self.axis
keepdims = self.keepdims
if isinstance(x, Tensor) and x.requires_grad:
# Number of elements that were averaged
if axis is None:
N = x.size
else:
if isinstance(axis, int):
N = x.shape[axis]
else:
N = np.prod([x.shape[ax] for ax in axis])
# Distribute gradient equally: each element gets grad_output / N
grad_x = grad_output / N
# Broadcast gradient back to original shape
if not keepdims and axis is not None:
# Need to add back the reduced dimensions for broadcasting
if isinstance(axis, int):
grad_x = np.expand_dims(grad_x, axis=axis)
else:
for ax in sorted(axis):
grad_x = np.expand_dims(grad_x, axis=ax)
# Broadcast to match input shape
grad_x = np.broadcast_to(grad_x, x.shape)
return grad_x,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
class ReLUBackward(Function):
"""
Gradient computation for ReLU activation.
@@ -547,48 +544,7 @@ class ReLUBackward(Function):
return grad_output * relu_grad,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
class GELUBackward(Function):
"""
Gradient computation for GELU activation.
**Mathematical Rule:** GELU(x) = x * Φ(x) where Φ is the standard normal CDF
**Key Insight:** GELU gradient involves both the function value and its derivative.
**Applications:** Used in modern transformers (GPT, BERT) as a smooth alternative to ReLU.
"""
def apply(self, grad_output):
"""
Compute gradients for GELU activation.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple containing gradient for the input
**Mathematical Foundation:**
- GELU approximation: f(x) = x * sigmoid(1.702 * x)
- Gradient: f'(x) = sigmoid(1.702*x) + x * sigmoid(1.702*x) * (1-sigmoid(1.702*x)) * 1.702
"""
x, = self.saved_tensors
if isinstance(x, Tensor) and x.requires_grad:
# GELU gradient using approximation
# f(x) = x * sigmoid(1.702*x)
# f'(x) = sigmoid(1.702*x) + 1.702 * x * sigmoid(1.702*x) * (1 - sigmoid(1.702*x))
sig = 1.0 / (1.0 + np.exp(-1.702 * x.data))
grad_x = grad_output * (sig + 1.702 * x.data * sig * (1 - sig))
return grad_x,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
class SigmoidBackward(Function):
"""
Gradient computation for sigmoid activation.
@@ -618,7 +574,101 @@ class SigmoidBackward(Function):
return grad_output * sigmoid_grad,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 30
class SoftmaxBackward(Function):
"""
Gradient computation for softmax activation.
Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x))
Derivative: softmax/x[i] = softmax[i] * (δ[i,j] - softmax[j])
For gradient computation:
grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax))
**Key Insight:** The gradient depends on all elements of softmax due to
the normalization, not just the element being differentiated.
"""
def __init__(self, input_tensor, output_tensor, dim=-1):
"""
Initialize with input, output, and dimension.
Args:
input_tensor: Original input to softmax
output_tensor: Output of softmax (needed for gradient)
dim: Dimension along which softmax was applied
"""
super().__init__(input_tensor)
self.output_data = output_tensor.data
self.dim = dim
def apply(self, grad_output):
"""
Compute gradient for softmax.
Mathematical formula:
L/x[i] = softmax[i] * (L/y[i] - sum_j(L/y[j] * softmax[j]))
This can be vectorized as:
grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True))
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
# Compute sum(grad_output * softmax) along the softmax dimension
sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True)
# Softmax gradient: softmax * (grad_output - sum_term)
grad_x = self.output_data * (grad_output - sum_term)
return (grad_x,)
return (None,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 31
class GELUBackward(Function):
"""
Gradient computation for GELU activation.
GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal
Approximation: gelu(x) 0.5 * x * (1 + tanh((2/π) * (x + 0.044715 * )))
**Key Insight:** GELU is smoother than ReLU, providing non-zero gradients
for negative values, which helps training deep networks.
"""
def __init__(self, input_tensor):
"""Initialize with input tensor."""
super().__init__(input_tensor)
def apply(self, grad_output):
"""
Compute gradient for GELU.
Mathematical formula (using approximation):
gelu/x 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...)
Simplified: We compute the derivative numerically or use the formula.
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
x = tensor.data
# GELU derivative approximation
# Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
x_cubed = x ** 3
tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed)
tanh_out = np.tanh(tanh_arg)
sech_squared = 1 - tanh_out ** 2
# Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx
d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2)
gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg
return (grad_output * gelu_grad,)
return (None,)
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 32
class MSEBackward(Function):
"""
Gradient computation for Mean Squared Error Loss.
@@ -644,7 +694,7 @@ class MSEBackward(Function):
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 33
class BCEBackward(Function):
"""
Gradient computation for Binary Cross-Entropy Loss.
@@ -674,7 +724,7 @@ class BCEBackward(Function):
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 34
class CrossEntropyBackward(Function):
"""
Gradient computation for Cross-Entropy Loss.
@@ -719,7 +769,7 @@ class CrossEntropyBackward(Function):
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 35
def enable_autograd():
"""
Enable gradient tracking for all Tensor operations.
@@ -758,8 +808,10 @@ def enable_autograd():
_original_add = Tensor.__add__
_original_sub = Tensor.__sub__
_original_mul = Tensor.__mul__
_original_truediv = Tensor.__truediv__
_original_div = Tensor.__truediv__
_original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
_original_transpose = Tensor.transpose if hasattr(Tensor, 'transpose') else None
_original_reshape = Tensor.reshape if hasattr(Tensor, 'reshape') else None
# Enhanced operations that track gradients
def tracked_add(self, other):
@@ -806,6 +858,76 @@ def enable_autograd():
return result
def tracked_matmul(self, other):
"""
Matrix multiplication with gradient tracking.
Enhances the original matmul method to build computation graphs
when requires_grad=True for any input.
"""
if _original_matmul:
result = _original_matmul(self, other)
else:
# Fallback if matmul doesn't exist
result = Tensor(np.dot(self.data, other.data))
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = MatmulBackward(self, other)
return result
def tracked_transpose(self, dim0=None, dim1=None):
"""
Transpose with gradient tracking.
Enhances the original transpose method to build computation graphs
when requires_grad=True for the input.
"""
if _original_transpose:
result = _original_transpose(self, dim0, dim1)
else:
# Fallback if transpose doesn't exist
if dim0 is None and dim1 is None:
axes = list(range(len(self.shape)))
if len(axes) >= 2:
axes[-2], axes[-1] = axes[-1], axes[-2]
result = Tensor(np.transpose(self.data, axes))
else:
axes = list(range(len(self.shape)))
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
result = Tensor(np.transpose(self.data, axes))
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = TransposeBackward(self, dim0, dim1)
return result
def tracked_reshape(self, *shape):
"""
Reshape with gradient tracking.
Enhances the original reshape method to build computation graphs
when requires_grad=True for the input.
"""
original_shape = self.shape
if _original_reshape:
result = _original_reshape(self, *shape)
else:
# Fallback if reshape doesn't exist
result = Tensor(self.data.reshape(*shape))
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = ReshapeBackward(self, original_shape)
return result
def tracked_sub(self, other):
"""
Subtraction with gradient tracking.
@@ -827,7 +949,7 @@ def enable_autograd():
return result
def tracked_truediv(self, other):
def tracked_div(self, other):
"""
Division with gradient tracking.
@@ -839,7 +961,7 @@ def enable_autograd():
other = Tensor(other)
# Call original operation
result = _original_truediv(self, other)
result = _original_div(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
@@ -848,26 +970,6 @@ def enable_autograd():
return result
def tracked_matmul(self, other):
"""
Matrix multiplication with gradient tracking.
Enhances the original matmul method to build computation graphs
when requires_grad=True for any input.
"""
if _original_matmul:
result = _original_matmul(self, other)
else:
# Fallback if matmul doesn't exist
result = Tensor(np.dot(self.data, other.data))
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = MatmulBackward(self, other)
return result
def sum_op(self, axis=None, keepdims=False):
"""
Sum operation with gradient tracking.
@@ -958,20 +1060,23 @@ def enable_autograd():
Tensor.__add__ = tracked_add
Tensor.__sub__ = tracked_sub
Tensor.__mul__ = tracked_mul
Tensor.__truediv__ = tracked_truediv
Tensor.__truediv__ = tracked_div
Tensor.matmul = tracked_matmul
Tensor.transpose = tracked_transpose
Tensor.reshape = tracked_reshape
Tensor.sum = sum_op
Tensor.backward = backward
Tensor.zero_grad = zero_grad
# Patch activations and losses to track gradients
try:
from tinytorch.core.activations import Sigmoid, ReLU, GELU
from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
# Store original methods
_original_sigmoid_forward = Sigmoid.forward
_original_relu_forward = ReLU.forward
_original_softmax_forward = Softmax.forward
_original_gelu_forward = GELU.forward
_original_bce_forward = BinaryCrossEntropyLoss.forward
_original_mse_forward = MSELoss.forward
@@ -999,13 +1104,24 @@ def enable_autograd():
return result
def tracked_softmax_forward(self, x, dim=-1):
"""Softmax with gradient tracking."""
# Call original forward to get result using Tensor operations
result = _original_softmax_forward(self, x, dim=dim)
# Attach the correct gradient function
if x.requires_grad:
result.requires_grad = True
result._grad_fn = SoftmaxBackward(x, result, dim)
return result
def tracked_gelu_forward(self, x):
"""GELU with gradient tracking."""
# GELU approximation: x * sigmoid(1.702 * x)
sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
result_data = x.data * sigmoid_part
result = Tensor(result_data)
# Call original forward to get result
result = _original_gelu_forward(self, x)
# Attach the correct gradient function
if x.requires_grad:
result.requires_grad = True
result._grad_fn = GELUBackward(x)
@@ -1071,6 +1187,7 @@ def enable_autograd():
# Install patched methods
Sigmoid.forward = tracked_sigmoid_forward
ReLU.forward = tracked_relu_forward
Softmax.forward = tracked_softmax_forward
GELU.forward = tracked_gelu_forward
BinaryCrossEntropyLoss.forward = tracked_bce_forward
MSELoss.forward = tracked_mse_forward

View File

@@ -113,21 +113,10 @@ class Tensor:
### BEGIN SOLUTION
if isinstance(other, Tensor):
# Tensor + Tensor: let NumPy handle broadcasting
result_data = self.data + other.data
return Tensor(self.data + other.data)
else:
# Tensor + scalar: NumPy broadcasts automatically
result_data = self.data + other
# Create new tensor with result
result = Tensor(result_data)
# Preserve gradient tracking if either operand requires gradients
if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
elif hasattr(self, 'requires_grad'):
result.requires_grad = self.requires_grad
return result
return Tensor(self.data + other)
### END SOLUTION
# nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
@@ -137,10 +126,12 @@ class Tensor:
Common use: Centering data (x - mean), computing differences for loss functions.
"""
### BEGIN SOLUTION
if isinstance(other, Tensor):
return Tensor(self.data - other.data)
else:
return Tensor(self.data - other)
### END SOLUTION
def __mul__(self, other):
"""
@@ -149,10 +140,12 @@ class Tensor:
Common use: Scaling features, applying masks, gating mechanisms in neural networks.
Note: This is * operator, not @ (which will be matrix multiplication).
"""
### BEGIN SOLUTION
if isinstance(other, Tensor):
return Tensor(self.data * other.data)
else:
return Tensor(self.data * other)
### END SOLUTION
def __truediv__(self, other):
"""
@@ -160,10 +153,12 @@ class Tensor:
Common use: Normalization (x / std), converting counts to probabilities.
"""
### BEGIN SOLUTION
if isinstance(other, Tensor):
return Tensor(self.data / other.data)
else:
return Tensor(self.data / other)
### END SOLUTION
# nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
def matmul(self, other):
@@ -232,7 +227,8 @@ class Tensor:
)
# Perform optimized matrix multiplication
result_data = np.dot(self.data, other.data)
# Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors
result_data = np.matmul(self.data, other.data)
return Tensor(result_data)
### END SOLUTION
@@ -304,16 +300,8 @@ class Tensor:
# Reshape the data (NumPy handles the memory layout efficiently)
reshaped_data = np.reshape(self.data, new_shape)
# Create output tensor preserving gradient tracking
# Preserve gradient tracking from the original tensor (important for autograd!)
result = Tensor(reshaped_data, requires_grad=self.requires_grad)
# Set up backward function for autograd
if self.requires_grad:
from tinytorch.core.autograd import ReshapeBackward
result._grad_fn = ReshapeBackward()
result._grad_fn.saved_tensors = (self,)
return result
### END SOLUTION
@@ -380,7 +368,9 @@ class Tensor:
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
transposed_data = np.transpose(self.data, axes)
return Tensor(transposed_data)
# Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)
result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)
return result
### END SOLUTION
# nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}

View File

@@ -15,7 +15,7 @@
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['CosineSchedule', 'save_checkpoint', 'load_checkpoint', 'Trainer']
__all__ = ['CosineSchedule', 'Trainer']
# %% ../../modules/source/07_training/training_dev.ipynb 1
import numpy as np
@@ -72,90 +72,6 @@ class CosineSchedule:
### END SOLUTION
# %% ../../modules/source/07_training/training_dev.ipynb 14
def save_checkpoint(checkpoint_dict: Dict[str, Any], path: str):
"""
Save checkpoint dictionary to disk using pickle.
This is a low-level utility for saving model state. Use this when you have
a custom training loop and want to save just what you need (model params,
config, metadata).
For complete training state with optimizer and scheduler, use
Trainer.save_checkpoint() instead.
TODO: Implement checkpoint saving with pickle
APPROACH:
1. Create parent directory if it doesn't exist (Path(path).parent.mkdir)
2. Open file in binary write mode ('wb')
3. Use pickle.dump() to serialize the checkpoint dictionary
4. Print confirmation message
EXAMPLE:
>>> model = SimpleModel()
>>> checkpoint = {
... 'model_params': [p.data.copy() for p in model.parameters()],
... 'config': {'embed_dim': 32, 'num_layers': 2},
... 'metadata': {'final_loss': 0.089, 'training_steps': 5000}
... }
>>> save_checkpoint(checkpoint, 'checkpoints/model.pkl')
Checkpoint saved: checkpoints/model.pkl
HINTS:
- Use Path(path).parent.mkdir(parents=True, exist_ok=True)
- pickle.dump(obj, file) writes the object to file
- Always print a success message so users know it worked
"""
### BEGIN SOLUTION
# Create parent directory if needed
Path(path).parent.mkdir(parents=True, exist_ok=True)
# Save checkpoint using pickle
with open(path, 'wb') as f:
pickle.dump(checkpoint_dict, f)
print(f"✓ Checkpoint saved: {path}")
### END SOLUTION
# %% ../../modules/source/07_training/training_dev.ipynb 15
def load_checkpoint(path: str) -> Dict[str, Any]:
"""
Load checkpoint dictionary from disk using pickle.
Companion function to save_checkpoint(). Restores the checkpoint dictionary
so you can rebuild your model, resume training, or inspect saved metadata.
TODO: Implement checkpoint loading with pickle
APPROACH:
1. Open file in binary read mode ('rb')
2. Use pickle.load() to deserialize the checkpoint
3. Print confirmation message
4. Return the loaded dictionary
EXAMPLE:
>>> checkpoint = load_checkpoint('checkpoints/model.pkl')
Checkpoint loaded: checkpoints/model.pkl
>>> print(checkpoint['metadata']['final_loss'])
0.089
>>> model_params = checkpoint['model_params']
>>> # Now restore model: for param, data in zip(model.parameters(), model_params)...
HINTS:
- pickle.load(file) reads and deserializes the object
- Return the loaded dictionary
- Print a success message for user feedback
"""
### BEGIN SOLUTION
# Load checkpoint using pickle
with open(path, 'rb') as f:
checkpoint = pickle.load(f)
print(f"✓ Checkpoint loaded: {path}")
return checkpoint
### END SOLUTION
# %% ../../modules/source/07_training/training_dev.ipynb 19
class Trainer:
"""
Complete training orchestrator for neural networks.
@@ -330,11 +246,6 @@ class Trainer:
def save_checkpoint(self, path: str):
"""
Save complete training state for resumption.
This high-level method saves everything needed to resume training:
model parameters, optimizer state, scheduler state, and training history.
Uses the low-level save_checkpoint() function internally.
Args:
path: File path to save checkpoint
@@ -349,23 +260,19 @@ class Trainer:
'training_mode': self.training_mode
}
# Use the standalone save_checkpoint function
save_checkpoint(checkpoint, path)
Path(path).parent.mkdir(parents=True, exist_ok=True)
with open(path, 'wb') as f:
pickle.dump(checkpoint, f)
def load_checkpoint(self, path: str):
"""
Load training state from checkpoint.
This high-level method restores complete training state including
model parameters, optimizer state, scheduler state, and history.
Uses the low-level load_checkpoint() function internally.
Args:
path: File path to load checkpoint from
"""
# Use the standalone load_checkpoint function
checkpoint = load_checkpoint(path)
with open(path, 'rb') as f:
checkpoint = pickle.load(f)
self.epoch = checkpoint['epoch']
self.step = checkpoint['step']

View File

@@ -23,47 +23,7 @@ from ..core.tensor import Tensor
from ..core.layers import Linear
from ..core.attention import MultiHeadAttention
from ..core.activations import GELU
from ..text.embeddings import Embedding
from ..core.autograd import SqrtBackward, MeanBackward
# Monkey-patch sqrt method onto Tensor for LayerNorm
def _tensor_sqrt(self):
"""
Compute element-wise square root with gradient tracking.
Used in normalization layers (LayerNorm, BatchNorm).
"""
result_data = np.sqrt(self.data)
result = Tensor(result_data, requires_grad=self.requires_grad)
if self.requires_grad:
result._grad_fn = SqrtBackward()
result._grad_fn.saved_tensors = (self,)
result._grad_fn.saved_output = result
return result
Tensor.sqrt = _tensor_sqrt
# Monkey-patch mean method onto Tensor for LayerNorm
def _tensor_mean(self, axis=None, keepdims=False):
"""
Compute mean with gradient tracking.
Used in normalization layers (LayerNorm, BatchNorm) and loss functions.
"""
result_data = np.mean(self.data, axis=axis, keepdims=keepdims)
result = Tensor(result_data, requires_grad=self.requires_grad)
if self.requires_grad:
result._grad_fn = MeanBackward()
result._grad_fn.saved_tensors = (self,)
result._grad_fn.axis = axis
result._grad_fn.keepdims = keepdims
return result
Tensor.mean = _tensor_mean
from ..text.embeddings import Embedding, PositionalEncoding
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
class LayerNorm:
@@ -101,7 +61,6 @@ class LayerNorm:
self.eps = eps
# Learnable parameters: scale and shift
# CRITICAL: requires_grad=True so optimizer can train these!
self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True) # Scale parameter
self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True) # Shift parameter
### END SOLUTION
@@ -124,24 +83,29 @@ class LayerNorm:
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
"""
### BEGIN SOLUTION
# CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
# Compute statistics across last dimension (features)
mean = x.mean(axis=-1, keepdims=True)
# Compute variance: E[(x - μ)²]
diff = x - mean # Tensor subtraction maintains gradient
variance = (diff * diff).mean(axis=-1, keepdims=True) # Tensor ops maintain gradient
# Use Tensor operations to preserve computation graph!
diff = x - mean
variance = (diff * diff).mean(axis=-1, keepdims=True)
# Normalize: (x - mean) / sqrt(variance + eps)
# Note: Use Tensor.sqrt() to preserve gradient flow
std = (variance + self.eps).sqrt() # sqrt maintains gradient flow
normalized = diff / std # Division maintains gradient flow
# Normalize - use Tensor operations to preserve gradients!
# Add eps as a Tensor for proper gradient flow
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
normalized = (x - mean) / std
# Apply learnable transformation
output = normalized * self.gamma + self.beta
return output
### END SOLUTION
def __call__(self, x):
"""Allows the layer norm to be called like a function."""
return self.forward(x)
def parameters(self):
"""Return learnable parameters."""
return [self.gamma, self.beta]
@@ -183,10 +147,8 @@ class MLP:
# Two-layer feed-forward network
self.linear1 = Linear(embed_dim, hidden_dim)
self.gelu = GELU() # Use GELU activation from activations module
self.linear2 = Linear(hidden_dim, embed_dim)
# GELU activation
self.gelu = GELU()
### END SOLUTION
def forward(self, x):
@@ -209,8 +171,8 @@ class MLP:
# First linear layer with expansion
hidden = self.linear1.forward(x)
# GELU activation (callable pattern - activations have __call__)
hidden = self.gelu(hidden)
# GELU activation (YOUR activation from Module 03!)
hidden = self.gelu.forward(hidden)
# Second linear layer back to original size
output = self.linear2.forward(hidden)
@@ -218,6 +180,10 @@ class MLP:
return output
### END SOLUTION
def __call__(self, x):
"""Allows the MLP to be called like a function."""
return self.forward(x)
def parameters(self):
"""Return all learnable parameters."""
params = []
@@ -298,7 +264,7 @@ class TransformerBlock:
# First sub-layer: Multi-head self-attention with residual connection
# Pre-norm: LayerNorm before attention
normed1 = self.ln1.forward(x)
# Self-attention: MultiHeadAttention internally creates Q, K, V from input
# Self-attention: query, key, value are all the same (normed1)
attention_out = self.attention.forward(normed1, mask)
# Residual connection
@@ -315,6 +281,10 @@ class TransformerBlock:
return output
### END SOLUTION
def __call__(self, x, mask=None):
"""Allows the transformer block to be called like a function."""
return self.forward(x, mask)
def parameters(self):
"""Return all learnable parameters."""
params = []
@@ -434,6 +404,10 @@ class GPT:
return logits
### END SOLUTION
def __call__(self, tokens):
"""Allows the GPT model to be called like a function."""
return self.forward(tokens)
def _create_causal_mask(self, seq_len):
"""Create causal mask to prevent attending to future positions."""
### BEGIN SOLUTION

22
tinytorch/optimization/acceleration.py generated Normal file
View File

@@ -0,0 +1,22 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_acceleration/acceleration_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = []
# %% ../../modules/source/18_acceleration/acceleration_dev.ipynb 0
#| default_exp optimization.acceleration
#| export

300
tinytorch/optimization/compression.py generated Normal file
View File

@@ -0,0 +1,300 @@
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/XX_compression/compression_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Sequential', 'KnowledgeDistillation', 'test_unit_knowledge_distillation', 'CompressionComplete', 'measure_sparsity',
'magnitude_prune', 'structured_prune', 'compress_model']
# %% ../../modules/source/17_compression/compression_dev.ipynb 1
import numpy as np
import copy
from typing import List, Dict, Any, Tuple, Optional
import time
# Import from TinyTorch modules
from ..core.tensor import Tensor
from ..core.layers import Linear
# Sequential container for model compression
class Sequential:
"""Sequential container for compression (not exported from core layers)."""
def __init__(self, *layers):
self.layers = list(layers)
def forward(self, x):
for layer in self.layers:
x = layer.forward(x) if hasattr(layer, 'forward') else layer(x)
return x
def __call__(self, x):
return self.forward(x)
def parameters(self):
params = []
for layer in self.layers:
if hasattr(layer, 'parameters'):
params.extend(layer.parameters())
return params
# %% ../../modules/source/17_compression/compression_dev.ipynb 15
class KnowledgeDistillation:
"""
Knowledge distillation for model compression.
Train a smaller student model to mimic a larger teacher model.
"""
def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
"""
Initialize knowledge distillation.
TODO: Set up teacher and student models with distillation parameters
APPROACH:
1. Store teacher and student models
2. Set temperature for softening probability distributions
3. Set alpha for balancing hard vs soft targets
EXAMPLE:
>>> teacher = Sequential(Linear(100, 200), Linear(200, 50))
>>> student = Sequential(Linear(100, 50))
>>> kd = KnowledgeDistillation(teacher, student, temperature=4.0, alpha=0.8)
>>> print(f"Temperature: {kd.temperature}, Alpha: {kd.alpha}")
Temperature: 4.0, Alpha: 0.8
HINTS:
- Simply assign the parameters to instance variables
- Temperature typically ranges from 3-5 for effective softening
- Alpha of 0.7 means 70% soft targets, 30% hard targets
Args:
teacher_model: Large, pre-trained model
student_model: Smaller model to train
temperature: Softening parameter for distributions
alpha: Weight for soft target loss (1-alpha for hard targets)
"""
### BEGIN SOLUTION
self.teacher_model = teacher_model
self.student_model = student_model
self.temperature = temperature
self.alpha = alpha
### END SOLUTION
def distillation_loss(self, student_logits, teacher_logits, true_labels):
"""
Calculate combined distillation loss.
TODO: Implement knowledge distillation loss function
APPROACH:
1. Calculate hard target loss (student vs true labels)
2. Calculate soft target loss (student vs teacher, with temperature)
3. Combine losses: alpha * soft_loss + (1-alpha) * hard_loss
EXAMPLE:
>>> kd = KnowledgeDistillation(teacher, student)
>>> loss = kd.distillation_loss(student_out, teacher_out, labels)
>>> print(f"Distillation loss: {loss:.4f}")
HINTS:
- Use temperature to soften distributions: logits/temperature
- Soft targets use KL divergence or cross-entropy
- Hard targets use standard classification loss
"""
### BEGIN SOLUTION
# Convert to numpy for this implementation
if hasattr(student_logits, 'data'):
student_logits = student_logits.data
if hasattr(teacher_logits, 'data'):
teacher_logits = teacher_logits.data
if hasattr(true_labels, 'data'):
true_labels = true_labels.data
# Soften distributions with temperature
student_soft = self._softmax(student_logits / self.temperature)
teacher_soft = self._softmax(teacher_logits / self.temperature)
# Soft target loss (KL divergence)
soft_loss = self._kl_divergence(student_soft, teacher_soft)
# Hard target loss (cross-entropy)
student_hard = self._softmax(student_logits)
hard_loss = self._cross_entropy(student_hard, true_labels)
# Combined loss
total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss
return total_loss
### END SOLUTION
def _softmax(self, logits):
"""Compute softmax with numerical stability."""
exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
def _kl_divergence(self, p, q):
"""Compute KL divergence between distributions."""
return np.sum(p * np.log(p / (q + 1e-8) + 1e-8))
def _cross_entropy(self, predictions, labels):
"""Compute cross-entropy loss."""
# Simple implementation for integer labels
if labels.ndim == 1:
return -np.mean(np.log(predictions[np.arange(len(labels)), labels] + 1e-8))
else:
return -np.mean(np.sum(labels * np.log(predictions + 1e-8), axis=1))
def test_unit_knowledge_distillation():
"""🔬 Test knowledge distillation functionality."""
print("🔬 Unit Test: Knowledge Distillation...")
# Create teacher and student models
teacher = Sequential(Linear(10, 20), Linear(20, 5))
student = Sequential(Linear(10, 5)) # Smaller model
# Initialize knowledge distillation
kd = KnowledgeDistillation(teacher, student, temperature=3.0, alpha=0.7)
# Create dummy data
input_data = Tensor(np.random.randn(8, 10)) # Batch of 8
true_labels = np.array([0, 1, 2, 3, 4, 0, 1, 2]) # Class labels
# Forward passes
teacher_output = teacher.forward(input_data)
student_output = student.forward(input_data)
# Calculate distillation loss
loss = kd.distillation_loss(student_output, teacher_output, true_labels)
# Verify loss is reasonable
assert isinstance(loss, (float, np.floating)), f"Loss should be float, got {type(loss)}"
assert loss > 0, f"Loss should be positive, got {loss}"
assert not np.isnan(loss), "Loss should not be NaN"
print("✅ knowledge_distillation works correctly!")
test_unit_knowledge_distillation()
# %% ../../modules/source/17_compression/compression_dev.ipynb 29
class CompressionComplete:
"""
Complete compression system for milestone use.
Provides pruning, distillation, and low-rank approximation techniques.
"""
@staticmethod
def measure_sparsity(model) -> float:
"""Measure the sparsity of a model (fraction of zero weights)."""
total_params = 0
zero_params = 0
if hasattr(model, 'parameters'):
for param in model.parameters():
total_params += param.size
zero_params += np.sum(param.data == 0)
return zero_params / total_params if total_params > 0 else 0.0
@staticmethod
def magnitude_prune(model, sparsity=0.5):
"""
Prune model weights by magnitude (smallest weights set to zero).
Args:
model: Model with parameters() method
sparsity: Fraction of weights to prune (0-1)
"""
if hasattr(model, 'parameters'):
for param in model.parameters():
threshold = np.percentile(np.abs(param.data), sparsity * 100)
param.data[np.abs(param.data) < threshold] = 0
return model
@staticmethod
def structured_prune(model, prune_ratio=0.5):
"""
Prune entire neurons/channels (structured pruning).
Args:
model: Model to prune
prune_ratio: Fraction of structures to prune (0-1)
"""
if hasattr(model, 'parameters'):
params = list(model.parameters())
if len(params) > 0 and hasattr(params[0], 'data'):
weight = params[0]
if len(weight.shape) == 2: # Linear layer
# Prune output neurons
neuron_norms = np.linalg.norm(weight.data, axis=0)
threshold = np.percentile(neuron_norms, prune_ratio * 100)
mask = neuron_norms >= threshold
weight.data[:, ~mask] = 0
return model
@staticmethod
def compress_model(model, compression_config: Dict[str, Any]):
"""
Apply complete compression pipeline to a model.
Args:
model: Model to compress
compression_config: Dictionary with compression settings
- 'magnitude_sparsity': float (0-1)
- 'structured_prune_ratio': float (0-1)
Returns:
Compressed model with sparsity stats
"""
stats = {
'original_sparsity': CompressionComplete.measure_sparsity(model)
}
# Apply magnitude pruning
if 'magnitude_sparsity' in compression_config:
model = CompressionComplete.magnitude_prune(
model, compression_config['magnitude_sparsity']
)
# Apply structured pruning
if 'structured_prune_ratio' in compression_config:
model = CompressionComplete.structured_prune(
model, compression_config['structured_prune_ratio']
)
stats['final_sparsity'] = CompressionComplete.measure_sparsity(model)
stats['compression_ratio'] = 1.0 / (1.0 - stats['final_sparsity']) if stats['final_sparsity'] < 1.0 else float('inf')
return model, stats
# Convenience functions for backward compatibility
def measure_sparsity(model) -> float:
"""Measure model sparsity."""
return CompressionComplete.measure_sparsity(model)
def magnitude_prune(model, sparsity=0.5):
"""Apply magnitude-based pruning."""
return CompressionComplete.magnitude_prune(model, sparsity)
def structured_prune(model, prune_ratio=0.5):
"""Apply structured pruning."""
return CompressionComplete.structured_prune(model, prune_ratio)
def compress_model(model, compression_config: Dict[str, Any]):
"""Apply complete compression pipeline."""
return CompressionComplete.compress_model(model, compression_config)

View File

@@ -15,9 +15,9 @@
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model']
__all__ = []
# %% ../../modules/source/17_quantization/quantization_dev.ipynb 3
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3
import numpy as np
import time
from typing import Tuple, Dict, List, Optional
@@ -29,94 +29,3 @@ from ..core.layers import Linear
from ..core.activations import ReLU
print("✅ Quantization module imports complete")
# %% ../../modules/source/17_quantization/quantization_dev.ipynb 34
class QuantizationComplete:
"""
Complete quantization system for milestone use.
Provides INT8 quantization with calibration for 4× memory reduction.
"""
@staticmethod
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
data = tensor.data
min_val = float(np.min(data))
max_val = float(np.max(data))
if abs(max_val - min_val) < 1e-8:
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
scale = (max_val - min_val) / 255.0
zero_point = int(np.round(-128 - min_val / scale))
zero_point = int(np.clip(zero_point, -128, 127))
quantized_data = np.round(data / scale + zero_point)
quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8)
return Tensor(quantized_data), scale, zero_point
@staticmethod
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
return Tensor(dequantized_data)
@staticmethod
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""
Quantize all Linear layers in a model.
Returns dictionary with quantization info and memory savings.
"""
quantized_layers = {}
original_size = 0
quantized_size = 0
# Iterate through model parameters
if hasattr(model, 'parameters'):
for i, param in enumerate(model.parameters()):
param_size = param.data.nbytes
original_size += param_size
# Quantize parameter
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
quantized_size += q_param.data.nbytes
quantized_layers[f'param_{i}'] = {
'quantized': q_param,
'scale': scale,
'zero_point': zp,
'original_shape': param.data.shape
}
return {
'quantized_layers': quantized_layers,
'original_size_mb': original_size / (1024 * 1024),
'quantized_size_mb': quantized_size / (1024 * 1024),
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
}
@staticmethod
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
"""Compare memory usage between original and quantized models."""
return {
'original_mb': quantized_info['original_size_mb'],
'quantized_mb': quantized_info['quantized_size_mb'],
'compression_ratio': quantized_info['compression_ratio'],
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
}
# Convenience functions for backward compatibility
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
"""Quantize FP32 tensor to INT8."""
return QuantizationComplete.quantize_tensor(tensor)
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
"""Dequantize INT8 tensor back to FP32."""
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
"""Quantize entire model to INT8."""
return QuantizationComplete.quantize_model(model, calibration_data)

View File

@@ -93,18 +93,22 @@ class Embedding:
# Perform embedding lookup using advanced indexing
# This is equivalent to one-hot multiplication but much more efficient
embedded_data = self.weight.data[indices.data.astype(int)]
# Create output tensor with gradient tracking
from tinytorch.core.autograd import EmbeddingBackward
result = Tensor(embedded_data, requires_grad=self.weight.requires_grad)
embedded = self.weight.data[indices.data.astype(int)]
# Create result tensor
result = Tensor(embedded, requires_grad=self.weight.requires_grad)
# Attach gradient function (students learned this in Module 05!)
if self.weight.requires_grad:
result._grad_fn = EmbeddingBackward()
result._grad_fn.saved_tensors = (self.weight, indices)
from tinytorch.core.autograd import EmbeddingBackward
result._grad_fn = EmbeddingBackward(self.weight, indices)
return result
def __call__(self, indices: Tensor) -> Tensor:
"""Allows the embedding to be called like a function."""
return self.forward(indices)
def parameters(self) -> List[Tensor]:
"""Return trainable parameters."""
return [self.weight]
@@ -188,16 +192,23 @@ class PositionalEncoding:
f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
)
# Get position embeddings for this sequence length
pos_embeddings = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim)
# Get position embeddings for this sequence length (slice using .data for efficiency)
pos_embeddings_data = self.position_embeddings.data[:seq_len] # (seq_len, embed_dim)
# Broadcast to match batch dimension: (1, seq_len, embed_dim)
pos_embeddings = pos_embeddings[np.newaxis, :, :]
pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]
# Wrap in Tensor to preserve requires_grad
pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)
# Add positional information to input embeddings
result = x.data + pos_embeddings
# Add positional information using Tensor operation to preserve gradients!
result = x + pos_embeddings
return Tensor(result)
return result
def __call__(self, x: Tensor) -> Tensor:
"""Allows the positional encoding to be called like a function."""
return self.forward(x)
def parameters(self) -> List[Tensor]:
"""Return trainable parameters."""
@@ -325,6 +336,10 @@ class EmbeddingLayer:
return output
def __call__(self, tokens: Tensor) -> Tensor:
"""Allows the embedding layer to be called like a function."""
return self.forward(tokens)
def parameters(self) -> List[Tensor]:
"""Return all trainable parameters."""
params = self.token_embedding.parameters()