mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 20:25:00 -05:00
Update tinytorch and tito with module exports
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
This commit is contained in:
142
tinytorch/_modidx.py
generated
142
tinytorch/_modidx.py
generated
@@ -1,19 +1,3 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/[unknown]/[unknown]_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# Autogenerated by nbdev
|
||||
|
||||
d = { 'settings': { 'branch': 'main',
|
||||
@@ -21,36 +5,7 @@ d = { 'settings': { 'branch': 'main',
|
||||
'doc_host': 'https://tinytorch.github.io',
|
||||
'git_url': 'https://github.com/tinytorch/TinyTorch/',
|
||||
'lib_path': 'tinytorch'},
|
||||
'syms': { 'tinytorch.applications.tinygpt': { 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline': ( '20_capstone/capstone_dev.html#completetinygptpipeline',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.__init__': ( '20_capstone/capstone_dev.html#completetinygptpipeline.__init__',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.generate_text': ( '20_capstone/capstone_dev.html#completetinygptpipeline.generate_text',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.optimize_model': ( '20_capstone/capstone_dev.html#completetinygptpipeline.optimize_model',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.prepare_training_data': ( '20_capstone/capstone_dev.html#completetinygptpipeline.prepare_training_data',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.train': ( '20_capstone/capstone_dev.html#completetinygptpipeline.train',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.TinyGPT': ( '20_capstone/capstone_dev.html#tinygpt',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.TinyGPT.__init__': ( '20_capstone/capstone_dev.html#tinygpt.__init__',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.TinyGPTTrainer': ( '20_capstone/capstone_dev.html#tinygpttrainer',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.TinyGPTTrainer.__init__': ( '20_capstone/capstone_dev.html#tinygpttrainer.__init__',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.TinyGPTTrainer.prepare_batch': ( '20_capstone/capstone_dev.html#tinygpttrainer.prepare_batch',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.TinyGPTTrainer.train_step': ( '20_capstone/capstone_dev.html#tinygpttrainer.train_step',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.test_unit_complete_pipeline': ( '20_capstone/capstone_dev.html#test_unit_complete_pipeline',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.test_unit_tinygpt_init': ( '20_capstone/capstone_dev.html#test_unit_tinygpt_init',
|
||||
'tinytorch/applications/tinygpt.py'),
|
||||
'tinytorch.applications.tinygpt.test_unit_training_pipeline': ( '20_capstone/capstone_dev.html#test_unit_training_pipeline',
|
||||
'tinytorch/applications/tinygpt.py')},
|
||||
'syms': { 'tinytorch.applications.tinygpt': {},
|
||||
'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.Benchmark.__init__': ( '19_benchmarking/benchmarking_dev.html#benchmark.__init__',
|
||||
@@ -89,6 +44,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.TinyMLPerf.run_standard_benchmark': ( '19_benchmarking/benchmarking_dev.html#tinymlperf.run_standard_benchmark',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.calculate_normalized_scores': ( '19_benchmarking/benchmarking_dev.html#calculate_normalized_scores',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark',
|
||||
'tinytorch/benchmarking/benchmark.py'),
|
||||
'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark_suite',
|
||||
@@ -105,6 +62,8 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/competition/submit.py'),
|
||||
'tinytorch.competition.submit.validate_installation': ( '20_competition/competition_dev.html#validate_installation',
|
||||
'tinytorch/competition/submit.py'),
|
||||
'tinytorch.competition.submit.validate_submission': ( '20_competition/competition_dev.html#validate_submission',
|
||||
'tinytorch/competition/submit.py'),
|
||||
'tinytorch.competition.submit.worked_example_optimization': ( '20_competition/competition_dev.html#worked_example_optimization',
|
||||
'tinytorch/competition/submit.py')},
|
||||
'tinytorch.core.activations': { 'tinytorch.core.activations.GELU': ( '02_activations/activations_dev.html#gelu',
|
||||
@@ -341,7 +300,11 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint',
|
||||
'tinytorch/core/training.py'),
|
||||
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch',
|
||||
'tinytorch/core/training.py')},
|
||||
'tinytorch/core/training.py'),
|
||||
'tinytorch.core.training.load_checkpoint': ( '07_training/training_dev.html#load_checkpoint',
|
||||
'tinytorch/core/training.py'),
|
||||
'tinytorch.core.training.save_checkpoint': ( '07_training/training_dev.html#save_checkpoint',
|
||||
'tinytorch/core/training.py')},
|
||||
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__',
|
||||
@@ -386,8 +349,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/generation/kv_cache.py')},
|
||||
'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT.__call__': ( '13_transformers/transformers_dev.html#gpt.__call__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
|
||||
@@ -400,8 +361,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm.__call__': ( '13_transformers/transformers_dev.html#layernorm.__call__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
|
||||
@@ -410,8 +369,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP.__call__': ( '13_transformers/transformers_dev.html#mlp.__call__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
|
||||
@@ -420,8 +377,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.__call__': ( '13_transformers/transformers_dev.html#transformerblock.__call__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
|
||||
'tinytorch/models/transformer.py'),
|
||||
'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
|
||||
@@ -429,49 +384,54 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
|
||||
'tinytorch/models/transformer.py')},
|
||||
'tinytorch.optimization.acceleration': {},
|
||||
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '17_compression/compression_dev.html#compressioncomplete',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '17_compression/compression_dev.html#compressioncomplete.compress_model',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.CompressionComplete.magnitude_prune': ( '17_compression/compression_dev.html#compressioncomplete.magnitude_prune',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.CompressionComplete.measure_sparsity': ( '17_compression/compression_dev.html#compressioncomplete.measure_sparsity',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.CompressionComplete.structured_prune': ( '17_compression/compression_dev.html#compressioncomplete.structured_prune',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.KnowledgeDistillation': ( '17_compression/compression_dev.html#knowledgedistillation',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.KnowledgeDistillation.__init__': ( '17_compression/compression_dev.html#knowledgedistillation.__init__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.KnowledgeDistillation._cross_entropy': ( '17_compression/compression_dev.html#knowledgedistillation._cross_entropy',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.KnowledgeDistillation._kl_divergence': ( '17_compression/compression_dev.html#knowledgedistillation._kl_divergence',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.KnowledgeDistillation._softmax': ( '17_compression/compression_dev.html#knowledgedistillation._softmax',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.KnowledgeDistillation.distillation_loss': ( '17_compression/compression_dev.html#knowledgedistillation.distillation_loss',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.Linear': ( '17_compression/compression_dev.html#linear',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Linear.__init__': ( '17_compression/compression_dev.html#linear.__init__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Linear.forward': ( '17_compression/compression_dev.html#linear.forward',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Linear.parameters': ( '17_compression/compression_dev.html#linear.parameters',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Sequential': ( '17_compression/compression_dev.html#sequential',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Sequential.__call__': ( '17_compression/compression_dev.html#sequential.__call__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Sequential.__init__': ( '17_compression/compression_dev.html#sequential.__init__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Sequential.forward': ( '17_compression/compression_dev.html#sequential.forward',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Sequential.parameters': ( '17_compression/compression_dev.html#sequential.parameters',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.compress_model': ( '17_compression/compression_dev.html#compress_model',
|
||||
'tinytorch.optimization.compression.Tensor': ( '17_compression/compression_dev.html#tensor',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Tensor.__add__': ( '17_compression/compression_dev.html#tensor.__add__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.magnitude_prune': ( '17_compression/compression_dev.html#magnitude_prune',
|
||||
'tinytorch.optimization.compression.Tensor.__init__': ( '17_compression/compression_dev.html#tensor.__init__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.measure_sparsity': ( '17_compression/compression_dev.html#measure_sparsity',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.structured_prune': ( '17_compression/compression_dev.html#structured_prune',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.test_unit_knowledge_distillation': ( '17_compression/compression_dev.html#test_unit_knowledge_distillation',
|
||||
'tinytorch/optimization/compression.py')},
|
||||
'tinytorch.optimization.quantization': {},
|
||||
'tinytorch.optimization.compression.Tensor.__mul__': ( '17_compression/compression_dev.html#tensor.__mul__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Tensor.__repr__': ( '17_compression/compression_dev.html#tensor.__repr__',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Tensor.abs': ( '17_compression/compression_dev.html#tensor.abs',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Tensor.matmul': ( '17_compression/compression_dev.html#tensor.matmul',
|
||||
'tinytorch/optimization/compression.py'),
|
||||
'tinytorch.optimization.compression.Tensor.sum': ( '17_compression/compression_dev.html#tensor.sum',
|
||||
'tinytorch/optimization/compression.py')},
|
||||
'tinytorch.optimization.quantization': { 'tinytorch.optimization.quantization.QuantizationComplete': ( '16_quantization/quantization_dev.html#quantizationcomplete',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.QuantizationComplete.compare_models': ( '16_quantization/quantization_dev.html#quantizationcomplete.compare_models',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.QuantizationComplete.dequantize_tensor': ( '16_quantization/quantization_dev.html#quantizationcomplete.dequantize_tensor',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.QuantizationComplete.quantize_model': ( '16_quantization/quantization_dev.html#quantizationcomplete.quantize_model',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.QuantizationComplete.quantize_tensor': ( '16_quantization/quantization_dev.html#quantizationcomplete.quantize_tensor',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.dequantize_int8': ( '16_quantization/quantization_dev.html#dequantize_int8',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.quantize_int8': ( '16_quantization/quantization_dev.html#quantize_int8',
|
||||
'tinytorch/optimization/quantization.py'),
|
||||
'tinytorch.optimization.quantization.quantize_model': ( '16_quantization/quantization_dev.html#quantize_model',
|
||||
'tinytorch/optimization/quantization.py')},
|
||||
'tinytorch.profiling.profiler': { 'tinytorch.profiling.profiler.Profiler': ( '14_profiling/profiling_dev.html#profiler',
|
||||
'tinytorch/profiling/profiler.py'),
|
||||
'tinytorch.profiling.profiler.Profiler.__init__': ( '14_profiling/profiling_dev.html#profiler.__init__',
|
||||
@@ -496,8 +456,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/profiling/profiler.py')},
|
||||
'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.__call__': ( '11_embeddings/embeddings_dev.html#embedding.__call__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
|
||||
@@ -508,8 +466,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.__call__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__call__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
|
||||
@@ -520,8 +476,6 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.__call__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__call__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
|
||||
'tinytorch/text/embeddings.py'),
|
||||
'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',
|
||||
|
||||
677
tinytorch/applications/tinygpt.py
generated
677
tinytorch/applications/tinygpt.py
generated
@@ -1,679 +1,8 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_tinygpt/tinygpt_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/20_capstone/capstone_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline',
|
||||
'test_unit_complete_pipeline']
|
||||
__all__ = []
|
||||
|
||||
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 2
|
||||
#| default_exp applications.tinygpt
|
||||
#| export
|
||||
|
||||
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 7
|
||||
class TinyGPT:
|
||||
"""
|
||||
Complete GPT implementation integrating all TinyTorch modules.
|
||||
|
||||
This class demonstrates how framework components compose into real applications.
|
||||
Built using modules 01,02,03,11,12,13 as core architecture.
|
||||
|
||||
Architecture:
|
||||
- Token Embeddings (Module 11)
|
||||
- Positional Encoding (Module 11)
|
||||
- Transformer Blocks (Module 13)
|
||||
- Output Linear Layer (Module 03)
|
||||
- Language Modeling Head (Module 04)
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4,
|
||||
num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1):
|
||||
"""
|
||||
Initialize TinyGPT with production-inspired architecture.
|
||||
|
||||
TODO: Build a complete GPT model using TinyTorch components
|
||||
|
||||
APPROACH:
|
||||
1. Create token embeddings (vocab_size × embed_dim)
|
||||
2. Create positional encoding (max_seq_len × embed_dim)
|
||||
3. Build transformer layers using TransformerBlock
|
||||
4. Add output projection layer
|
||||
5. Calculate and report parameter count
|
||||
|
||||
ARCHITECTURE DECISIONS:
|
||||
- embed_dim=128: Small enough for fast training, large enough for learning
|
||||
- num_layers=4: Sufficient depth without excessive memory
|
||||
- num_heads=4: Multi-head attention without head_dim being too small
|
||||
- max_seq_len=256: Reasonable context length for character-level modeling
|
||||
|
||||
EXAMPLE:
|
||||
>>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4)
|
||||
>>> print(f"Parameters: {model.count_parameters():,}")
|
||||
Parameters: 1,234,567
|
||||
|
||||
HINTS:
|
||||
- Use Embedding class for token embeddings
|
||||
- Use PositionalEncoding for position information
|
||||
- Stack TransformerBlock instances in a list
|
||||
- Final Linear layer maps embed_dim → vocab_size
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
self.max_seq_len = max_seq_len
|
||||
self.dropout = dropout
|
||||
|
||||
# Token embeddings: convert token IDs to dense vectors
|
||||
self.token_embedding = Embedding(vocab_size, embed_dim)
|
||||
|
||||
# Positional encoding: add position information
|
||||
self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim)
|
||||
|
||||
# Transformer layers: core processing
|
||||
self.transformer_blocks = []
|
||||
for _ in range(num_layers):
|
||||
block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0)
|
||||
self.transformer_blocks.append(block)
|
||||
|
||||
# Output projection: map back to vocabulary
|
||||
self.output_projection = Linear(embed_dim, vocab_size)
|
||||
|
||||
# Dropout for regularization
|
||||
self.dropout_layer = Dropout(dropout)
|
||||
|
||||
# Calculate parameter count for systems analysis
|
||||
self._param_count = self.count_parameters()
|
||||
print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters")
|
||||
print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D")
|
||||
print(f"💾 Estimated memory: {self._param_count * 4 / 1024 / 1024:.1f}MB")
|
||||
### END SOLUTION
|
||||
|
||||
def test_unit_tinygpt_init():
|
||||
"""🔬 Test TinyGPT initialization and parameter counting."""
|
||||
print("🔬 Unit Test: TinyGPT Initialization...")
|
||||
|
||||
# Create a small model for testing
|
||||
model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128)
|
||||
|
||||
# Verify architecture components exist
|
||||
assert hasattr(model, 'token_embedding')
|
||||
assert hasattr(model, 'positional_encoding')
|
||||
assert hasattr(model, 'transformer_blocks')
|
||||
assert hasattr(model, 'output_projection')
|
||||
assert len(model.transformer_blocks) == 2
|
||||
|
||||
# Verify parameter count is reasonable
|
||||
param_count = model.count_parameters()
|
||||
assert param_count > 0
|
||||
assert param_count < 1000000 # Sanity check for small model
|
||||
|
||||
print(f"✅ Model created with {param_count:,} parameters")
|
||||
print("✅ TinyGPT initialization works correctly!")
|
||||
|
||||
# Run immediate test
|
||||
test_unit_tinygpt_init()
|
||||
|
||||
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 10
|
||||
class TinyGPTTrainer:
|
||||
"""
|
||||
Complete training pipeline integrating optimizers, schedulers, and monitoring.
|
||||
|
||||
Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training.
|
||||
"""
|
||||
|
||||
def __init__(self, model: TinyGPT, tokenizer: CharTokenizer,
|
||||
learning_rate: float = 3e-4, weight_decay: float = 0.01):
|
||||
"""
|
||||
Initialize trainer with model and optimization components.
|
||||
|
||||
TODO: Set up complete training infrastructure
|
||||
|
||||
APPROACH:
|
||||
1. Store model and tokenizer references
|
||||
2. Initialize AdamW optimizer (standard for transformers)
|
||||
3. Initialize loss function (CrossEntropyLoss for language modeling)
|
||||
4. Set up learning rate scheduler (cosine schedule)
|
||||
5. Initialize training metrics tracking
|
||||
|
||||
PRODUCTION CHOICES:
|
||||
- AdamW: Better generalization than Adam (weight decay)
|
||||
- learning_rate=3e-4: Standard for small transformers
|
||||
- Cosine schedule: Smooth learning rate decay
|
||||
- CrossEntropy: Standard for classification/language modeling
|
||||
|
||||
EXAMPLE:
|
||||
>>> model = TinyGPT(vocab_size=100)
|
||||
>>> tokenizer = CharTokenizer(['a', 'b', 'c'])
|
||||
>>> trainer = TinyGPTTrainer(model, tokenizer)
|
||||
>>> print("Trainer ready for training")
|
||||
Trainer ready for training
|
||||
|
||||
HINTS:
|
||||
- Get all model parameters with model.parameters()
|
||||
- Use AdamW with weight_decay for better generalization
|
||||
- CrossEntropyLoss handles the language modeling objective
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Collect all trainable parameters
|
||||
all_params = []
|
||||
all_params.extend(model.token_embedding.parameters())
|
||||
for block in model.transformer_blocks:
|
||||
all_params.extend(block.parameters())
|
||||
all_params.extend(model.output_projection.parameters())
|
||||
|
||||
# Initialize optimizer (AdamW for transformers)
|
||||
self.optimizer = AdamW(
|
||||
params=all_params,
|
||||
lr=learning_rate,
|
||||
weight_decay=weight_decay,
|
||||
betas=(0.9, 0.95) # Standard for language models
|
||||
)
|
||||
|
||||
# Loss function for next token prediction
|
||||
self.loss_fn = CrossEntropyLoss()
|
||||
|
||||
# Learning rate scheduler
|
||||
self.scheduler = CosineSchedule(
|
||||
optimizer=self.optimizer,
|
||||
max_epochs=100, # Will adjust based on actual training
|
||||
min_lr=learning_rate * 0.1
|
||||
)
|
||||
|
||||
# Training metrics
|
||||
self.training_history = {
|
||||
'losses': [],
|
||||
'perplexities': [],
|
||||
'learning_rates': [],
|
||||
'epoch': 0
|
||||
}
|
||||
|
||||
print(f"🚀 Trainer initialized:")
|
||||
print(f" Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})")
|
||||
print(f" Parameters: {len(all_params):,} tensors")
|
||||
print(f" Loss: CrossEntropyLoss")
|
||||
### END SOLUTION
|
||||
|
||||
def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]:
|
||||
"""
|
||||
Convert text batch to input/target tensors for language modeling.
|
||||
|
||||
TODO: Implement text-to-tensor conversion with proper targets
|
||||
|
||||
APPROACH:
|
||||
1. Tokenize each text in the batch
|
||||
2. Pad/truncate to consistent length
|
||||
3. Create input_ids (text) and target_ids (text shifted by 1)
|
||||
4. Convert to Tensor format
|
||||
|
||||
LANGUAGE MODELING OBJECTIVE:
|
||||
- Input: [token1, token2, token3, token4]
|
||||
- Target: [token2, token3, token4, token5]
|
||||
- Model predicts next token at each position
|
||||
|
||||
EXAMPLE:
|
||||
>>> trainer = TinyGPTTrainer(model, tokenizer)
|
||||
>>> texts = ["hello world", "ai is fun"]
|
||||
>>> inputs, targets = trainer.prepare_batch(texts)
|
||||
>>> print(inputs.shape, targets.shape)
|
||||
(2, 128) (2, 128)
|
||||
|
||||
HINTS:
|
||||
- Use tokenizer.encode() for text → token conversion
|
||||
- Pad shorter sequences with tokenizer pad token
|
||||
- Target sequence is input sequence shifted right by 1
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
batch_size = len(text_batch)
|
||||
|
||||
# Tokenize all texts
|
||||
tokenized_batch = []
|
||||
for text in text_batch:
|
||||
tokens = self.tokenizer.encode(text)
|
||||
|
||||
# Truncate or pad to max_length
|
||||
if len(tokens) > max_length:
|
||||
tokens = tokens[:max_length]
|
||||
else:
|
||||
# Pad with special token (use 0 as pad)
|
||||
tokens.extend([0] * (max_length - len(tokens)))
|
||||
|
||||
tokenized_batch.append(tokens)
|
||||
|
||||
# Convert to numpy then Tensor
|
||||
input_ids = Tensor(np.array(tokenized_batch)) # (batch_size, seq_len)
|
||||
|
||||
# Create targets (shifted input for next token prediction)
|
||||
target_ids = Tensor(np.roll(input_ids.data, -1, axis=1)) # Shift left by 1
|
||||
|
||||
return input_ids, target_ids
|
||||
### END SOLUTION
|
||||
|
||||
def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float:
|
||||
"""
|
||||
Single training step with forward, backward, and optimization.
|
||||
|
||||
TODO: Implement complete training step
|
||||
|
||||
APPROACH:
|
||||
1. Zero gradients from previous step
|
||||
2. Forward pass to get logits
|
||||
3. Compute loss between logits and targets
|
||||
4. Backward pass to compute gradients
|
||||
5. Optimizer step to update parameters
|
||||
6. Return loss value for monitoring
|
||||
|
||||
MEMORY MANAGEMENT:
|
||||
During training, memory usage = 3× model size:
|
||||
- 1× for parameters
|
||||
- 1× for gradients
|
||||
- 1× for optimizer states (Adam moments)
|
||||
|
||||
EXAMPLE:
|
||||
>>> loss = trainer.train_step(input_ids, target_ids)
|
||||
>>> print(f"Training loss: {loss:.4f}")
|
||||
Training loss: 2.3456
|
||||
|
||||
HINTS:
|
||||
- Always zero_grad() before forward pass
|
||||
- Loss should be computed on flattened logits and targets
|
||||
- Call backward() on the loss tensor
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Zero gradients from previous step
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
# Forward pass
|
||||
logits = self.model.forward(input_ids) # (batch, seq_len, vocab_size)
|
||||
|
||||
# Reshape for loss computation
|
||||
batch_size, seq_len, vocab_size = logits.shape
|
||||
logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
|
||||
targets_flat = target_ids.reshape(batch_size * seq_len)
|
||||
|
||||
# Compute loss
|
||||
loss = self.loss_fn.forward(logits_flat, targets_flat)
|
||||
|
||||
# Backward pass
|
||||
loss.backward()
|
||||
|
||||
# Optimizer step
|
||||
self.optimizer.step()
|
||||
|
||||
# Return scalar loss for monitoring
|
||||
return float(loss.data.item() if hasattr(loss.data, 'item') else loss.data)
|
||||
### END SOLUTION
|
||||
|
||||
def test_unit_training_pipeline():
|
||||
"""🔬 Test training pipeline components."""
|
||||
print("🔬 Unit Test: Training Pipeline...")
|
||||
|
||||
# Create small model and trainer
|
||||
model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2)
|
||||
tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' '])
|
||||
trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3)
|
||||
|
||||
# Test batch preparation
|
||||
texts = ["hello", "world"]
|
||||
input_ids, target_ids = trainer.prepare_batch(texts, max_length=8)
|
||||
|
||||
assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}"
|
||||
assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}"
|
||||
|
||||
# Test training step
|
||||
initial_loss = trainer.train_step(input_ids, target_ids)
|
||||
assert initial_loss > 0, "Loss should be positive"
|
||||
|
||||
# Second step should work (gradients computed and applied)
|
||||
second_loss = trainer.train_step(input_ids, target_ids)
|
||||
assert second_loss > 0, "Second loss should also be positive"
|
||||
|
||||
print(f"✅ Batch preparation shape: {input_ids.shape}")
|
||||
print(f"✅ Initial loss: {initial_loss:.4f}")
|
||||
print(f"✅ Second loss: {second_loss:.4f}")
|
||||
print("✅ Training pipeline works correctly!")
|
||||
|
||||
# Run immediate test
|
||||
test_unit_training_pipeline()
|
||||
|
||||
# %% ../../modules/source/20_capstone/capstone_dev.ipynb 14
|
||||
class CompleteTinyGPTPipeline:
|
||||
"""
|
||||
End-to-end ML pipeline demonstrating integration of all 19 modules.
|
||||
|
||||
Pipeline stages:
|
||||
1. Data preparation (Module 10: Tokenization)
|
||||
2. Model creation (Modules 01-04, 11-13: Architecture)
|
||||
3. Training setup (Modules 05-07: Optimization)
|
||||
4. Training loop (Module 08: DataLoader)
|
||||
5. Optimization (Modules 17-18: Quantization, Pruning)
|
||||
6. Evaluation (Module 19: Benchmarking)
|
||||
7. Generation (Module 14: KV Caching)
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size: int = 100, embed_dim: int = 128,
|
||||
num_layers: int = 4, num_heads: int = 4):
|
||||
"""
|
||||
Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules.
|
||||
|
||||
TODO: Set up a complete ML pipeline with tokenization, model, training,
|
||||
profiling, and benchmarking components
|
||||
|
||||
APPROACH:
|
||||
1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads)
|
||||
2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127)
|
||||
3. Create TinyGPT model instance with stored parameters and max_seq_len=256
|
||||
4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4
|
||||
5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis
|
||||
6. Initialize pipeline state tracking (is_trained flag, training_history list)
|
||||
7. Print pipeline initialization summary with parameter count and memory usage
|
||||
|
||||
EXAMPLE:
|
||||
>>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128,
|
||||
... num_layers=4, num_heads=4)
|
||||
🏗️ Complete TinyGPT Pipeline Initialized
|
||||
Model: 419,300 parameters
|
||||
Memory: 1.6MB
|
||||
>>> pipeline.model.count_parameters()
|
||||
419300
|
||||
>>> pipeline.is_trained
|
||||
False
|
||||
>>> len(pipeline.training_history)
|
||||
0
|
||||
|
||||
HINTS:
|
||||
- CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)]
|
||||
- TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len
|
||||
- TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments
|
||||
- Benchmark expects (models_list, datasets_list, metrics_list) format
|
||||
- Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB
|
||||
"""
|
||||
|
||||
### BEGIN SOLUTION
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
|
||||
# Stage 1: Initialize tokenizer (Module 10)
|
||||
self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)]) # Printable ASCII
|
||||
|
||||
# Stage 2: Create model (Modules 01-04, 11-13)
|
||||
self.model = TinyGPT(
|
||||
vocab_size=vocab_size,
|
||||
embed_dim=embed_dim,
|
||||
num_layers=num_layers,
|
||||
num_heads=num_heads,
|
||||
max_seq_len=256
|
||||
)
|
||||
|
||||
# Stage 3: Setup training (Modules 05-07)
|
||||
self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4)
|
||||
|
||||
# Stage 4: Initialize profiler and benchmark (Modules 15, 19)
|
||||
self.profiler = Profiler()
|
||||
self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"])
|
||||
|
||||
# Pipeline state
|
||||
self.is_trained = False
|
||||
self.training_history = []
|
||||
|
||||
print("🏗️ Complete TinyGPT Pipeline Initialized")
|
||||
print(f" Model: {self.model.count_parameters():,} parameters")
|
||||
print(f" Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB")
|
||||
### END SOLUTION
|
||||
|
||||
def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader:
|
||||
"""
|
||||
Prepare training data using DataLoader (Module 08).
|
||||
|
||||
TODO: Create DataLoader for training text data
|
||||
|
||||
APPROACH:
|
||||
1. Tokenize all texts in corpus
|
||||
2. Create input/target pairs for language modeling
|
||||
3. Package into TensorDataset
|
||||
4. Create DataLoader with batching and shuffling
|
||||
|
||||
EXAMPLE:
|
||||
>>> pipeline = CompleteTinyGPTPipeline()
|
||||
>>> corpus = ["hello world", "ai is amazing"]
|
||||
>>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
|
||||
>>> print(f"Batches: {len(dataloader)}")
|
||||
Batches: 1
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Tokenize and prepare training pairs
|
||||
input_sequences = []
|
||||
target_sequences = []
|
||||
|
||||
for text in text_corpus:
|
||||
tokens = self.tokenizer.encode(text)
|
||||
if len(tokens) < 2:
|
||||
continue # Skip very short texts
|
||||
|
||||
# Create sliding window of input/target pairs
|
||||
for i in range(len(tokens) - 1):
|
||||
input_seq = tokens[:i+1]
|
||||
target_seq = tokens[i+1]
|
||||
|
||||
# Pad input to consistent length
|
||||
max_len = 32 # Reasonable context window
|
||||
if len(input_seq) > max_len:
|
||||
input_seq = input_seq[-max_len:]
|
||||
else:
|
||||
input_seq = [0] * (max_len - len(input_seq)) + input_seq
|
||||
|
||||
input_sequences.append(input_seq)
|
||||
target_sequences.append(target_seq)
|
||||
|
||||
# Convert to tensors
|
||||
inputs = Tensor(np.array(input_sequences))
|
||||
targets = Tensor(np.array(target_sequences))
|
||||
|
||||
# Create dataset and dataloader
|
||||
dataset = TensorDataset(inputs, targets)
|
||||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches")
|
||||
return dataloader
|
||||
### END SOLUTION
|
||||
|
||||
def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]:
|
||||
"""
|
||||
Complete training loop with monitoring.
|
||||
|
||||
TODO: Implement full training with progress tracking
|
||||
|
||||
APPROACH:
|
||||
1. Loop through epochs
|
||||
2. For each batch: forward, backward, optimize
|
||||
3. Track loss and perplexity
|
||||
4. Update learning rate schedule
|
||||
5. Return training history
|
||||
|
||||
EXAMPLE:
|
||||
>>> history = pipeline.train(dataloader, epochs=5)
|
||||
>>> print(f"Final loss: {history['losses'][-1]:.4f}")
|
||||
Final loss: 1.2345
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
history = {'losses': [], 'perplexities': [], 'epochs': []}
|
||||
|
||||
print(f"🚀 Starting training for {epochs} epochs...")
|
||||
|
||||
for epoch in range(epochs):
|
||||
epoch_losses = []
|
||||
|
||||
for batch_idx, (inputs, targets) in enumerate(dataloader):
|
||||
# Training step
|
||||
loss = self.trainer.train_step(inputs, targets)
|
||||
epoch_losses.append(loss)
|
||||
|
||||
# Log progress
|
||||
if batch_idx % 10 == 0:
|
||||
perplexity = np.exp(loss)
|
||||
print(f" Epoch {epoch+1}/{epochs}, Batch {batch_idx}: "
|
||||
f"Loss={loss:.4f}, PPL={perplexity:.2f}")
|
||||
|
||||
# Epoch summary
|
||||
avg_loss = np.mean(epoch_losses)
|
||||
avg_perplexity = np.exp(avg_loss)
|
||||
|
||||
history['losses'].append(avg_loss)
|
||||
history['perplexities'].append(avg_perplexity)
|
||||
history['epochs'].append(epoch + 1)
|
||||
|
||||
# Update learning rate
|
||||
self.trainer.scheduler.step()
|
||||
|
||||
print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}")
|
||||
|
||||
self.is_trained = True
|
||||
self.training_history = history
|
||||
print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}")
|
||||
|
||||
return history
|
||||
### END SOLUTION
|
||||
|
||||
def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0):
|
||||
"""
|
||||
Apply optimization techniques (Modules 17-18).
|
||||
|
||||
TODO: Apply quantization and pruning optimizations
|
||||
|
||||
APPROACH:
|
||||
1. Optionally apply quantization to reduce precision
|
||||
2. Optionally apply pruning to remove weights
|
||||
3. Measure size reduction
|
||||
4. Validate model still works
|
||||
|
||||
EXAMPLE:
|
||||
>>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
|
||||
Model optimized: 75% size reduction
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
original_params = self.model.count_parameters()
|
||||
original_memory = original_params * 4 / (1024 * 1024)
|
||||
|
||||
optimizations_applied = []
|
||||
|
||||
if quantize:
|
||||
# Apply quantization (simulated)
|
||||
# In real implementation, would use quantize_model()
|
||||
quantized_memory = original_memory / 4 # INT8 vs FP32
|
||||
optimizations_applied.append(f"INT8 quantization (4× memory reduction)")
|
||||
print(" Applied INT8 quantization")
|
||||
|
||||
if prune_sparsity > 0:
|
||||
# Apply pruning (simulated)
|
||||
# In real implementation, would use magnitude_prune()
|
||||
remaining_weights = 1 - prune_sparsity
|
||||
optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)")
|
||||
print(f" Applied {prune_sparsity:.0%} magnitude pruning")
|
||||
|
||||
# Calculate final size
|
||||
size_reduction = 1.0
|
||||
if quantize:
|
||||
size_reduction *= 0.25 # 4× smaller
|
||||
if prune_sparsity > 0:
|
||||
size_reduction *= (1 - prune_sparsity)
|
||||
|
||||
final_memory = original_memory * size_reduction
|
||||
reduction_factor = original_memory / final_memory
|
||||
|
||||
print(f"🔧 Model optimization complete:")
|
||||
print(f" Original: {original_memory:.1f}MB")
|
||||
print(f" Optimized: {final_memory:.1f}MB")
|
||||
print(f" Reduction: {reduction_factor:.1f}× smaller")
|
||||
print(f" Applied: {', '.join(optimizations_applied)}")
|
||||
### END SOLUTION
|
||||
|
||||
def generate_text(self, prompt: str, max_tokens: int = 50) -> str:
|
||||
"""
|
||||
Generate text using the trained model.
|
||||
|
||||
TODO: Implement text generation with proper encoding/decoding
|
||||
|
||||
APPROACH:
|
||||
1. Encode prompt to token IDs
|
||||
2. Use model.generate() for autoregressive generation
|
||||
3. Decode generated tokens back to text
|
||||
4. Return generated text
|
||||
|
||||
EXAMPLE:
|
||||
>>> text = pipeline.generate_text("Hello", max_tokens=10)
|
||||
>>> print(f"Generated: {text}")
|
||||
Generated: Hello world this is AI
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if not self.is_trained:
|
||||
print("⚠️ Model not trained yet. Generating with random weights.")
|
||||
|
||||
# Encode prompt
|
||||
prompt_tokens = self.tokenizer.encode(prompt)
|
||||
prompt_tensor = Tensor([prompt_tokens])
|
||||
|
||||
# Generate tokens
|
||||
generated_tokens = self.model.generate(
|
||||
prompt_tensor,
|
||||
max_new_tokens=max_tokens,
|
||||
temperature=0.8,
|
||||
use_cache=True
|
||||
)
|
||||
|
||||
# Decode to text
|
||||
all_tokens = generated_tokens.data[0].tolist()
|
||||
generated_text = self.tokenizer.decode(all_tokens)
|
||||
|
||||
return generated_text
|
||||
### END SOLUTION
|
||||
|
||||
def test_unit_complete_pipeline():
|
||||
"""🔬 Test complete pipeline integration."""
|
||||
print("🔬 Unit Test: Complete Pipeline Integration...")
|
||||
|
||||
# Create pipeline
|
||||
pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2)
|
||||
|
||||
# Test data preparation
|
||||
corpus = ["hello world", "ai is fun", "machine learning"]
|
||||
dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
|
||||
assert len(dataloader) > 0, "DataLoader should have batches"
|
||||
|
||||
# Test training (minimal)
|
||||
history = pipeline.train(dataloader, epochs=1)
|
||||
assert 'losses' in history, "History should contain losses"
|
||||
assert len(history['losses']) == 1, "Should have one epoch of losses"
|
||||
|
||||
# Test optimization
|
||||
pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
|
||||
|
||||
# Test generation
|
||||
generated = pipeline.generate_text("hello", max_tokens=5)
|
||||
assert isinstance(generated, str), "Generated output should be string"
|
||||
assert len(generated) > 0, "Generated text should not be empty"
|
||||
|
||||
print(f"✅ Pipeline stages completed successfully")
|
||||
print(f"✅ Training history: {len(history['losses'])} epochs")
|
||||
print(f"✅ Generated text: '{generated[:20]}...'")
|
||||
print("✅ Complete pipeline integration works!")
|
||||
|
||||
# Run immediate test
|
||||
test_unit_complete_pipeline()
|
||||
|
||||
72
tinytorch/benchmarking/benchmark.py
generated
72
tinytorch/benchmarking/benchmark.py
generated
@@ -1,22 +1,8 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_benchmark/benchmark_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/19_benchmarking/benchmarking_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['OlympicEvent', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf',
|
||||
'test_unit_tinymlperf']
|
||||
'test_unit_tinymlperf', 'calculate_normalized_scores']
|
||||
|
||||
# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 0
|
||||
#| default_exp benchmarking.benchmark
|
||||
@@ -72,7 +58,7 @@ class Benchmark:
|
||||
self.measurement_runs = measurement_runs
|
||||
self.results = {}
|
||||
|
||||
# Use Profiler from Module 14 for measurements
|
||||
# Use Profiler from Module 15 for measurements
|
||||
self.profiler = Profiler()
|
||||
|
||||
# System information for metadata
|
||||
@@ -1024,3 +1010,53 @@ def test_unit_tinymlperf():
|
||||
print("✅ TinyMLPerf works correctly!")
|
||||
|
||||
test_unit_tinymlperf()
|
||||
|
||||
# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 24
|
||||
def calculate_normalized_scores(baseline_results: dict,
|
||||
optimized_results: dict) -> dict:
|
||||
"""
|
||||
Calculate normalized performance metrics for fair competition comparison.
|
||||
|
||||
This function converts absolute measurements into relative improvements,
|
||||
enabling fair comparison across different hardware platforms.
|
||||
|
||||
Args:
|
||||
baseline_results: Dict with keys: 'latency', 'memory', 'accuracy'
|
||||
optimized_results: Dict with same keys as baseline_results
|
||||
|
||||
Returns:
|
||||
Dict with normalized metrics:
|
||||
- speedup: Relative latency improvement (higher is better)
|
||||
- compression_ratio: Relative memory reduction (higher is better)
|
||||
- accuracy_delta: Absolute accuracy change (closer to 0 is better)
|
||||
- efficiency_score: Combined metric balancing all factors
|
||||
|
||||
Example:
|
||||
>>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
|
||||
>>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
|
||||
>>> scores = calculate_normalized_scores(baseline, optimized)
|
||||
>>> print(f"Speedup: {scores['speedup']:.2f}x")
|
||||
Speedup: 2.50x
|
||||
"""
|
||||
# Calculate speedup (higher is better)
|
||||
speedup = baseline_results['latency'] / optimized_results['latency']
|
||||
|
||||
# Calculate compression ratio (higher is better)
|
||||
compression_ratio = baseline_results['memory'] / optimized_results['memory']
|
||||
|
||||
# Calculate accuracy delta (closer to 0 is better, negative means degradation)
|
||||
accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy']
|
||||
|
||||
# Calculate efficiency score (combined metric)
|
||||
# Penalize accuracy loss: the more accuracy you lose, the lower your score
|
||||
accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0
|
||||
efficiency_score = (speedup * compression_ratio) / accuracy_penalty
|
||||
|
||||
return {
|
||||
'speedup': speedup,
|
||||
'compression_ratio': compression_ratio,
|
||||
'accuracy_delta': accuracy_delta,
|
||||
'efficiency_score': efficiency_score,
|
||||
'baseline': baseline_results.copy(),
|
||||
'optimized': optimized_results.copy()
|
||||
}
|
||||
|
||||
231
tinytorch/competition/submit.py
generated
231
tinytorch/competition/submit.py
generated
@@ -1,22 +1,8 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_submit/submit_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/20_competition/competition_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['validate_installation', 'load_baseline_model', 'generate_baseline', 'worked_example_optimization',
|
||||
'optimize_for_competition', 'generate_submission']
|
||||
'optimize_for_competition', 'validate_submission', 'generate_submission']
|
||||
|
||||
# %% ../../modules/source/20_competition/competition_dev.ipynb 4
|
||||
import numpy as np
|
||||
@@ -24,6 +10,8 @@ import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Any, Optional
|
||||
from ..benchmarking.benchmark import Benchmark, calculate_normalized_scores
|
||||
from ..profiling.profiler import Profiler
|
||||
|
||||
def validate_installation() -> Dict[str, bool]:
|
||||
"""
|
||||
@@ -362,31 +350,24 @@ def worked_example_optimization():
|
||||
return submission
|
||||
|
||||
# %% ../../modules/source/20_competition/competition_dev.ipynb 10
|
||||
def optimize_for_competition(baseline_model, event: str = "all_around"):
|
||||
def optimize_for_competition(baseline_model, event: str = "all_around", division: str = "closed"):
|
||||
"""
|
||||
🏅 YOUR COMPETITION ENTRY - IMPLEMENT YOUR STRATEGY HERE!
|
||||
|
||||
This is where you apply optimization techniques from Modules 14-18.
|
||||
|
||||
Available techniques:
|
||||
- Module 14: KV Caching (for transformers) - enable_kv_cache()
|
||||
- Module 16: Acceleration (vectorization, fusion)
|
||||
- Module 17: Quantization (INT8, INT4) - quantize_model()
|
||||
- Module 18: Compression (pruning) - magnitude_prune()
|
||||
|
||||
Args:
|
||||
baseline_model: The unoptimized model
|
||||
event: Which Olympic event you're competing in
|
||||
baseline_model: Starting model (use for Closed, optional for Open)
|
||||
event: Category you're competing in
|
||||
- "latency_sprint": Minimize latency
|
||||
- "memory_challenge": Minimize memory
|
||||
- "accuracy_contest": Maximize accuracy
|
||||
- "all_around": Best balance
|
||||
- "extreme_push": Most aggressive
|
||||
division: "closed" or "open" - which track you chose
|
||||
|
||||
Returns:
|
||||
Your optimized model
|
||||
|
||||
Example:
|
||||
🔒 CLOSED DIVISION Example:
|
||||
from tinytorch.optimization.quantization import quantize_model
|
||||
from tinytorch.optimization.compression import magnitude_prune
|
||||
|
||||
@@ -394,6 +375,15 @@ def optimize_for_competition(baseline_model, event: str = "all_around"):
|
||||
optimized = quantize_model(optimized, bits=8)
|
||||
optimized = magnitude_prune(optimized, sparsity=0.7)
|
||||
return optimized
|
||||
|
||||
🔓 OPEN DIVISION Example:
|
||||
# Build your own model OR
|
||||
# Use your improved implementations from earlier modules
|
||||
# (after you've modified and re-exported them)
|
||||
|
||||
from tinytorch.models import YourCustomArchitecture
|
||||
optimized = YourCustomArchitecture()
|
||||
return optimized
|
||||
"""
|
||||
|
||||
print(f"🏅 YOUR OPTIMIZATION STRATEGY FOR: {event}")
|
||||
@@ -438,74 +428,201 @@ def optimize_for_competition(baseline_model, event: str = "all_around"):
|
||||
|
||||
return optimized_model
|
||||
|
||||
#| export
|
||||
def validate_submission(submission: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate competition submission with sanity checks.
|
||||
|
||||
This catches honest mistakes like unrealistic speedups or accidental training.
|
||||
Honor code system - we trust but verify basic reasonableness.
|
||||
|
||||
Args:
|
||||
submission: Submission dictionary to validate
|
||||
|
||||
Returns:
|
||||
Dict with validation results and warnings
|
||||
"""
|
||||
checks = []
|
||||
warnings = []
|
||||
errors = []
|
||||
|
||||
# Extract metrics
|
||||
normalized = submission.get("normalized_scores", {})
|
||||
speedup = normalized.get("speedup", 1.0)
|
||||
compression = normalized.get("compression_ratio", 1.0)
|
||||
accuracy_delta = normalized.get("accuracy_delta", 0.0)
|
||||
|
||||
# Check 1: Speedup is reasonable (not claiming impossible gains)
|
||||
if speedup > 50:
|
||||
errors.append(f"❌ Speedup {speedup:.1f}x seems unrealistic (>50x)")
|
||||
elif speedup > 20:
|
||||
warnings.append(f"⚠️ Speedup {speedup:.1f}x is very high - please verify measurements")
|
||||
else:
|
||||
checks.append(f"✅ Speedup {speedup:.2f}x is reasonable")
|
||||
|
||||
# Check 2: Compression is reasonable
|
||||
if compression > 32:
|
||||
errors.append(f"❌ Compression {compression:.1f}x seems unrealistic (>32x)")
|
||||
elif compression > 16:
|
||||
warnings.append(f"⚠️ Compression {compression:.1f}x is very high - please verify")
|
||||
else:
|
||||
checks.append(f"✅ Compression {compression:.2f}x is reasonable")
|
||||
|
||||
# Check 3: Accuracy didn't improve (Closed Division rule - no training allowed!)
|
||||
division = submission.get("division", "closed")
|
||||
if division == "closed" and accuracy_delta > 1.0:
|
||||
errors.append(f"❌ Accuracy improved by {accuracy_delta:.1f}pp - did you accidentally train the model?")
|
||||
elif accuracy_delta > 0.5:
|
||||
warnings.append(f"⚠️ Accuracy improved by {accuracy_delta:.1f}pp - verify no training occurred")
|
||||
else:
|
||||
checks.append(f"✅ Accuracy change {accuracy_delta:+.2f}pp is reasonable")
|
||||
|
||||
# Check 4: GitHub repo provided
|
||||
github_repo = submission.get("github_repo", "")
|
||||
if not github_repo or github_repo == "":
|
||||
warnings.append("⚠️ No GitHub repo provided - required for verification")
|
||||
else:
|
||||
checks.append(f"✅ GitHub repo provided: {github_repo}")
|
||||
|
||||
# Check 5: Required fields present
|
||||
required_fields = ["division", "event", "athlete_name", "baseline", "optimized", "normalized_scores"]
|
||||
missing = [f for f in required_fields if f not in submission]
|
||||
if missing:
|
||||
errors.append(f"❌ Missing required fields: {', '.join(missing)}")
|
||||
else:
|
||||
checks.append("✅ All required fields present")
|
||||
|
||||
# Check 6: Techniques documented
|
||||
techniques = submission.get("techniques_applied", [])
|
||||
if not techniques or "TODO" in str(techniques):
|
||||
warnings.append("⚠️ No optimization techniques listed")
|
||||
else:
|
||||
checks.append(f"✅ Techniques documented: {', '.join(techniques[:3])}...")
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"checks": checks,
|
||||
"warnings": warnings,
|
||||
"errors": errors
|
||||
}
|
||||
|
||||
#| export
|
||||
def generate_submission(baseline_model, optimized_model,
|
||||
division: str = "closed",
|
||||
event: str = "all_around",
|
||||
athlete_name: str = "YourName",
|
||||
github_repo: str = "",
|
||||
techniques: List[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate standardized competition submission.
|
||||
Generate standardized TinyMLPerf competition submission with normalized scoring.
|
||||
|
||||
Args:
|
||||
baseline_model: Original unoptimized model
|
||||
optimized_model: Your optimized model
|
||||
event: Olympic event name
|
||||
athlete_name: Your name for leaderboard
|
||||
techniques: List of techniques applied
|
||||
division: "closed" or "open"
|
||||
event: Competition category (latency_sprint, memory_challenge, all_around, etc.)
|
||||
athlete_name: Your name for submission
|
||||
github_repo: GitHub repository URL for code verification
|
||||
techniques: List of optimization techniques applied
|
||||
|
||||
Returns:
|
||||
Submission dictionary (will be saved as JSON)
|
||||
"""
|
||||
print("📤 Generating Competition Submission...")
|
||||
print("📤 Generating TinyMLPerf Competition Submission...")
|
||||
print("=" * 70)
|
||||
|
||||
# Get baseline metrics
|
||||
baseline_metrics = generate_baseline(quick=True)
|
||||
|
||||
# For demonstration, estimate optimized metrics
|
||||
# In real competition, this would benchmark the actual optimized model
|
||||
# Benchmark optimized model
|
||||
print("🔬 Benchmarking optimized model...")
|
||||
|
||||
# Placeholder: Students' actual optimizations would be measured here
|
||||
# Use Profiler and Benchmark from Module 19
|
||||
profiler = Profiler()
|
||||
|
||||
# For demonstration, we'll use placeholder metrics
|
||||
# In real competition, students would measure their actual optimized model
|
||||
optimized_metrics = {
|
||||
"model": "Your_Optimized_Model",
|
||||
"accuracy": 84.0, # Measured
|
||||
"latency_ms": 28.0, # Measured
|
||||
"memory_mb": 4.0, # Measured
|
||||
"parameters": 2000000, # Measured
|
||||
"model": getattr(optimized_model, 'name', 'Optimized_Model'),
|
||||
"accuracy": 84.0, # Would be measured with actual test set
|
||||
"latency_ms": 28.0, # Would be measured with profiler
|
||||
"memory_mb": 4.0, # Would be measured with profiler
|
||||
"parameters": 2000000, # Would be counted
|
||||
}
|
||||
|
||||
# Calculate improvements
|
||||
improvements = {
|
||||
"accuracy_change": optimized_metrics["accuracy"] - baseline_metrics["accuracy"],
|
||||
"latency_speedup": baseline_metrics["latency_ms"] / optimized_metrics["latency_ms"],
|
||||
"memory_reduction": baseline_metrics["memory_mb"] / optimized_metrics["memory_mb"],
|
||||
# Calculate normalized scores using Module 19's function
|
||||
baseline_for_norm = {
|
||||
"latency": baseline_metrics["latency_ms"],
|
||||
"memory": baseline_metrics["memory_mb"],
|
||||
"accuracy": baseline_metrics["accuracy"]
|
||||
}
|
||||
|
||||
# Create submission
|
||||
optimized_for_norm = {
|
||||
"latency": optimized_metrics["latency_ms"],
|
||||
"memory": optimized_metrics["memory_mb"],
|
||||
"accuracy": optimized_metrics["accuracy"]
|
||||
}
|
||||
|
||||
normalized_scores = calculate_normalized_scores(baseline_for_norm, optimized_for_norm)
|
||||
|
||||
# Create submission with all required fields
|
||||
submission = {
|
||||
"division": division,
|
||||
"event": event,
|
||||
"athlete_name": athlete_name,
|
||||
"github_repo": github_repo,
|
||||
"baseline": baseline_metrics,
|
||||
"optimized": optimized_metrics,
|
||||
"improvements": improvements,
|
||||
"techniques_applied": techniques or ["TODO: List your techniques"],
|
||||
"normalized_scores": {
|
||||
"speedup": normalized_scores["speedup"],
|
||||
"compression_ratio": normalized_scores["compression_ratio"],
|
||||
"accuracy_delta": normalized_scores["accuracy_delta"],
|
||||
"efficiency_score": normalized_scores["efficiency_score"]
|
||||
},
|
||||
"techniques_applied": techniques or ["TODO: Document your optimization techniques"],
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"tinytorch_version": "0.1.0",
|
||||
"honor_code": False # Must be explicitly set to True after validation
|
||||
}
|
||||
|
||||
# Validate submission
|
||||
print("\n🔍 Validating submission...")
|
||||
validation = validate_submission(submission)
|
||||
|
||||
# Display validation results
|
||||
print("\n📋 Validation Results:")
|
||||
for check in validation["checks"]:
|
||||
print(f" {check}")
|
||||
for warning in validation["warnings"]:
|
||||
print(f" {warning}")
|
||||
for error in validation["errors"]:
|
||||
print(f" {error}")
|
||||
|
||||
if not validation["valid"]:
|
||||
print("\n❌ Submission has errors - please fix before submitting")
|
||||
return submission
|
||||
|
||||
# Save to JSON
|
||||
output_file = Path("submission.json")
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(submission, f, indent=2)
|
||||
|
||||
print(f"✅ Submission saved to: {output_file}")
|
||||
print(f"\n✅ Submission saved to: {output_file}")
|
||||
print()
|
||||
print("📊 Your Results:")
|
||||
print(f" Event: {event}")
|
||||
print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {improvements['accuracy_change']:+.1f}pp)")
|
||||
print(f" Latency: {optimized_metrics['latency_ms']:.1f}ms ({improvements['latency_speedup']:.2f}x faster)")
|
||||
print(f" Memory: {optimized_metrics['memory_mb']:.2f}MB ({improvements['memory_reduction']:.2f}x smaller)")
|
||||
print("📊 Your Normalized Scores (MLPerf-style):")
|
||||
print(f" Division: {division.upper()}")
|
||||
print(f" Event: {event.replace('_', ' ').title()}")
|
||||
print(f" Speedup: {normalized_scores['speedup']:.2f}x faster ⚡")
|
||||
print(f" Compression: {normalized_scores['compression_ratio']:.2f}x smaller 💾")
|
||||
print(f" Accuracy: {optimized_metrics['accuracy']:.1f}% (Δ {normalized_scores['accuracy_delta']:+.2f}pp)")
|
||||
print(f" Efficiency: {normalized_scores['efficiency_score']:.2f}")
|
||||
print()
|
||||
print("📤 Next Steps:")
|
||||
print(" 1. Verify all metrics are correct")
|
||||
print(" 2. Push your code to GitHub (if not done)")
|
||||
print(" 3. Run: tito submit submission.json")
|
||||
print(" (This will validate and prepare final submission)")
|
||||
print()
|
||||
print("📤 Upload submission.json to TorchPerf Olympics platform!")
|
||||
print("=" * 70)
|
||||
|
||||
return submission
|
||||
|
||||
18
tinytorch/core/activations.py
generated
18
tinytorch/core/activations.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/03_activations/activations_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/02_activations/activations_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']
|
||||
|
||||
|
||||
22
tinytorch/core/attention.py
generated
22
tinytorch/core/attention.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/07_attention/attention_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/12_attention/attention_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['scaled_dot_product_attention', 'MultiHeadAttention']
|
||||
|
||||
@@ -293,6 +279,10 @@ class MultiHeadAttention:
|
||||
return output
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
|
||||
"""Allows the attention layer to be called like a function."""
|
||||
return self.forward(x, mask)
|
||||
|
||||
def parameters(self) -> List[Tensor]:
|
||||
"""
|
||||
Return all trainable parameters.
|
||||
|
||||
547
tinytorch/core/autograd.py
generated
547
tinytorch/core/autograd.py
generated
@@ -1,23 +1,8 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/09_autograd/autograd_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/05_autograd/autograd_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'TransposeBackward',
|
||||
'PermuteBackward', 'EmbeddingBackward', 'ReshapeBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
|
||||
'SoftmaxBackward', 'GELUBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||||
__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
|
||||
'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
|
||||
import numpy as np
|
||||
@@ -164,66 +149,7 @@ class MulBackward(Function):
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 13
|
||||
class SubBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor subtraction.
|
||||
|
||||
**Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1
|
||||
"""
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for subtraction.
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_a, grad_b) where grad_b is negated
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
grad_a = grad_output # ∂(a-b)/∂a = 1
|
||||
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15
|
||||
class DivBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor division.
|
||||
|
||||
**Mathematical Rule:** If z = a / b, then:
|
||||
- ∂z/∂a = 1/b
|
||||
- ∂z/∂b = -a/b²
|
||||
"""
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for division using quotient rule.
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_a, grad_b)
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
# ∂(a/b)/∂a = 1/b
|
||||
if isinstance(b, Tensor):
|
||||
grad_a = grad_output / b.data
|
||||
else:
|
||||
grad_a = grad_output / b
|
||||
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
# ∂(a/b)/∂b = -a/b²
|
||||
grad_b = -grad_output * a.data / (b.data ** 2)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14
|
||||
class MatmulBackward(Function):
|
||||
"""
|
||||
Gradient computation for matrix multiplication.
|
||||
@@ -252,242 +178,21 @@ class MatmulBackward(Function):
|
||||
**Mathematical Foundation:**
|
||||
- ∂(A@B)/∂A = grad_output @ B.T
|
||||
- ∂(A@B)/∂B = A.T @ grad_output
|
||||
|
||||
**Batched Operation:** For 3D+ tensors, we transpose only the last two
|
||||
dimensions using np.swapaxes, preserving batch dimensions.
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
# Gradient for first input: grad_output @ b.T
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
# For batched tensors, transpose only last two dims
|
||||
if b.data.ndim >= 2:
|
||||
b_T = np.swapaxes(b.data, -2, -1)
|
||||
else:
|
||||
b_T = b.data.T
|
||||
grad_a = np.matmul(grad_output, b_T)
|
||||
grad_a = np.dot(grad_output, b.data.T)
|
||||
|
||||
# Gradient for second input: a.T @ grad_output
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
# For batched tensors, transpose only last two dims
|
||||
if a.data.ndim >= 2:
|
||||
a_T = np.swapaxes(a.data, -2, -1)
|
||||
else:
|
||||
a_T = a.data.T
|
||||
grad_b = np.matmul(a_T, grad_output)
|
||||
grad_b = np.dot(a.data.T, grad_output)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
|
||||
class TransposeBackward(Function):
|
||||
"""
|
||||
Gradient computation for transpose operation.
|
||||
|
||||
**Mathematical Rule:** If Y = X.T, then:
|
||||
- ∂Y/∂X = grad_Y.T
|
||||
|
||||
**Key Insight:** The gradient of transpose is just transpose the gradient!
|
||||
This is because transpose is a linear operation that just rearranges elements.
|
||||
|
||||
**Applications:** Used in attention (K.T for scores), weight gradients (W.T),
|
||||
and any operation that needs to swap matrix dimensions.
|
||||
"""
|
||||
|
||||
def __init__(self, tensor, dim0, dim1):
|
||||
"""
|
||||
Args:
|
||||
tensor: Input tensor
|
||||
dim0: First dimension to swap (None for default)
|
||||
dim1: Second dimension to swap (None for default)
|
||||
"""
|
||||
super().__init__(tensor)
|
||||
self.dim0 = dim0
|
||||
self.dim1 = dim1
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradient for transpose.
|
||||
|
||||
Args:
|
||||
grad_output: Gradient flowing backward from output
|
||||
|
||||
Returns:
|
||||
Tuple with single gradient for input tensor
|
||||
|
||||
**Mathematical Foundation:**
|
||||
- ∂(X.T)/∂X = grad_output.T
|
||||
- Just transpose the gradient back!
|
||||
"""
|
||||
x, = self.saved_tensors
|
||||
grad_x = None
|
||||
|
||||
if isinstance(x, Tensor) and x.requires_grad:
|
||||
# Transpose gradient using the same dims
|
||||
if self.dim0 is None and self.dim1 is None:
|
||||
# Default: transpose last two dimensions
|
||||
if grad_output.ndim < 2:
|
||||
grad_x = grad_output.copy()
|
||||
else:
|
||||
axes = list(range(grad_output.ndim))
|
||||
axes[-2], axes[-1] = axes[-1], axes[-2]
|
||||
grad_x = np.transpose(grad_output, axes)
|
||||
else:
|
||||
# Specific dimensions: swap them back
|
||||
axes = list(range(grad_output.ndim))
|
||||
axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0]
|
||||
grad_x = np.transpose(grad_output, axes)
|
||||
|
||||
return (grad_x,)
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 19
|
||||
class PermuteBackward(Function):
|
||||
"""
|
||||
Gradient computation for arbitrary axis permutation (general transpose).
|
||||
|
||||
**Mathematical Rule:** If Y = X.permute(axes), then:
|
||||
- ∂Y/∂X = grad_Y.permute(inverse_axes)
|
||||
|
||||
**Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse).
|
||||
More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0).
|
||||
|
||||
**Key Insight:** To reverse a permutation, we need to know where each axis went.
|
||||
If axis i went to position axes[i], then in the inverse, position axes[i] should go to i.
|
||||
|
||||
**Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads.
|
||||
"""
|
||||
|
||||
def __init__(self, tensor, axes):
|
||||
"""
|
||||
Args:
|
||||
tensor: Input tensor
|
||||
axes: Tuple of axis indices defining the permutation
|
||||
"""
|
||||
super().__init__(tensor)
|
||||
self.axes = axes
|
||||
# Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i
|
||||
self.inverse_axes = tuple(np.argsort(axes))
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradient for permutation.
|
||||
|
||||
The gradient is permuted back using the inverse permutation.
|
||||
|
||||
**Mathematical Foundation:**
|
||||
- ∂(X.permute(axes))/∂X = grad_output.permute(inverse_axes)
|
||||
"""
|
||||
x, = self.saved_tensors
|
||||
grad_x = None
|
||||
|
||||
if isinstance(x, Tensor) and x.requires_grad:
|
||||
# Permute gradient back to original axis order
|
||||
grad_x = np.transpose(grad_output, self.inverse_axes)
|
||||
|
||||
return (grad_x,)
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 20
|
||||
class EmbeddingBackward(Function):
|
||||
"""
|
||||
Gradient computation for embedding lookup operation.
|
||||
|
||||
**Mathematical Rule:** If Y = Embedding[indices], then:
|
||||
- ∂Loss/∂Embedding[i] = sum of all gradients where index==i
|
||||
|
||||
**Key Insight:** Embedding lookup is a gather operation. The backward
|
||||
is a scatter operation that accumulates gradients to the embedding weights.
|
||||
|
||||
**Applications:** Word embeddings, positional embeddings, token embeddings
|
||||
in transformers.
|
||||
"""
|
||||
|
||||
def __init__(self, weight, indices):
|
||||
"""
|
||||
Args:
|
||||
weight: Embedding weight matrix
|
||||
indices: Indices used for lookup
|
||||
"""
|
||||
super().__init__(weight)
|
||||
self.indices = indices
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradient for embedding lookup.
|
||||
|
||||
Args:
|
||||
grad_output: Gradient flowing backward from output
|
||||
|
||||
Returns:
|
||||
Tuple with single gradient for weight tensor
|
||||
|
||||
**Mathematical Foundation:**
|
||||
- ∂(Embedding[indices])/∂Embedding = scatter gradients to selected rows
|
||||
- Multiple indices can point to same embedding → gradients accumulate
|
||||
"""
|
||||
weight, = self.saved_tensors
|
||||
grad_weight = None
|
||||
|
||||
if isinstance(weight, Tensor) and weight.requires_grad:
|
||||
# Initialize gradient with zeros
|
||||
grad_weight = np.zeros_like(weight.data)
|
||||
|
||||
# Scatter gradients back to embedding weights
|
||||
# np.add.at accumulates gradients for repeated indices
|
||||
indices_flat = self.indices.data.astype(int).flatten()
|
||||
grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1])
|
||||
|
||||
np.add.at(grad_weight, indices_flat, grad_output_reshaped)
|
||||
|
||||
return (grad_weight,)
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21
|
||||
class ReshapeBackward(Function):
|
||||
"""
|
||||
Gradient computation for reshape operation.
|
||||
|
||||
**Mathematical Rule:** If Y = X.reshape(new_shape), then:
|
||||
- ∂Y/∂X = grad_Y.reshape(X.shape)
|
||||
|
||||
**Key Insight:** Reshape just rearranges the same elements.
|
||||
The gradient is simply reshaped back to the original shape!
|
||||
|
||||
**Applications:** Flattening tensors for linear layers, reshaping
|
||||
between convolutional and dense layers.
|
||||
"""
|
||||
|
||||
def __init__(self, tensor, original_shape):
|
||||
"""
|
||||
Args:
|
||||
tensor: Input tensor
|
||||
original_shape: Shape before reshape
|
||||
"""
|
||||
super().__init__(tensor)
|
||||
self.original_shape = original_shape
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradient for reshape.
|
||||
|
||||
Args:
|
||||
grad_output: Gradient flowing backward from output
|
||||
|
||||
Returns:
|
||||
Tuple with single gradient for input tensor
|
||||
|
||||
**Mathematical Foundation:**
|
||||
- ∂(X.reshape(...))/∂X = grad_output.reshape(X.shape)
|
||||
- Just reshape the gradient back!
|
||||
"""
|
||||
x, = self.saved_tensors
|
||||
grad_x = None
|
||||
|
||||
if isinstance(x, Tensor) and x.requires_grad:
|
||||
# Reshape gradient back to original shape
|
||||
grad_x = grad_output.reshape(self.original_shape)
|
||||
|
||||
return (grad_x,)
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16
|
||||
class SumBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor sum.
|
||||
@@ -521,7 +226,7 @@ class SumBackward(Function):
|
||||
return np.ones_like(tensor.data) * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
|
||||
class ReLUBackward(Function):
|
||||
"""
|
||||
Gradient computation for ReLU activation.
|
||||
@@ -544,7 +249,7 @@ class ReLUBackward(Function):
|
||||
return grad_output * relu_grad,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
|
||||
class SigmoidBackward(Function):
|
||||
"""
|
||||
Gradient computation for sigmoid activation.
|
||||
@@ -574,101 +279,7 @@ class SigmoidBackward(Function):
|
||||
return grad_output * sigmoid_grad,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 30
|
||||
class SoftmaxBackward(Function):
|
||||
"""
|
||||
Gradient computation for softmax activation.
|
||||
|
||||
Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x))
|
||||
Derivative: ∂softmax/∂x[i] = softmax[i] * (δ[i,j] - softmax[j])
|
||||
|
||||
For gradient computation:
|
||||
grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax))
|
||||
|
||||
**Key Insight:** The gradient depends on all elements of softmax due to
|
||||
the normalization, not just the element being differentiated.
|
||||
"""
|
||||
|
||||
def __init__(self, input_tensor, output_tensor, dim=-1):
|
||||
"""
|
||||
Initialize with input, output, and dimension.
|
||||
|
||||
Args:
|
||||
input_tensor: Original input to softmax
|
||||
output_tensor: Output of softmax (needed for gradient)
|
||||
dim: Dimension along which softmax was applied
|
||||
"""
|
||||
super().__init__(input_tensor)
|
||||
self.output_data = output_tensor.data
|
||||
self.dim = dim
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradient for softmax.
|
||||
|
||||
Mathematical formula:
|
||||
∂L/∂x[i] = softmax[i] * (∂L/∂y[i] - sum_j(∂L/∂y[j] * softmax[j]))
|
||||
|
||||
This can be vectorized as:
|
||||
grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True))
|
||||
"""
|
||||
tensor, = self.saved_tensors
|
||||
|
||||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||||
# Compute sum(grad_output * softmax) along the softmax dimension
|
||||
sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True)
|
||||
|
||||
# Softmax gradient: softmax * (grad_output - sum_term)
|
||||
grad_x = self.output_data * (grad_output - sum_term)
|
||||
|
||||
return (grad_x,)
|
||||
return (None,)
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 31
|
||||
class GELUBackward(Function):
|
||||
"""
|
||||
Gradient computation for GELU activation.
|
||||
|
||||
GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal
|
||||
Approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
|
||||
|
||||
**Key Insight:** GELU is smoother than ReLU, providing non-zero gradients
|
||||
for negative values, which helps training deep networks.
|
||||
"""
|
||||
|
||||
def __init__(self, input_tensor):
|
||||
"""Initialize with input tensor."""
|
||||
super().__init__(input_tensor)
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradient for GELU.
|
||||
|
||||
Mathematical formula (using approximation):
|
||||
∂gelu/∂x ≈ 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...)
|
||||
|
||||
Simplified: We compute the derivative numerically or use the formula.
|
||||
"""
|
||||
tensor, = self.saved_tensors
|
||||
|
||||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||||
x = tensor.data
|
||||
# GELU derivative approximation
|
||||
# Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
||||
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
|
||||
x_cubed = x ** 3
|
||||
tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed)
|
||||
tanh_out = np.tanh(tanh_arg)
|
||||
sech_squared = 1 - tanh_out ** 2
|
||||
|
||||
# Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx
|
||||
d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2)
|
||||
gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg
|
||||
|
||||
return (grad_output * gelu_grad,)
|
||||
return (None,)
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 32
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
|
||||
class MSEBackward(Function):
|
||||
"""
|
||||
Gradient computation for Mean Squared Error Loss.
|
||||
@@ -694,7 +305,7 @@ class MSEBackward(Function):
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 33
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
|
||||
class BCEBackward(Function):
|
||||
"""
|
||||
Gradient computation for Binary Cross-Entropy Loss.
|
||||
@@ -724,7 +335,7 @@ class BCEBackward(Function):
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 34
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
|
||||
class CrossEntropyBackward(Function):
|
||||
"""
|
||||
Gradient computation for Cross-Entropy Loss.
|
||||
@@ -769,7 +380,7 @@ class CrossEntropyBackward(Function):
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 35
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
|
||||
def enable_autograd():
|
||||
"""
|
||||
Enable gradient tracking for all Tensor operations.
|
||||
@@ -806,12 +417,8 @@ def enable_autograd():
|
||||
|
||||
# Store original operations
|
||||
_original_add = Tensor.__add__
|
||||
_original_sub = Tensor.__sub__
|
||||
_original_mul = Tensor.__mul__
|
||||
_original_div = Tensor.__truediv__
|
||||
_original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
|
||||
_original_transpose = Tensor.transpose if hasattr(Tensor, 'transpose') else None
|
||||
_original_reshape = Tensor.reshape if hasattr(Tensor, 'reshape') else None
|
||||
|
||||
# Enhanced operations that track gradients
|
||||
def tracked_add(self, other):
|
||||
@@ -878,98 +485,6 @@ def enable_autograd():
|
||||
|
||||
return result
|
||||
|
||||
def tracked_transpose(self, dim0=None, dim1=None):
|
||||
"""
|
||||
Transpose with gradient tracking.
|
||||
|
||||
Enhances the original transpose method to build computation graphs
|
||||
when requires_grad=True for the input.
|
||||
"""
|
||||
if _original_transpose:
|
||||
result = _original_transpose(self, dim0, dim1)
|
||||
else:
|
||||
# Fallback if transpose doesn't exist
|
||||
if dim0 is None and dim1 is None:
|
||||
axes = list(range(len(self.shape)))
|
||||
if len(axes) >= 2:
|
||||
axes[-2], axes[-1] = axes[-1], axes[-2]
|
||||
result = Tensor(np.transpose(self.data, axes))
|
||||
else:
|
||||
axes = list(range(len(self.shape)))
|
||||
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
|
||||
result = Tensor(np.transpose(self.data, axes))
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = TransposeBackward(self, dim0, dim1)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_reshape(self, *shape):
|
||||
"""
|
||||
Reshape with gradient tracking.
|
||||
|
||||
Enhances the original reshape method to build computation graphs
|
||||
when requires_grad=True for the input.
|
||||
"""
|
||||
original_shape = self.shape
|
||||
|
||||
if _original_reshape:
|
||||
result = _original_reshape(self, *shape)
|
||||
else:
|
||||
# Fallback if reshape doesn't exist
|
||||
result = Tensor(self.data.reshape(*shape))
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = ReshapeBackward(self, original_shape)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_sub(self, other):
|
||||
"""
|
||||
Subtraction with gradient tracking.
|
||||
|
||||
Enhances the original __sub__ method to build computation graphs
|
||||
when requires_grad=True for any input.
|
||||
"""
|
||||
# Convert scalar to Tensor if needed
|
||||
if not isinstance(other, Tensor):
|
||||
other = Tensor(other)
|
||||
|
||||
# Call original operation
|
||||
result = _original_sub(self, other)
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad or other.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = SubBackward(self, other)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_div(self, other):
|
||||
"""
|
||||
Division with gradient tracking.
|
||||
|
||||
Enhances the original __truediv__ method to build computation graphs
|
||||
when requires_grad=True for any input.
|
||||
"""
|
||||
# Convert scalar to Tensor if needed
|
||||
if not isinstance(other, Tensor):
|
||||
other = Tensor(other)
|
||||
|
||||
# Call original operation
|
||||
result = _original_div(self, other)
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad or other.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = DivBackward(self, other)
|
||||
|
||||
return result
|
||||
|
||||
def sum_op(self, axis=None, keepdims=False):
|
||||
"""
|
||||
Sum operation with gradient tracking.
|
||||
@@ -1058,26 +573,20 @@ def enable_autograd():
|
||||
|
||||
# Install enhanced operations
|
||||
Tensor.__add__ = tracked_add
|
||||
Tensor.__sub__ = tracked_sub
|
||||
Tensor.__mul__ = tracked_mul
|
||||
Tensor.__truediv__ = tracked_div
|
||||
Tensor.matmul = tracked_matmul
|
||||
Tensor.transpose = tracked_transpose
|
||||
Tensor.reshape = tracked_reshape
|
||||
Tensor.sum = sum_op
|
||||
Tensor.backward = backward
|
||||
Tensor.zero_grad = zero_grad
|
||||
|
||||
# Patch activations and losses to track gradients
|
||||
try:
|
||||
from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU
|
||||
from tinytorch.core.activations import Sigmoid, ReLU
|
||||
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
|
||||
|
||||
# Store original methods
|
||||
_original_sigmoid_forward = Sigmoid.forward
|
||||
_original_relu_forward = ReLU.forward
|
||||
_original_softmax_forward = Softmax.forward
|
||||
_original_gelu_forward = GELU.forward
|
||||
_original_bce_forward = BinaryCrossEntropyLoss.forward
|
||||
_original_mse_forward = MSELoss.forward
|
||||
_original_ce_forward = CrossEntropyLoss.forward
|
||||
@@ -1104,30 +613,6 @@ def enable_autograd():
|
||||
|
||||
return result
|
||||
|
||||
def tracked_softmax_forward(self, x, dim=-1):
|
||||
"""Softmax with gradient tracking."""
|
||||
# Call original forward to get result using Tensor operations
|
||||
result = _original_softmax_forward(self, x, dim=dim)
|
||||
|
||||
# Attach the correct gradient function
|
||||
if x.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = SoftmaxBackward(x, result, dim)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_gelu_forward(self, x):
|
||||
"""GELU with gradient tracking."""
|
||||
# Call original forward to get result
|
||||
result = _original_gelu_forward(self, x)
|
||||
|
||||
# Attach the correct gradient function
|
||||
if x.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = GELUBackward(x)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_bce_forward(self, predictions, targets):
|
||||
"""Binary cross-entropy with gradient tracking."""
|
||||
# Compute BCE loss
|
||||
@@ -1187,8 +672,6 @@ def enable_autograd():
|
||||
# Install patched methods
|
||||
Sigmoid.forward = tracked_sigmoid_forward
|
||||
ReLU.forward = tracked_relu_forward
|
||||
Softmax.forward = tracked_softmax_forward
|
||||
GELU.forward = tracked_gelu_forward
|
||||
BinaryCrossEntropyLoss.forward = tracked_bce_forward
|
||||
MSELoss.forward = tracked_mse_forward
|
||||
CrossEntropyLoss.forward = tracked_ce_forward
|
||||
|
||||
18
tinytorch/core/layers.py
generated
18
tinytorch/core/layers.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/04_layers/layers_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_layers/layers_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Linear', 'Dropout']
|
||||
|
||||
|
||||
18
tinytorch/core/losses.py
generated
18
tinytorch/core/losses.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_losses/losses_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/04_losses/losses_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['import_previous_module', 'log_softmax', 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss']
|
||||
|
||||
|
||||
18
tinytorch/core/optimizers.py
generated
18
tinytorch/core/optimizers.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/10_optimizers/optimizers_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/06_optimizers/optimizers_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Optimizer', 'SGD', 'Adam', 'AdamW']
|
||||
|
||||
|
||||
18
tinytorch/core/spatial.py
generated
18
tinytorch/core/spatial.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/06_spatial/spatial_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/09_spatial/spatial_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Conv2d', 'MaxPool2d', 'AvgPool2d', 'SimpleCNN']
|
||||
|
||||
|
||||
18
tinytorch/core/tensor.py
generated
18
tinytorch/core/tensor.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/02_tensor/tensor_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/01_tensor/tensor_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Tensor']
|
||||
|
||||
|
||||
123
tinytorch/core/training.py
generated
123
tinytorch/core/training.py
generated
@@ -1,21 +1,7 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/11_training/training_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_training/training_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['CosineSchedule', 'Trainer']
|
||||
__all__ = ['CosineSchedule', 'save_checkpoint', 'load_checkpoint', 'Trainer']
|
||||
|
||||
# %% ../../modules/source/07_training/training_dev.ipynb 1
|
||||
import numpy as np
|
||||
@@ -72,6 +58,90 @@ class CosineSchedule:
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/07_training/training_dev.ipynb 14
|
||||
def save_checkpoint(checkpoint_dict: Dict[str, Any], path: str):
|
||||
"""
|
||||
Save checkpoint dictionary to disk using pickle.
|
||||
|
||||
This is a low-level utility for saving model state. Use this when you have
|
||||
a custom training loop and want to save just what you need (model params,
|
||||
config, metadata).
|
||||
|
||||
For complete training state with optimizer and scheduler, use
|
||||
Trainer.save_checkpoint() instead.
|
||||
|
||||
TODO: Implement checkpoint saving with pickle
|
||||
|
||||
APPROACH:
|
||||
1. Create parent directory if it doesn't exist (Path(path).parent.mkdir)
|
||||
2. Open file in binary write mode ('wb')
|
||||
3. Use pickle.dump() to serialize the checkpoint dictionary
|
||||
4. Print confirmation message
|
||||
|
||||
EXAMPLE:
|
||||
>>> model = SimpleModel()
|
||||
>>> checkpoint = {
|
||||
... 'model_params': [p.data.copy() for p in model.parameters()],
|
||||
... 'config': {'embed_dim': 32, 'num_layers': 2},
|
||||
... 'metadata': {'final_loss': 0.089, 'training_steps': 5000}
|
||||
... }
|
||||
>>> save_checkpoint(checkpoint, 'checkpoints/model.pkl')
|
||||
✓ Checkpoint saved: checkpoints/model.pkl
|
||||
|
||||
HINTS:
|
||||
- Use Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||
- pickle.dump(obj, file) writes the object to file
|
||||
- Always print a success message so users know it worked
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Create parent directory if needed
|
||||
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save checkpoint using pickle
|
||||
with open(path, 'wb') as f:
|
||||
pickle.dump(checkpoint_dict, f)
|
||||
|
||||
print(f"✓ Checkpoint saved: {path}")
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/07_training/training_dev.ipynb 15
|
||||
def load_checkpoint(path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Load checkpoint dictionary from disk using pickle.
|
||||
|
||||
Companion function to save_checkpoint(). Restores the checkpoint dictionary
|
||||
so you can rebuild your model, resume training, or inspect saved metadata.
|
||||
|
||||
TODO: Implement checkpoint loading with pickle
|
||||
|
||||
APPROACH:
|
||||
1. Open file in binary read mode ('rb')
|
||||
2. Use pickle.load() to deserialize the checkpoint
|
||||
3. Print confirmation message
|
||||
4. Return the loaded dictionary
|
||||
|
||||
EXAMPLE:
|
||||
>>> checkpoint = load_checkpoint('checkpoints/model.pkl')
|
||||
✓ Checkpoint loaded: checkpoints/model.pkl
|
||||
>>> print(checkpoint['metadata']['final_loss'])
|
||||
0.089
|
||||
>>> model_params = checkpoint['model_params']
|
||||
>>> # Now restore model: for param, data in zip(model.parameters(), model_params)...
|
||||
|
||||
HINTS:
|
||||
- pickle.load(file) reads and deserializes the object
|
||||
- Return the loaded dictionary
|
||||
- Print a success message for user feedback
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Load checkpoint using pickle
|
||||
with open(path, 'rb') as f:
|
||||
checkpoint = pickle.load(f)
|
||||
|
||||
print(f"✓ Checkpoint loaded: {path}")
|
||||
return checkpoint
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/07_training/training_dev.ipynb 19
|
||||
class Trainer:
|
||||
"""
|
||||
Complete training orchestrator for neural networks.
|
||||
@@ -246,6 +316,11 @@ class Trainer:
|
||||
def save_checkpoint(self, path: str):
|
||||
"""
|
||||
Save complete training state for resumption.
|
||||
|
||||
This high-level method saves everything needed to resume training:
|
||||
model parameters, optimizer state, scheduler state, and training history.
|
||||
|
||||
Uses the low-level save_checkpoint() function internally.
|
||||
|
||||
Args:
|
||||
path: File path to save checkpoint
|
||||
@@ -260,19 +335,23 @@ class Trainer:
|
||||
'training_mode': self.training_mode
|
||||
}
|
||||
|
||||
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, 'wb') as f:
|
||||
pickle.dump(checkpoint, f)
|
||||
# Use the standalone save_checkpoint function
|
||||
save_checkpoint(checkpoint, path)
|
||||
|
||||
def load_checkpoint(self, path: str):
|
||||
"""
|
||||
Load training state from checkpoint.
|
||||
|
||||
This high-level method restores complete training state including
|
||||
model parameters, optimizer state, scheduler state, and history.
|
||||
|
||||
Uses the low-level load_checkpoint() function internally.
|
||||
|
||||
Args:
|
||||
path: File path to load checkpoint from
|
||||
"""
|
||||
with open(path, 'rb') as f:
|
||||
checkpoint = pickle.load(f)
|
||||
# Use the standalone load_checkpoint function
|
||||
checkpoint = load_checkpoint(path)
|
||||
|
||||
self.epoch = checkpoint['epoch']
|
||||
self.step = checkpoint['step']
|
||||
|
||||
18
tinytorch/data/loader.py
generated
18
tinytorch/data/loader.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_loader/loader_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_dataloader/dataloader_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Dataset', 'TensorDataset', 'DataLoader']
|
||||
|
||||
|
||||
18
tinytorch/generation/kv_cache.py
generated
18
tinytorch/generation/kv_cache.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_kv_cache/kv_cache_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/15_memoization/memoization_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['KVCache', 'enable_kv_cache', 'disable_kv_cache']
|
||||
|
||||
|
||||
45
tinytorch/models/transformer.py
generated
45
tinytorch/models/transformer.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_transformer/transformer_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/13_transformers/transformers_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['LayerNorm', 'MLP', 'TransformerBlock', 'GPT']
|
||||
|
||||
@@ -23,7 +9,6 @@ from ..core.tensor import Tensor
|
||||
from ..core.layers import Linear
|
||||
from ..core.attention import MultiHeadAttention
|
||||
from ..core.activations import GELU
|
||||
from ..text.embeddings import Embedding, PositionalEncoding
|
||||
|
||||
# %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
|
||||
class LayerNorm:
|
||||
@@ -61,6 +46,7 @@ class LayerNorm:
|
||||
self.eps = eps
|
||||
|
||||
# Learnable parameters: scale and shift
|
||||
# CRITICAL: requires_grad=True so optimizer can train these!
|
||||
self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True) # Scale parameter
|
||||
self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True) # Shift parameter
|
||||
### END SOLUTION
|
||||
@@ -83,19 +69,18 @@ class LayerNorm:
|
||||
HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
|
||||
# Compute statistics across last dimension (features)
|
||||
mean = x.mean(axis=-1, keepdims=True)
|
||||
|
||||
# Compute variance: E[(x - μ)²]
|
||||
# Use Tensor operations to preserve computation graph!
|
||||
diff = x - mean
|
||||
variance = (diff * diff).mean(axis=-1, keepdims=True)
|
||||
diff = x - mean # Tensor subtraction maintains gradient
|
||||
variance = (diff * diff).mean(axis=-1, keepdims=True) # Tensor ops maintain gradient
|
||||
|
||||
# Normalize - use Tensor operations to preserve gradients!
|
||||
# Add eps as a Tensor for proper gradient flow
|
||||
eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
|
||||
std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
|
||||
normalized = (x - mean) / std
|
||||
# Normalize: (x - mean) / sqrt(variance + eps)
|
||||
# Note: sqrt and division need to preserve gradient flow
|
||||
std_data = np.sqrt(variance.data + self.eps)
|
||||
normalized = diff * Tensor(1.0 / std_data) # Scale by reciprocal to maintain gradient
|
||||
|
||||
# Apply learnable transformation
|
||||
output = normalized * self.gamma + self.beta
|
||||
@@ -103,7 +88,7 @@ class LayerNorm:
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, x):
|
||||
"""Allows the layer norm to be called like a function."""
|
||||
"""Allows the layer to be called like a function."""
|
||||
return self.forward(x)
|
||||
|
||||
def parameters(self):
|
||||
@@ -147,7 +132,7 @@ class MLP:
|
||||
|
||||
# Two-layer feed-forward network
|
||||
self.linear1 = Linear(embed_dim, hidden_dim)
|
||||
self.gelu = GELU() # Use GELU activation from activations module
|
||||
self.gelu = GELU()
|
||||
self.linear2 = Linear(hidden_dim, embed_dim)
|
||||
### END SOLUTION
|
||||
|
||||
@@ -171,7 +156,7 @@ class MLP:
|
||||
# First linear layer with expansion
|
||||
hidden = self.linear1.forward(x)
|
||||
|
||||
# GELU activation (YOUR activation from Module 03!)
|
||||
# GELU activation
|
||||
hidden = self.gelu.forward(hidden)
|
||||
|
||||
# Second linear layer back to original size
|
||||
@@ -404,10 +389,6 @@ class GPT:
|
||||
return logits
|
||||
### END SOLUTION
|
||||
|
||||
def __call__(self, tokens):
|
||||
"""Allows the GPT model to be called like a function."""
|
||||
return self.forward(tokens)
|
||||
|
||||
def _create_causal_mask(self, seq_len):
|
||||
"""Create causal mask to prevent attending to future positions."""
|
||||
### BEGIN SOLUTION
|
||||
|
||||
18
tinytorch/optimization/acceleration.py
generated
18
tinytorch/optimization/acceleration.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_acceleration/acceleration_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/18_acceleration/acceleration_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = []
|
||||
|
||||
|
||||
339
tinytorch/optimization/compression.py
generated
339
tinytorch/optimization/compression.py
generated
@@ -1,22 +1,7 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_compression/compression_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/17_compression/compression_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Sequential', 'KnowledgeDistillation', 'test_unit_knowledge_distillation', 'CompressionComplete', 'measure_sparsity',
|
||||
'magnitude_prune', 'structured_prune', 'compress_model']
|
||||
__all__ = ['Tensor', 'Linear', 'Sequential']
|
||||
|
||||
# %% ../../modules/source/17_compression/compression_dev.ipynb 1
|
||||
import numpy as np
|
||||
@@ -24,277 +9,77 @@ import copy
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
import time
|
||||
|
||||
# Import from TinyTorch modules
|
||||
from ..core.tensor import Tensor
|
||||
from ..core.layers import Linear
|
||||
# Import from previous modules
|
||||
# Note: In the full package, these would be imports like:
|
||||
# from tinytorch.core.tensor import Tensor
|
||||
# from tinytorch.core.layers import Linear
|
||||
# For development, we'll create minimal implementations
|
||||
|
||||
class Tensor:
|
||||
"""Minimal Tensor class for compression development - imports from Module 01 in practice."""
|
||||
def __init__(self, data, requires_grad=False):
|
||||
self.data = np.array(data)
|
||||
self.shape = self.data.shape
|
||||
self.size = self.data.size
|
||||
self.requires_grad = requires_grad
|
||||
self.grad = None
|
||||
|
||||
def __add__(self, other):
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data + other.data)
|
||||
return Tensor(self.data + other)
|
||||
|
||||
def __mul__(self, other):
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data * other.data)
|
||||
return Tensor(self.data * other)
|
||||
|
||||
def matmul(self, other):
|
||||
return Tensor(np.dot(self.data, other.data))
|
||||
|
||||
def abs(self):
|
||||
return Tensor(np.abs(self.data))
|
||||
|
||||
def sum(self, axis=None):
|
||||
return Tensor(self.data.sum(axis=axis))
|
||||
|
||||
def __repr__(self):
|
||||
return f"Tensor(shape={self.shape})"
|
||||
|
||||
class Linear:
|
||||
"""Minimal Linear layer for compression development - imports from Module 03 in practice."""
|
||||
def __init__(self, in_features, out_features, bias=True):
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
# Initialize with He initialization
|
||||
self.weight = Tensor(np.random.randn(in_features, out_features) * np.sqrt(2.0 / in_features))
|
||||
self.bias = Tensor(np.zeros(out_features)) if bias else None
|
||||
|
||||
def forward(self, x):
|
||||
output = x.matmul(self.weight)
|
||||
if self.bias is not None:
|
||||
output = output + self.bias
|
||||
return output
|
||||
|
||||
def parameters(self):
|
||||
params = [self.weight]
|
||||
if self.bias is not None:
|
||||
params.append(self.bias)
|
||||
return params
|
||||
|
||||
# Sequential container for model compression
|
||||
class Sequential:
|
||||
"""Sequential container for compression (not exported from core layers)."""
|
||||
"""Minimal Sequential container for model compression."""
|
||||
def __init__(self, *layers):
|
||||
self.layers = list(layers)
|
||||
|
||||
def forward(self, x):
|
||||
for layer in self.layers:
|
||||
x = layer.forward(x) if hasattr(layer, 'forward') else layer(x)
|
||||
x = layer.forward(x)
|
||||
return x
|
||||
|
||||
def __call__(self, x):
|
||||
return self.forward(x)
|
||||
|
||||
def parameters(self):
|
||||
params = []
|
||||
for layer in self.layers:
|
||||
if hasattr(layer, 'parameters'):
|
||||
params.extend(layer.parameters())
|
||||
return params
|
||||
|
||||
# %% ../../modules/source/17_compression/compression_dev.ipynb 15
|
||||
class KnowledgeDistillation:
|
||||
"""
|
||||
Knowledge distillation for model compression.
|
||||
|
||||
Train a smaller student model to mimic a larger teacher model.
|
||||
"""
|
||||
|
||||
def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
|
||||
"""
|
||||
Initialize knowledge distillation.
|
||||
|
||||
TODO: Set up teacher and student models with distillation parameters
|
||||
|
||||
APPROACH:
|
||||
1. Store teacher and student models
|
||||
2. Set temperature for softening probability distributions
|
||||
3. Set alpha for balancing hard vs soft targets
|
||||
|
||||
EXAMPLE:
|
||||
>>> teacher = Sequential(Linear(100, 200), Linear(200, 50))
|
||||
>>> student = Sequential(Linear(100, 50))
|
||||
>>> kd = KnowledgeDistillation(teacher, student, temperature=4.0, alpha=0.8)
|
||||
>>> print(f"Temperature: {kd.temperature}, Alpha: {kd.alpha}")
|
||||
Temperature: 4.0, Alpha: 0.8
|
||||
|
||||
HINTS:
|
||||
- Simply assign the parameters to instance variables
|
||||
- Temperature typically ranges from 3-5 for effective softening
|
||||
- Alpha of 0.7 means 70% soft targets, 30% hard targets
|
||||
|
||||
Args:
|
||||
teacher_model: Large, pre-trained model
|
||||
student_model: Smaller model to train
|
||||
temperature: Softening parameter for distributions
|
||||
alpha: Weight for soft target loss (1-alpha for hard targets)
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
self.teacher_model = teacher_model
|
||||
self.student_model = student_model
|
||||
self.temperature = temperature
|
||||
self.alpha = alpha
|
||||
### END SOLUTION
|
||||
|
||||
def distillation_loss(self, student_logits, teacher_logits, true_labels):
|
||||
"""
|
||||
Calculate combined distillation loss.
|
||||
|
||||
TODO: Implement knowledge distillation loss function
|
||||
|
||||
APPROACH:
|
||||
1. Calculate hard target loss (student vs true labels)
|
||||
2. Calculate soft target loss (student vs teacher, with temperature)
|
||||
3. Combine losses: alpha * soft_loss + (1-alpha) * hard_loss
|
||||
|
||||
EXAMPLE:
|
||||
>>> kd = KnowledgeDistillation(teacher, student)
|
||||
>>> loss = kd.distillation_loss(student_out, teacher_out, labels)
|
||||
>>> print(f"Distillation loss: {loss:.4f}")
|
||||
|
||||
HINTS:
|
||||
- Use temperature to soften distributions: logits/temperature
|
||||
- Soft targets use KL divergence or cross-entropy
|
||||
- Hard targets use standard classification loss
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Convert to numpy for this implementation
|
||||
if hasattr(student_logits, 'data'):
|
||||
student_logits = student_logits.data
|
||||
if hasattr(teacher_logits, 'data'):
|
||||
teacher_logits = teacher_logits.data
|
||||
if hasattr(true_labels, 'data'):
|
||||
true_labels = true_labels.data
|
||||
|
||||
# Soften distributions with temperature
|
||||
student_soft = self._softmax(student_logits / self.temperature)
|
||||
teacher_soft = self._softmax(teacher_logits / self.temperature)
|
||||
|
||||
# Soft target loss (KL divergence)
|
||||
soft_loss = self._kl_divergence(student_soft, teacher_soft)
|
||||
|
||||
# Hard target loss (cross-entropy)
|
||||
student_hard = self._softmax(student_logits)
|
||||
hard_loss = self._cross_entropy(student_hard, true_labels)
|
||||
|
||||
# Combined loss
|
||||
total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss
|
||||
|
||||
return total_loss
|
||||
### END SOLUTION
|
||||
|
||||
def _softmax(self, logits):
|
||||
"""Compute softmax with numerical stability."""
|
||||
exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
|
||||
return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
|
||||
|
||||
def _kl_divergence(self, p, q):
|
||||
"""Compute KL divergence between distributions."""
|
||||
return np.sum(p * np.log(p / (q + 1e-8) + 1e-8))
|
||||
|
||||
def _cross_entropy(self, predictions, labels):
|
||||
"""Compute cross-entropy loss."""
|
||||
# Simple implementation for integer labels
|
||||
if labels.ndim == 1:
|
||||
return -np.mean(np.log(predictions[np.arange(len(labels)), labels] + 1e-8))
|
||||
else:
|
||||
return -np.mean(np.sum(labels * np.log(predictions + 1e-8), axis=1))
|
||||
|
||||
def test_unit_knowledge_distillation():
|
||||
"""🔬 Test knowledge distillation functionality."""
|
||||
print("🔬 Unit Test: Knowledge Distillation...")
|
||||
|
||||
# Create teacher and student models
|
||||
teacher = Sequential(Linear(10, 20), Linear(20, 5))
|
||||
student = Sequential(Linear(10, 5)) # Smaller model
|
||||
|
||||
# Initialize knowledge distillation
|
||||
kd = KnowledgeDistillation(teacher, student, temperature=3.0, alpha=0.7)
|
||||
|
||||
# Create dummy data
|
||||
input_data = Tensor(np.random.randn(8, 10)) # Batch of 8
|
||||
true_labels = np.array([0, 1, 2, 3, 4, 0, 1, 2]) # Class labels
|
||||
|
||||
# Forward passes
|
||||
teacher_output = teacher.forward(input_data)
|
||||
student_output = student.forward(input_data)
|
||||
|
||||
# Calculate distillation loss
|
||||
loss = kd.distillation_loss(student_output, teacher_output, true_labels)
|
||||
|
||||
# Verify loss is reasonable
|
||||
assert isinstance(loss, (float, np.floating)), f"Loss should be float, got {type(loss)}"
|
||||
assert loss > 0, f"Loss should be positive, got {loss}"
|
||||
assert not np.isnan(loss), "Loss should not be NaN"
|
||||
|
||||
print("✅ knowledge_distillation works correctly!")
|
||||
|
||||
test_unit_knowledge_distillation()
|
||||
|
||||
# %% ../../modules/source/17_compression/compression_dev.ipynb 29
|
||||
class CompressionComplete:
|
||||
"""
|
||||
Complete compression system for milestone use.
|
||||
|
||||
Provides pruning, distillation, and low-rank approximation techniques.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def measure_sparsity(model) -> float:
|
||||
"""Measure the sparsity of a model (fraction of zero weights)."""
|
||||
total_params = 0
|
||||
zero_params = 0
|
||||
|
||||
if hasattr(model, 'parameters'):
|
||||
for param in model.parameters():
|
||||
total_params += param.size
|
||||
zero_params += np.sum(param.data == 0)
|
||||
|
||||
return zero_params / total_params if total_params > 0 else 0.0
|
||||
|
||||
@staticmethod
|
||||
def magnitude_prune(model, sparsity=0.5):
|
||||
"""
|
||||
Prune model weights by magnitude (smallest weights set to zero).
|
||||
|
||||
Args:
|
||||
model: Model with parameters() method
|
||||
sparsity: Fraction of weights to prune (0-1)
|
||||
"""
|
||||
if hasattr(model, 'parameters'):
|
||||
for param in model.parameters():
|
||||
threshold = np.percentile(np.abs(param.data), sparsity * 100)
|
||||
param.data[np.abs(param.data) < threshold] = 0
|
||||
|
||||
return model
|
||||
|
||||
@staticmethod
|
||||
def structured_prune(model, prune_ratio=0.5):
|
||||
"""
|
||||
Prune entire neurons/channels (structured pruning).
|
||||
|
||||
Args:
|
||||
model: Model to prune
|
||||
prune_ratio: Fraction of structures to prune (0-1)
|
||||
"""
|
||||
if hasattr(model, 'parameters'):
|
||||
params = list(model.parameters())
|
||||
if len(params) > 0 and hasattr(params[0], 'data'):
|
||||
weight = params[0]
|
||||
if len(weight.shape) == 2: # Linear layer
|
||||
# Prune output neurons
|
||||
neuron_norms = np.linalg.norm(weight.data, axis=0)
|
||||
threshold = np.percentile(neuron_norms, prune_ratio * 100)
|
||||
mask = neuron_norms >= threshold
|
||||
weight.data[:, ~mask] = 0
|
||||
|
||||
return model
|
||||
|
||||
@staticmethod
|
||||
def compress_model(model, compression_config: Dict[str, Any]):
|
||||
"""
|
||||
Apply complete compression pipeline to a model.
|
||||
|
||||
Args:
|
||||
model: Model to compress
|
||||
compression_config: Dictionary with compression settings
|
||||
- 'magnitude_sparsity': float (0-1)
|
||||
- 'structured_prune_ratio': float (0-1)
|
||||
|
||||
Returns:
|
||||
Compressed model with sparsity stats
|
||||
"""
|
||||
stats = {
|
||||
'original_sparsity': CompressionComplete.measure_sparsity(model)
|
||||
}
|
||||
|
||||
# Apply magnitude pruning
|
||||
if 'magnitude_sparsity' in compression_config:
|
||||
model = CompressionComplete.magnitude_prune(
|
||||
model, compression_config['magnitude_sparsity']
|
||||
)
|
||||
|
||||
# Apply structured pruning
|
||||
if 'structured_prune_ratio' in compression_config:
|
||||
model = CompressionComplete.structured_prune(
|
||||
model, compression_config['structured_prune_ratio']
|
||||
)
|
||||
|
||||
stats['final_sparsity'] = CompressionComplete.measure_sparsity(model)
|
||||
stats['compression_ratio'] = 1.0 / (1.0 - stats['final_sparsity']) if stats['final_sparsity'] < 1.0 else float('inf')
|
||||
|
||||
return model, stats
|
||||
|
||||
# Convenience functions for backward compatibility
|
||||
def measure_sparsity(model) -> float:
|
||||
"""Measure model sparsity."""
|
||||
return CompressionComplete.measure_sparsity(model)
|
||||
|
||||
def magnitude_prune(model, sparsity=0.5):
|
||||
"""Apply magnitude-based pruning."""
|
||||
return CompressionComplete.magnitude_prune(model, sparsity)
|
||||
|
||||
def structured_prune(model, prune_ratio=0.5):
|
||||
"""Apply structured pruning."""
|
||||
return CompressionComplete.structured_prune(model, prune_ratio)
|
||||
|
||||
def compress_model(model, compression_config: Dict[str, Any]):
|
||||
"""Apply complete compression pipeline."""
|
||||
return CompressionComplete.compress_model(model, compression_config)
|
||||
|
||||
111
tinytorch/optimization/quantization.py
generated
111
tinytorch/optimization/quantization.py
generated
@@ -1,21 +1,7 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_quantization/quantization_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/16_quantization/quantization_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = []
|
||||
__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model']
|
||||
|
||||
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3
|
||||
import numpy as np
|
||||
@@ -29,3 +15,94 @@ from ..core.layers import Linear
|
||||
from ..core.activations import ReLU
|
||||
|
||||
print("✅ Quantization module imports complete")
|
||||
|
||||
# %% ../../modules/source/16_quantization/quantization_dev.ipynb 34
|
||||
class QuantizationComplete:
|
||||
"""
|
||||
Complete quantization system for milestone use.
|
||||
|
||||
Provides INT8 quantization with calibration for 4× memory reduction.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
|
||||
"""Quantize FP32 tensor to INT8."""
|
||||
data = tensor.data
|
||||
min_val = float(np.min(data))
|
||||
max_val = float(np.max(data))
|
||||
|
||||
if abs(max_val - min_val) < 1e-8:
|
||||
return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
|
||||
|
||||
scale = (max_val - min_val) / 255.0
|
||||
zero_point = int(np.round(-128 - min_val / scale))
|
||||
zero_point = int(np.clip(zero_point, -128, 127))
|
||||
|
||||
quantized_data = np.round(data / scale + zero_point)
|
||||
quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8)
|
||||
|
||||
return Tensor(quantized_data), scale, zero_point
|
||||
|
||||
@staticmethod
|
||||
def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
|
||||
"""Dequantize INT8 tensor back to FP32."""
|
||||
dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
|
||||
return Tensor(dequantized_data)
|
||||
|
||||
@staticmethod
|
||||
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
|
||||
"""
|
||||
Quantize all Linear layers in a model.
|
||||
|
||||
Returns dictionary with quantization info and memory savings.
|
||||
"""
|
||||
quantized_layers = {}
|
||||
original_size = 0
|
||||
quantized_size = 0
|
||||
|
||||
# Iterate through model parameters
|
||||
if hasattr(model, 'parameters'):
|
||||
for i, param in enumerate(model.parameters()):
|
||||
param_size = param.data.nbytes
|
||||
original_size += param_size
|
||||
|
||||
# Quantize parameter
|
||||
q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
|
||||
quantized_size += q_param.data.nbytes
|
||||
|
||||
quantized_layers[f'param_{i}'] = {
|
||||
'quantized': q_param,
|
||||
'scale': scale,
|
||||
'zero_point': zp,
|
||||
'original_shape': param.data.shape
|
||||
}
|
||||
|
||||
return {
|
||||
'quantized_layers': quantized_layers,
|
||||
'original_size_mb': original_size / (1024 * 1024),
|
||||
'quantized_size_mb': quantized_size / (1024 * 1024),
|
||||
'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
|
||||
"""Compare memory usage between original and quantized models."""
|
||||
return {
|
||||
'original_mb': quantized_info['original_size_mb'],
|
||||
'quantized_mb': quantized_info['quantized_size_mb'],
|
||||
'compression_ratio': quantized_info['compression_ratio'],
|
||||
'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
|
||||
}
|
||||
|
||||
# Convenience functions for backward compatibility
|
||||
def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
|
||||
"""Quantize FP32 tensor to INT8."""
|
||||
return QuantizationComplete.quantize_tensor(tensor)
|
||||
|
||||
def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
|
||||
"""Dequantize INT8 tensor back to FP32."""
|
||||
return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
|
||||
|
||||
def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
|
||||
"""Quantize entire model to INT8."""
|
||||
return QuantizationComplete.quantize_model(model, calibration_data)
|
||||
|
||||
18
tinytorch/profiling/profiler.py
generated
18
tinytorch/profiling/profiler.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_profiler/profiler_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/14_profiling/profiling_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Profiler', 'quick_profile', 'analyze_weight_distribution']
|
||||
|
||||
|
||||
31
tinytorch/text/embeddings.py
generated
31
tinytorch/text/embeddings.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_embeddings/embeddings_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/11_embeddings/embeddings_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Embedding', 'PositionalEncoding', 'EmbeddingLayer']
|
||||
|
||||
@@ -95,13 +81,10 @@ class Embedding:
|
||||
# This is equivalent to one-hot multiplication but much more efficient
|
||||
embedded = self.weight.data[indices.data.astype(int)]
|
||||
|
||||
# Create result tensor
|
||||
# Create result tensor with gradient tracking
|
||||
# Note: Gradient computation handled by autograd system (Module 05)
|
||||
# The embedding lookup is differentiable through the weight matrix
|
||||
result = Tensor(embedded, requires_grad=self.weight.requires_grad)
|
||||
|
||||
# Attach gradient function (students learned this in Module 05!)
|
||||
if self.weight.requires_grad:
|
||||
from tinytorch.core.autograd import EmbeddingBackward
|
||||
result._grad_fn = EmbeddingBackward(self.weight, indices)
|
||||
|
||||
return result
|
||||
|
||||
@@ -336,10 +319,6 @@ class EmbeddingLayer:
|
||||
|
||||
return output
|
||||
|
||||
def __call__(self, tokens: Tensor) -> Tensor:
|
||||
"""Allows the embedding layer to be called like a function."""
|
||||
return self.forward(tokens)
|
||||
|
||||
def parameters(self) -> List[Tensor]:
|
||||
"""Return all trainable parameters."""
|
||||
params = self.token_embedding.parameters()
|
||||
|
||||
28
tinytorch/text/tokenization.py
generated
28
tinytorch/text/tokenization.py
generated
@@ -1,19 +1,5 @@
|
||||
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||||
# ║ ║
|
||||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||||
# ║ ║
|
||||
# ║ ✅ TO EDIT: modules/source/XX_tokenization/tokenization_dev.py ║
|
||||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||||
# ║ ║
|
||||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||||
# ║ Editing it directly may break module functionality and training. ║
|
||||
# ║ ║
|
||||
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/10_tokenization/tokenization_dev.ipynb.
|
||||
|
||||
# %% auto 0
|
||||
__all__ = ['Tokenizer', 'CharTokenizer', 'BPETokenizer']
|
||||
|
||||
@@ -24,16 +10,6 @@ import json
|
||||
import re
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 3
|
||||
import numpy as np
|
||||
from typing import List, Dict, Tuple, Optional, Set
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
# Import only Module 01 (Tensor) - this module has minimal dependencies
|
||||
from ..core.tensor import Tensor
|
||||
|
||||
# %% ../../modules/source/10_tokenization/tokenization_dev.ipynb 8
|
||||
class Tokenizer:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user