diff --git a/tinytorch/_modidx.py b/tinytorch/_modidx.py
index 3cd9a0a8..88d63238 100644
--- a/tinytorch/_modidx.py
+++ b/tinytorch/_modidx.py
@@ -21,7 +21,37 @@ d = { 'settings': { 'branch': 'main',
                 'doc_host': 'https://tinytorch.github.io',
                 'git_url': 'https://github.com/tinytorch/TinyTorch/',
                 'lib_path': 'tinytorch'},
-  'syms': { 'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark',
+  'syms': { 'tinytorch.applications.tinygpt': { 'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline': ( '20_capstone/capstone_dev.html#completetinygptpipeline',
+                                                                                                            'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.__init__': ( '20_capstone/capstone_dev.html#completetinygptpipeline.__init__',
+                                                                                                                     'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.generate_text': ( '20_capstone/capstone_dev.html#completetinygptpipeline.generate_text',
+                                                                                                                          'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.optimize_model': ( '20_capstone/capstone_dev.html#completetinygptpipeline.optimize_model',
+                                                                                                                           'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.prepare_training_data': ( '20_capstone/capstone_dev.html#completetinygptpipeline.prepare_training_data',
+                                                                                                                                  'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.CompleteTinyGPTPipeline.train': ( '20_capstone/capstone_dev.html#completetinygptpipeline.train',
+                                                                                                                  'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.TinyGPT': ( '20_capstone/capstone_dev.html#tinygpt',
+                                                                                            'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.TinyGPT.__init__': ( '20_capstone/capstone_dev.html#tinygpt.__init__',
+                                                                                                     'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.TinyGPTTrainer': ( '20_capstone/capstone_dev.html#tinygpttrainer',
+                                                                                                   'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.TinyGPTTrainer.__init__': ( '20_capstone/capstone_dev.html#tinygpttrainer.__init__',
+                                                                                                            'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.TinyGPTTrainer.prepare_batch': ( '20_capstone/capstone_dev.html#tinygpttrainer.prepare_batch',
+                                                                                                                 'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.TinyGPTTrainer.train_step': ( '20_capstone/capstone_dev.html#tinygpttrainer.train_step',
+                                                                                                              'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.test_unit_complete_pipeline': ( '20_capstone/capstone_dev.html#test_unit_complete_pipeline',
+                                                                                                                'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.test_unit_tinygpt_init': ( '20_capstone/capstone_dev.html#test_unit_tinygpt_init',
+                                                                                                           'tinytorch/applications/tinygpt.py'),
+                                                'tinytorch.applications.tinygpt.test_unit_training_pipeline': ( '20_capstone/capstone_dev.html#test_unit_training_pipeline',
+                                                                                                                'tinytorch/applications/tinygpt.py')},
+            'tinytorch.benchmarking.benchmark': { 'tinytorch.benchmarking.benchmark.Benchmark': ( '19_benchmarking/benchmarking_dev.html#benchmark',
                                                                                                   'tinytorch/benchmarking/benchmark.py'),
                                                   'tinytorch.benchmarking.benchmark.Benchmark.__init__': ( '19_benchmarking/benchmarking_dev.html#benchmark.__init__',
                                                                                                            'tinytorch/benchmarking/benchmark.py'),
@@ -59,8 +89,6 @@ d = { 'settings': { 'branch': 'main',
                                                                                                                       'tinytorch/benchmarking/benchmark.py'),
                                                   'tinytorch.benchmarking.benchmark.TinyMLPerf.run_standard_benchmark': ( '19_benchmarking/benchmarking_dev.html#tinymlperf.run_standard_benchmark',
                                                                                                                           'tinytorch/benchmarking/benchmark.py'),
-                                                  'tinytorch.benchmarking.benchmark.calculate_normalized_scores': ( '19_benchmarking/benchmarking_dev.html#calculate_normalized_scores',
-                                                                                                                    'tinytorch/benchmarking/benchmark.py'),
                                                   'tinytorch.benchmarking.benchmark.test_unit_benchmark': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark',
                                                                                                             'tinytorch/benchmarking/benchmark.py'),
                                                   'tinytorch.benchmarking.benchmark.test_unit_benchmark_suite': ( '19_benchmarking/benchmarking_dev.html#test_unit_benchmark_suite',
@@ -77,8 +105,6 @@ d = { 'settings': { 'branch': 'main',
                                                                                                          'tinytorch/competition/submit.py'),
                                               'tinytorch.competition.submit.validate_installation': ( '20_competition/competition_dev.html#validate_installation',
                                                                                                       'tinytorch/competition/submit.py'),
-                                              'tinytorch.competition.submit.validate_submission': ( '20_competition/competition_dev.html#validate_submission',
-                                                                                                    'tinytorch/competition/submit.py'),
                                               'tinytorch.competition.submit.worked_example_optimization': ( '20_competition/competition_dev.html#worked_example_optimization',
                                                                                                             'tinytorch/competition/submit.py')},
             'tinytorch.core.activations': { 'tinytorch.core.activations.GELU': ( '02_activations/activations_dev.html#gelu',
@@ -315,11 +341,7 @@ d = { 'settings': { 'branch': 'main',
                                          'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint',
                                                                                               'tinytorch/core/training.py'),
                                          'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch',
-                                                                                          'tinytorch/core/training.py'),
-                                         'tinytorch.core.training.load_checkpoint': ( '07_training/training_dev.html#load_checkpoint',
-                                                                                      'tinytorch/core/training.py'),
-                                         'tinytorch.core.training.save_checkpoint': ( '07_training/training_dev.html#save_checkpoint',
-                                                                                      'tinytorch/core/training.py')},
+                                                                                          'tinytorch/core/training.py')},
             'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader',
                                                                              'tinytorch/data/loader.py'),
                                        'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__',
@@ -364,6 +386,8 @@ d = { 'settings': { 'branch': 'main',
                                                                                                   'tinytorch/generation/kv_cache.py')},
             'tinytorch.models.transformer': { 'tinytorch.models.transformer.GPT': ( '13_transformers/transformers_dev.html#gpt',
                                                                                     'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.GPT.__call__': ( '13_transformers/transformers_dev.html#gpt.__call__',
+                                                                                             'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.GPT.__init__': ( '13_transformers/transformers_dev.html#gpt.__init__',
                                                                                              'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.GPT._create_causal_mask': ( '13_transformers/transformers_dev.html#gpt._create_causal_mask',
@@ -376,6 +400,8 @@ d = { 'settings': { 'branch': 'main',
                                                                                                'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.LayerNorm': ( '13_transformers/transformers_dev.html#layernorm',
                                                                                           'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.LayerNorm.__call__': ( '13_transformers/transformers_dev.html#layernorm.__call__',
+                                                                                                   'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.LayerNorm.__init__': ( '13_transformers/transformers_dev.html#layernorm.__init__',
                                                                                                    'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.LayerNorm.forward': ( '13_transformers/transformers_dev.html#layernorm.forward',
@@ -384,6 +410,8 @@ d = { 'settings': { 'branch': 'main',
                                                                                                      'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.MLP': ( '13_transformers/transformers_dev.html#mlp',
                                                                                     'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.MLP.__call__': ( '13_transformers/transformers_dev.html#mlp.__call__',
+                                                                                             'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.MLP.__init__': ( '13_transformers/transformers_dev.html#mlp.__init__',
                                                                                              'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.MLP.forward': ( '13_transformers/transformers_dev.html#mlp.forward',
@@ -392,32 +420,58 @@ d = { 'settings': { 'branch': 'main',
                                                                                                'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.TransformerBlock': ( '13_transformers/transformers_dev.html#transformerblock',
                                                                                                  'tinytorch/models/transformer.py'),
+                                              'tinytorch.models.transformer.TransformerBlock.__call__': ( '13_transformers/transformers_dev.html#transformerblock.__call__',
+                                                                                                          'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.TransformerBlock.__init__': ( '13_transformers/transformers_dev.html#transformerblock.__init__',
                                                                                                           'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.TransformerBlock.forward': ( '13_transformers/transformers_dev.html#transformerblock.forward',
                                                                                                          'tinytorch/models/transformer.py'),
                                               'tinytorch.models.transformer.TransformerBlock.parameters': ( '13_transformers/transformers_dev.html#transformerblock.parameters',
-                                                                                                            'tinytorch/models/transformer.py'),
-                                              'tinytorch.models.transformer._tensor_mean': ( '13_transformers/transformers_dev.html#_tensor_mean',
-                                                                                             'tinytorch/models/transformer.py'),
-                                              'tinytorch.models.transformer._tensor_sqrt': ( '13_transformers/transformers_dev.html#_tensor_sqrt',
-                                                                                             'tinytorch/models/transformer.py')},
-            'tinytorch.optimization.quantization': { 'tinytorch.optimization.quantization.QuantizationComplete': ( '17_quantization/quantization_dev.html#quantizationcomplete',
-                                                                                                                   'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.QuantizationComplete.compare_models': ( '17_quantization/quantization_dev.html#quantizationcomplete.compare_models',
-                                                                                                                                  'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.QuantizationComplete.dequantize_tensor': ( '17_quantization/quantization_dev.html#quantizationcomplete.dequantize_tensor',
-                                                                                                                                     'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.QuantizationComplete.quantize_model': ( '17_quantization/quantization_dev.html#quantizationcomplete.quantize_model',
-                                                                                                                                  'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.QuantizationComplete.quantize_tensor': ( '17_quantization/quantization_dev.html#quantizationcomplete.quantize_tensor',
-                                                                                                                                   'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.dequantize_int8': ( '17_quantization/quantization_dev.html#dequantize_int8',
-                                                                                                              'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.quantize_int8': ( '17_quantization/quantization_dev.html#quantize_int8',
-                                                                                                            'tinytorch/optimization/quantization.py'),
-                                                     'tinytorch.optimization.quantization.quantize_model': ( '17_quantization/quantization_dev.html#quantize_model',
-                                                                                                             'tinytorch/optimization/quantization.py')},
+                                                                                                            'tinytorch/models/transformer.py')},
+            'tinytorch.optimization.acceleration': {},
+            'tinytorch.optimization.compression': { 'tinytorch.optimization.compression.CompressionComplete': ( '17_compression/compression_dev.html#compressioncomplete',
+                                                                                                                'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.CompressionComplete.compress_model': ( '17_compression/compression_dev.html#compressioncomplete.compress_model',
+                                                                                                                               'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.CompressionComplete.magnitude_prune': ( '17_compression/compression_dev.html#compressioncomplete.magnitude_prune',
+                                                                                                                                'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.CompressionComplete.measure_sparsity': ( '17_compression/compression_dev.html#compressioncomplete.measure_sparsity',
+                                                                                                                                 'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.CompressionComplete.structured_prune': ( '17_compression/compression_dev.html#compressioncomplete.structured_prune',
+                                                                                                                                 'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.KnowledgeDistillation': ( '17_compression/compression_dev.html#knowledgedistillation',
+                                                                                                                  'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.KnowledgeDistillation.__init__': ( '17_compression/compression_dev.html#knowledgedistillation.__init__',
+                                                                                                                           'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.KnowledgeDistillation._cross_entropy': ( '17_compression/compression_dev.html#knowledgedistillation._cross_entropy',
+                                                                                                                                 'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.KnowledgeDistillation._kl_divergence': ( '17_compression/compression_dev.html#knowledgedistillation._kl_divergence',
+                                                                                                                                 'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.KnowledgeDistillation._softmax': ( '17_compression/compression_dev.html#knowledgedistillation._softmax',
+                                                                                                                           'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.KnowledgeDistillation.distillation_loss': ( '17_compression/compression_dev.html#knowledgedistillation.distillation_loss',
+                                                                                                                                    'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.Sequential': ( '17_compression/compression_dev.html#sequential',
+                                                                                                       'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.Sequential.__call__': ( '17_compression/compression_dev.html#sequential.__call__',
+                                                                                                                'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.Sequential.__init__': ( '17_compression/compression_dev.html#sequential.__init__',
+                                                                                                                'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.Sequential.forward': ( '17_compression/compression_dev.html#sequential.forward',
+                                                                                                               'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.Sequential.parameters': ( '17_compression/compression_dev.html#sequential.parameters',
+                                                                                                                  'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.compress_model': ( '17_compression/compression_dev.html#compress_model',
+                                                                                                           'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.magnitude_prune': ( '17_compression/compression_dev.html#magnitude_prune',
+                                                                                                            'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.measure_sparsity': ( '17_compression/compression_dev.html#measure_sparsity',
+                                                                                                             'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.structured_prune': ( '17_compression/compression_dev.html#structured_prune',
+                                                                                                             'tinytorch/optimization/compression.py'),
+                                                    'tinytorch.optimization.compression.test_unit_knowledge_distillation': ( '17_compression/compression_dev.html#test_unit_knowledge_distillation',
+                                                                                                                             'tinytorch/optimization/compression.py')},
+            'tinytorch.optimization.quantization': {},
             'tinytorch.profiling.profiler': { 'tinytorch.profiling.profiler.Profiler': ( '14_profiling/profiling_dev.html#profiler',
                                                                                          'tinytorch/profiling/profiler.py'),
                                               'tinytorch.profiling.profiler.Profiler.__init__': ( '14_profiling/profiling_dev.html#profiler.__init__',
@@ -442,6 +496,8 @@ d = { 'settings': { 'branch': 'main',
                                                                                               'tinytorch/profiling/profiler.py')},
             'tinytorch.text.embeddings': { 'tinytorch.text.embeddings.Embedding': ( '11_embeddings/embeddings_dev.html#embedding',
                                                                                     'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.Embedding.__call__': ( '11_embeddings/embeddings_dev.html#embedding.__call__',
+                                                                                             'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.Embedding.__init__': ( '11_embeddings/embeddings_dev.html#embedding.__init__',
                                                                                              'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.Embedding.__repr__': ( '11_embeddings/embeddings_dev.html#embedding.__repr__',
@@ -452,6 +508,8 @@ d = { 'settings': { 'branch': 'main',
                                                                                                'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.EmbeddingLayer': ( '11_embeddings/embeddings_dev.html#embeddinglayer',
                                                                                          'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.EmbeddingLayer.__call__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__call__',
+                                                                                                  'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.EmbeddingLayer.__init__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__init__',
                                                                                                   'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.EmbeddingLayer.__repr__': ( '11_embeddings/embeddings_dev.html#embeddinglayer.__repr__',
@@ -462,6 +520,8 @@ d = { 'settings': { 'branch': 'main',
                                                                                                     'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.PositionalEncoding': ( '11_embeddings/embeddings_dev.html#positionalencoding',
                                                                                              'tinytorch/text/embeddings.py'),
+                                           'tinytorch.text.embeddings.PositionalEncoding.__call__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__call__',
+                                                                                                      'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.PositionalEncoding.__init__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__init__',
                                                                                                       'tinytorch/text/embeddings.py'),
                                            'tinytorch.text.embeddings.PositionalEncoding.__repr__': ( '11_embeddings/embeddings_dev.html#positionalencoding.__repr__',
diff --git a/tinytorch/applications/tinygpt.py b/tinytorch/applications/tinygpt.py
new file mode 100644
index 00000000..24ccef7b
--- /dev/null
+++ b/tinytorch/applications/tinygpt.py
@@ -0,0 +1,679 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_tinygpt/tinygpt_dev.py               ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline',
+           'test_unit_complete_pipeline']
+
+# %% ../../modules/source/20_capstone/capstone_dev.ipynb 2
+#| default_exp applications.tinygpt
+#| export
+
+# %% ../../modules/source/20_capstone/capstone_dev.ipynb 7
+class TinyGPT:
+    """
+    Complete GPT implementation integrating all TinyTorch modules.
+
+    This class demonstrates how framework components compose into real applications.
+    Built using modules 01,02,03,11,12,13 as core architecture.
+
+    Architecture:
+    - Token Embeddings (Module 11)
+    - Positional Encoding (Module 11)
+    - Transformer Blocks (Module 13)
+    - Output Linear Layer (Module 03)
+    - Language Modeling Head (Module 04)
+    """
+
+    def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4,
+                 num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1):
+        """
+        Initialize TinyGPT with production-inspired architecture.
+
+        TODO: Build a complete GPT model using TinyTorch components
+
+        APPROACH:
+        1. Create token embeddings (vocab_size × embed_dim)
+        2. Create positional encoding (max_seq_len × embed_dim)
+        3. Build transformer layers using TransformerBlock
+        4. Add output projection layer
+        5. Calculate and report parameter count
+
+        ARCHITECTURE DECISIONS:
+        - embed_dim=128: Small enough for fast training, large enough for learning
+        - num_layers=4: Sufficient depth without excessive memory
+        - num_heads=4: Multi-head attention without head_dim being too small
+        - max_seq_len=256: Reasonable context length for character-level modeling
+
+        EXAMPLE:
+        >>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4)
+        >>> print(f"Parameters: {model.count_parameters():,}")
+        Parameters: 1,234,567
+
+        HINTS:
+        - Use Embedding class for token embeddings
+        - Use PositionalEncoding for position information
+        - Stack TransformerBlock instances in a list
+        - Final Linear layer maps embed_dim → vocab_size
+        """
+        ### BEGIN SOLUTION
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+
+        # Token embeddings: convert token IDs to dense vectors
+        self.token_embedding = Embedding(vocab_size, embed_dim)
+
+        # Positional encoding: add position information
+        self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim)
+
+        # Transformer layers: core processing
+        self.transformer_blocks = []
+        for _ in range(num_layers):
+            block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0)
+            self.transformer_blocks.append(block)
+
+        # Output projection: map back to vocabulary
+        self.output_projection = Linear(embed_dim, vocab_size)
+
+        # Dropout for regularization
+        self.dropout_layer = Dropout(dropout)
+
+        # Calculate parameter count for systems analysis
+        self._param_count = self.count_parameters()
+        print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters")
+        print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D")
+        print(f"💾 Estimated memory: {self._param_count * 4 / 1024 / 1024:.1f}MB")
+        ### END SOLUTION
+
+def test_unit_tinygpt_init():
+    """🔬 Test TinyGPT initialization and parameter counting."""
+    print("🔬 Unit Test: TinyGPT Initialization...")
+
+    # Create a small model for testing
+    model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128)
+
+    # Verify architecture components exist
+    assert hasattr(model, 'token_embedding')
+    assert hasattr(model, 'positional_encoding')
+    assert hasattr(model, 'transformer_blocks')
+    assert hasattr(model, 'output_projection')
+    assert len(model.transformer_blocks) == 2
+
+    # Verify parameter count is reasonable
+    param_count = model.count_parameters()
+    assert param_count > 0
+    assert param_count < 1000000  # Sanity check for small model
+
+    print(f"✅ Model created with {param_count:,} parameters")
+    print("✅ TinyGPT initialization works correctly!")
+
+# Run immediate test
+test_unit_tinygpt_init()
+
+# %% ../../modules/source/20_capstone/capstone_dev.ipynb 10
+class TinyGPTTrainer:
+    """
+    Complete training pipeline integrating optimizers, schedulers, and monitoring.
+
+    Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training.
+    """
+
+    def __init__(self, model: TinyGPT, tokenizer: CharTokenizer,
+                 learning_rate: float = 3e-4, weight_decay: float = 0.01):
+        """
+        Initialize trainer with model and optimization components.
+
+        TODO: Set up complete training infrastructure
+
+        APPROACH:
+        1. Store model and tokenizer references
+        2. Initialize AdamW optimizer (standard for transformers)
+        3. Initialize loss function (CrossEntropyLoss for language modeling)
+        4. Set up learning rate scheduler (cosine schedule)
+        5. Initialize training metrics tracking
+
+        PRODUCTION CHOICES:
+        - AdamW: Better generalization than Adam (weight decay)
+        - learning_rate=3e-4: Standard for small transformers
+        - Cosine schedule: Smooth learning rate decay
+        - CrossEntropy: Standard for classification/language modeling
+
+        EXAMPLE:
+        >>> model = TinyGPT(vocab_size=100)
+        >>> tokenizer = CharTokenizer(['a', 'b', 'c'])
+        >>> trainer = TinyGPTTrainer(model, tokenizer)
+        >>> print("Trainer ready for training")
+        Trainer ready for training
+
+        HINTS:
+        - Get all model parameters with model.parameters()
+        - Use AdamW with weight_decay for better generalization
+        - CrossEntropyLoss handles the language modeling objective
+        """
+        ### BEGIN SOLUTION
+        self.model = model
+        self.tokenizer = tokenizer
+
+        # Collect all trainable parameters
+        all_params = []
+        all_params.extend(model.token_embedding.parameters())
+        for block in model.transformer_blocks:
+            all_params.extend(block.parameters())
+        all_params.extend(model.output_projection.parameters())
+
+        # Initialize optimizer (AdamW for transformers)
+        self.optimizer = AdamW(
+            params=all_params,
+            lr=learning_rate,
+            weight_decay=weight_decay,
+            betas=(0.9, 0.95)  # Standard for language models
+        )
+
+        # Loss function for next token prediction
+        self.loss_fn = CrossEntropyLoss()
+
+        # Learning rate scheduler
+        self.scheduler = CosineSchedule(
+            optimizer=self.optimizer,
+            max_epochs=100,  # Will adjust based on actual training
+            min_lr=learning_rate * 0.1
+        )
+
+        # Training metrics
+        self.training_history = {
+            'losses': [],
+            'perplexities': [],
+            'learning_rates': [],
+            'epoch': 0
+        }
+
+        print(f"🚀 Trainer initialized:")
+        print(f"   Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})")
+        print(f"   Parameters: {len(all_params):,} tensors")
+        print(f"   Loss: CrossEntropyLoss")
+        ### END SOLUTION
+
+    def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]:
+        """
+        Convert text batch to input/target tensors for language modeling.
+
+        TODO: Implement text-to-tensor conversion with proper targets
+
+        APPROACH:
+        1. Tokenize each text in the batch
+        2. Pad/truncate to consistent length
+        3. Create input_ids (text) and target_ids (text shifted by 1)
+        4. Convert to Tensor format
+
+        LANGUAGE MODELING OBJECTIVE:
+        - Input: [token1, token2, token3, token4]
+        - Target: [token2, token3, token4, token5]
+        - Model predicts next token at each position
+
+        EXAMPLE:
+        >>> trainer = TinyGPTTrainer(model, tokenizer)
+        >>> texts = ["hello world", "ai is fun"]
+        >>> inputs, targets = trainer.prepare_batch(texts)
+        >>> print(inputs.shape, targets.shape)
+        (2, 128) (2, 128)
+
+        HINTS:
+        - Use tokenizer.encode() for text → token conversion
+        - Pad shorter sequences with tokenizer pad token
+        - Target sequence is input sequence shifted right by 1
+        """
+        ### BEGIN SOLUTION
+        batch_size = len(text_batch)
+
+        # Tokenize all texts
+        tokenized_batch = []
+        for text in text_batch:
+            tokens = self.tokenizer.encode(text)
+
+            # Truncate or pad to max_length
+            if len(tokens) > max_length:
+                tokens = tokens[:max_length]
+            else:
+                # Pad with special token (use 0 as pad)
+                tokens.extend([0] * (max_length - len(tokens)))
+
+            tokenized_batch.append(tokens)
+
+        # Convert to numpy then Tensor
+        input_ids = Tensor(np.array(tokenized_batch))  # (batch_size, seq_len)
+
+        # Create targets (shifted input for next token prediction)
+        target_ids = Tensor(np.roll(input_ids.data, -1, axis=1))  # Shift left by 1
+
+        return input_ids, target_ids
+        ### END SOLUTION
+
+    def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float:
+        """
+        Single training step with forward, backward, and optimization.
+
+        TODO: Implement complete training step
+
+        APPROACH:
+        1. Zero gradients from previous step
+        2. Forward pass to get logits
+        3. Compute loss between logits and targets
+        4. Backward pass to compute gradients
+        5. Optimizer step to update parameters
+        6. Return loss value for monitoring
+
+        MEMORY MANAGEMENT:
+        During training, memory usage = 3× model size:
+        - 1× for parameters
+        - 1× for gradients
+        - 1× for optimizer states (Adam moments)
+
+        EXAMPLE:
+        >>> loss = trainer.train_step(input_ids, target_ids)
+        >>> print(f"Training loss: {loss:.4f}")
+        Training loss: 2.3456
+
+        HINTS:
+        - Always zero_grad() before forward pass
+        - Loss should be computed on flattened logits and targets
+        - Call backward() on the loss tensor
+        """
+        ### BEGIN SOLUTION
+        # Zero gradients from previous step
+        self.optimizer.zero_grad()
+
+        # Forward pass
+        logits = self.model.forward(input_ids)  # (batch, seq_len, vocab_size)
+
+        # Reshape for loss computation
+        batch_size, seq_len, vocab_size = logits.shape
+        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
+        targets_flat = target_ids.reshape(batch_size * seq_len)
+
+        # Compute loss
+        loss = self.loss_fn.forward(logits_flat, targets_flat)
+
+        # Backward pass
+        loss.backward()
+
+        # Optimizer step
+        self.optimizer.step()
+
+        # Return scalar loss for monitoring
+        return float(loss.data.item() if hasattr(loss.data, 'item') else loss.data)
+        ### END SOLUTION
+
+def test_unit_training_pipeline():
+    """🔬 Test training pipeline components."""
+    print("🔬 Unit Test: Training Pipeline...")
+
+    # Create small model and trainer
+    model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2)
+    tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' '])
+    trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3)
+
+    # Test batch preparation
+    texts = ["hello", "world"]
+    input_ids, target_ids = trainer.prepare_batch(texts, max_length=8)
+
+    assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}"
+    assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}"
+
+    # Test training step
+    initial_loss = trainer.train_step(input_ids, target_ids)
+    assert initial_loss > 0, "Loss should be positive"
+
+    # Second step should work (gradients computed and applied)
+    second_loss = trainer.train_step(input_ids, target_ids)
+    assert second_loss > 0, "Second loss should also be positive"
+
+    print(f"✅ Batch preparation shape: {input_ids.shape}")
+    print(f"✅ Initial loss: {initial_loss:.4f}")
+    print(f"✅ Second loss: {second_loss:.4f}")
+    print("✅ Training pipeline works correctly!")
+
+# Run immediate test
+test_unit_training_pipeline()
+
+# %% ../../modules/source/20_capstone/capstone_dev.ipynb 14
+class CompleteTinyGPTPipeline:
+    """
+    End-to-end ML pipeline demonstrating integration of all 19 modules.
+
+    Pipeline stages:
+    1. Data preparation (Module 10: Tokenization)
+    2. Model creation (Modules 01-04, 11-13: Architecture)
+    3. Training setup (Modules 05-07: Optimization)
+    4. Training loop (Module 08: DataLoader)
+    5. Optimization (Modules 17-18: Quantization, Pruning)
+    6. Evaluation (Module 19: Benchmarking)
+    7. Generation (Module 14: KV Caching)
+    """
+
+    def __init__(self, vocab_size: int = 100, embed_dim: int = 128,
+                 num_layers: int = 4, num_heads: int = 4):
+        """
+        Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules.
+
+        TODO: Set up a complete ML pipeline with tokenization, model, training,
+        profiling, and benchmarking components
+
+        APPROACH:
+        1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads)
+        2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127)
+        3. Create TinyGPT model instance with stored parameters and max_seq_len=256
+        4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4
+        5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis
+        6. Initialize pipeline state tracking (is_trained flag, training_history list)
+        7. Print pipeline initialization summary with parameter count and memory usage
+
+        EXAMPLE:
+        >>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128,
+        ...                                     num_layers=4, num_heads=4)
+        🏗️ Complete TinyGPT Pipeline Initialized
+           Model: 419,300 parameters
+           Memory: 1.6MB
+        >>> pipeline.model.count_parameters()
+        419300
+        >>> pipeline.is_trained
+        False
+        >>> len(pipeline.training_history)
+        0
+
+        HINTS:
+        - CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)]
+        - TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len
+        - TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments
+        - Benchmark expects (models_list, datasets_list, metrics_list) format
+        - Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB
+        """
+
+        ### BEGIN SOLUTION
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+
+        # Stage 1: Initialize tokenizer (Module 10)
+        self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)])  # Printable ASCII
+
+        # Stage 2: Create model (Modules 01-04, 11-13)
+        self.model = TinyGPT(
+            vocab_size=vocab_size,
+            embed_dim=embed_dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            max_seq_len=256
+        )
+
+        # Stage 3: Setup training (Modules 05-07)
+        self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4)
+
+        # Stage 4: Initialize profiler and benchmark (Modules 15, 19)
+        self.profiler = Profiler()
+        self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"])
+
+        # Pipeline state
+        self.is_trained = False
+        self.training_history = []
+
+        print("🏗️ Complete TinyGPT Pipeline Initialized")
+        print(f"   Model: {self.model.count_parameters():,} parameters")
+        print(f"   Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB")
+        ### END SOLUTION
+
+    def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader:
+        """
+        Prepare training data using DataLoader (Module 08).
+
+        TODO: Create DataLoader for training text data
+
+        APPROACH:
+        1. Tokenize all texts in corpus
+        2. Create input/target pairs for language modeling
+        3. Package into TensorDataset
+        4. Create DataLoader with batching and shuffling
+
+        EXAMPLE:
+        >>> pipeline = CompleteTinyGPTPipeline()
+        >>> corpus = ["hello world", "ai is amazing"]
+        >>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
+        >>> print(f"Batches: {len(dataloader)}")
+        Batches: 1
+        """
+        ### BEGIN SOLUTION
+        # Tokenize and prepare training pairs
+        input_sequences = []
+        target_sequences = []
+
+        for text in text_corpus:
+            tokens = self.tokenizer.encode(text)
+            if len(tokens) < 2:
+                continue  # Skip very short texts
+
+            # Create sliding window of input/target pairs
+            for i in range(len(tokens) - 1):
+                input_seq = tokens[:i+1]
+                target_seq = tokens[i+1]
+
+                # Pad input to consistent length
+                max_len = 32  # Reasonable context window
+                if len(input_seq) > max_len:
+                    input_seq = input_seq[-max_len:]
+                else:
+                    input_seq = [0] * (max_len - len(input_seq)) + input_seq
+
+                input_sequences.append(input_seq)
+                target_sequences.append(target_seq)
+
+        # Convert to tensors
+        inputs = Tensor(np.array(input_sequences))
+        targets = Tensor(np.array(target_sequences))
+
+        # Create dataset and dataloader
+        dataset = TensorDataset(inputs, targets)
+        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+        print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches")
+        return dataloader
+        ### END SOLUTION
+
+    def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]:
+        """
+        Complete training loop with monitoring.
+
+        TODO: Implement full training with progress tracking
+
+        APPROACH:
+        1. Loop through epochs
+        2. For each batch: forward, backward, optimize
+        3. Track loss and perplexity
+        4. Update learning rate schedule
+        5. Return training history
+
+        EXAMPLE:
+        >>> history = pipeline.train(dataloader, epochs=5)
+        >>> print(f"Final loss: {history['losses'][-1]:.4f}")
+        Final loss: 1.2345
+        """
+        ### BEGIN SOLUTION
+        history = {'losses': [], 'perplexities': [], 'epochs': []}
+
+        print(f"🚀 Starting training for {epochs} epochs...")
+
+        for epoch in range(epochs):
+            epoch_losses = []
+
+            for batch_idx, (inputs, targets) in enumerate(dataloader):
+                # Training step
+                loss = self.trainer.train_step(inputs, targets)
+                epoch_losses.append(loss)
+
+                # Log progress
+                if batch_idx % 10 == 0:
+                    perplexity = np.exp(loss)
+                    print(f"   Epoch {epoch+1}/{epochs}, Batch {batch_idx}: "
+                          f"Loss={loss:.4f}, PPL={perplexity:.2f}")
+
+            # Epoch summary
+            avg_loss = np.mean(epoch_losses)
+            avg_perplexity = np.exp(avg_loss)
+
+            history['losses'].append(avg_loss)
+            history['perplexities'].append(avg_perplexity)
+            history['epochs'].append(epoch + 1)
+
+            # Update learning rate
+            self.trainer.scheduler.step()
+
+            print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}")
+
+        self.is_trained = True
+        self.training_history = history
+        print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}")
+
+        return history
+        ### END SOLUTION
+
+    def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0):
+        """
+        Apply optimization techniques (Modules 17-18).
+
+        TODO: Apply quantization and pruning optimizations
+
+        APPROACH:
+        1. Optionally apply quantization to reduce precision
+        2. Optionally apply pruning to remove weights
+        3. Measure size reduction
+        4. Validate model still works
+
+        EXAMPLE:
+        >>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
+        Model optimized: 75% size reduction
+        """
+        ### BEGIN SOLUTION
+        original_params = self.model.count_parameters()
+        original_memory = original_params * 4 / (1024 * 1024)
+
+        optimizations_applied = []
+
+        if quantize:
+            # Apply quantization (simulated)
+            # In real implementation, would use quantize_model()
+            quantized_memory = original_memory / 4  # INT8 vs FP32
+            optimizations_applied.append(f"INT8 quantization (4× memory reduction)")
+            print("   Applied INT8 quantization")
+
+        if prune_sparsity > 0:
+            # Apply pruning (simulated)
+            # In real implementation, would use magnitude_prune()
+            remaining_weights = 1 - prune_sparsity
+            optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)")
+            print(f"   Applied {prune_sparsity:.0%} magnitude pruning")
+
+        # Calculate final size
+        size_reduction = 1.0
+        if quantize:
+            size_reduction *= 0.25  # 4× smaller
+        if prune_sparsity > 0:
+            size_reduction *= (1 - prune_sparsity)
+
+        final_memory = original_memory * size_reduction
+        reduction_factor = original_memory / final_memory
+
+        print(f"🔧 Model optimization complete:")
+        print(f"   Original: {original_memory:.1f}MB")
+        print(f"   Optimized: {final_memory:.1f}MB")
+        print(f"   Reduction: {reduction_factor:.1f}× smaller")
+        print(f"   Applied: {', '.join(optimizations_applied)}")
+        ### END SOLUTION
+
+    def generate_text(self, prompt: str, max_tokens: int = 50) -> str:
+        """
+        Generate text using the trained model.
+
+        TODO: Implement text generation with proper encoding/decoding
+
+        APPROACH:
+        1. Encode prompt to token IDs
+        2. Use model.generate() for autoregressive generation
+        3. Decode generated tokens back to text
+        4. Return generated text
+
+        EXAMPLE:
+        >>> text = pipeline.generate_text("Hello", max_tokens=10)
+        >>> print(f"Generated: {text}")
+        Generated: Hello world this is AI
+        """
+        ### BEGIN SOLUTION
+        if not self.is_trained:
+            print("⚠️ Model not trained yet. Generating with random weights.")
+
+        # Encode prompt
+        prompt_tokens = self.tokenizer.encode(prompt)
+        prompt_tensor = Tensor([prompt_tokens])
+
+        # Generate tokens
+        generated_tokens = self.model.generate(
+            prompt_tensor,
+            max_new_tokens=max_tokens,
+            temperature=0.8,
+            use_cache=True
+        )
+
+        # Decode to text
+        all_tokens = generated_tokens.data[0].tolist()
+        generated_text = self.tokenizer.decode(all_tokens)
+
+        return generated_text
+        ### END SOLUTION
+
+def test_unit_complete_pipeline():
+    """🔬 Test complete pipeline integration."""
+    print("🔬 Unit Test: Complete Pipeline Integration...")
+
+    # Create pipeline
+    pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2)
+
+    # Test data preparation
+    corpus = ["hello world", "ai is fun", "machine learning"]
+    dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
+    assert len(dataloader) > 0, "DataLoader should have batches"
+
+    # Test training (minimal)
+    history = pipeline.train(dataloader, epochs=1)
+    assert 'losses' in history, "History should contain losses"
+    assert len(history['losses']) == 1, "Should have one epoch of losses"
+
+    # Test optimization
+    pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
+
+    # Test generation
+    generated = pipeline.generate_text("hello", max_tokens=5)
+    assert isinstance(generated, str), "Generated output should be string"
+    assert len(generated) > 0, "Generated text should not be empty"
+
+    print(f"✅ Pipeline stages completed successfully")
+    print(f"✅ Training history: {len(history['losses'])} epochs")
+    print(f"✅ Generated text: '{generated[:20]}...'")
+    print("✅ Complete pipeline integration works!")
+
+# Run immediate test
+test_unit_complete_pipeline()
diff --git a/tinytorch/benchmarking/benchmark.py b/tinytorch/benchmarking/benchmark.py
index 138f627a..f6572c55 100644
--- a/tinytorch/benchmarking/benchmark.py
+++ b/tinytorch/benchmarking/benchmark.py
@@ -16,7 +16,7 @@
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['OlympicEvent', 'Benchmark', 'test_unit_benchmark', 'BenchmarkSuite', 'test_unit_benchmark_suite', 'TinyMLPerf',
-           'test_unit_tinymlperf', 'calculate_normalized_scores']
+           'test_unit_tinymlperf']
 
 # %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 0
 #| default_exp benchmarking.benchmark
@@ -72,7 +72,7 @@ class Benchmark:
         self.measurement_runs = measurement_runs
         self.results = {}
         
-        # Use Profiler from Module 15 for measurements
+        # Use Profiler from Module 14 for measurements
         self.profiler = Profiler()
 
         # System information for metadata
@@ -1024,53 +1024,3 @@ def test_unit_tinymlperf():
     print("✅ TinyMLPerf works correctly!")
 
 test_unit_tinymlperf()
-
-# %% ../../modules/source/19_benchmarking/benchmarking_dev.ipynb 24
-def calculate_normalized_scores(baseline_results: dict, 
-                                optimized_results: dict) -> dict:
-    """
-    Calculate normalized performance metrics for fair competition comparison.
-    
-    This function converts absolute measurements into relative improvements,
-    enabling fair comparison across different hardware platforms.
-    
-    Args:
-        baseline_results: Dict with keys: 'latency', 'memory', 'accuracy'
-        optimized_results: Dict with same keys as baseline_results
-        
-    Returns:
-        Dict with normalized metrics:
-        - speedup: Relative latency improvement (higher is better)
-        - compression_ratio: Relative memory reduction (higher is better)
-        - accuracy_delta: Absolute accuracy change (closer to 0 is better)
-        - efficiency_score: Combined metric balancing all factors
-        
-    Example:
-        >>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
-        >>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
-        >>> scores = calculate_normalized_scores(baseline, optimized)
-        >>> print(f"Speedup: {scores['speedup']:.2f}x")
-        Speedup: 2.50x
-    """
-    # Calculate speedup (higher is better)
-    speedup = baseline_results['latency'] / optimized_results['latency']
-    
-    # Calculate compression ratio (higher is better)
-    compression_ratio = baseline_results['memory'] / optimized_results['memory']
-    
-    # Calculate accuracy delta (closer to 0 is better, negative means degradation)
-    accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy']
-    
-    # Calculate efficiency score (combined metric)
-    # Penalize accuracy loss: the more accuracy you lose, the lower your score
-    accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0
-    efficiency_score = (speedup * compression_ratio) / accuracy_penalty
-    
-    return {
-        'speedup': speedup,
-        'compression_ratio': compression_ratio,
-        'accuracy_delta': accuracy_delta,
-        'efficiency_score': efficiency_score,
-        'baseline': baseline_results.copy(),
-        'optimized': optimized_results.copy()
-    }
diff --git a/tinytorch/competition/submit.py b/tinytorch/competition/submit.py
index e1beaa7b..da8585d6 100644
--- a/tinytorch/competition/submit.py
+++ b/tinytorch/competition/submit.py
@@ -16,7 +16,7 @@
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
 __all__ = ['validate_installation', 'load_baseline_model', 'generate_baseline', 'worked_example_optimization',
-           'optimize_for_competition', 'validate_submission', 'generate_submission']
+           'optimize_for_competition', 'generate_submission']
 
 # %% ../../modules/source/20_competition/competition_dev.ipynb 4
 import numpy as np
@@ -24,8 +24,6 @@ import json
 import time
 from pathlib import Path
 from typing import Dict, List, Tuple, Any, Optional
-from ..benchmarking.benchmark import Benchmark, calculate_normalized_scores
-from ..profiling.profiler import Profiler
 
 def validate_installation() -> Dict[str, bool]:
     """
@@ -364,24 +362,31 @@ def worked_example_optimization():
     return submission
 
 # %% ../../modules/source/20_competition/competition_dev.ipynb 10
-def optimize_for_competition(baseline_model, event: str = "all_around", division: str = "closed"):
+def optimize_for_competition(baseline_model, event: str = "all_around"):
     """
     🏅 YOUR COMPETITION ENTRY - IMPLEMENT YOUR STRATEGY HERE!
     
+    This is where you apply optimization techniques from Modules 14-18.
+    
+    Available techniques:
+    - Module 14: KV Caching (for transformers) - enable_kv_cache()
+    - Module 16: Acceleration (vectorization, fusion)
+    - Module 17: Quantization (INT8, INT4) - quantize_model()
+    - Module 18: Compression (pruning) - magnitude_prune()
+    
     Args:
-        baseline_model: Starting model (use for Closed, optional for Open)
-        event: Category you're competing in
+        baseline_model: The unoptimized model
+        event: Which Olympic event you're competing in
             - "latency_sprint": Minimize latency
             - "memory_challenge": Minimize memory
             - "accuracy_contest": Maximize accuracy
             - "all_around": Best balance
             - "extreme_push": Most aggressive
-        division: "closed" or "open" - which track you chose
     
     Returns:
         Your optimized model
     
-    🔒 CLOSED DIVISION Example:
+    Example:
         from tinytorch.optimization.quantization import quantize_model
         from tinytorch.optimization.compression import magnitude_prune
         
@@ -389,15 +394,6 @@ def optimize_for_competition(baseline_model, event: str = "all_around", division
         optimized = quantize_model(optimized, bits=8)
         optimized = magnitude_prune(optimized, sparsity=0.7)
         return optimized
-    
-    🔓 OPEN DIVISION Example:
-        # Build your own model OR
-        # Use your improved implementations from earlier modules
-        # (after you've modified and re-exported them)
-        
-        from tinytorch.models import YourCustomArchitecture
-        optimized = YourCustomArchitecture()
-        return optimized
     """
     
     print(f"🏅 YOUR OPTIMIZATION STRATEGY FOR: {event}")
@@ -442,201 +438,74 @@ def optimize_for_competition(baseline_model, event: str = "all_around", division
     
     return optimized_model
 
-#| export
-def validate_submission(submission: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Validate competition submission with sanity checks.
-    
-    This catches honest mistakes like unrealistic speedups or accidental training.
-    Honor code system - we trust but verify basic reasonableness.
-    
-    Args:
-        submission: Submission dictionary to validate
-        
-    Returns:
-        Dict with validation results and warnings
-    """
-    checks = []
-    warnings = []
-    errors = []
-    
-    # Extract metrics
-    normalized = submission.get("normalized_scores", {})
-    speedup = normalized.get("speedup", 1.0)
-    compression = normalized.get("compression_ratio", 1.0)
-    accuracy_delta = normalized.get("accuracy_delta", 0.0)
-    
-    # Check 1: Speedup is reasonable (not claiming impossible gains)
-    if speedup > 50:
-        errors.append(f"❌ Speedup {speedup:.1f}x seems unrealistic (>50x)")
-    elif speedup > 20:
-        warnings.append(f"⚠️  Speedup {speedup:.1f}x is very high - please verify measurements")
-    else:
-        checks.append(f"✅ Speedup {speedup:.2f}x is reasonable")
-    
-    # Check 2: Compression is reasonable
-    if compression > 32:
-        errors.append(f"❌ Compression {compression:.1f}x seems unrealistic (>32x)")
-    elif compression > 16:
-        warnings.append(f"⚠️  Compression {compression:.1f}x is very high - please verify")
-    else:
-        checks.append(f"✅ Compression {compression:.2f}x is reasonable")
-    
-    # Check 3: Accuracy didn't improve (Closed Division rule - no training allowed!)
-    division = submission.get("division", "closed")
-    if division == "closed" and accuracy_delta > 1.0:
-        errors.append(f"❌ Accuracy improved by {accuracy_delta:.1f}pp - did you accidentally train the model?")
-    elif accuracy_delta > 0.5:
-        warnings.append(f"⚠️  Accuracy improved by {accuracy_delta:.1f}pp - verify no training occurred")
-    else:
-        checks.append(f"✅ Accuracy change {accuracy_delta:+.2f}pp is reasonable")
-    
-    # Check 4: GitHub repo provided
-    github_repo = submission.get("github_repo", "")
-    if not github_repo or github_repo == "":
-        warnings.append("⚠️  No GitHub repo provided - required for verification")
-    else:
-        checks.append(f"✅ GitHub repo provided: {github_repo}")
-    
-    # Check 5: Required fields present
-    required_fields = ["division", "event", "athlete_name", "baseline", "optimized", "normalized_scores"]
-    missing = [f for f in required_fields if f not in submission]
-    if missing:
-        errors.append(f"❌ Missing required fields: {', '.join(missing)}")
-    else:
-        checks.append("✅ All required fields present")
-    
-    # Check 6: Techniques documented
-    techniques = submission.get("techniques_applied", [])
-    if not techniques or "TODO" in str(techniques):
-        warnings.append("⚠️  No optimization techniques listed")
-    else:
-        checks.append(f"✅ Techniques documented: {', '.join(techniques[:3])}...")
-    
-    return {
-        "valid": len(errors) == 0,
-        "checks": checks,
-        "warnings": warnings,
-        "errors": errors
-    }
-
-#| export
 def generate_submission(baseline_model, optimized_model, 
-                       division: str = "closed",
                        event: str = "all_around",
                        athlete_name: str = "YourName",
-                       github_repo: str = "",
                        techniques: List[str] = None) -> Dict[str, Any]:
     """
-    Generate standardized TinyMLPerf competition submission with normalized scoring.
+    Generate standardized competition submission.
     
     Args:
         baseline_model: Original unoptimized model
         optimized_model: Your optimized model
-        division: "closed" or "open"
-        event: Competition category (latency_sprint, memory_challenge, all_around, etc.)
-        athlete_name: Your name for submission
-        github_repo: GitHub repository URL for code verification
-        techniques: List of optimization techniques applied
+        event: Olympic event name
+        athlete_name: Your name for leaderboard
+        techniques: List of techniques applied
     
     Returns:
         Submission dictionary (will be saved as JSON)
     """
-    print("📤 Generating TinyMLPerf Competition Submission...")
+    print("📤 Generating Competition Submission...")
     print("=" * 70)
     
     # Get baseline metrics
     baseline_metrics = generate_baseline(quick=True)
     
-    # Benchmark optimized model
+    # For demonstration, estimate optimized metrics
+    # In real competition, this would benchmark the actual optimized model
     print("🔬 Benchmarking optimized model...")
     
-    # Use Profiler and Benchmark from Module 19
-    profiler = Profiler()
-    
-    # For demonstration, we'll use placeholder metrics
-    # In real competition, students would measure their actual optimized model
+    # Placeholder: Students' actual optimizations would be measured here
     optimized_metrics = {
-        "model": getattr(optimized_model, 'name', 'Optimized_Model'),
-        "accuracy": 84.0,  # Would be measured with actual test set
-        "latency_ms": 28.0,  # Would be measured with profiler
-        "memory_mb": 4.0,  # Would be measured with profiler
-        "parameters": 2000000,  # Would be counted
+        "model": "Your_Optimized_Model",
+        "accuracy": 84.0,  # Measured
+        "latency_ms": 28.0,  # Measured
+        "memory_mb": 4.0,  # Measured
+        "parameters": 2000000,  # Measured
     }
     
-    # Calculate normalized scores using Module 19's function
-    baseline_for_norm = {
-        "latency": baseline_metrics["latency_ms"],
-        "memory": baseline_metrics["memory_mb"],
-        "accuracy": baseline_metrics["accuracy"]
+    # Calculate improvements
+    improvements = {
+        "accuracy_change": optimized_metrics["accuracy"] - baseline_metrics["accuracy"],
+        "latency_speedup": baseline_metrics["latency_ms"] / optimized_metrics["latency_ms"],
+        "memory_reduction": baseline_metrics["memory_mb"] / optimized_metrics["memory_mb"],
     }
     
-    optimized_for_norm = {
-        "latency": optimized_metrics["latency_ms"],
-        "memory": optimized_metrics["memory_mb"],
-        "accuracy": optimized_metrics["accuracy"]
-    }
-    
-    normalized_scores = calculate_normalized_scores(baseline_for_norm, optimized_for_norm)
-    
-    # Create submission with all required fields
+    # Create submission
     submission = {
-        "division": division,
         "event": event,
         "athlete_name": athlete_name,
-        "github_repo": github_repo,
         "baseline": baseline_metrics,
         "optimized": optimized_metrics,
-        "normalized_scores": {
-            "speedup": normalized_scores["speedup"],
-            "compression_ratio": normalized_scores["compression_ratio"],
-            "accuracy_delta": normalized_scores["accuracy_delta"],
-            "efficiency_score": normalized_scores["efficiency_score"]
-        },
-        "techniques_applied": techniques or ["TODO: Document your optimization techniques"],
+        "improvements": improvements,
+        "techniques_applied": techniques or ["TODO: List your techniques"],
         "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-        "tinytorch_version": "0.1.0",
-        "honor_code": False  # Must be explicitly set to True after validation
     }
     
-    # Validate submission
-    print("\n🔍 Validating submission...")
-    validation = validate_submission(submission)
-    
-    # Display validation results
-    print("\n📋 Validation Results:")
-    for check in validation["checks"]:
-        print(f"  {check}")
-    for warning in validation["warnings"]:
-        print(f"  {warning}")
-    for error in validation["errors"]:
-        print(f"  {error}")
-    
-    if not validation["valid"]:
-        print("\n❌ Submission has errors - please fix before submitting")
-        return submission
-    
     # Save to JSON
     output_file = Path("submission.json")
     with open(output_file, "w") as f:
         json.dump(submission, f, indent=2)
     
-    print(f"\n✅ Submission saved to: {output_file}")
+    print(f"✅ Submission saved to: {output_file}")
     print()
-    print("📊 Your Normalized Scores (MLPerf-style):")
-    print(f"  Division:        {division.upper()}")
-    print(f"  Event:           {event.replace('_', ' ').title()}")
-    print(f"  Speedup:         {normalized_scores['speedup']:.2f}x faster ⚡")
-    print(f"  Compression:     {normalized_scores['compression_ratio']:.2f}x smaller 💾")
-    print(f"  Accuracy:        {optimized_metrics['accuracy']:.1f}% (Δ {normalized_scores['accuracy_delta']:+.2f}pp)")
-    print(f"  Efficiency:      {normalized_scores['efficiency_score']:.2f}")
-    print()
-    print("📤 Next Steps:")
-    print("  1. Verify all metrics are correct")
-    print("  2. Push your code to GitHub (if not done)")
-    print("  3. Run: tito submit submission.json")
-    print("     (This will validate and prepare final submission)")
+    print("📊 Your Results:")
+    print(f"  Event:           {event}")
+    print(f"  Accuracy:        {optimized_metrics['accuracy']:.1f}% (Δ {improvements['accuracy_change']:+.1f}pp)")
+    print(f"  Latency:         {optimized_metrics['latency_ms']:.1f}ms ({improvements['latency_speedup']:.2f}x faster)")
+    print(f"  Memory:          {optimized_metrics['memory_mb']:.2f}MB ({improvements['memory_reduction']:.2f}x smaller)")
     print()
+    print("📤 Upload submission.json to TorchPerf Olympics platform!")
     print("=" * 70)
     
     return submission
diff --git a/tinytorch/core/autograd.py b/tinytorch/core/autograd.py
index 4e340bfd..1a71c287 100644
--- a/tinytorch/core/autograd.py
+++ b/tinytorch/core/autograd.py
@@ -15,9 +15,9 @@
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
-__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'SumBackward',
-           'ReshapeBackward', 'EmbeddingBackward', 'SqrtBackward', 'MeanBackward', 'ReLUBackward', 'GELUBackward',
-           'SigmoidBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
+__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'TransposeBackward',
+           'PermuteBackward', 'EmbeddingBackward', 'ReshapeBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
+           'SoftmaxBackward', 'GELUBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
 
 # %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
 import numpy as np
@@ -164,92 +164,66 @@ class MulBackward(Function):
 
         return grad_a, grad_b
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 12
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 13
 class SubBackward(Function):
     """
     Gradient computation for tensor subtraction.
     
     **Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1
-    
-    **Key Insight:** Subtraction passes gradient unchanged to first input,
-    but negates it for second input (because of the minus sign).
-    
-    **Applications:** Used in residual connections, computing differences in losses.
     """
 
     def apply(self, grad_output):
         """
         Compute gradients for subtraction.
         
-        Args:
-            grad_output: Gradient flowing backward from output
-            
         Returns:
-            Tuple of (grad_a, grad_b) for the two inputs
-            
-        **Mathematical Foundation:**
-        - ∂(a-b)/∂a = 1 → grad_a = grad_output
-        - ∂(a-b)/∂b = -1 → grad_b = -grad_output
+            Tuple of (grad_a, grad_b) where grad_b is negated
         """
         a, b = self.saved_tensors
         grad_a = grad_b = None
 
-        # Gradient for first input: grad_output (unchanged)
         if isinstance(a, Tensor) and a.requires_grad:
-            grad_a = grad_output
+            grad_a = grad_output  # ∂(a-b)/∂a = 1
 
-        # Gradient for second input: -grad_output (negated)
         if isinstance(b, Tensor) and b.requires_grad:
-            grad_b = -grad_output
+            grad_b = -grad_output  # ∂(a-b)/∂b = -1 (note the negative!)
 
         return grad_a, grad_b
 
-
-#| export
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15
 class DivBackward(Function):
     """
     Gradient computation for tensor division.
     
-    **Mathematical Rule:** If z = a / b, then ∂z/∂a = 1/b and ∂z/∂b = -a/b²
-    
-    **Key Insight:** Division gradient for numerator is 1/denominator,
-    for denominator is -numerator/denominator².
-    
-    **Applications:** Used in normalization (LayerNorm, BatchNorm), loss functions.
+    **Mathematical Rule:** If z = a / b, then:
+    - ∂z/∂a = 1/b
+    - ∂z/∂b = -a/b²
     """
 
     def apply(self, grad_output):
         """
-        Compute gradients for division.
+        Compute gradients for division using quotient rule.
         
-        Args:
-            grad_output: Gradient flowing backward from output
-            
         Returns:
-            Tuple of (grad_a, grad_b) for the two inputs
-            
-        **Mathematical Foundation:**
-        - ∂(a/b)/∂a = 1/b → grad_a = grad_output / b
-        - ∂(a/b)/∂b = -a/b² → grad_b = -grad_output * a / b²
+            Tuple of (grad_a, grad_b)
         """
         a, b = self.saved_tensors
         grad_a = grad_b = None
 
-        # Gradient for numerator: grad_output / b
         if isinstance(a, Tensor) and a.requires_grad:
+            # ∂(a/b)/∂a = 1/b
             if isinstance(b, Tensor):
                 grad_a = grad_output / b.data
             else:
                 grad_a = grad_output / b
 
-        # Gradient for denominator: -grad_output * a / b²
         if isinstance(b, Tensor) and b.requires_grad:
+            # ∂(a/b)/∂b = -a/b²
             grad_b = -grad_output * a.data / (b.data ** 2)
 
         return grad_a, grad_b
 
-
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
 class MatmulBackward(Function):
     """
     Gradient computation for matrix multiplication.
@@ -269,8 +243,6 @@ class MatmulBackward(Function):
         """
         Compute gradients for matrix multiplication.
         
-        Handles both 2D matrices and 3D batched tensors (for transformers).
-        
         Args:
             grad_output: Gradient flowing backward from output
             
@@ -278,40 +250,244 @@ class MatmulBackward(Function):
             Tuple of (grad_a, grad_b) for the two matrix inputs
             
         **Mathematical Foundation:**
-        - 2D: ∂(A@B)/∂A = grad_output @ B.T
-        - 3D: ∂(A@B)/∂A = grad_output @ swapaxes(B, -2, -1)
+        - ∂(A@B)/∂A = grad_output @ B.T
+        - ∂(A@B)/∂B = A.T @ grad_output
         
-        **Why Both Cases:**
-        - 2D: Traditional matrix multiplication (Linear layers)
-        - 3D: Batched operations (Transformers: batch, seq, embed)
+        **Batched Operation:** For 3D+ tensors, we transpose only the last two
+        dimensions using np.swapaxes, preserving batch dimensions.
         """
         a, b = self.saved_tensors
         grad_a = grad_b = None
 
-        # Detect if we're dealing with batched (3D) or regular (2D) tensors
-        is_batched = len(grad_output.shape) == 3
-
-        # Gradient for first input: grad_output @ b.T (or batched equivalent)
+        # Gradient for first input: grad_output @ b.T
         if isinstance(a, Tensor) and a.requires_grad:
-            if is_batched:
-                # Batched: use matmul and swapaxes for transpose
-                grad_a = np.matmul(grad_output, np.swapaxes(b.data, -2, -1))
+            # For batched tensors, transpose only last two dims
+            if b.data.ndim >= 2:
+                b_T = np.swapaxes(b.data, -2, -1)
             else:
-                # 2D: use dot and .T for transpose
-                grad_a = np.dot(grad_output, b.data.T)
+                b_T = b.data.T
+            grad_a = np.matmul(grad_output, b_T)
 
-        # Gradient for second input: a.T @ grad_output (or batched equivalent)
+        # Gradient for second input: a.T @ grad_output
         if isinstance(b, Tensor) and b.requires_grad:
-            if is_batched:
-                # Batched: use matmul and swapaxes for transpose
-                grad_b = np.matmul(np.swapaxes(a.data, -2, -1), grad_output)
+            # For batched tensors, transpose only last two dims
+            if a.data.ndim >= 2:
+                a_T = np.swapaxes(a.data, -2, -1)
             else:
-                # 2D: use dot and .T for transpose
-                grad_b = np.dot(a.data.T, grad_output)
+                a_T = a.data.T
+            grad_b = np.matmul(a_T, grad_output)
 
         return grad_a, grad_b
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
+class TransposeBackward(Function):
+    """
+    Gradient computation for transpose operation.
+    
+    **Mathematical Rule:** If Y = X.T, then:
+    - ∂Y/∂X = grad_Y.T
+    
+    **Key Insight:** The gradient of transpose is just transpose the gradient!
+    This is because transpose is a linear operation that just rearranges elements.
+    
+    **Applications:** Used in attention (K.T for scores), weight gradients (W.T),
+    and any operation that needs to swap matrix dimensions.
+    """
+
+    def __init__(self, tensor, dim0, dim1):
+        """
+        Args:
+            tensor: Input tensor
+            dim0: First dimension to swap (None for default)
+            dim1: Second dimension to swap (None for default)
+        """
+        super().__init__(tensor)
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def apply(self, grad_output):
+        """
+        Compute gradient for transpose.
+        
+        Args:
+            grad_output: Gradient flowing backward from output
+            
+        Returns:
+            Tuple with single gradient for input tensor
+            
+        **Mathematical Foundation:**
+        - ∂(X.T)/∂X = grad_output.T
+        - Just transpose the gradient back!
+        """
+        x, = self.saved_tensors
+        grad_x = None
+
+        if isinstance(x, Tensor) and x.requires_grad:
+            # Transpose gradient using the same dims
+            if self.dim0 is None and self.dim1 is None:
+                # Default: transpose last two dimensions
+                if grad_output.ndim < 2:
+                    grad_x = grad_output.copy()
+                else:
+                    axes = list(range(grad_output.ndim))
+                    axes[-2], axes[-1] = axes[-1], axes[-2]
+                    grad_x = np.transpose(grad_output, axes)
+            else:
+                # Specific dimensions: swap them back
+                axes = list(range(grad_output.ndim))
+                axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0]
+                grad_x = np.transpose(grad_output, axes)
+
+        return (grad_x,)
+
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 19
+class PermuteBackward(Function):
+    """
+    Gradient computation for arbitrary axis permutation (general transpose).
+    
+    **Mathematical Rule:** If Y = X.permute(axes), then:
+    - ∂Y/∂X = grad_Y.permute(inverse_axes)
+    
+    **Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse).
+    More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0).
+    
+    **Key Insight:** To reverse a permutation, we need to know where each axis went.
+    If axis i went to position axes[i], then in the inverse, position axes[i] should go to i.
+    
+    **Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads.
+    """
+
+    def __init__(self, tensor, axes):
+        """
+        Args:
+            tensor: Input tensor
+            axes: Tuple of axis indices defining the permutation
+        """
+        super().__init__(tensor)
+        self.axes = axes
+        # Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i
+        self.inverse_axes = tuple(np.argsort(axes))
+
+    def apply(self, grad_output):
+        """
+        Compute gradient for permutation.
+        
+        The gradient is permuted back using the inverse permutation.
+        
+        **Mathematical Foundation:**
+        - ∂(X.permute(axes))/∂X = grad_output.permute(inverse_axes)
+        """
+        x, = self.saved_tensors
+        grad_x = None
+
+        if isinstance(x, Tensor) and x.requires_grad:
+            # Permute gradient back to original axis order
+            grad_x = np.transpose(grad_output, self.inverse_axes)
+
+        return (grad_x,)
+
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 20
+class EmbeddingBackward(Function):
+    """
+    Gradient computation for embedding lookup operation.
+    
+    **Mathematical Rule:** If Y = Embedding[indices], then:
+    - ∂Loss/∂Embedding[i] = sum of all gradients where index==i
+    
+    **Key Insight:** Embedding lookup is a gather operation. The backward
+    is a scatter operation that accumulates gradients to the embedding weights.
+    
+    **Applications:** Word embeddings, positional embeddings, token embeddings
+    in transformers.
+    """
+
+    def __init__(self, weight, indices):
+        """
+        Args:
+            weight: Embedding weight matrix
+            indices: Indices used for lookup
+        """
+        super().__init__(weight)
+        self.indices = indices
+
+    def apply(self, grad_output):
+        """
+        Compute gradient for embedding lookup.
+        
+        Args:
+            grad_output: Gradient flowing backward from output
+            
+        Returns:
+            Tuple with single gradient for weight tensor
+            
+        **Mathematical Foundation:**
+        - ∂(Embedding[indices])/∂Embedding = scatter gradients to selected rows
+        - Multiple indices can point to same embedding → gradients accumulate
+        """
+        weight, = self.saved_tensors
+        grad_weight = None
+
+        if isinstance(weight, Tensor) and weight.requires_grad:
+            # Initialize gradient with zeros
+            grad_weight = np.zeros_like(weight.data)
+            
+            # Scatter gradients back to embedding weights
+            # np.add.at accumulates gradients for repeated indices
+            indices_flat = self.indices.data.astype(int).flatten()
+            grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1])
+            
+            np.add.at(grad_weight, indices_flat, grad_output_reshaped)
+
+        return (grad_weight,)
+
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21
+class ReshapeBackward(Function):
+    """
+    Gradient computation for reshape operation.
+    
+    **Mathematical Rule:** If Y = X.reshape(new_shape), then:
+    - ∂Y/∂X = grad_Y.reshape(X.shape)
+    
+    **Key Insight:** Reshape just rearranges the same elements.
+    The gradient is simply reshaped back to the original shape!
+    
+    **Applications:** Flattening tensors for linear layers, reshaping
+    between convolutional and dense layers.
+    """
+
+    def __init__(self, tensor, original_shape):
+        """
+        Args:
+            tensor: Input tensor
+            original_shape: Shape before reshape
+        """
+        super().__init__(tensor)
+        self.original_shape = original_shape
+
+    def apply(self, grad_output):
+        """
+        Compute gradient for reshape.
+        
+        Args:
+            grad_output: Gradient flowing backward from output
+            
+        Returns:
+            Tuple with single gradient for input tensor
+            
+        **Mathematical Foundation:**
+        - ∂(X.reshape(...))/∂X = grad_output.reshape(X.shape)
+        - Just reshape the gradient back!
+        """
+        x, = self.saved_tensors
+        grad_x = None
+
+        if isinstance(x, Tensor) and x.requires_grad:
+            # Reshape gradient back to original shape
+            grad_x = grad_output.reshape(self.original_shape)
+
+        return (grad_x,)
+
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
 class SumBackward(Function):
     """
     Gradient computation for tensor sum.
@@ -345,186 +521,7 @@ class SumBackward(Function):
             return np.ones_like(tensor.data) * grad_output,
         return None,
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
-class ReshapeBackward(Function):
-    """
-    Gradient computation for tensor reshape.
-    
-    **Mathematical Rule:** If z = reshape(a, new_shape), then ∂z/∂a is reshape(grad_z, old_shape)
-    
-    **Key Insight:** Reshape doesn't change values, only their arrangement.
-    Gradients flow back by reshaping to the original shape.
-    
-    **Applications:** Used in transformers (flattening for loss), CNNs, and
-    anywhere tensor dimensions need to be rearranged.
-    """
-
-    def apply(self, grad_output):
-        """
-        Compute gradients for reshape operation.
-        
-        Args:
-            grad_output: Gradient flowing backward from output
-            
-        Returns:
-            Tuple containing gradient for the input tensor
-            
-        **Mathematical Foundation:**
-        - Reshape is a view operation: grad_input = reshape(grad_output, original_shape)
-        """
-        tensor, = self.saved_tensors
-        original_shape = tensor.shape
-
-        if isinstance(tensor, Tensor) and tensor.requires_grad:
-            # Reshape gradient back to original input shape
-            return np.reshape(grad_output, original_shape),
-        return None,
-
-
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
-class EmbeddingBackward(Function):
-    """
-    Gradient computation for embedding lookup.
-    
-    **Mathematical Rule:** If z = embedding[indices], gradients accumulate at indexed positions.
-    
-    **Key Insight:** Multiple indices can point to the same embedding vector,
-    so gradients must accumulate (not overwrite) at each position.
-    
-    **Applications:** Used in NLP transformers, language models, and any discrete input.
-    """
-
-    def apply(self, grad_output):
-        """
-        Compute gradients for embedding lookup.
-        
-        Args:
-            grad_output: Gradient flowing backward from output (batch, seq, embed_dim)
-            
-        Returns:
-            Tuple containing gradient for the embedding weight matrix
-            
-        **Mathematical Foundation:**
-        - Embedding is a lookup: output[i] = weight[indices[i]]
-        - Gradients scatter back to indexed positions: grad_weight[indices[i]] += grad_output[i]
-        - Must accumulate because multiple positions can use same embedding
-        """
-        weight, indices = self.saved_tensors
-        
-        if isinstance(weight, Tensor) and weight.requires_grad:
-            # Initialize gradient matrix with zeros
-            grad_weight = np.zeros_like(weight.data)
-            
-            # Scatter gradients back to embedding table
-            # np.add.at accumulates values at repeated indices
-            flat_indices = indices.data.astype(int).flatten()
-            flat_grad_output = grad_output.reshape((-1, weight.shape[-1]))
-            
-            np.add.at(grad_weight, flat_indices, flat_grad_output)
-            
-            return grad_weight, None
-        
-        return None, None
-
-
-#| export
-class SqrtBackward(Function):
-    """
-    Gradient computation for square root.
-    
-    **Mathematical Rule:** If z = sqrt(x), then ∂z/∂x = 1 / (2 * sqrt(x))
-    
-    **Key Insight:** Gradient is inversely proportional to the square root output.
-    
-    **Applications:** Used in normalization (LayerNorm, BatchNorm), distance metrics.
-    """
-
-    def apply(self, grad_output):
-        """
-        Compute gradients for sqrt operation.
-        
-        Args:
-            grad_output: Gradient flowing backward from output
-            
-        Returns:
-            Tuple containing gradient for the input
-            
-        **Mathematical Foundation:**
-        - d/dx(sqrt(x)) = 1 / (2 * sqrt(x)) = 1 / (2 * output)
-        """
-        x, = self.saved_tensors
-        output = self.saved_output
-        
-        if isinstance(x, Tensor) and x.requires_grad:
-            # Gradient: 1 / (2 * sqrt(x))
-            grad_x = grad_output / (2.0 * output.data)
-            return grad_x,
-        
-        return None,
-
-
-#| export
-class MeanBackward(Function):
-    """
-    Gradient computation for mean reduction.
-    
-    **Mathematical Rule:** If z = mean(x), then ∂z/∂x_i = 1 / N for all i
-    
-    **Key Insight:** Mean distributes gradient equally to all input elements.
-    
-    **Applications:** Used in loss functions, normalization (LayerNorm, BatchNorm).
-    """
-
-    def apply(self, grad_output):
-        """
-        Compute gradients for mean reduction.
-        
-        Args:
-            grad_output: Gradient flowing backward from output
-            
-        Returns:
-            Tuple containing gradient for the input
-            
-        **Mathematical Foundation:**
-        - mean reduces by averaging, so gradient is distributed equally
-        - Each input element contributes 1/N to the output
-        - Gradient: grad_output / N, broadcasted to input shape
-        """
-        x, = self.saved_tensors
-        axis = self.axis
-        keepdims = self.keepdims
-        
-        if isinstance(x, Tensor) and x.requires_grad:
-            # Number of elements that were averaged
-            if axis is None:
-                N = x.size
-            else:
-                if isinstance(axis, int):
-                    N = x.shape[axis]
-                else:
-                    N = np.prod([x.shape[ax] for ax in axis])
-            
-            # Distribute gradient equally: each element gets grad_output / N
-            grad_x = grad_output / N
-            
-            # Broadcast gradient back to original shape
-            if not keepdims and axis is not None:
-                # Need to add back the reduced dimensions for broadcasting
-                if isinstance(axis, int):
-                    grad_x = np.expand_dims(grad_x, axis=axis)
-                else:
-                    for ax in sorted(axis):
-                        grad_x = np.expand_dims(grad_x, axis=ax)
-            
-            # Broadcast to match input shape
-            grad_x = np.broadcast_to(grad_x, x.shape)
-            
-            return grad_x,
-        
-        return None,
-
-
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
 class ReLUBackward(Function):
     """
     Gradient computation for ReLU activation.
@@ -547,48 +544,7 @@ class ReLUBackward(Function):
             return grad_output * relu_grad,
         return None,
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
-class GELUBackward(Function):
-    """
-    Gradient computation for GELU activation.
-    
-    **Mathematical Rule:** GELU(x) = x * Φ(x) where Φ is the standard normal CDF
-    
-    **Key Insight:** GELU gradient involves both the function value and its derivative.
-    
-    **Applications:** Used in modern transformers (GPT, BERT) as a smooth alternative to ReLU.
-    """
-
-    def apply(self, grad_output):
-        """
-        Compute gradients for GELU activation.
-        
-        Args:
-            grad_output: Gradient flowing backward from output
-            
-        Returns:
-            Tuple containing gradient for the input
-            
-        **Mathematical Foundation:**
-        - GELU approximation: f(x) = x * sigmoid(1.702 * x)
-        - Gradient: f'(x) = sigmoid(1.702*x) + x * sigmoid(1.702*x) * (1-sigmoid(1.702*x)) * 1.702
-        """
-        x, = self.saved_tensors
-        
-        if isinstance(x, Tensor) and x.requires_grad:
-            # GELU gradient using approximation
-            # f(x) = x * sigmoid(1.702*x)
-            # f'(x) = sigmoid(1.702*x) + 1.702 * x * sigmoid(1.702*x) * (1 - sigmoid(1.702*x))
-            
-            sig = 1.0 / (1.0 + np.exp(-1.702 * x.data))
-            grad_x = grad_output * (sig + 1.702 * x.data * sig * (1 - sig))
-            
-            return grad_x,
-        
-        return None,
-
-
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
 class SigmoidBackward(Function):
     """
     Gradient computation for sigmoid activation.
@@ -618,7 +574,101 @@ class SigmoidBackward(Function):
             return grad_output * sigmoid_grad,
         return None,
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 30
+class SoftmaxBackward(Function):
+    """
+    Gradient computation for softmax activation.
+    
+    Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x))
+    Derivative: ∂softmax/∂x[i] = softmax[i] * (δ[i,j] - softmax[j])
+    
+    For gradient computation:
+    grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax))
+    
+    **Key Insight:** The gradient depends on all elements of softmax due to
+    the normalization, not just the element being differentiated.
+    """
+    
+    def __init__(self, input_tensor, output_tensor, dim=-1):
+        """
+        Initialize with input, output, and dimension.
+        
+        Args:
+            input_tensor: Original input to softmax
+            output_tensor: Output of softmax (needed for gradient)
+            dim: Dimension along which softmax was applied
+        """
+        super().__init__(input_tensor)
+        self.output_data = output_tensor.data
+        self.dim = dim
+    
+    def apply(self, grad_output):
+        """
+        Compute gradient for softmax.
+        
+        Mathematical formula:
+        ∂L/∂x[i] = softmax[i] * (∂L/∂y[i] - sum_j(∂L/∂y[j] * softmax[j]))
+        
+        This can be vectorized as:
+        grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True))
+        """
+        tensor, = self.saved_tensors
+        
+        if isinstance(tensor, Tensor) and tensor.requires_grad:
+            # Compute sum(grad_output * softmax) along the softmax dimension
+            sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True)
+            
+            # Softmax gradient: softmax * (grad_output - sum_term)
+            grad_x = self.output_data * (grad_output - sum_term)
+            
+            return (grad_x,)
+        return (None,)
+
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 31
+class GELUBackward(Function):
+    """
+    Gradient computation for GELU activation.
+    
+    GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal
+    Approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
+    
+    **Key Insight:** GELU is smoother than ReLU, providing non-zero gradients
+    for negative values, which helps training deep networks.
+    """
+    
+    def __init__(self, input_tensor):
+        """Initialize with input tensor."""
+        super().__init__(input_tensor)
+    
+    def apply(self, grad_output):
+        """
+        Compute gradient for GELU.
+        
+        Mathematical formula (using approximation):
+        ∂gelu/∂x ≈ 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...)
+        
+        Simplified: We compute the derivative numerically or use the formula.
+        """
+        tensor, = self.saved_tensors
+        
+        if isinstance(tensor, Tensor) and tensor.requires_grad:
+            x = tensor.data
+            # GELU derivative approximation
+            # Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+            sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
+            x_cubed = x ** 3
+            tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed)
+            tanh_out = np.tanh(tanh_arg)
+            sech_squared = 1 - tanh_out ** 2
+            
+            # Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx
+            d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2)
+            gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg
+            
+            return (grad_output * gelu_grad,)
+        return (None,)
+
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 32
 class MSEBackward(Function):
     """
     Gradient computation for Mean Squared Error Loss.
@@ -644,7 +694,7 @@ class MSEBackward(Function):
             return grad * grad_output,
         return None,
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 33
 class BCEBackward(Function):
     """
     Gradient computation for Binary Cross-Entropy Loss.
@@ -674,7 +724,7 @@ class BCEBackward(Function):
             return grad * grad_output,
         return None,
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 34
 class CrossEntropyBackward(Function):
     """
     Gradient computation for Cross-Entropy Loss.
@@ -719,7 +769,7 @@ class CrossEntropyBackward(Function):
             return grad * grad_output,
         return None,
 
-# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
+# %% ../../modules/source/05_autograd/autograd_dev.ipynb 35
 def enable_autograd():
     """
     Enable gradient tracking for all Tensor operations.
@@ -758,8 +808,10 @@ def enable_autograd():
     _original_add = Tensor.__add__
     _original_sub = Tensor.__sub__
     _original_mul = Tensor.__mul__
-    _original_truediv = Tensor.__truediv__
+    _original_div = Tensor.__truediv__
     _original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
+    _original_transpose = Tensor.transpose if hasattr(Tensor, 'transpose') else None
+    _original_reshape = Tensor.reshape if hasattr(Tensor, 'reshape') else None
 
     # Enhanced operations that track gradients
     def tracked_add(self, other):
@@ -806,6 +858,76 @@ def enable_autograd():
 
         return result
 
+    def tracked_matmul(self, other):
+        """
+        Matrix multiplication with gradient tracking.
+        
+        Enhances the original matmul method to build computation graphs
+        when requires_grad=True for any input.
+        """
+        if _original_matmul:
+            result = _original_matmul(self, other)
+        else:
+            # Fallback if matmul doesn't exist
+            result = Tensor(np.dot(self.data, other.data))
+
+        # Track gradient if needed
+        if self.requires_grad or other.requires_grad:
+            result.requires_grad = True
+            result._grad_fn = MatmulBackward(self, other)
+
+        return result
+
+    def tracked_transpose(self, dim0=None, dim1=None):
+        """
+        Transpose with gradient tracking.
+        
+        Enhances the original transpose method to build computation graphs
+        when requires_grad=True for the input.
+        """
+        if _original_transpose:
+            result = _original_transpose(self, dim0, dim1)
+        else:
+            # Fallback if transpose doesn't exist
+            if dim0 is None and dim1 is None:
+                axes = list(range(len(self.shape)))
+                if len(axes) >= 2:
+                    axes[-2], axes[-1] = axes[-1], axes[-2]
+                result = Tensor(np.transpose(self.data, axes))
+            else:
+                axes = list(range(len(self.shape)))
+                axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+                result = Tensor(np.transpose(self.data, axes))
+
+        # Track gradient if needed
+        if self.requires_grad:
+            result.requires_grad = True
+            result._grad_fn = TransposeBackward(self, dim0, dim1)
+
+        return result
+
+    def tracked_reshape(self, *shape):
+        """
+        Reshape with gradient tracking.
+        
+        Enhances the original reshape method to build computation graphs
+        when requires_grad=True for the input.
+        """
+        original_shape = self.shape
+        
+        if _original_reshape:
+            result = _original_reshape(self, *shape)
+        else:
+            # Fallback if reshape doesn't exist
+            result = Tensor(self.data.reshape(*shape))
+
+        # Track gradient if needed
+        if self.requires_grad:
+            result.requires_grad = True
+            result._grad_fn = ReshapeBackward(self, original_shape)
+
+        return result
+
     def tracked_sub(self, other):
         """
         Subtraction with gradient tracking.
@@ -827,7 +949,7 @@ def enable_autograd():
 
         return result
 
-    def tracked_truediv(self, other):
+    def tracked_div(self, other):
         """
         Division with gradient tracking.
         
@@ -839,7 +961,7 @@ def enable_autograd():
             other = Tensor(other)
 
         # Call original operation
-        result = _original_truediv(self, other)
+        result = _original_div(self, other)
 
         # Track gradient if needed
         if self.requires_grad or other.requires_grad:
@@ -848,26 +970,6 @@ def enable_autograd():
 
         return result
 
-    def tracked_matmul(self, other):
-        """
-        Matrix multiplication with gradient tracking.
-        
-        Enhances the original matmul method to build computation graphs
-        when requires_grad=True for any input.
-        """
-        if _original_matmul:
-            result = _original_matmul(self, other)
-        else:
-            # Fallback if matmul doesn't exist
-            result = Tensor(np.dot(self.data, other.data))
-
-        # Track gradient if needed
-        if self.requires_grad or other.requires_grad:
-            result.requires_grad = True
-            result._grad_fn = MatmulBackward(self, other)
-
-        return result
-
     def sum_op(self, axis=None, keepdims=False):
         """
         Sum operation with gradient tracking.
@@ -958,20 +1060,23 @@ def enable_autograd():
     Tensor.__add__ = tracked_add
     Tensor.__sub__ = tracked_sub
     Tensor.__mul__ = tracked_mul
-    Tensor.__truediv__ = tracked_truediv
+    Tensor.__truediv__ = tracked_div
     Tensor.matmul = tracked_matmul
+    Tensor.transpose = tracked_transpose
+    Tensor.reshape = tracked_reshape
     Tensor.sum = sum_op
     Tensor.backward = backward
     Tensor.zero_grad = zero_grad
 
     # Patch activations and losses to track gradients
     try:
-        from tinytorch.core.activations import Sigmoid, ReLU, GELU
+        from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU
         from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
         
         # Store original methods
         _original_sigmoid_forward = Sigmoid.forward
         _original_relu_forward = ReLU.forward
+        _original_softmax_forward = Softmax.forward
         _original_gelu_forward = GELU.forward
         _original_bce_forward = BinaryCrossEntropyLoss.forward
         _original_mse_forward = MSELoss.forward
@@ -999,13 +1104,24 @@ def enable_autograd():
             
             return result
         
+        def tracked_softmax_forward(self, x, dim=-1):
+            """Softmax with gradient tracking."""
+            # Call original forward to get result using Tensor operations
+            result = _original_softmax_forward(self, x, dim=dim)
+            
+            # Attach the correct gradient function
+            if x.requires_grad:
+                result.requires_grad = True
+                result._grad_fn = SoftmaxBackward(x, result, dim)
+            
+            return result
+        
         def tracked_gelu_forward(self, x):
             """GELU with gradient tracking."""
-            # GELU approximation: x * sigmoid(1.702 * x)
-            sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
-            result_data = x.data * sigmoid_part
-            result = Tensor(result_data)
+            # Call original forward to get result
+            result = _original_gelu_forward(self, x)
             
+            # Attach the correct gradient function
             if x.requires_grad:
                 result.requires_grad = True
                 result._grad_fn = GELUBackward(x)
@@ -1071,6 +1187,7 @@ def enable_autograd():
         # Install patched methods
         Sigmoid.forward = tracked_sigmoid_forward
         ReLU.forward = tracked_relu_forward
+        Softmax.forward = tracked_softmax_forward
         GELU.forward = tracked_gelu_forward
         BinaryCrossEntropyLoss.forward = tracked_bce_forward
         MSELoss.forward = tracked_mse_forward
diff --git a/tinytorch/core/tensor.py b/tinytorch/core/tensor.py
index 4c0912c0..82e681fa 100644
--- a/tinytorch/core/tensor.py
+++ b/tinytorch/core/tensor.py
@@ -113,21 +113,10 @@ class Tensor:
         ### BEGIN SOLUTION
         if isinstance(other, Tensor):
             # Tensor + Tensor: let NumPy handle broadcasting
-            result_data = self.data + other.data
+            return Tensor(self.data + other.data)
         else:
             # Tensor + scalar: NumPy broadcasts automatically
-            result_data = self.data + other
-
-        # Create new tensor with result
-        result = Tensor(result_data)
-
-        # Preserve gradient tracking if either operand requires gradients
-        if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
-            result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
-        elif hasattr(self, 'requires_grad'):
-            result.requires_grad = self.requires_grad
-
-        return result
+            return Tensor(self.data + other)
         ### END SOLUTION
 
     # nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
@@ -137,10 +126,12 @@ class Tensor:
 
         Common use: Centering data (x - mean), computing differences for loss functions.
         """
+        ### BEGIN SOLUTION
         if isinstance(other, Tensor):
             return Tensor(self.data - other.data)
         else:
             return Tensor(self.data - other)
+        ### END SOLUTION
 
     def __mul__(self, other):
         """
@@ -149,10 +140,12 @@ class Tensor:
         Common use: Scaling features, applying masks, gating mechanisms in neural networks.
         Note: This is * operator, not @ (which will be matrix multiplication).
         """
+        ### BEGIN SOLUTION
         if isinstance(other, Tensor):
             return Tensor(self.data * other.data)
         else:
             return Tensor(self.data * other)
+        ### END SOLUTION
 
     def __truediv__(self, other):
         """
@@ -160,10 +153,12 @@ class Tensor:
 
         Common use: Normalization (x / std), converting counts to probabilities.
         """
+        ### BEGIN SOLUTION
         if isinstance(other, Tensor):
             return Tensor(self.data / other.data)
         else:
             return Tensor(self.data / other)
+        ### END SOLUTION
 
     # nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
     def matmul(self, other):
@@ -232,7 +227,8 @@ class Tensor:
                 )
 
         # Perform optimized matrix multiplication
-        result_data = np.dot(self.data, other.data)
+        # Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors
+        result_data = np.matmul(self.data, other.data)
         return Tensor(result_data)
         ### END SOLUTION
 
@@ -304,16 +300,8 @@ class Tensor:
 
         # Reshape the data (NumPy handles the memory layout efficiently)
         reshaped_data = np.reshape(self.data, new_shape)
-        
-        # Create output tensor preserving gradient tracking
+        # Preserve gradient tracking from the original tensor (important for autograd!)
         result = Tensor(reshaped_data, requires_grad=self.requires_grad)
-        
-        # Set up backward function for autograd
-        if self.requires_grad:
-            from tinytorch.core.autograd import ReshapeBackward
-            result._grad_fn = ReshapeBackward()
-            result._grad_fn.saved_tensors = (self,)
-        
         return result
         ### END SOLUTION
 
@@ -380,7 +368,9 @@ class Tensor:
             axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
             transposed_data = np.transpose(self.data, axes)
 
-        return Tensor(transposed_data)
+        # Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)
+        result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)
+        return result
         ### END SOLUTION
 
     # nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}
diff --git a/tinytorch/core/training.py b/tinytorch/core/training.py
index f535f6b8..e4082b8f 100644
--- a/tinytorch/core/training.py
+++ b/tinytorch/core/training.py
@@ -15,7 +15,7 @@
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
-__all__ = ['CosineSchedule', 'save_checkpoint', 'load_checkpoint', 'Trainer']
+__all__ = ['CosineSchedule', 'Trainer']
 
 # %% ../../modules/source/07_training/training_dev.ipynb 1
 import numpy as np
@@ -72,90 +72,6 @@ class CosineSchedule:
     ### END SOLUTION
 
 # %% ../../modules/source/07_training/training_dev.ipynb 14
-def save_checkpoint(checkpoint_dict: Dict[str, Any], path: str):
-    """
-    Save checkpoint dictionary to disk using pickle.
-    
-    This is a low-level utility for saving model state. Use this when you have
-    a custom training loop and want to save just what you need (model params,
-    config, metadata).
-    
-    For complete training state with optimizer and scheduler, use 
-    Trainer.save_checkpoint() instead.
-    
-    TODO: Implement checkpoint saving with pickle
-    
-    APPROACH:
-    1. Create parent directory if it doesn't exist (Path(path).parent.mkdir)
-    2. Open file in binary write mode ('wb')
-    3. Use pickle.dump() to serialize the checkpoint dictionary
-    4. Print confirmation message
-    
-    EXAMPLE:
-    >>> model = SimpleModel()
-    >>> checkpoint = {
-    ...     'model_params': [p.data.copy() for p in model.parameters()],
-    ...     'config': {'embed_dim': 32, 'num_layers': 2},
-    ...     'metadata': {'final_loss': 0.089, 'training_steps': 5000}
-    ... }
-    >>> save_checkpoint(checkpoint, 'checkpoints/model.pkl')
-    ✓ Checkpoint saved: checkpoints/model.pkl
-    
-    HINTS:
-    - Use Path(path).parent.mkdir(parents=True, exist_ok=True)
-    - pickle.dump(obj, file) writes the object to file
-    - Always print a success message so users know it worked
-    """
-    ### BEGIN SOLUTION
-    # Create parent directory if needed
-    Path(path).parent.mkdir(parents=True, exist_ok=True)
-    
-    # Save checkpoint using pickle
-    with open(path, 'wb') as f:
-        pickle.dump(checkpoint_dict, f)
-    
-    print(f"✓ Checkpoint saved: {path}")
-    ### END SOLUTION
-
-# %% ../../modules/source/07_training/training_dev.ipynb 15
-def load_checkpoint(path: str) -> Dict[str, Any]:
-    """
-    Load checkpoint dictionary from disk using pickle.
-    
-    Companion function to save_checkpoint(). Restores the checkpoint dictionary
-    so you can rebuild your model, resume training, or inspect saved metadata.
-    
-    TODO: Implement checkpoint loading with pickle
-    
-    APPROACH:
-    1. Open file in binary read mode ('rb')
-    2. Use pickle.load() to deserialize the checkpoint
-    3. Print confirmation message
-    4. Return the loaded dictionary
-    
-    EXAMPLE:
-    >>> checkpoint = load_checkpoint('checkpoints/model.pkl')
-    ✓ Checkpoint loaded: checkpoints/model.pkl
-    >>> print(checkpoint['metadata']['final_loss'])
-    0.089
-    >>> model_params = checkpoint['model_params']
-    >>> # Now restore model: for param, data in zip(model.parameters(), model_params)...
-    
-    HINTS:
-    - pickle.load(file) reads and deserializes the object
-    - Return the loaded dictionary
-    - Print a success message for user feedback
-    """
-    ### BEGIN SOLUTION
-    # Load checkpoint using pickle
-    with open(path, 'rb') as f:
-        checkpoint = pickle.load(f)
-    
-    print(f"✓ Checkpoint loaded: {path}")
-    return checkpoint
-    ### END SOLUTION
-
-# %% ../../modules/source/07_training/training_dev.ipynb 19
 class Trainer:
     """
     Complete training orchestrator for neural networks.
@@ -330,11 +246,6 @@ class Trainer:
     def save_checkpoint(self, path: str):
         """
         Save complete training state for resumption.
-        
-        This high-level method saves everything needed to resume training:
-        model parameters, optimizer state, scheduler state, and training history.
-        
-        Uses the low-level save_checkpoint() function internally.
 
         Args:
             path: File path to save checkpoint
@@ -349,23 +260,19 @@ class Trainer:
             'training_mode': self.training_mode
         }
 
-        # Use the standalone save_checkpoint function
-        save_checkpoint(checkpoint, path)
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        with open(path, 'wb') as f:
+            pickle.dump(checkpoint, f)
 
     def load_checkpoint(self, path: str):
         """
         Load training state from checkpoint.
-        
-        This high-level method restores complete training state including
-        model parameters, optimizer state, scheduler state, and history.
-        
-        Uses the low-level load_checkpoint() function internally.
 
         Args:
             path: File path to load checkpoint from
         """
-        # Use the standalone load_checkpoint function
-        checkpoint = load_checkpoint(path)
+        with open(path, 'rb') as f:
+            checkpoint = pickle.load(f)
 
         self.epoch = checkpoint['epoch']
         self.step = checkpoint['step']
diff --git a/tinytorch/models/transformer.py b/tinytorch/models/transformer.py
index a04d2cbd..4bf34131 100644
--- a/tinytorch/models/transformer.py
+++ b/tinytorch/models/transformer.py
@@ -23,47 +23,7 @@ from ..core.tensor import Tensor
 from ..core.layers import Linear
 from ..core.attention import MultiHeadAttention
 from ..core.activations import GELU
-from ..text.embeddings import Embedding
-from ..core.autograd import SqrtBackward, MeanBackward
-
-# Monkey-patch sqrt method onto Tensor for LayerNorm
-def _tensor_sqrt(self):
-    """
-    Compute element-wise square root with gradient tracking.
-    
-    Used in normalization layers (LayerNorm, BatchNorm).
-    """
-    result_data = np.sqrt(self.data)
-    result = Tensor(result_data, requires_grad=self.requires_grad)
-    
-    if self.requires_grad:
-        result._grad_fn = SqrtBackward()
-        result._grad_fn.saved_tensors = (self,)
-        result._grad_fn.saved_output = result
-    
-    return result
-
-Tensor.sqrt = _tensor_sqrt
-
-# Monkey-patch mean method onto Tensor for LayerNorm
-def _tensor_mean(self, axis=None, keepdims=False):
-    """
-    Compute mean with gradient tracking.
-    
-    Used in normalization layers (LayerNorm, BatchNorm) and loss functions.
-    """
-    result_data = np.mean(self.data, axis=axis, keepdims=keepdims)
-    result = Tensor(result_data, requires_grad=self.requires_grad)
-    
-    if self.requires_grad:
-        result._grad_fn = MeanBackward()
-        result._grad_fn.saved_tensors = (self,)
-        result._grad_fn.axis = axis
-        result._grad_fn.keepdims = keepdims
-    
-    return result
-
-Tensor.mean = _tensor_mean
+from ..text.embeddings import Embedding, PositionalEncoding
 
 # %% ../../modules/source/13_transformers/transformers_dev.ipynb 9
 class LayerNorm:
@@ -101,7 +61,6 @@ class LayerNorm:
         self.eps = eps
 
         # Learnable parameters: scale and shift
-        # CRITICAL: requires_grad=True so optimizer can train these!
         self.gamma = Tensor(np.ones(normalized_shape), requires_grad=True)  # Scale parameter
         self.beta = Tensor(np.zeros(normalized_shape), requires_grad=True)  # Shift parameter
         ### END SOLUTION
@@ -124,24 +83,29 @@ class LayerNorm:
         HINT: Use keepdims=True to maintain tensor dimensions for broadcasting
         """
         ### BEGIN SOLUTION
-        # CRITICAL: Use Tensor operations (not .data) to maintain gradient flow!
         # Compute statistics across last dimension (features)
         mean = x.mean(axis=-1, keepdims=True)
 
         # Compute variance: E[(x - μ)²]
-        diff = x - mean  # Tensor subtraction maintains gradient
-        variance = (diff * diff).mean(axis=-1, keepdims=True)  # Tensor ops maintain gradient
+        # Use Tensor operations to preserve computation graph!
+        diff = x - mean
+        variance = (diff * diff).mean(axis=-1, keepdims=True)
 
-        # Normalize: (x - mean) / sqrt(variance + eps)
-        # Note: Use Tensor.sqrt() to preserve gradient flow
-        std = (variance + self.eps).sqrt()  # sqrt maintains gradient flow
-        normalized = diff / std  # Division maintains gradient flow
+        # Normalize - use Tensor operations to preserve gradients!
+        # Add eps as a Tensor for proper gradient flow
+        eps_tensor = Tensor(np.array(self.eps), requires_grad=False)
+        std = Tensor(np.sqrt(variance.data + self.eps), requires_grad=variance.requires_grad)
+        normalized = (x - mean) / std
 
         # Apply learnable transformation
         output = normalized * self.gamma + self.beta
         return output
         ### END SOLUTION
 
+    def __call__(self, x):
+        """Allows the layer norm to be called like a function."""
+        return self.forward(x)
+
     def parameters(self):
         """Return learnable parameters."""
         return [self.gamma, self.beta]
@@ -183,10 +147,8 @@ class MLP:
 
         # Two-layer feed-forward network
         self.linear1 = Linear(embed_dim, hidden_dim)
+        self.gelu = GELU()  # Use GELU activation from activations module
         self.linear2 = Linear(hidden_dim, embed_dim)
-        
-        # GELU activation
-        self.gelu = GELU()
         ### END SOLUTION
 
     def forward(self, x):
@@ -209,8 +171,8 @@ class MLP:
         # First linear layer with expansion
         hidden = self.linear1.forward(x)
 
-        # GELU activation (callable pattern - activations have __call__)
-        hidden = self.gelu(hidden)
+        # GELU activation (YOUR activation from Module 03!)
+        hidden = self.gelu.forward(hidden)
 
         # Second linear layer back to original size
         output = self.linear2.forward(hidden)
@@ -218,6 +180,10 @@ class MLP:
         return output
         ### END SOLUTION
 
+    def __call__(self, x):
+        """Allows the MLP to be called like a function."""
+        return self.forward(x)
+
     def parameters(self):
         """Return all learnable parameters."""
         params = []
@@ -298,7 +264,7 @@ class TransformerBlock:
         # First sub-layer: Multi-head self-attention with residual connection
         # Pre-norm: LayerNorm before attention
         normed1 = self.ln1.forward(x)
-        # Self-attention: MultiHeadAttention internally creates Q, K, V from input
+        # Self-attention: query, key, value are all the same (normed1)
         attention_out = self.attention.forward(normed1, mask)
 
         # Residual connection
@@ -315,6 +281,10 @@ class TransformerBlock:
         return output
         ### END SOLUTION
 
+    def __call__(self, x, mask=None):
+        """Allows the transformer block to be called like a function."""
+        return self.forward(x, mask)
+
     def parameters(self):
         """Return all learnable parameters."""
         params = []
@@ -434,6 +404,10 @@ class GPT:
         return logits
         ### END SOLUTION
 
+    def __call__(self, tokens):
+        """Allows the GPT model to be called like a function."""
+        return self.forward(tokens)
+
     def _create_causal_mask(self, seq_len):
         """Create causal mask to prevent attending to future positions."""
         ### BEGIN SOLUTION
diff --git a/tinytorch/optimization/acceleration.py b/tinytorch/optimization/acceleration.py
new file mode 100644
index 00000000..e59fe00f
--- /dev/null
+++ b/tinytorch/optimization/acceleration.py
@@ -0,0 +1,22 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_acceleration/acceleration_dev.py     ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = []
+
+# %% ../../modules/source/18_acceleration/acceleration_dev.ipynb 0
+#| default_exp optimization.acceleration
+#| export
diff --git a/tinytorch/optimization/compression.py b/tinytorch/optimization/compression.py
new file mode 100644
index 00000000..20c318fa
--- /dev/null
+++ b/tinytorch/optimization/compression.py
@@ -0,0 +1,300 @@
+# ╔═══════════════════════════════════════════════════════════════════════════════╗
+# ║                        🚨 CRITICAL WARNING 🚨                                ║
+# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
+# ║                                                                               ║
+# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
+# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
+# ║                                                                               ║
+# ║  ✅ TO EDIT: modules/source/XX_compression/compression_dev.py       ║
+# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
+# ║                                                                               ║
+# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
+# ║     Editing it directly may break module functionality and training.         ║
+# ║                                                                               ║
+# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
+# ║     happens! The tinytorch/ directory is just the compiled output.           ║
+# ╚═══════════════════════════════════════════════════════════════════════════════╝
+# %% auto 0
+__all__ = ['Sequential', 'KnowledgeDistillation', 'test_unit_knowledge_distillation', 'CompressionComplete', 'measure_sparsity',
+           'magnitude_prune', 'structured_prune', 'compress_model']
+
+# %% ../../modules/source/17_compression/compression_dev.ipynb 1
+import numpy as np
+import copy
+from typing import List, Dict, Any, Tuple, Optional
+import time
+
+# Import from TinyTorch modules
+from ..core.tensor import Tensor
+from ..core.layers import Linear
+
+# Sequential container for model compression
+class Sequential:
+    """Sequential container for compression (not exported from core layers)."""
+    def __init__(self, *layers):
+        self.layers = list(layers)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer.forward(x) if hasattr(layer, 'forward') else layer(x)
+        return x
+
+    def __call__(self, x):
+        return self.forward(x)
+
+    def parameters(self):
+        params = []
+        for layer in self.layers:
+            if hasattr(layer, 'parameters'):
+                params.extend(layer.parameters())
+        return params
+
+# %% ../../modules/source/17_compression/compression_dev.ipynb 15
+class KnowledgeDistillation:
+    """
+    Knowledge distillation for model compression.
+
+    Train a smaller student model to mimic a larger teacher model.
+    """
+
+    def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
+        """
+        Initialize knowledge distillation.
+
+        TODO: Set up teacher and student models with distillation parameters
+
+        APPROACH:
+        1. Store teacher and student models
+        2. Set temperature for softening probability distributions
+        3. Set alpha for balancing hard vs soft targets
+
+        EXAMPLE:
+        >>> teacher = Sequential(Linear(100, 200), Linear(200, 50))
+        >>> student = Sequential(Linear(100, 50))
+        >>> kd = KnowledgeDistillation(teacher, student, temperature=4.0, alpha=0.8)
+        >>> print(f"Temperature: {kd.temperature}, Alpha: {kd.alpha}")
+        Temperature: 4.0, Alpha: 0.8
+
+        HINTS:
+        - Simply assign the parameters to instance variables
+        - Temperature typically ranges from 3-5 for effective softening
+        - Alpha of 0.7 means 70% soft targets, 30% hard targets
+
+        Args:
+            teacher_model: Large, pre-trained model
+            student_model: Smaller model to train
+            temperature: Softening parameter for distributions
+            alpha: Weight for soft target loss (1-alpha for hard targets)
+        """
+        ### BEGIN SOLUTION
+        self.teacher_model = teacher_model
+        self.student_model = student_model
+        self.temperature = temperature
+        self.alpha = alpha
+        ### END SOLUTION
+
+    def distillation_loss(self, student_logits, teacher_logits, true_labels):
+        """
+        Calculate combined distillation loss.
+
+        TODO: Implement knowledge distillation loss function
+
+        APPROACH:
+        1. Calculate hard target loss (student vs true labels)
+        2. Calculate soft target loss (student vs teacher, with temperature)
+        3. Combine losses: alpha * soft_loss + (1-alpha) * hard_loss
+
+        EXAMPLE:
+        >>> kd = KnowledgeDistillation(teacher, student)
+        >>> loss = kd.distillation_loss(student_out, teacher_out, labels)
+        >>> print(f"Distillation loss: {loss:.4f}")
+
+        HINTS:
+        - Use temperature to soften distributions: logits/temperature
+        - Soft targets use KL divergence or cross-entropy
+        - Hard targets use standard classification loss
+        """
+        ### BEGIN SOLUTION
+        # Convert to numpy for this implementation
+        if hasattr(student_logits, 'data'):
+            student_logits = student_logits.data
+        if hasattr(teacher_logits, 'data'):
+            teacher_logits = teacher_logits.data
+        if hasattr(true_labels, 'data'):
+            true_labels = true_labels.data
+
+        # Soften distributions with temperature
+        student_soft = self._softmax(student_logits / self.temperature)
+        teacher_soft = self._softmax(teacher_logits / self.temperature)
+
+        # Soft target loss (KL divergence)
+        soft_loss = self._kl_divergence(student_soft, teacher_soft)
+
+        # Hard target loss (cross-entropy)
+        student_hard = self._softmax(student_logits)
+        hard_loss = self._cross_entropy(student_hard, true_labels)
+
+        # Combined loss
+        total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss
+
+        return total_loss
+        ### END SOLUTION
+
+    def _softmax(self, logits):
+        """Compute softmax with numerical stability."""
+        exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+        return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
+
+    def _kl_divergence(self, p, q):
+        """Compute KL divergence between distributions."""
+        return np.sum(p * np.log(p / (q + 1e-8) + 1e-8))
+
+    def _cross_entropy(self, predictions, labels):
+        """Compute cross-entropy loss."""
+        # Simple implementation for integer labels
+        if labels.ndim == 1:
+            return -np.mean(np.log(predictions[np.arange(len(labels)), labels] + 1e-8))
+        else:
+            return -np.mean(np.sum(labels * np.log(predictions + 1e-8), axis=1))
+
+def test_unit_knowledge_distillation():
+    """🔬 Test knowledge distillation functionality."""
+    print("🔬 Unit Test: Knowledge Distillation...")
+
+    # Create teacher and student models
+    teacher = Sequential(Linear(10, 20), Linear(20, 5))
+    student = Sequential(Linear(10, 5))  # Smaller model
+
+    # Initialize knowledge distillation
+    kd = KnowledgeDistillation(teacher, student, temperature=3.0, alpha=0.7)
+
+    # Create dummy data
+    input_data = Tensor(np.random.randn(8, 10))  # Batch of 8
+    true_labels = np.array([0, 1, 2, 3, 4, 0, 1, 2])  # Class labels
+
+    # Forward passes
+    teacher_output = teacher.forward(input_data)
+    student_output = student.forward(input_data)
+
+    # Calculate distillation loss
+    loss = kd.distillation_loss(student_output, teacher_output, true_labels)
+
+    # Verify loss is reasonable
+    assert isinstance(loss, (float, np.floating)), f"Loss should be float, got {type(loss)}"
+    assert loss > 0, f"Loss should be positive, got {loss}"
+    assert not np.isnan(loss), "Loss should not be NaN"
+
+    print("✅ knowledge_distillation works correctly!")
+
+test_unit_knowledge_distillation()
+
+# %% ../../modules/source/17_compression/compression_dev.ipynb 29
+class CompressionComplete:
+    """
+    Complete compression system for milestone use.
+    
+    Provides pruning, distillation, and low-rank approximation techniques.
+    """
+    
+    @staticmethod
+    def measure_sparsity(model) -> float:
+        """Measure the sparsity of a model (fraction of zero weights)."""
+        total_params = 0
+        zero_params = 0
+        
+        if hasattr(model, 'parameters'):
+            for param in model.parameters():
+                total_params += param.size
+                zero_params += np.sum(param.data == 0)
+        
+        return zero_params / total_params if total_params > 0 else 0.0
+    
+    @staticmethod
+    def magnitude_prune(model, sparsity=0.5):
+        """
+        Prune model weights by magnitude (smallest weights set to zero).
+        
+        Args:
+            model: Model with parameters() method
+            sparsity: Fraction of weights to prune (0-1)
+        """
+        if hasattr(model, 'parameters'):
+            for param in model.parameters():
+                threshold = np.percentile(np.abs(param.data), sparsity * 100)
+                param.data[np.abs(param.data) < threshold] = 0
+        
+        return model
+    
+    @staticmethod
+    def structured_prune(model, prune_ratio=0.5):
+        """
+        Prune entire neurons/channels (structured pruning).
+        
+        Args:
+            model: Model to prune
+            prune_ratio: Fraction of structures to prune (0-1)
+        """
+        if hasattr(model, 'parameters'):
+            params = list(model.parameters())
+            if len(params) > 0 and hasattr(params[0], 'data'):
+                weight = params[0]
+                if len(weight.shape) == 2:  # Linear layer
+                    # Prune output neurons
+                    neuron_norms = np.linalg.norm(weight.data, axis=0)
+                    threshold = np.percentile(neuron_norms, prune_ratio * 100)
+                    mask = neuron_norms >= threshold
+                    weight.data[:, ~mask] = 0
+        
+        return model
+    
+    @staticmethod
+    def compress_model(model, compression_config: Dict[str, Any]):
+        """
+        Apply complete compression pipeline to a model.
+        
+        Args:
+            model: Model to compress
+            compression_config: Dictionary with compression settings
+                - 'magnitude_sparsity': float (0-1)
+                - 'structured_prune_ratio': float (0-1)
+        
+        Returns:
+            Compressed model with sparsity stats
+        """
+        stats = {
+            'original_sparsity': CompressionComplete.measure_sparsity(model)
+        }
+        
+        # Apply magnitude pruning
+        if 'magnitude_sparsity' in compression_config:
+            model = CompressionComplete.magnitude_prune(
+                model, compression_config['magnitude_sparsity']
+            )
+        
+        # Apply structured pruning
+        if 'structured_prune_ratio' in compression_config:
+            model = CompressionComplete.structured_prune(
+                model, compression_config['structured_prune_ratio']
+            )
+        
+        stats['final_sparsity'] = CompressionComplete.measure_sparsity(model)
+        stats['compression_ratio'] = 1.0 / (1.0 - stats['final_sparsity']) if stats['final_sparsity'] < 1.0 else float('inf')
+        
+        return model, stats
+
+# Convenience functions for backward compatibility
+def measure_sparsity(model) -> float:
+    """Measure model sparsity."""
+    return CompressionComplete.measure_sparsity(model)
+
+def magnitude_prune(model, sparsity=0.5):
+    """Apply magnitude-based pruning."""
+    return CompressionComplete.magnitude_prune(model, sparsity)
+
+def structured_prune(model, prune_ratio=0.5):
+    """Apply structured pruning."""
+    return CompressionComplete.structured_prune(model, prune_ratio)
+
+def compress_model(model, compression_config: Dict[str, Any]):
+    """Apply complete compression pipeline."""
+    return CompressionComplete.compress_model(model, compression_config)
diff --git a/tinytorch/optimization/quantization.py b/tinytorch/optimization/quantization.py
index 70c0eb48..c30509d3 100644
--- a/tinytorch/optimization/quantization.py
+++ b/tinytorch/optimization/quantization.py
@@ -15,9 +15,9 @@
 # ║     happens! The tinytorch/ directory is just the compiled output.           ║
 # ╚═══════════════════════════════════════════════════════════════════════════════╝
 # %% auto 0
-__all__ = ['QuantizationComplete', 'quantize_int8', 'dequantize_int8', 'quantize_model']
+__all__ = []
 
-# %% ../../modules/source/17_quantization/quantization_dev.ipynb 3
+# %% ../../modules/source/16_quantization/quantization_dev.ipynb 3
 import numpy as np
 import time
 from typing import Tuple, Dict, List, Optional
@@ -29,94 +29,3 @@ from ..core.layers import Linear
 from ..core.activations import ReLU
 
 print("✅ Quantization module imports complete")
-
-# %% ../../modules/source/17_quantization/quantization_dev.ipynb 34
-class QuantizationComplete:
-    """
-    Complete quantization system for milestone use.
-    
-    Provides INT8 quantization with calibration for 4× memory reduction.
-    """
-    
-    @staticmethod
-    def quantize_tensor(tensor: Tensor) -> Tuple[Tensor, float, int]:
-        """Quantize FP32 tensor to INT8."""
-        data = tensor.data
-        min_val = float(np.min(data))
-        max_val = float(np.max(data))
-        
-        if abs(max_val - min_val) < 1e-8:
-            return Tensor(np.zeros_like(data, dtype=np.int8)), 1.0, 0
-        
-        scale = (max_val - min_val) / 255.0
-        zero_point = int(np.round(-128 - min_val / scale))
-        zero_point = int(np.clip(zero_point, -128, 127))
-        
-        quantized_data = np.round(data / scale + zero_point)
-        quantized_data = np.clip(quantized_data, -128, 127).astype(np.int8)
-        
-        return Tensor(quantized_data), scale, zero_point
-    
-    @staticmethod
-    def dequantize_tensor(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
-        """Dequantize INT8 tensor back to FP32."""
-        dequantized_data = (q_tensor.data.astype(np.float32) - zero_point) * scale
-        return Tensor(dequantized_data)
-    
-    @staticmethod
-    def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
-        """
-        Quantize all Linear layers in a model.
-        
-        Returns dictionary with quantization info and memory savings.
-        """
-        quantized_layers = {}
-        original_size = 0
-        quantized_size = 0
-        
-        # Iterate through model parameters
-        if hasattr(model, 'parameters'):
-            for i, param in enumerate(model.parameters()):
-                param_size = param.data.nbytes
-                original_size += param_size
-                
-                # Quantize parameter
-                q_param, scale, zp = QuantizationComplete.quantize_tensor(param)
-                quantized_size += q_param.data.nbytes
-                
-                quantized_layers[f'param_{i}'] = {
-                    'quantized': q_param,
-                    'scale': scale,
-                    'zero_point': zp,
-                    'original_shape': param.data.shape
-                }
-        
-        return {
-            'quantized_layers': quantized_layers,
-            'original_size_mb': original_size / (1024 * 1024),
-            'quantized_size_mb': quantized_size / (1024 * 1024),
-            'compression_ratio': original_size / quantized_size if quantized_size > 0 else 1.0
-        }
-    
-    @staticmethod
-    def compare_models(original_model, quantized_info: Dict) -> Dict[str, float]:
-        """Compare memory usage between original and quantized models."""
-        return {
-            'original_mb': quantized_info['original_size_mb'],
-            'quantized_mb': quantized_info['quantized_size_mb'],
-            'compression_ratio': quantized_info['compression_ratio'],
-            'memory_saved_mb': quantized_info['original_size_mb'] - quantized_info['quantized_size_mb']
-        }
-
-# Convenience functions for backward compatibility
-def quantize_int8(tensor: Tensor) -> Tuple[Tensor, float, int]:
-    """Quantize FP32 tensor to INT8."""
-    return QuantizationComplete.quantize_tensor(tensor)
-
-def dequantize_int8(q_tensor: Tensor, scale: float, zero_point: int) -> Tensor:
-    """Dequantize INT8 tensor back to FP32."""
-    return QuantizationComplete.dequantize_tensor(q_tensor, scale, zero_point)
-
-def quantize_model(model, calibration_data: Optional[List[Tensor]] = None) -> Dict[str, any]:
-    """Quantize entire model to INT8."""
-    return QuantizationComplete.quantize_model(model, calibration_data)
diff --git a/tinytorch/text/embeddings.py b/tinytorch/text/embeddings.py
index 07981e95..dacb0f27 100644
--- a/tinytorch/text/embeddings.py
+++ b/tinytorch/text/embeddings.py
@@ -93,18 +93,22 @@ class Embedding:
 
         # Perform embedding lookup using advanced indexing
         # This is equivalent to one-hot multiplication but much more efficient
-        embedded_data = self.weight.data[indices.data.astype(int)]
-        
-        # Create output tensor with gradient tracking
-        from tinytorch.core.autograd import EmbeddingBackward
-        result = Tensor(embedded_data, requires_grad=self.weight.requires_grad)
+        embedded = self.weight.data[indices.data.astype(int)]
+
+        # Create result tensor
+        result = Tensor(embedded, requires_grad=self.weight.requires_grad)
         
+        # Attach gradient function (students learned this in Module 05!)
         if self.weight.requires_grad:
-            result._grad_fn = EmbeddingBackward()
-            result._grad_fn.saved_tensors = (self.weight, indices)
-        
+            from tinytorch.core.autograd import EmbeddingBackward
+            result._grad_fn = EmbeddingBackward(self.weight, indices)
+
         return result
 
+    def __call__(self, indices: Tensor) -> Tensor:
+        """Allows the embedding to be called like a function."""
+        return self.forward(indices)
+
     def parameters(self) -> List[Tensor]:
         """Return trainable parameters."""
         return [self.weight]
@@ -188,16 +192,23 @@ class PositionalEncoding:
                 f"Embedding dimension mismatch: expected {self.embed_dim}, got {embed_dim}"
             )
 
-        # Get position embeddings for this sequence length
-        pos_embeddings = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
+        # Get position embeddings for this sequence length (slice using .data for efficiency)
+        pos_embeddings_data = self.position_embeddings.data[:seq_len]  # (seq_len, embed_dim)
 
         # Broadcast to match batch dimension: (1, seq_len, embed_dim)
-        pos_embeddings = pos_embeddings[np.newaxis, :, :]
+        pos_embeddings_data = pos_embeddings_data[np.newaxis, :, :]
+        
+        # Wrap in Tensor to preserve requires_grad
+        pos_embeddings = Tensor(pos_embeddings_data, requires_grad=self.position_embeddings.requires_grad)
 
-        # Add positional information to input embeddings
-        result = x.data + pos_embeddings
+        # Add positional information using Tensor operation to preserve gradients!
+        result = x + pos_embeddings
 
-        return Tensor(result)
+        return result
+
+    def __call__(self, x: Tensor) -> Tensor:
+        """Allows the positional encoding to be called like a function."""
+        return self.forward(x)
 
     def parameters(self) -> List[Tensor]:
         """Return trainable parameters."""
@@ -325,6 +336,10 @@ class EmbeddingLayer:
 
         return output
 
+    def __call__(self, tokens: Tensor) -> Tensor:
+        """Allows the embedding layer to be called like a function."""
+        return self.forward(tokens)
+
     def parameters(self) -> List[Tensor]:
         """Return all trainable parameters."""
         params = self.token_embedding.parameters()