diff --git a/modules/01_tensor/02_tensor.yml b/modules/01_tensor/02_tensor.yml
deleted file mode 100644
index 3440029c..00000000
--- a/modules/01_tensor/02_tensor.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "tensor"
-title: "Tensor"
-description: "Core tensor data structure and operations"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup"] 
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.tensor"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "tensor_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐"
-time_estimate: "4-6 hours"
-
-# Components - What's implemented in this module
-components:
-  - "Tensor"
-  - "tensor_creation"
-  - "tensor_operations"
-  - "tensor_arithmetic" 
\ No newline at end of file
diff --git a/modules/02_activations/03_activations.yml b/modules/02_activations/03_activations.yml
deleted file mode 100644
index 430ca924..00000000
--- a/modules/02_activations/03_activations.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "activations"
-title: "Activation Functions"
-description: "Neural network activation functions (ReLU, Sigmoid, Tanh, Softmax)"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["tensor"] 
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.activations"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "activations_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐"
-time_estimate: "3-4 hours"
-
-# Components - What's implemented in this module
-components:
-  - "ReLU"
-  - "Sigmoid"
-  - "Tanh"
-  - "Softmax" 
\ No newline at end of file
diff --git a/modules/03_layers/04_layers.yml b/modules/03_layers/04_layers.yml
deleted file mode 100644
index f02a6e41..00000000
--- a/modules/03_layers/04_layers.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "layers"
-title: "Layers"
-description: "Neural network layers (Linear, activation layers)"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup", "tensor", "activations"]
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.layers"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "layers_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐"
-time_estimate: "4-5 hours"
-
-# Components - What's implemented in this module
-components:
-  - "Dense"
-  - "Linear"
-  - "matmul" 
\ No newline at end of file
diff --git a/modules/04_losses/05_losses.yml b/modules/04_losses/05_losses.yml
deleted file mode 100644
index b3c733b2..00000000
--- a/modules/04_losses/05_losses.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: "Loss Functions"
-number: 5
-description: "Essential loss functions for neural network training objectives"
-learning_objectives:
-  - "Implement MSE, CrossEntropy, and BinaryCrossEntropy loss functions"
-  - "Understand numerical stability in loss computation"
-  - "Match loss functions to problem types (regression vs classification)"
-  - "Build production-ready loss functions with batch processing"
-prerequisites:
-  - "02_tensor"
-difficulty: "⭐⭐⭐"
-time_estimate: "2-3 hours"
-exports:
-  - "MeanSquaredError"
-  - "CrossEntropyLoss" 
-  - "BinaryCrossEntropyLoss"
-key_concepts:
-  - "Training objectives and optimization"
-  - "Numerical stability in loss computation"
-  - "Regression vs classification loss functions"
-  - "Batch processing for scalable training"
\ No newline at end of file
diff --git a/modules/05_autograd/06_autograd.yml b/modules/05_autograd/06_autograd.yml
deleted file mode 100644
index b5a5424f..00000000
--- a/modules/05_autograd/06_autograd.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "autograd"
-title: "Autograd"
-description: "Automatic differentiation engine for gradient computation"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup", "tensor", "activations"]
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.autograd"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "autograd_dev.py"
-  test_file: "tests/test_autograd.py"
-  readme: "README.md"
-
-# Educational Metadata
-difficulty: "⭐⭐⭐⭐"
-time_estimate: "8-10 hours"
-
-# Components - What's implemented in this module
-components:
-  - "Variable"
-  - "backward"
-  - "gradient_computation" 
\ No newline at end of file
diff --git a/modules/06_optimizers/07_optimizers.yml b/modules/06_optimizers/07_optimizers.yml
deleted file mode 100644
index 243baeb4..00000000
--- a/modules/06_optimizers/07_optimizers.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "optimizers"
-title: "Optimizers"
-description: "Gradient-based parameter optimization algorithms"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup", "tensor", "autograd"]
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.optimizers"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "optimizers_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐⭐⭐"
-time_estimate: "6-8 hours"
-
-# Components - What's implemented in this module
-components:
-  - "SGD"
-  - "Adam"
-  - "StepLR"
-  - "gradient_descent_step" 
\ No newline at end of file
diff --git a/modules/07_training/08_training.yml b/modules/07_training/08_training.yml
deleted file mode 100644
index 09fb7db2..00000000
--- a/modules/07_training/08_training.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "training"
-title: "Training"
-description: "Neural network training loops, loss functions, and metrics"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup", "tensor", "activations", "layers", "networks", "dataloader", "autograd", "optimizers"]
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.training"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "training_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐⭐⭐"
-time_estimate: "8-10 hours"
-
-# Components - What's implemented in this module
-components:
-  - "MeanSquaredError"
-  - "CrossEntropyLoss"
-  - "BinaryCrossEntropyLoss"
-  - "Accuracy"
-  - "Trainer" 
\ No newline at end of file
diff --git a/modules/08_spatial/09_spatial.yml b/modules/08_spatial/09_spatial.yml
deleted file mode 100644
index 71126497..00000000
--- a/modules/08_spatial/09_spatial.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "spatial"
-title: "Spatial Networks"
-description: "Convolutional networks for spatial pattern recognition and image processing"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup", "tensor", "activations", "layers", "dense"]
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.spatial"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "spatial_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐⭐"
-time_estimate: "6-8 hours"
-
-# Components - What's implemented in this module
-components:
-  - "conv2d_naive"
-  - "Conv2D"
-  - "flatten" 
\ No newline at end of file
diff --git a/modules/09_dataloader/10_dataloader.yml b/modules/09_dataloader/10_dataloader.yml
deleted file mode 100644
index 620f2786..00000000
--- a/modules/09_dataloader/10_dataloader.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-name: "dataloader"
-title: "DataLoader"
-description: "Dataset interfaces and data loading pipelines"
-
-# Dependencies - Used by CLI for module ordering and prerequisites
-dependencies:
-  prerequisites: ["setup", "tensor"]
-
-# Package Export - What gets built into tinytorch package
-exports_to: "tinytorch.core.dataloader"
-
-# File Structure - What files exist in this module
-files:
-  dev_file: "dataloader_dev.py"
-  readme: "README.md"
-  tests: "inline"
-
-# Educational Metadata
-difficulty: "⭐⭐⭐"
-time_estimate: "5-6 hours"
-
-# Components - What's implemented in this module
-components:
-  - "Dataset"
-  - "DataLoader"
-  - "SimpleDataset" 
\ No newline at end of file
diff --git a/modules/10_tokenization/11_tokenization.yml b/modules/10_tokenization/11_tokenization.yml
deleted file mode 100644
index 0b8abfd9..00000000
--- a/modules/10_tokenization/11_tokenization.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: "Tokenization"
-number: 11
-description: "Text processing systems that convert raw text into numerical sequences for language models"
-learning_objectives:
-  - "Implement character-level tokenization with special token handling"
-  - "Build BPE (Byte Pair Encoding) tokenizer for subword units"
-  - "Understand tokenization trade-offs: vocabulary size vs sequence length"
-  - "Optimize tokenization performance for production systems"
-  - "Analyze how tokenization affects model memory and training efficiency"
-
-prerequisites:
-  - "02_tensor"
-
-exports:
-  - "CharTokenizer"
-  - "BPETokenizer" 
-  - "TokenizationProfiler"
-  - "OptimizedTokenizer"
-
-systems_concepts:
-  - "Memory efficiency of token representations"
-  - "Vocabulary size vs model size tradeoffs"
-  - "Tokenization throughput optimization" 
-  - "String processing performance"
-  - "Cache-friendly text processing patterns"
-
-ml_systems_focus: "Text processing pipelines, tokenization throughput, memory-efficient vocabulary management"
-
-estimated_time: "4-5 hours"
-
-next_modules:
-  - "12_embeddings"
\ No newline at end of file
diff --git a/modules/11_embeddings/12_embeddings.yml b/modules/11_embeddings/12_embeddings.yml
deleted file mode 100644
index 8c1a50ad..00000000
--- a/modules/11_embeddings/12_embeddings.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: "Embeddings"
-number: 12
-description: "Dense vector representations that convert discrete tokens into continuous semantic spaces"
-learning_objectives:
-  - "Implement embedding layers with efficient lookup operations"
-  - "Build sinusoidal and learned positional encoding systems"
-  - "Understand embedding memory scaling and optimization techniques"
-  - "Analyze how embedding choices affect model capacity and performance"
-  - "Design embedding systems for production language model deployment"
-
-prerequisites:
-  - "02_tensor"
-  - "11_tokenization"
-
-exports:
-  - "Embedding"
-  - "PositionalEncoding"
-  - "LearnedPositionalEmbedding"
-  - "EmbeddingProfiler"
-
-systems_concepts:
-  - "Embedding table memory scaling O(vocab_size × embed_dim)"
-  - "Memory-bandwidth bound lookup operations"
-  - "Cache-friendly embedding access patterns"
-  - "Position encoding trade-offs and extrapolation"
-  - "Distributed embedding table management"
-
-ml_systems_focus: "Memory-efficient embedding lookup, position encoding scalability, large-scale parameter management"
-
-estimated_time: "4-5 hours"
-
-next_modules:
-  - "13_attention"
\ No newline at end of file
diff --git a/modules/12_attention/13_attention.yml b/modules/12_attention/13_attention.yml
deleted file mode 100644
index e74bc605..00000000
--- a/modules/12_attention/13_attention.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: "Attention"
-number: 13
-description: "Scaled dot-product and multi-head attention mechanisms that enable transformer architectures"
-learning_objectives:
-  - "Implement scaled dot-product attention with proper masking and numerical stability"
-  - "Build multi-head attention with parallel head processing and output projection"
-  - "Design KV-cache systems for efficient autoregressive generation"
-  - "Understand attention's O(N²) scaling and memory optimization techniques"
-  - "Analyze attention performance bottlenecks and production optimization strategies"
-
-prerequisites:
-  - "02_tensor"
-  - "12_embeddings"
-
-exports:
-  - "ScaledDotProductAttention"
-  - "MultiHeadAttention"
-  - "KVCache"
-  - "AttentionProfiler"
-
-systems_concepts:
-  - "Quadratic memory scaling O(N²) with sequence length"
-  - "Memory-bandwidth bound attention computation"
-  - "KV-cache optimization for autoregressive generation"
-  - "Multi-head parallelization and hardware optimization"
-  - "Attention masking patterns and causal dependencies"
-
-ml_systems_focus: "Attention memory scaling, generation efficiency optimization, sequence length limitations"
-
-estimated_time: "5-6 hours"
-
-next_modules:
-  - "14_transformers"
\ No newline at end of file
diff --git a/modules/13_transformers/14_transformers.yml b/modules/13_transformers/14_transformers.yml
deleted file mode 100644
index c4b6631d..00000000
--- a/modules/13_transformers/14_transformers.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: "Transformers"
-number: 14
-description: "Complete transformer architecture with LayerNorm, transformer blocks, and language model implementation"
-learning_objectives:
-  - "Implement LayerNorm for stable deep network training"
-  - "Build position-wise feed-forward networks for transformer blocks"
-  - "Create complete transformer blocks with attention, normalization, and residual connections"
-  - "Develop full transformer models with embeddings, multiple layers, and generation capability"
-  - "Understand transformer scaling characteristics and production deployment considerations"
-
-prerequisites:
-  - "02_tensor"
-  - "12_embeddings"
-  - "13_attention"
-
-exports:
-  - "LayerNorm"
-  - "PositionwiseFeedForward"
-  - "TransformerBlock"
-  - "Transformer"
-  - "TransformerProfiler"
-
-systems_concepts:
-  - "Linear memory scaling with transformer depth"
-  - "Layer normalization vs batch normalization trade-offs"
-  - "Residual connection gradient flow optimization"
-  - "Parameter allocation across depth, width, and attention heads"
-  - "Training memory vs inference memory requirements"
-
-ml_systems_focus: "Transformer architecture optimization, memory scaling with depth, production deployment strategies"
-
-estimated_time: "6-7 hours"
-
-next_modules:
-  - "Advanced transformer architectures and optimization techniques"
\ No newline at end of file
diff --git a/modules/14_profiling/15_profiling.yml b/modules/14_profiling/15_profiling.yml
deleted file mode 100644
index d9e13a80..00000000
--- a/modules/14_profiling/15_profiling.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: Profiling
-number: 15
-type: systems
-difficulty: advanced
-estimated_hours: 8-10
-
-description: |
-  Build professional profiling infrastructure to measure and analyze performance.
-  Students learn to create timing, memory, and operation profilers that reveal
-  bottlenecks and guide optimization decisions. Performance detective work that 
-  makes optimization exciting through data-driven insights.
-
-learning_objectives:
-  - Build accurate timing infrastructure with statistical rigor
-  - Implement memory profiling and allocation tracking
-  - Create FLOP counting for computational analysis
-  - Master profiling methodology for bottleneck identification
-  - Connect profiling insights to ML systems optimization decisions
-
-prerequisites:
-  - Module 14: Transformers (need models to profile)
-
-skills_developed:
-  - Performance measurement
-  - Bottleneck identification
-  - Profiling tool development
-  - Statistical analysis
-
-exports:
-  - tinytorch.profiling
\ No newline at end of file
diff --git a/modules/15_acceleration/16_acceleration.yml b/modules/15_acceleration/16_acceleration.yml
deleted file mode 100644
index f43ca066..00000000
--- a/modules/15_acceleration/16_acceleration.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: "acceleration"
-title: "Hardware Acceleration - The Simplest Optimization"
-description: "Master the easiest optimization: using better backends! Learn why naive loops are slow, how cache-friendly blocking helps, and why NumPy provides 100x+ speedups."
-learning_objectives:
-  - "Understand CPU cache hierarchy and memory access performance bottlenecks"
-  - "Implement cache-friendly blocked matrix multiplication algorithms"  
-  - "Build vectorized operations with optimized memory access patterns"
-  - "Design transparent backend systems for automatic optimization selection"
-  - "Measure and quantify real performance improvements scientifically"
-  - "Apply systems thinking to optimization decisions in ML workflows"
-prerequisites:
-  - "Module 2: Tensor operations and NumPy fundamentals"
-  - "Module 4: Linear layers and matrix multiplication"
-  - "Understanding of basic algorithmic complexity (O notation)"
-estimated_time: "3-4 hours"
-difficulty: "Advanced"
-tags:
-  - "performance"
-  - "optimization" 
-  - "systems"
-  - "hardware"
-  - "acceleration"
-  - "cache"
-  - "vectorization"
-  - "backends"
-exports:
-  - "matmul_naive"
-  - "matmul_blocked"
-  - "matmul_numpy"
-  - "OptimizedBackend"
-  - "matmul"
-  - "set_backend"
-assessment:
-  - "Understand why naive loops have poor cache performance"
-  - "Implement cache-friendly blocked matrix multiplication showing 10-50x speedups"
-  - "Recognize why NumPy provides 100x+ speedups over custom implementations"
-  - "Build backend system that automatically chooses optimal implementations"
-  - "Apply the 'free speedup' principle: use better tools, don't write faster code"
\ No newline at end of file
diff --git a/modules/16_quantization/17_quantization.yml b/modules/16_quantization/17_quantization.yml
deleted file mode 100644
index f26b691e..00000000
--- a/modules/16_quantization/17_quantization.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Quantization
-number: 17
-type: optimization
-difficulty: advanced
-estimated_hours: 6-8
-
-description: |
-  Precision optimization through INT8 quantization. Students learn to reduce model size
-  and accelerate inference by using lower precision arithmetic while maintaining accuracy.
-  Especially powerful for CNN convolutions and edge deployment.
-
-learning_objectives:
-  - Understand precision vs performance trade-offs
-  - Implement INT8 quantization for neural networks  
-  - Build calibration-based quantization systems
-  - Optimize CNN inference for mobile deployment
-
-prerequisites:
-  - Module 09: Spatial (CNNs)
-  - Module 16: Acceleration
-
-skills_developed:
-  - Quantization techniques and mathematics
-  - Post-training optimization strategies
-  - Hardware-aware optimization
-  - Mobile and edge deployment patterns
-
-exports:
-  - tinytorch.quantization
\ No newline at end of file
diff --git a/modules/17_compression/18_compression.yml b/modules/17_compression/18_compression.yml
deleted file mode 100644
index ec8a5417..00000000
--- a/modules/17_compression/18_compression.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Compression
-number: 17
-type: optimization
-difficulty: advanced
-estimated_hours: 8-10
-
-description: |
-  Model compression through pruning and sparsity. Students learn to identify and remove
-  redundant parameters, achieving 70-80% sparsity while maintaining accuracy. Essential
-  for edge deployment and mobile devices.
-
-learning_objectives:
-  - Understand sparsity and redundancy in neural networks
-  - Implement magnitude-based pruning
-  - Build structured and unstructured pruning
-  - Measure accuracy vs model size tradeoffs
-
-prerequisites:
-  - Module 15: Acceleration
-  - Module 16: Quantization
-
-skills_developed:
-  - Pruning techniques
-  - Sparsity management
-  - Model compression
-  - Edge deployment optimization
-
-exports:
-  - tinytorch.optimizations.compression
\ No newline at end of file
diff --git a/modules/18_caching/19_caching.yml b/modules/18_caching/19_caching.yml
deleted file mode 100644
index b6a2eda7..00000000
--- a/modules/18_caching/19_caching.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Caching
-number: 18
-type: optimization
-difficulty: advanced
-estimated_hours: 8-10
-
-description: |
-  Memory optimization through KV caching for transformer inference. Students learn to
-  transform O(N²) attention complexity into O(N) for autoregressive generation, achieving
-  dramatic speedups in transformer inference.
-
-learning_objectives:
-  - Understand attention memory complexity
-  - Implement KV caching for transformers
-  - Build incremental computation patterns
-  - Optimize autoregressive generation
-
-prerequisites:
-  - Module 14: Transformers
-  - Module 17: Compression
-
-skills_developed:
-  - KV caching implementation
-  - Memory-computation tradeoffs
-  - Incremental computation
-  - Production inference patterns
-
-exports:
-  - tinytorch.optimizations.caching
\ No newline at end of file
diff --git a/modules/20_capstone/20_capstone.yml b/modules/20_capstone/20_capstone.yml
deleted file mode 100644
index c7bade4d..00000000
--- a/modules/20_capstone/20_capstone.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-# TinyTorch Module Metadata
-# Essential system information for CLI tools and build systems
-
-# === CORE IDENTITY ===
-name: "capstone"
-number: 20
-folder_name: "20_capstone"
-
-# === DISPLAY ===
-display:
-  title: "Torch Olympics"
-  subtitle: "MLPerf-Inspired Challenges"
-  emoji: "🏆"
-
-# === DEPENDENCIES ===
-dependencies:
-  prerequisites: ["setup", "tensor", "activations", "layers", "losses", "autograd", "optimizers", "training", "spatial", "dataloader", "tokenization", "embeddings", "attention", "transformers", "profiling", "acceleration", "quantization", "compression", "caching"]
-
-# === BUILD SYSTEM ===
-build:
-  exports_to: "tinytorch.benchmarking"
-  main_file: "capstone_dev.py"
-
-# === EDUCATION ===
-education:
-  stage: "optimization"
-  difficulty: "⭐⭐⭐⭐⭐"
-  time_estimate: "6-8 hours"
-  description: "TinyMLPerf Olympics - the culmination of your TinyTorch journey! Build a comprehensive benchmarking suite using your profiler from Module 19, then compete on speed, memory, and efficiency. Benchmark the models you built throughout the course to see the impact of all your optimizations."
-
-# === CHECKPOINT ===
-checkpoint:
-  unlocks: 15
-  capability: "Can I build unified ML frameworks across modalities?"
-
-# === COMPONENTS ===
-components:
-  - "TinyMLPerf"
-  - "BenchmarkSuite"
-  - "PerformanceReporter"
-  - "CompetitionFramework"
\ No newline at end of file