mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 22:47:31 -05:00
This commit implements the pedagogically optimal "inevitable discovery" module progression based on expert validation and educational design principles. ## Module Reordering Summary **Previous Order (Problems)**: - 05_losses → 06_autograd → 07_dataloader → 08_optimizers → 09_spatial → 10_training - Issues: Autograd before optimizers, DataLoader before training, scattered dependencies **New Order (Beautiful Progression)**: - 05_losses → 06_optimizers → 07_autograd → 08_training → 09_spatial → 10_dataloader - Benefits: Each module creates inevitable need for the next ## Pedagogical Flow Achieved **05_losses** → "Need systematic weight updates" → **06_optimizers** **06_optimizers** → "Need automatic gradients" → **07_autograd** **07_autograd** → "Need systematic training" → **08_training** **08_training** → "MLPs hit limits on images" → **09_spatial** **09_spatial** → "Training is too slow" → **10_dataloader** ## Technical Changes ### Module Directory Renaming - `06_autograd` → `07_autograd` - `07_dataloader` → `10_dataloader` - `08_optimizers` → `06_optimizers` - `10_training` → `08_training` - `09_spatial` → `09_spatial` (no change) ### System Integration Updates - **MODULE_TO_CHECKPOINT mapping**: Updated in tito/commands/export.py - **Test directories**: Renamed module_XX directories to match new numbers - **Documentation**: Updated all references in MD files and agent configurations - **CLI integration**: Updated next-steps suggestions for proper flow ### Agent Configuration Updates - **Quality Assurance**: Updated module audit status with new numbers - **Module Developer**: Updated work tracking with new sequence - **Documentation**: Updated MASTER_PLAN_OF_RECORD.md with beautiful progression ## Educational Benefits 1. **Inevitable Discovery**: Each module naturally leads to the next 2. **Cognitive Load**: Concepts introduced exactly when needed 3. **Motivation**: Students understand WHY each tool is necessary 4. **Synthesis**: Everything flows toward complete ML systems understanding 5. **Professional Alignment**: Matches real ML engineering workflows ## Quality Assurance - ✅ All CLI commands still function - ✅ Checkpoint system mappings updated - ✅ Documentation consistency maintained - ✅ Test directory structure aligned - ✅ Agent configurations synchronized **Impact**: This reordering transforms TinyTorch from a collection of modules into a coherent educational journey where each step naturally motivates the next, creating optimal conditions for deep learning systems understanding.
181 lines
6.4 KiB
Python
181 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backend Integration Example: Drop-in Performance Optimization
|
|
|
|
This demonstrates how the backend system integrates with existing TinyTorch
|
|
code to provide dramatic performance improvements without changing APIs.
|
|
"""
|
|
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
|
|
# Add the kernels module to path
|
|
sys.path.append('/Users/VJ/GitHub/TinyTorch/modules/13_kernels')
|
|
from kernels_dev import set_backend, benchmark, run_performance_comparison
|
|
|
|
# Import existing TinyTorch components
|
|
sys.path.append('/Users/VJ/GitHub/TinyTorch/modules/02_tensor')
|
|
sys.path.append('/Users/VJ/GitHub/TinyTorch/modules/04_layers')
|
|
|
|
try:
|
|
from tensor_dev import Tensor
|
|
from layers_dev import Dense, Module
|
|
except ImportError:
|
|
print("Creating minimal tensor/layer classes for demo...")
|
|
|
|
class Tensor:
|
|
def __init__(self, data):
|
|
self.data = np.array(data, dtype=np.float32)
|
|
self.shape = self.data.shape
|
|
|
|
def __str__(self):
|
|
return f"Tensor(shape={self.shape})"
|
|
|
|
class Dense:
|
|
def __init__(self, in_features, out_features):
|
|
self.weight = Tensor(np.random.randn(in_features, out_features) * 0.1)
|
|
self.bias = Tensor(np.zeros(out_features))
|
|
|
|
def forward(self, x):
|
|
# This would normally call tinytorch.matmul, but we'll simulate
|
|
result = x.data @ self.weight.data + self.bias.data
|
|
return Tensor(result)
|
|
|
|
# Now import our optimized functions
|
|
from kernels_dev import fast_matmul
|
|
|
|
def demo_same_code_different_performance():
|
|
"""Demonstrate same code achieving different performance"""
|
|
|
|
print("🎯 DEMONSTRATION: Same Code, Different Performance")
|
|
print("=" * 70)
|
|
|
|
# Create a simple neural network model
|
|
class SimpleNet:
|
|
def __init__(self):
|
|
self.layer1 = Dense(784, 512)
|
|
self.layer2 = Dense(512, 256)
|
|
self.layer3 = Dense(256, 10)
|
|
|
|
def forward(self, x):
|
|
x = self.layer1.forward(x)
|
|
x = self.layer2.forward(x)
|
|
x = self.layer3.forward(x)
|
|
return x
|
|
|
|
# Create model and data
|
|
model = SimpleNet()
|
|
batch_data = Tensor(np.random.randn(128, 784)) # Batch of 128 images
|
|
|
|
def run_model():
|
|
"""Run the same model forward pass"""
|
|
output = model.forward(batch_data)
|
|
return output
|
|
|
|
# This is the magic - SAME CODE, different performance!
|
|
results = run_performance_comparison("Neural Network Forward Pass", run_model)
|
|
|
|
return results
|
|
|
|
def demo_competition_scenario():
|
|
"""Demonstrate a competition scenario"""
|
|
|
|
print("\n🏆 COMPETITION SCENARIO: Matrix Multiplication Optimization")
|
|
print("=" * 70)
|
|
|
|
# Different student "submissions"
|
|
def student_alice_submission():
|
|
"""Alice's optimized implementation"""
|
|
set_backend('optimized')
|
|
a = Tensor(np.random.randn(400, 300))
|
|
b = Tensor(np.random.randn(300, 200))
|
|
return fast_matmul(a, b)
|
|
|
|
def student_bob_submission():
|
|
"""Bob still using naive implementation"""
|
|
set_backend('naive')
|
|
a = Tensor(np.random.randn(400, 300))
|
|
b = Tensor(np.random.randn(300, 200))
|
|
return fast_matmul(a, b)
|
|
|
|
# Simulate competition submissions
|
|
from kernels_dev import submit_to_competition, competition
|
|
|
|
print("Student submissions:")
|
|
submit_to_competition("Alice", "Matrix Multiplication", student_alice_submission)
|
|
submit_to_competition("Bob", "Matrix Multiplication", student_bob_submission)
|
|
|
|
# Show leaderboard
|
|
competition.show_leaderboard("Matrix Multiplication")
|
|
|
|
def demo_real_world_scenario():
|
|
"""Demonstrate real-world ML training scenario"""
|
|
|
|
print("\n🌍 REAL-WORLD SCENARIO: Training Speed Comparison")
|
|
print("=" * 70)
|
|
|
|
# Simulate training step computation
|
|
def training_step():
|
|
"""Simulate one training step with multiple operations"""
|
|
|
|
# Forward pass operations
|
|
batch_size, seq_len, hidden_dim = 32, 128, 512
|
|
|
|
# Attention computation (the expensive part)
|
|
queries = Tensor(np.random.randn(batch_size, seq_len, hidden_dim))
|
|
keys = Tensor(np.random.randn(batch_size, seq_len, hidden_dim))
|
|
values = Tensor(np.random.randn(batch_size, seq_len, hidden_dim))
|
|
|
|
# Attention weights: Q @ K^T
|
|
attention_weights = fast_matmul(queries, keys) # This gets optimized!
|
|
|
|
# Attention output: weights @ V
|
|
attention_output = fast_matmul(attention_weights, values) # This too!
|
|
|
|
# Feed-forward layers
|
|
ff1 = Dense(hidden_dim, hidden_dim * 4)
|
|
ff2 = Dense(hidden_dim * 4, hidden_dim)
|
|
|
|
ff_output = ff1.forward(attention_output)
|
|
final_output = ff2.forward(ff_output)
|
|
|
|
return final_output
|
|
|
|
# Compare training speeds
|
|
results = run_performance_comparison("Transformer Training Step", training_step)
|
|
|
|
# Calculate training time implications
|
|
naive_time = results['naive'].time_ms
|
|
opt_time = results['optimized'].time_ms
|
|
|
|
print(f"\n📊 Training Time Analysis:")
|
|
print(f"Time per step: Naive={naive_time:.1f}ms, Optimized={opt_time:.1f}ms")
|
|
|
|
steps_per_epoch = 1000
|
|
naive_epoch_time = (naive_time * steps_per_epoch) / 1000 / 60 # minutes
|
|
opt_epoch_time = (opt_time * steps_per_epoch) / 1000 / 60 # minutes
|
|
|
|
print(f"Time per epoch: Naive={naive_epoch_time:.1f}min, Optimized={opt_epoch_time:.1f}min")
|
|
print(f"Training 100 epochs: Naive={naive_epoch_time*100/60:.1f}hrs, Optimized={opt_epoch_time*100/60:.1f}hrs")
|
|
|
|
time_saved = (naive_epoch_time - opt_epoch_time) * 100 / 60 # hours saved over 100 epochs
|
|
print(f"⚡ Time saved: {time_saved:.1f} hours over 100 epochs!")
|
|
|
|
if __name__ == "__main__":
|
|
print("🚀 TinyTorch Backend Integration Demo")
|
|
print("Demonstrating competition-ready optimization without API changes")
|
|
print("=" * 80)
|
|
|
|
# Run all demonstrations
|
|
demo_same_code_different_performance()
|
|
demo_competition_scenario()
|
|
demo_real_world_scenario()
|
|
|
|
print("\n" + "=" * 80)
|
|
print("🎯 KEY INSIGHTS:")
|
|
print("• Same APIs, dramatically different performance")
|
|
print("• Backend switching enables both learning AND competition")
|
|
print("• Real ML training can be 10-100x faster with proper optimization")
|
|
print("• Students see immediate impact of systems engineering")
|
|
print("=" * 80) |