Files
TinyTorch/BACKEND_INTEGRATION_EXAMPLE.py
Vijay Janapa Reddi 753ae52ae0 MAJOR: Implement beautiful module progression through strategic reordering
This commit implements the pedagogically optimal "inevitable discovery" module progression based on expert validation and educational design principles.

## Module Reordering Summary

**Previous Order (Problems)**:
- 05_losses → 06_autograd → 07_dataloader → 08_optimizers → 09_spatial → 10_training
- Issues: Autograd before optimizers, DataLoader before training, scattered dependencies

**New Order (Beautiful Progression)**:
- 05_losses → 06_optimizers → 07_autograd → 08_training → 09_spatial → 10_dataloader
- Benefits: Each module creates inevitable need for the next

## Pedagogical Flow Achieved

**05_losses** → "Need systematic weight updates" → **06_optimizers**
**06_optimizers** → "Need automatic gradients" → **07_autograd**
**07_autograd** → "Need systematic training" → **08_training**
**08_training** → "MLPs hit limits on images" → **09_spatial**
**09_spatial** → "Training is too slow" → **10_dataloader**

## Technical Changes

### Module Directory Renaming
- `06_autograd` → `07_autograd`
- `07_dataloader` → `10_dataloader`
- `08_optimizers` → `06_optimizers`
- `10_training` → `08_training`
- `09_spatial` → `09_spatial` (no change)

### System Integration Updates
- **MODULE_TO_CHECKPOINT mapping**: Updated in tito/commands/export.py
- **Test directories**: Renamed module_XX directories to match new numbers
- **Documentation**: Updated all references in MD files and agent configurations
- **CLI integration**: Updated next-steps suggestions for proper flow

### Agent Configuration Updates
- **Quality Assurance**: Updated module audit status with new numbers
- **Module Developer**: Updated work tracking with new sequence
- **Documentation**: Updated MASTER_PLAN_OF_RECORD.md with beautiful progression

## Educational Benefits

1. **Inevitable Discovery**: Each module naturally leads to the next
2. **Cognitive Load**: Concepts introduced exactly when needed
3. **Motivation**: Students understand WHY each tool is necessary
4. **Synthesis**: Everything flows toward complete ML systems understanding
5. **Professional Alignment**: Matches real ML engineering workflows

## Quality Assurance

-  All CLI commands still function
-  Checkpoint system mappings updated
-  Documentation consistency maintained
-  Test directory structure aligned
-  Agent configurations synchronized

**Impact**: This reordering transforms TinyTorch from a collection of modules into a coherent educational journey where each step naturally motivates the next, creating optimal conditions for deep learning systems understanding.
2025-09-24 15:56:47 -04:00

181 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Backend Integration Example: Drop-in Performance Optimization
This demonstrates how the backend system integrates with existing TinyTorch
code to provide dramatic performance improvements without changing APIs.
"""
import numpy as np
import sys
import os
# Add the kernels module to path
sys.path.append('/Users/VJ/GitHub/TinyTorch/modules/13_kernels')
from kernels_dev import set_backend, benchmark, run_performance_comparison
# Import existing TinyTorch components
sys.path.append('/Users/VJ/GitHub/TinyTorch/modules/02_tensor')
sys.path.append('/Users/VJ/GitHub/TinyTorch/modules/04_layers')
try:
from tensor_dev import Tensor
from layers_dev import Dense, Module
except ImportError:
print("Creating minimal tensor/layer classes for demo...")
class Tensor:
def __init__(self, data):
self.data = np.array(data, dtype=np.float32)
self.shape = self.data.shape
def __str__(self):
return f"Tensor(shape={self.shape})"
class Dense:
def __init__(self, in_features, out_features):
self.weight = Tensor(np.random.randn(in_features, out_features) * 0.1)
self.bias = Tensor(np.zeros(out_features))
def forward(self, x):
# This would normally call tinytorch.matmul, but we'll simulate
result = x.data @ self.weight.data + self.bias.data
return Tensor(result)
# Now import our optimized functions
from kernels_dev import fast_matmul
def demo_same_code_different_performance():
"""Demonstrate same code achieving different performance"""
print("🎯 DEMONSTRATION: Same Code, Different Performance")
print("=" * 70)
# Create a simple neural network model
class SimpleNet:
def __init__(self):
self.layer1 = Dense(784, 512)
self.layer2 = Dense(512, 256)
self.layer3 = Dense(256, 10)
def forward(self, x):
x = self.layer1.forward(x)
x = self.layer2.forward(x)
x = self.layer3.forward(x)
return x
# Create model and data
model = SimpleNet()
batch_data = Tensor(np.random.randn(128, 784)) # Batch of 128 images
def run_model():
"""Run the same model forward pass"""
output = model.forward(batch_data)
return output
# This is the magic - SAME CODE, different performance!
results = run_performance_comparison("Neural Network Forward Pass", run_model)
return results
def demo_competition_scenario():
"""Demonstrate a competition scenario"""
print("\n🏆 COMPETITION SCENARIO: Matrix Multiplication Optimization")
print("=" * 70)
# Different student "submissions"
def student_alice_submission():
"""Alice's optimized implementation"""
set_backend('optimized')
a = Tensor(np.random.randn(400, 300))
b = Tensor(np.random.randn(300, 200))
return fast_matmul(a, b)
def student_bob_submission():
"""Bob still using naive implementation"""
set_backend('naive')
a = Tensor(np.random.randn(400, 300))
b = Tensor(np.random.randn(300, 200))
return fast_matmul(a, b)
# Simulate competition submissions
from kernels_dev import submit_to_competition, competition
print("Student submissions:")
submit_to_competition("Alice", "Matrix Multiplication", student_alice_submission)
submit_to_competition("Bob", "Matrix Multiplication", student_bob_submission)
# Show leaderboard
competition.show_leaderboard("Matrix Multiplication")
def demo_real_world_scenario():
"""Demonstrate real-world ML training scenario"""
print("\n🌍 REAL-WORLD SCENARIO: Training Speed Comparison")
print("=" * 70)
# Simulate training step computation
def training_step():
"""Simulate one training step with multiple operations"""
# Forward pass operations
batch_size, seq_len, hidden_dim = 32, 128, 512
# Attention computation (the expensive part)
queries = Tensor(np.random.randn(batch_size, seq_len, hidden_dim))
keys = Tensor(np.random.randn(batch_size, seq_len, hidden_dim))
values = Tensor(np.random.randn(batch_size, seq_len, hidden_dim))
# Attention weights: Q @ K^T
attention_weights = fast_matmul(queries, keys) # This gets optimized!
# Attention output: weights @ V
attention_output = fast_matmul(attention_weights, values) # This too!
# Feed-forward layers
ff1 = Dense(hidden_dim, hidden_dim * 4)
ff2 = Dense(hidden_dim * 4, hidden_dim)
ff_output = ff1.forward(attention_output)
final_output = ff2.forward(ff_output)
return final_output
# Compare training speeds
results = run_performance_comparison("Transformer Training Step", training_step)
# Calculate training time implications
naive_time = results['naive'].time_ms
opt_time = results['optimized'].time_ms
print(f"\n📊 Training Time Analysis:")
print(f"Time per step: Naive={naive_time:.1f}ms, Optimized={opt_time:.1f}ms")
steps_per_epoch = 1000
naive_epoch_time = (naive_time * steps_per_epoch) / 1000 / 60 # minutes
opt_epoch_time = (opt_time * steps_per_epoch) / 1000 / 60 # minutes
print(f"Time per epoch: Naive={naive_epoch_time:.1f}min, Optimized={opt_epoch_time:.1f}min")
print(f"Training 100 epochs: Naive={naive_epoch_time*100/60:.1f}hrs, Optimized={opt_epoch_time*100/60:.1f}hrs")
time_saved = (naive_epoch_time - opt_epoch_time) * 100 / 60 # hours saved over 100 epochs
print(f"⚡ Time saved: {time_saved:.1f} hours over 100 epochs!")
if __name__ == "__main__":
print("🚀 TinyTorch Backend Integration Demo")
print("Demonstrating competition-ready optimization without API changes")
print("=" * 80)
# Run all demonstrations
demo_same_code_different_performance()
demo_competition_scenario()
demo_real_world_scenario()
print("\n" + "=" * 80)
print("🎯 KEY INSIGHTS:")
print("• Same APIs, dramatically different performance")
print("• Backend switching enables both learning AND competition")
print("• Real ML training can be 10-100x faster with proper optimization")
print("• Students see immediate impact of systems engineering")
print("=" * 80)