mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-30 18:19:27 -05:00
Applied consistent naming pattern: 0X_[figure]_[task].py M01 (1957 Perceptron): - forward_pass.py → 01_rosenblatt_forward.py - perceptron_trained.py → 02_rosenblatt_trained.py M02 (1969 XOR): - xor_crisis.py → 01_xor_crisis.py - xor_solved.py → 02_xor_solved.py M03 (1986 MLP): - mlp_digits.py → 01_rumelhart_tinydigits.py - mlp_mnist.py → 02_rumelhart_mnist.py M04 (1998 CNN): - cnn_digits.py → 01_lecun_tinydigits.py - lecun_cifar10.py → 02_lecun_cifar10.py M05 (2017 Transformer): - vaswani_chatgpt.py → 01_vaswani_generation.py - vaswani_copilot.py → 02_vaswani_dialogue.py - profile_kv_cache.py → 03_vaswani_profile.py Benefits: - Clear execution order (01, 02, 03) - Historical context (rosenblatt, lecun, vaswani) - Descriptive purpose (generation, dialogue, profile) - Consistent structure across all milestones Updated documentation: - README.md: Updated all milestone examples - site/chapters/milestones.md: Updated bash commands
432 lines
19 KiB
Python
432 lines
19 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
MNIST MLP (1986) - Backpropagation Revolution
|
||
============================================
|
||
|
||
📚 HISTORICAL CONTEXT:
|
||
In 1986, Rumelhart, Hinton, and Williams popularized backpropagation, finally
|
||
enabling training of deep multi-layer networks. This breakthrough made it possible
|
||
to solve real vision problems like handwritten digit recognition, launching the
|
||
modern deep learning era.
|
||
|
||
🎯 WHAT YOU'RE BUILDING:
|
||
Using YOUR TinyTorch implementations, you'll build a multi-layer perceptron that
|
||
achieves 95%+ accuracy on MNIST digits - proving YOUR system can solve real vision!
|
||
|
||
✅ REQUIRED MODULES (Run after Module 06):
|
||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
Module 01 (Tensor) : YOUR data structure with autodiff
|
||
Module 02 (Activations) : YOUR ReLU for deep networks
|
||
Module 03 (Layers) : YOUR Linear layers
|
||
Module 04 (Losses) : YOUR CrossEntropyLoss
|
||
Module 05 (Autograd) : YOUR gradient computation
|
||
Module 06 (Optimizers) : YOUR SGD optimizer
|
||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
|
||
🏗️ ARCHITECTURE (Deep Feedforward Network):
|
||
┌─────────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐
|
||
│ Input Image │ │ Flatten │ │ Linear │ │ Linear │ │ Output │
|
||
│ 28×28 │───▶│ 784 │───▶│ 784→128 │───▶│ 128→64 │───▶│ 64→10 │
|
||
│ Pixels │ │ YOUR M4 │ │ +ReLU │ │ +ReLU │ │ Classes │
|
||
└─────────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘
|
||
Hidden Layer 1 Hidden Layer 2 Digit Probs
|
||
|
||
🔍 MNIST DATASET - THE HELLO WORLD OF COMPUTER VISION:
|
||
|
||
MNIST contains 70,000 handwritten digits (60K train, 10K test):
|
||
|
||
Sample Digits: Why MNIST Matters:
|
||
|
||
┌─────┐ ┌─────┐ ┌─────┐ • First "real" vision benchmark
|
||
│ ███ │ │█████│ │█████│ • 28×28 pixels = 784 features
|
||
│█ █│ │ █│ │ █│ • 10 classes (digits 0-9)
|
||
│ █ │ │ ██ │ │ ███ │ • Proves deep learning works
|
||
│ █ │ │ █ │ │ █│ • YOUR MLP will get 95%+ accuracy!
|
||
│ █ │ │█████│ │█████│
|
||
└─────┘ └─────┘ └─────┘
|
||
"1" "2" "3"
|
||
|
||
Network learns to map:
|
||
784 pixels → Hidden features → Digit classification
|
||
|
||
📊 EXPECTED PERFORMANCE:
|
||
- Dataset: 60,000 training images, 10,000 test images
|
||
- Training time: 2-3 minutes (5 epochs with manual training loop)
|
||
- Expected accuracy: 85-90% on test set (realistic with basic training)
|
||
- Parameters: ~100K weights (small by modern standards!)
|
||
- Training stability: Loss consistently decreases with YOUR manual loop
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import numpy as np
|
||
import argparse
|
||
import time
|
||
|
||
# Add project root to path for TinyTorch imports
|
||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.append(project_root)
|
||
|
||
# Import TinyTorch components YOU BUILT!
|
||
from tinytorch.core.tensor import Tensor # Module 02: YOU built this!
|
||
from tinytorch.core.layers import Linear # Module 04: YOU built this!
|
||
from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this!
|
||
from tinytorch.core.losses import CrossEntropyLoss # Module 04: YOU built this!
|
||
from tinytorch.core.optimizers import SGD # Module 06: YOU built this!
|
||
|
||
# Import dataset manager
|
||
from data_manager import DatasetManager
|
||
|
||
def flatten(x):
|
||
"""Flatten operation for CNN to MLP transition."""
|
||
batch_size = x.data.shape[0]
|
||
return Tensor(x.data.reshape(batch_size, -1))
|
||
|
||
class MNISTMLP:
|
||
"""
|
||
Multi-Layer Perceptron for MNIST using YOUR TinyTorch!
|
||
|
||
This architecture proved deep learning could solve real vision problems.
|
||
"""
|
||
|
||
def __init__(self, input_size=784, hidden1=128, hidden2=64, num_classes=10):
|
||
print("🧠 Building MNIST MLP with YOUR TinyTorch modules...")
|
||
|
||
# Deep architecture - multiple hidden layers!
|
||
self.fc1 = Linear(input_size, hidden1) # Module 04: YOUR Linear layer!
|
||
self.relu1 = ReLU() # Module 03: YOUR activation!
|
||
self.fc2 = Linear(hidden1, hidden2) # Module 04: YOUR Linear layer!
|
||
self.relu2 = ReLU() # Module 03: YOUR activation!
|
||
self.fc3 = Linear(hidden2, num_classes) # Module 04: YOUR output layer!
|
||
|
||
# Store architecture info
|
||
self.total_params = (
|
||
input_size * hidden1 + hidden1 + # fc1
|
||
hidden1 * hidden2 + hidden2 + # fc2
|
||
hidden2 * num_classes + num_classes # fc3
|
||
)
|
||
|
||
print(f" Architecture: {input_size} → {hidden1} → {hidden2} → {num_classes}")
|
||
print(f" Total parameters: {self.total_params:,} (YOUR Linear layers)")
|
||
print(f" Activation: ReLU (YOUR Module 03)")
|
||
|
||
def forward(self, x):
|
||
"""Forward pass through YOUR deep network."""
|
||
# Flatten image to vector
|
||
batch_size = x.data.shape[0]
|
||
x = Tensor(x.data.reshape(batch_size, -1)) # 28×28 → 784
|
||
|
||
# Deep forward pass using YOUR components
|
||
x = self.fc1(x) # Module 04: YOUR Linear layer!
|
||
x = self.relu1(x) # Module 03: YOUR ReLU activation!
|
||
x = self.fc2(x) # Module 04: YOUR Linear layer!
|
||
x = self.relu2(x) # Module 03: YOUR ReLU activation!
|
||
x = self.fc3(x) # Module 04: YOUR output layer!
|
||
|
||
return x
|
||
|
||
def parameters(self):
|
||
"""Get all trainable parameters from YOUR layers."""
|
||
return [
|
||
self.fc1.weights, self.fc1.bias,
|
||
self.fc2.weights, self.fc2.bias,
|
||
self.fc3.weights, self.fc3.bias
|
||
]
|
||
|
||
def visualize_mnist_digits():
|
||
"""Show ASCII representation of MNIST digits."""
|
||
print("\n" + "="*70)
|
||
print("🔢 VISUALIZING MNIST - Handwritten Digit Recognition:")
|
||
print("="*70)
|
||
|
||
print("""
|
||
Sample Training Data: What YOUR Network Learns:
|
||
|
||
28×28 Pixel Images: Feature Hierarchy:
|
||
┌──────────┐
|
||
│░░░░██░░░░│ → Flatten(784) → Layer 1: Edge detectors
|
||
│░░░███░░░░│ - Vertical lines
|
||
│░░██░█░░░░│ - Horizontal lines
|
||
│░░░░░█░░░░│ - Curves
|
||
│░░░░░█░░░░│
|
||
│░░░░░█░░░░│ Layer 2: Shape components
|
||
│░░░█████░░│ - Loops (0, 6, 8, 9)
|
||
│░░░░░░░░░░│ - Lines (1, 7)
|
||
└──────────┘ - Corners (4, 5)
|
||
Digit "7"
|
||
Output: Class probabilities
|
||
YOUR network learns to: P("0") = 0.01
|
||
1. Extract features from pixels P("1") = 0.02
|
||
2. Combine features hierarchically ...
|
||
3. Classify into 10 digit classes P("7") = 0.91 ← Highest!
|
||
""")
|
||
print("="*70)
|
||
|
||
def train_mnist_mlp(model, train_data, train_labels,
|
||
epochs=5, batch_size=32, learning_rate=0.01):
|
||
"""
|
||
Train MNIST MLP using YOUR manual training loop!
|
||
This demonstrates the core training cycle that YOU control.
|
||
"""
|
||
print("\n🚀 Training MNIST MLP with YOUR TinyTorch system!")
|
||
print(f" Dataset: {len(train_data)} training images")
|
||
print(f" Manual training loop - YOU control the process!")
|
||
print(f" Cross-entropy loss (Module 04)")
|
||
print(f" SGD optimizer (Module 06)")
|
||
|
||
# YOUR optimizer and loss function!
|
||
optimizer = SGD(model.parameters(), lr=learning_rate)
|
||
loss_fn = CrossEntropyLoss()
|
||
|
||
# Training history
|
||
train_losses = []
|
||
|
||
# Training loop
|
||
for epoch in range(epochs):
|
||
epoch_loss = 0
|
||
num_batches = 0
|
||
|
||
# Shuffle data
|
||
indices = np.random.permutation(len(train_data))
|
||
train_data_shuffled = train_data[indices]
|
||
train_labels_shuffled = train_labels[indices]
|
||
|
||
# Mini-batch training
|
||
for i in range(0, len(train_data), batch_size):
|
||
# Get batch
|
||
batch_end = min(i + batch_size, len(train_data))
|
||
batch_X = train_data_shuffled[i:batch_end]
|
||
batch_y = train_labels_shuffled[i:batch_end]
|
||
|
||
# Convert to tensors
|
||
inputs = Tensor(batch_X)
|
||
targets = Tensor(batch_y)
|
||
|
||
# Forward pass with YOUR model
|
||
outputs = model(inputs)
|
||
|
||
# Compute loss with YOUR loss function
|
||
loss = loss_fn.forward(outputs, targets)
|
||
|
||
# Backward pass with YOUR autograd
|
||
loss.backward()
|
||
|
||
# Update weights with YOUR optimizer
|
||
optimizer.step()
|
||
optimizer.zero_grad()
|
||
|
||
# Track loss
|
||
loss_value = loss.data.item() if hasattr(loss.data, 'item') else float(loss.data)
|
||
epoch_loss += loss_value
|
||
num_batches += 1
|
||
|
||
# Epoch summary
|
||
avg_loss = epoch_loss / num_batches
|
||
train_losses.append(avg_loss)
|
||
print(f" Epoch {epoch+1}/{epochs}: Loss = {avg_loss:.4f}")
|
||
|
||
print("\n📈 Training completed!")
|
||
print(" ✅ YOUR manual training loop works!")
|
||
print(" ✅ Cross-entropy loss computed correctly")
|
||
print(" ✅ SGD optimizer updated all parameters")
|
||
print(" ✅ Gradients flowed through YOUR network")
|
||
|
||
# Simple monitor object for compatibility
|
||
class TrainingMonitor:
|
||
def __init__(self, losses):
|
||
self.train_losses = losses
|
||
self.best_val_loss = losses[-1] if losses else 0
|
||
self.should_stop = False
|
||
|
||
return model, TrainingMonitor(train_losses)
|
||
|
||
def test_mnist_mlp(model, test_data, test_labels):
|
||
"""Test YOUR MLP on MNIST test set."""
|
||
print("\n🧪 Testing YOUR MNIST MLP on 10,000 test images...")
|
||
|
||
batch_size = 100
|
||
correct = 0
|
||
total = 0
|
||
|
||
# Per-class accuracy tracking
|
||
class_correct = np.zeros(10)
|
||
class_total = np.zeros(10)
|
||
|
||
for i in range(0, len(test_data), batch_size):
|
||
batch_X = test_data[i:i+batch_size]
|
||
batch_y = test_labels[i:i+batch_size]
|
||
|
||
# Test with YOUR network
|
||
inputs = Tensor(batch_X) # Module 02: YOUR Tensor!
|
||
outputs = model(inputs) # YOUR forward pass!
|
||
|
||
outputs_np = np.array(outputs.data.data if hasattr(outputs.data, 'data') else outputs.data)
|
||
predictions = np.argmax(outputs_np, axis=1)
|
||
correct += np.sum(predictions == batch_y)
|
||
total += len(batch_y)
|
||
|
||
# Per-class accuracy
|
||
for j in range(len(batch_y)):
|
||
label = batch_y[j]
|
||
class_total[label] += 1
|
||
if predictions[j] == label:
|
||
class_correct[label] += 1
|
||
|
||
# Overall accuracy
|
||
accuracy = 100 * correct / total
|
||
print(f"\n 📊 Overall Test Accuracy: {accuracy:.2f}%")
|
||
|
||
# Per-digit accuracy
|
||
print("\n Per-Digit Performance (YOUR network's understanding):")
|
||
print(" " + "─"*45)
|
||
print(" │ Digit │ Accuracy │ Visual │")
|
||
print(" ├───────┼──────────┼─────────────────────┤")
|
||
|
||
for digit in range(10):
|
||
if class_total[digit] > 0:
|
||
digit_acc = 100 * class_correct[digit] / class_total[digit]
|
||
bar_length = int(digit_acc / 5)
|
||
bar = "█" * bar_length + "░" * (20 - bar_length)
|
||
print(f" │ {digit} │ {digit_acc:5.1f}% │ {bar} │")
|
||
|
||
print(" " + "─"*45)
|
||
|
||
if accuracy >= 90:
|
||
print("\n 🎉 SUCCESS! YOUR MLP achieved excellent accuracy with stable training!")
|
||
elif accuracy >= 80:
|
||
print("\n ✅ Great job! YOUR MLP is learning well with consistent progress!")
|
||
elif accuracy >= 70:
|
||
print("\n 📈 Good progress! YOUR MLP shows stable learning dynamics!")
|
||
else:
|
||
print("\n 🔄 YOUR MLP is learning... (stable training in progress)")
|
||
|
||
return accuracy
|
||
|
||
def analyze_mnist_systems(model, monitor):
|
||
"""Analyze YOUR MNIST MLP from an ML systems perspective."""
|
||
print("\n🔬 SYSTEMS ANALYSIS of YOUR MNIST Implementation:")
|
||
|
||
# Model size analysis
|
||
param_bytes = model.total_params * 4 # float32
|
||
|
||
print(f"\n Model Statistics:")
|
||
print(f" • Parameters: {model.total_params:,} weights")
|
||
print(f" • Memory: {param_bytes / 1024:.1f} KB")
|
||
print(f" • FLOPs per image: ~{model.total_params * 2:,}")
|
||
|
||
print(f"\n Performance Characteristics:")
|
||
print(f" • Training: O(N × P) where N=samples, P=parameters")
|
||
print(f" • Inference: {model.total_params * 2 / 1_000_000:.2f}M ops/image")
|
||
print(f" • YOUR implementation: Pure Python + NumPy")
|
||
|
||
# Training dynamics analysis
|
||
if monitor.train_losses:
|
||
final_train_loss = monitor.train_losses[-1]
|
||
epochs_completed = len(monitor.train_losses)
|
||
|
||
print(f"\n Training Dynamics:")
|
||
print(f" • Epochs completed: {epochs_completed}")
|
||
print(f" • Final training loss: {final_train_loss:.4f}")
|
||
print(f" • Training completed with YOUR manual loop")
|
||
|
||
# Loss convergence analysis
|
||
if len(monitor.train_losses) >= 3:
|
||
loss_improvement = monitor.train_losses[0] - monitor.train_losses[-1]
|
||
print(f" • Loss improvement: {loss_improvement:.4f}")
|
||
print(f" • Training stability: {'✅ Stable' if loss_improvement > 0 else '⚠️ Check convergence'}")
|
||
|
||
print(f"\n 🏛️ Historical Context:")
|
||
print(f" • 1986: Backprop made deep learning possible")
|
||
print(f" • 1998: LeNet-5 achieved 99.2% on MNIST (CNNs)")
|
||
print(f" • YOUR MLP: 95%+ with simple architecture")
|
||
print(f" • Modern: 99.8%+ possible with advanced techniques")
|
||
|
||
print(f"\n 💡 Systems Insights:")
|
||
print(f" • Fully connected = O(N²) parameters")
|
||
print(f" • Why CNNs win: Weight sharing reduces parameters")
|
||
print(f" • Manual training loops give YOU full control")
|
||
print(f" • This is how PyTorch training actually works!")
|
||
print(f" • YOUR achievement: Real vision with YOUR code!")
|
||
|
||
def main():
|
||
"""Demonstrate MNIST digit classification using YOUR TinyTorch!"""
|
||
|
||
parser = argparse.ArgumentParser(description='MNIST MLP 1986')
|
||
parser.add_argument('--test-only', action='store_true',
|
||
help='Test architecture without training')
|
||
parser.add_argument('--epochs', type=int, default=5,
|
||
help='Number of training epochs')
|
||
parser.add_argument('--batch-size', type=int, default=32,
|
||
help='Training batch size')
|
||
parser.add_argument('--visualize', action='store_true', default=True,
|
||
help='Show MNIST visualization')
|
||
parser.add_argument('--quick-test', action='store_true',
|
||
help='Train on subset for quick testing')
|
||
args = parser.parse_args()
|
||
|
||
print("🎯 MNIST MLP 1986 - Real Vision with YOUR Deep Network!")
|
||
print(" Historical significance: Backprop enables deep learning")
|
||
print(" YOUR achievement: 85-90% accuracy on real handwritten digits")
|
||
print(" Components used: YOUR complete ML system (Modules 01-06)")
|
||
|
||
# Show MNIST visualization
|
||
if args.visualize:
|
||
visualize_mnist_digits()
|
||
|
||
# Step 1: Load MNIST dataset
|
||
print("\n📥 Loading MNIST dataset...")
|
||
data_manager = DatasetManager()
|
||
|
||
(train_data, train_labels), (test_data, test_labels) = data_manager.get_mnist()
|
||
print(f"✅ Loaded {len(train_data)} training, {len(test_data)} test images")
|
||
|
||
# Quick test mode - use subset
|
||
if args.quick_test:
|
||
train_data = train_data[:1000]
|
||
train_labels = train_labels[:1000]
|
||
test_data = test_data[:100]
|
||
test_labels = test_labels[:100]
|
||
print(" (Using subset for quick testing)")
|
||
|
||
# Step 2: Create MLP with YOUR components
|
||
model = MNISTMLP(input_size=784, hidden1=128, hidden2=64, num_classes=10)
|
||
|
||
if args.test_only:
|
||
print("\n🧪 ARCHITECTURE TEST MODE")
|
||
test_input = Tensor(train_data[:5]) # Module 02: YOUR Tensor!
|
||
test_output = model(test_input) # YOUR architecture!
|
||
print(f"✅ Forward pass successful! Output shape: {test_output.data.shape}")
|
||
print("✅ YOUR deep MLP architecture works!")
|
||
return
|
||
|
||
# Step 3: Train using YOUR system with monitoring
|
||
start_time = time.time()
|
||
model, monitor = train_mnist_mlp(model, train_data, train_labels,
|
||
epochs=args.epochs, batch_size=args.batch_size)
|
||
train_time = time.time() - start_time
|
||
|
||
# Step 4: Test on test set
|
||
accuracy = test_mnist_mlp(model, test_data, test_labels)
|
||
|
||
# Step 5: Systems analysis
|
||
analyze_mnist_systems(model, monitor)
|
||
|
||
print(f"\n⏱️ Training time: {train_time:.1f} seconds")
|
||
print(f" YOUR implementation: {len(train_data) * args.epochs / train_time:.0f} images/sec")
|
||
|
||
print("\n✅ SUCCESS! MNIST Milestone Complete!")
|
||
print("\n🎓 What YOU Accomplished:")
|
||
print(" • YOU built a deep MLP with YOUR manual training loop")
|
||
print(" • YOUR backprop trains 100K+ parameters with no issues")
|
||
print(" • YOUR system demonstrates the core training cycle")
|
||
print(" • Forward → Loss → Backward → Update: YOU control it all!")
|
||
print(" • YOUR manual loop mirrors how PyTorch actually trains")
|
||
|
||
print("\n🚀 Next Steps:")
|
||
print(" • Continue to CNN milestone after Module 08 (Spatial)")
|
||
print(" • YOUR foundation scales to ImageNet and beyond!")
|
||
print(f" • With {accuracy:.1f}% accuracy, YOUR deep learning works!")
|
||
print(" • Manual training loops give YOU the deepest understanding")
|
||
|
||
if __name__ == "__main__":
|
||
main() |