TinyTorch/milestones/03_1986_mlp/02_rumelhart_mnist.py

#!/usr/bin/env python3
"""
MNIST MLP (1986) - Backpropagation Revolution
============================================

📚 HISTORICAL CONTEXT:
In 1986, Rumelhart, Hinton, and Williams popularized backpropagation, finally
enabling training of deep multi-layer networks. This breakthrough made it possible
to solve real vision problems like handwritten digit recognition, launching the
modern deep learning era.

🎯 WHAT YOU'RE BUILDING:
Using YOUR TinyTorch implementations, you'll build a multi-layer perceptron that
achieves 95%+ accuracy on MNIST digits - proving YOUR system can solve real vision!

✅ REQUIRED MODULES (Run after Module 06):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
  Module 01 (Tensor)        : YOUR data structure with autodiff
  Module 02 (Activations)   : YOUR ReLU for deep networks
  Module 03 (Layers)        : YOUR Linear layers
  Module 04 (Losses)        : YOUR CrossEntropyLoss
  Module 05 (Autograd)      : YOUR gradient computation
  Module 06 (Optimizers)    : YOUR SGD optimizer
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

🏗️ ARCHITECTURE (Deep Feedforward Network):
    ┌─────────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐
    │ Input Image │    │ Flatten │    │ Linear  │    │ Linear  │    │ Output  │
    │   28×28     │───▶│   784   │───▶│ 784→128 │───▶│ 128→64  │───▶│  64→10  │
    │   Pixels    │    │ YOUR M4 │    │  +ReLU  │    │  +ReLU  │    │ Classes │
    └─────────────┘    └─────────┘    └─────────┘    └─────────┘    └─────────┘
                                      Hidden Layer 1  Hidden Layer 2  Digit Probs

🔍 MNIST DATASET - THE HELLO WORLD OF COMPUTER VISION:

MNIST contains 70,000 handwritten digits (60K train, 10K test):

    Sample Digits:                   Why MNIST Matters:

    ┌─────┐ ┌─────┐ ┌─────┐        • First "real" vision benchmark
    │ ███ │ │█████│ │█████│        • 28×28 pixels = 784 features
    │█   █│ │    █│ │    █│        • 10 classes (digits 0-9)
    │   █ │ │  ██ │ │ ███ │        • Proves deep learning works
    │  █  │ │ █   │ │    █│        • YOUR MLP will get 95%+ accuracy!
    │ █   │ │█████│ │█████│
    └─────┘ └─────┘ └─────┘
      "1"     "2"     "3"

    Network learns to map:
    784 pixels → Hidden features → Digit classification

📊 EXPECTED PERFORMANCE:
- Dataset: 60,000 training images, 10,000 test images
- Training time: 2-3 minutes (5 epochs with manual training loop)
- Expected accuracy: 85-90% on test set (realistic with basic training)
- Parameters: ~100K weights (small by modern standards!)
- Training stability: Loss consistently decreases with YOUR manual loop
"""

import sys
import os
import numpy as np
import argparse
import time

# Add project root to path for TinyTorch imports
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_root)

# Import TinyTorch components YOU BUILT!
from tinytorch.core.tensor import Tensor          # Module 02: YOU built this!
from tinytorch.core.layers import Linear          # Module 04: YOU built this!
from tinytorch.core.activations import ReLU, Softmax  # Module 03: YOU built this!
from tinytorch.core.losses import CrossEntropyLoss  # Module 04: YOU built this!
from tinytorch.core.optimizers import SGD          # Module 06: YOU built this!

# Import dataset manager
from data_manager import DatasetManager

def flatten(x):
    """Flatten operation for CNN to MLP transition."""
    batch_size = x.data.shape[0]
    return Tensor(x.data.reshape(batch_size, -1))

class MNISTMLP:
    """
    Multi-Layer Perceptron for MNIST using YOUR TinyTorch!

    This architecture proved deep learning could solve real vision problems.
    """

    def __init__(self, input_size=784, hidden1=128, hidden2=64, num_classes=10):
        print("🧠 Building MNIST MLP with YOUR TinyTorch modules...")

        # Deep architecture - multiple hidden layers!
        self.fc1 = Linear(input_size, hidden1)    # Module 04: YOUR Linear layer!
        self.relu1 = ReLU()                       # Module 03: YOUR activation!
        self.fc2 = Linear(hidden1, hidden2)       # Module 04: YOUR Linear layer!
        self.relu2 = ReLU()                       # Module 03: YOUR activation!
        self.fc3 = Linear(hidden2, num_classes)   # Module 04: YOUR output layer!

        # Store architecture info
        self.total_params = (
            input_size * hidden1 + hidden1 +      # fc1
            hidden1 * hidden2 + hidden2 +         # fc2
            hidden2 * num_classes + num_classes   # fc3
        )

        print(f"   Architecture: {input_size} → {hidden1} → {hidden2} → {num_classes}")
        print(f"   Total parameters: {self.total_params:,} (YOUR Linear layers)")
        print(f"   Activation: ReLU (YOUR Module 03)")

    def forward(self, x):
        """Forward pass through YOUR deep network."""
        # Flatten image to vector
        batch_size = x.data.shape[0]
        x = Tensor(x.data.reshape(batch_size, -1))  # 28×28 → 784

        # Deep forward pass using YOUR components
        x = self.fc1(x)        # Module 04: YOUR Linear layer!
        x = self.relu1(x)      # Module 03: YOUR ReLU activation!
        x = self.fc2(x)        # Module 04: YOUR Linear layer!
        x = self.relu2(x)      # Module 03: YOUR ReLU activation!
        x = self.fc3(x)        # Module 04: YOUR output layer!

        return x

    def parameters(self):
        """Get all trainable parameters from YOUR layers."""
        return [
            self.fc1.weights, self.fc1.bias,
            self.fc2.weights, self.fc2.bias,
            self.fc3.weights, self.fc3.bias
        ]

def visualize_mnist_digits():
    """Show ASCII representation of MNIST digits."""
    print("\n" + "="*70)
    print("🔢 VISUALIZING MNIST - Handwritten Digit Recognition:")
    print("="*70)

    print("""
    Sample Training Data:              What YOUR Network Learns:

    28×28 Pixel Images:                Feature Hierarchy:
    ┌──────────┐
    │░░░░██░░░░│ → Flatten(784) →     Layer 1: Edge detectors
    │░░░███░░░░│                       - Vertical lines
    │░░██░█░░░░│                       - Horizontal lines
    │░░░░░█░░░░│                       - Curves
    │░░░░░█░░░░│
    │░░░░░█░░░░│                       Layer 2: Shape components
    │░░░█████░░│                       - Loops (0, 6, 8, 9)
    │░░░░░░░░░░│                       - Lines (1, 7)
    └──────────┘                       - Corners (4, 5)
    Digit "7"
                                       Output: Class probabilities
    YOUR network learns to:            P("0") = 0.01
    1. Extract features from pixels    P("1") = 0.02
    2. Combine features hierarchically  ...
    3. Classify into 10 digit classes  P("7") = 0.91 ← Highest!
    """)
    print("="*70)

def train_mnist_mlp(model, train_data, train_labels,
                   epochs=5, batch_size=32, learning_rate=0.01):
    """
    Train MNIST MLP using YOUR manual training loop!
    This demonstrates the core training cycle that YOU control.
    """
    print("\n🚀 Training MNIST MLP with YOUR TinyTorch system!")
    print(f"   Dataset: {len(train_data)} training images")
    print(f"   Manual training loop - YOU control the process!")
    print(f"   Cross-entropy loss (Module 04)")
    print(f"   SGD optimizer (Module 06)")

    # YOUR optimizer and loss function!
    optimizer = SGD(model.parameters(), lr=learning_rate)
    loss_fn = CrossEntropyLoss()

    # Training history
    train_losses = []

    # Training loop
    for epoch in range(epochs):
        epoch_loss = 0
        num_batches = 0

        # Shuffle data
        indices = np.random.permutation(len(train_data))
        train_data_shuffled = train_data[indices]
        train_labels_shuffled = train_labels[indices]

        # Mini-batch training
        for i in range(0, len(train_data), batch_size):
            # Get batch
            batch_end = min(i + batch_size, len(train_data))
            batch_X = train_data_shuffled[i:batch_end]
            batch_y = train_labels_shuffled[i:batch_end]

            # Convert to tensors
            inputs = Tensor(batch_X)
            targets = Tensor(batch_y)

            # Forward pass with YOUR model
            outputs = model(inputs)

            # Compute loss with YOUR loss function
            loss = loss_fn.forward(outputs, targets)

            # Backward pass with YOUR autograd
            loss.backward()

            # Update weights with YOUR optimizer
            optimizer.step()
            optimizer.zero_grad()

            # Track loss
            loss_value = loss.data.item() if hasattr(loss.data, 'item') else float(loss.data)
            epoch_loss += loss_value
            num_batches += 1

        # Epoch summary
        avg_loss = epoch_loss / num_batches
        train_losses.append(avg_loss)
        print(f"   Epoch {epoch+1}/{epochs}: Loss = {avg_loss:.4f}")

    print("\n📈 Training completed!")
    print("   ✅ YOUR manual training loop works!")
    print("   ✅ Cross-entropy loss computed correctly")
    print("   ✅ SGD optimizer updated all parameters")
    print("   ✅ Gradients flowed through YOUR network")

    # Simple monitor object for compatibility
    class TrainingMonitor:
        def __init__(self, losses):
            self.train_losses = losses
            self.best_val_loss = losses[-1] if losses else 0
            self.should_stop = False

    return model, TrainingMonitor(train_losses)

def test_mnist_mlp(model, test_data, test_labels):
    """Test YOUR MLP on MNIST test set."""
    print("\n🧪 Testing YOUR MNIST MLP on 10,000 test images...")

    batch_size = 100
    correct = 0
    total = 0

    # Per-class accuracy tracking
    class_correct = np.zeros(10)
    class_total = np.zeros(10)

    for i in range(0, len(test_data), batch_size):
        batch_X = test_data[i:i+batch_size]
        batch_y = test_labels[i:i+batch_size]

        # Test with YOUR network
        inputs = Tensor(batch_X)  # Module 02: YOUR Tensor!
        outputs = model(inputs)  # YOUR forward pass!

        outputs_np = np.array(outputs.data.data if hasattr(outputs.data, 'data') else outputs.data)
        predictions = np.argmax(outputs_np, axis=1)
        correct += np.sum(predictions == batch_y)
        total += len(batch_y)

        # Per-class accuracy
        for j in range(len(batch_y)):
            label = batch_y[j]
            class_total[label] += 1
            if predictions[j] == label:
                class_correct[label] += 1

    # Overall accuracy
    accuracy = 100 * correct / total
    print(f"\n   📊 Overall Test Accuracy: {accuracy:.2f}%")

    # Per-digit accuracy
    print("\n   Per-Digit Performance (YOUR network's understanding):")
    print("   " + "─"*45)
    print("   │ Digit │ Accuracy │ Visual              │")
    print("   ├───────┼──────────┼─────────────────────┤")

    for digit in range(10):
        if class_total[digit] > 0:
            digit_acc = 100 * class_correct[digit] / class_total[digit]
            bar_length = int(digit_acc / 5)
            bar = "█" * bar_length + "░" * (20 - bar_length)
            print(f"   │   {digit}   │  {digit_acc:5.1f}%  │ {bar} │")

    print("   " + "─"*45)

    if accuracy >= 90:
        print("\n   🎉 SUCCESS! YOUR MLP achieved excellent accuracy with stable training!")
    elif accuracy >= 80:
        print("\n   ✅ Great job! YOUR MLP is learning well with consistent progress!")
    elif accuracy >= 70:
        print("\n   📈 Good progress! YOUR MLP shows stable learning dynamics!")
    else:
        print("\n   🔄 YOUR MLP is learning... (stable training in progress)")

    return accuracy

def analyze_mnist_systems(model, monitor):
    """Analyze YOUR MNIST MLP from an ML systems perspective."""
    print("\n🔬 SYSTEMS ANALYSIS of YOUR MNIST Implementation:")

    # Model size analysis
    param_bytes = model.total_params * 4  # float32

    print(f"\n   Model Statistics:")
    print(f"   • Parameters: {model.total_params:,} weights")
    print(f"   • Memory: {param_bytes / 1024:.1f} KB")
    print(f"   • FLOPs per image: ~{model.total_params * 2:,}")

    print(f"\n   Performance Characteristics:")
    print(f"   • Training: O(N × P) where N=samples, P=parameters")
    print(f"   • Inference: {model.total_params * 2 / 1_000_000:.2f}M ops/image")
    print(f"   • YOUR implementation: Pure Python + NumPy")

    # Training dynamics analysis
    if monitor.train_losses:
        final_train_loss = monitor.train_losses[-1]
        epochs_completed = len(monitor.train_losses)

        print(f"\n   Training Dynamics:")
        print(f"   • Epochs completed: {epochs_completed}")
        print(f"   • Final training loss: {final_train_loss:.4f}")
        print(f"   • Training completed with YOUR manual loop")

        # Loss convergence analysis
        if len(monitor.train_losses) >= 3:
            loss_improvement = monitor.train_losses[0] - monitor.train_losses[-1]
            print(f"   • Loss improvement: {loss_improvement:.4f}")
            print(f"   • Training stability: {'✅ Stable' if loss_improvement > 0 else '⚠️ Check convergence'}")

    print(f"\n   🏛️ Historical Context:")
    print(f"   • 1986: Backprop made deep learning possible")
    print(f"   • 1998: LeNet-5 achieved 99.2% on MNIST (CNNs)")
    print(f"   • YOUR MLP: 95%+ with simple architecture")
    print(f"   • Modern: 99.8%+ possible with advanced techniques")

    print(f"\n   💡 Systems Insights:")
    print(f"   • Fully connected = O(N²) parameters")
    print(f"   • Why CNNs win: Weight sharing reduces parameters")
    print(f"   • Manual training loops give YOU full control")
    print(f"   • This is how PyTorch training actually works!")
    print(f"   • YOUR achievement: Real vision with YOUR code!")

def main():
    """Demonstrate MNIST digit classification using YOUR TinyTorch!"""

    parser = argparse.ArgumentParser(description='MNIST MLP 1986')
    parser.add_argument('--test-only', action='store_true',
                       help='Test architecture without training')
    parser.add_argument('--epochs', type=int, default=5,
                       help='Number of training epochs')
    parser.add_argument('--batch-size', type=int, default=32,
                       help='Training batch size')
    parser.add_argument('--visualize', action='store_true', default=True,
                       help='Show MNIST visualization')
    parser.add_argument('--quick-test', action='store_true',
                       help='Train on subset for quick testing')
    args = parser.parse_args()

    print("🎯 MNIST MLP 1986 - Real Vision with YOUR Deep Network!")
    print("   Historical significance: Backprop enables deep learning")
    print("   YOUR achievement: 85-90% accuracy on real handwritten digits")
    print("   Components used: YOUR complete ML system (Modules 01-06)")

    # Show MNIST visualization
    if args.visualize:
        visualize_mnist_digits()

    # Step 1: Load MNIST dataset
    print("\n📥 Loading MNIST dataset...")
    data_manager = DatasetManager()

    (train_data, train_labels), (test_data, test_labels) = data_manager.get_mnist()
    print(f"✅ Loaded {len(train_data)} training, {len(test_data)} test images")

    # Quick test mode - use subset
    if args.quick_test:
        train_data = train_data[:1000]
        train_labels = train_labels[:1000]
        test_data = test_data[:100]
        test_labels = test_labels[:100]
        print("   (Using subset for quick testing)")

    # Step 2: Create MLP with YOUR components
    model = MNISTMLP(input_size=784, hidden1=128, hidden2=64, num_classes=10)

    if args.test_only:
        print("\n🧪 ARCHITECTURE TEST MODE")
        test_input = Tensor(train_data[:5])  # Module 02: YOUR Tensor!
        test_output = model(test_input)  # YOUR architecture!
        print(f"✅ Forward pass successful! Output shape: {test_output.data.shape}")
        print("✅ YOUR deep MLP architecture works!")
        return

    # Step 3: Train using YOUR system with monitoring
    start_time = time.time()
    model, monitor = train_mnist_mlp(model, train_data, train_labels,
                                   epochs=args.epochs, batch_size=args.batch_size)
    train_time = time.time() - start_time

    # Step 4: Test on test set
    accuracy = test_mnist_mlp(model, test_data, test_labels)

    # Step 5: Systems analysis
    analyze_mnist_systems(model, monitor)

    print(f"\n⏱️  Training time: {train_time:.1f} seconds")
    print(f"   YOUR implementation: {len(train_data) * args.epochs / train_time:.0f} images/sec")

    print("\n✅ SUCCESS! MNIST Milestone Complete!")
    print("\n🎓 What YOU Accomplished:")
    print("   • YOU built a deep MLP with YOUR manual training loop")
    print("   • YOUR backprop trains 100K+ parameters with no issues")
    print("   • YOUR system demonstrates the core training cycle")
    print("   • Forward → Loss → Backward → Update: YOU control it all!")
    print("   • YOUR manual loop mirrors how PyTorch actually trains")

    print("\n🚀 Next Steps:")
    print("   • Continue to CNN milestone after Module 08 (Spatial)")
    print("   • YOUR foundation scales to ImageNet and beyond!")
    print(f"   • With {accuracy:.1f}% accuracy, YOUR deep learning works!")
    print("   • Manual training loops give YOU the deepest understanding")

if __name__ == "__main__":
    main()