diff --git a/examples/cifar_cnn_modern/train_cnn.py b/examples/cifar_cnn_modern/train_cnn.py index 4cda3201..cb36975d 100644 --- a/examples/cifar_cnn_modern/train_cnn.py +++ b/examples/cifar_cnn_modern/train_cnn.py @@ -1,119 +1,462 @@ #!/usr/bin/env python3 """ -Clean CIFAR-10 CNN Example - What Students Built +CIFAR-10 CNN (Modern) - Convolutional Revolution =============================================== -After completing modules 02-10, students can build CNNs for real image classification. -This demonstrates how convolution + pooling creates spatial feature hierarchies. +πŸ“š HISTORICAL CONTEXT: +Convolutional Neural Networks revolutionized computer vision by exploiting spatial +structure in images. Unlike MLPs that flatten images (losing spatial relationships), +CNNs preserve spatial hierarchies through local connectivity and weight sharing, +enabling recognition of complex patterns in natural images. -MODULES EXERCISED IN THIS EXAMPLE: +🎯 WHAT YOU'RE BUILDING: +Using YOUR TinyTorch implementations, you'll build a CNN that achieves 65%+ accuracy +on CIFAR-10 natural images - proving YOUR spatial modules can extract hierarchical +features from real-world photographs! + +βœ… REQUIRED MODULES (Run after Module 10): ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Module 02 (Tensor) : Data structure with gradient tracking - Module 03 (Activations) : ReLU activation throughout the network - Module 04 (Layers) : Linear layers for classification head - Module 05 (Networks) : Module base class for CNN architecture - Module 06 (Autograd) : Backprop through conv and dense layers - Module 07 (Spatial) : Conv2d, MaxPool2d, Flatten operations - Module 08 (Optimizers) : Adam optimizer with momentum - Module 09 (DataLoader) : CIFAR10Dataset and batch processing - Module 10 (Training) : CrossEntropy loss for multi-class + Module 02 (Tensor) : YOUR data structure with autodiff + Module 03 (Activations) : YOUR ReLU for feature extraction + Module 04 (Layers) : YOUR Linear layers for classification + Module 05 (Losses) : YOUR CrossEntropy loss + Module 07 (Optimizers) : YOUR Adam optimizer + Module 08 (Training) : YOUR training loops + Module 09 (Spatial) : YOUR Conv2D, MaxPool2D, Flatten + Module 10 (DataLoader) : YOUR CIFAR10Dataset and batching ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -CNN Architecture: - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Input Image β”‚ β”‚ Conv2d β”‚ β”‚ MaxPool β”‚ β”‚ Conv2d β”‚ β”‚ MaxPool β”‚ - β”‚ (32Γ—32Γ—3) │─▢│ 3β†’32 │─▢│ (2Γ—2) │─▢│ 32β†’64 │─▢│ (2Γ—2) β”‚ - β”‚ RGB Pixels β”‚ β”‚ Module β”‚ β”‚ Module β”‚ β”‚ Module 07 β”‚ β”‚ Module β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ 07 β”‚ β”‚ 07 β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ 07 β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ β”‚ - β–Ό β–Ό - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ ReLU β”‚ β”‚ Flatten β”‚ - β”‚ Module β”‚ β”‚ β†’ Dense β”‚ - β”‚ 03 β”‚ β”‚ Module 04 β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β” - β”‚ Dense Classifier: 1600 β†’ 256 β†’ 10 classes β”‚ - β”‚ Module 04: Linear layers + ReLU β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +πŸ—οΈ ARCHITECTURE (Hierarchical Feature Extraction): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Input Image β”‚ β”‚ Conv2D β”‚ β”‚ MaxPool β”‚ β”‚ Conv2D β”‚ β”‚ MaxPool β”‚ + β”‚ 32Γ—32Γ—3 RGB │─▢│ 3β†’32 │─▢│ 2Γ—2 │─▢│ 32β†’64 │─▢│ 2Γ—2 β”‚ + β”‚ Pixels β”‚ β”‚ YOUR M9 β”‚ β”‚ YOUR M9 β”‚ β”‚ YOUR M9 β”‚ β”‚ YOUR M9 β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + ↓ ↓ + Edge Detection Shape Detection + + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Flatten β†’ Linear β†’ Linear β†’ 10 β”‚ + β”‚ YOUR M9 YOUR M4 YOUR M4 β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + Object Recognition β†’ Classification -Feature Hierarchy: Pixels β†’ Edges β†’ Shapes β†’ Objects β†’ Classes +πŸ” CIFAR-10 DATASET - REAL NATURAL IMAGES: + +CIFAR-10 contains 60,000 32Γ—32 color images in 10 classes: + + Sample Images: Feature Hierarchy YOUR CNN Learns: + + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” Layer 1 (Conv 3β†’32): + β”‚ ✈️ Plane β”‚ β€’ Edge detectors + β”‚[Sky blue ]β”‚ β€’ Color gradients + β”‚[White ]β”‚ β€’ Simple textures + β”‚[Wings ]β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Layer 2 (Conv 32β†’64): + β€’ Object parts + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β€’ Complex patterns + β”‚ πŸš— Car β”‚ β€’ Spatial relationships + β”‚[Red body ]β”‚ + β”‚[Wheels ]β”‚ Output Layer: + β”‚[Windows ]β”‚ β€’ Complete objects + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β€’ Class probabilities + + Classes: plane, car, bird, cat, deer, dog, frog, horse, ship, truck + + Why CNNs Excel at Natural Images: + β€’ LOCAL CONNECTIVITY: Pixels near each other are related + β€’ WEIGHT SHARING: Same filter detects patterns everywhere + β€’ HIERARCHICAL LEARNING: Edges β†’ Shapes β†’ Objects + β€’ TRANSLATION INVARIANCE: Detects cat anywhere in image + +πŸ“Š EXPECTED PERFORMANCE: +- Dataset: 50,000 training images, 10,000 test images +- Training time: 3-5 minutes (demonstration mode) +- Expected accuracy: 65%+ (with YOUR simple CNN!) +- Parameters: ~600K (mostly in conv layers) """ -from tinytorch import nn, optim -from tinytorch.core.tensor import Tensor -from tinytorch.core.autograd import to_numpy +import sys +import os import numpy as np +import argparse +import time -class CIFARCNN(nn.Module): - def __init__(self): - super().__init__() # Module 05: You built Module base class! - # Convolutional feature extraction - self.conv1 = nn.Conv2d(3, 32, (3, 3)) # Module 07: You built 2D convolution! - self.conv2 = nn.Conv2d(32, 64, (3, 3)) # Module 07: You built filter sliding! - - # Dense classification - # After conv1(32x32β†’30x30) β†’ pool(15x15) β†’ conv2(13x13) β†’ pool(6x6) - # Final feature size: 64 channels * 6 * 6 = 2304 - self.fc1 = nn.Linear(64 * 6 * 6, 256) # Module 04: You built Linear layers! - self.fc2 = nn.Linear(256, 10) # Module 04: Your weight matrices! +# Add project root to path +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(project_root) + +# Import TinyTorch components YOU BUILT! +from tinytorch.core.tensor import Tensor # Module 02: YOU built this! +from tinytorch.core.layers import Linear # Module 04: YOU built this! +from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this! +from tinytorch.core.spatial import Conv2D, MaxPool2D # Module 09: YOU built this! +from tinytorch.core.losses import CrossEntropyLoss # Module 05: YOU built this! +from tinytorch.core.optimizers import Adam # Module 07: YOU built this! +# DataLoader would normally be imported from Module 10 +# For this demo, we'll use the data_manager directly + +# Import dataset manager +try: + from examples.data_manager import DatasetManager +except ImportError: + sys.path.append(os.path.join(project_root, 'examples')) + from data_manager import DatasetManager + +def flatten(x): + """Flatten spatial features for dense layers - YOUR implementation!""" + batch_size = x.data.shape[0] + return Tensor(x.data.reshape(batch_size, -1)) + +class CIFARCNN: + """ + Convolutional Neural Network for CIFAR-10 using YOUR TinyTorch! + This architecture demonstrates how spatial feature extraction enables + recognition of complex patterns in natural images. + """ + + def __init__(self): + print("🧠 Building CIFAR-10 CNN with YOUR TinyTorch modules...") + + # Convolutional feature extractors - YOUR spatial modules! + self.conv1 = Conv2D(in_channels=3, out_channels=32, kernel_size=3) # Module 09! + self.conv2 = Conv2D(in_channels=32, out_channels=64, kernel_size=3) # Module 09! + self.pool = MaxPool2D(pool_size=2) # Module 09: YOUR pooling! + + # Activation functions + self.relu = ReLU() # Module 03: YOUR activation! + + # Dense classification head + # After conv1(32β†’30)β†’pool(15)β†’conv2(13)β†’pool(6): 64*6*6 = 2304 features + self.fc1 = Linear(64 * 6 * 6, 256) # Module 04: YOUR Linear! + self.fc2 = Linear(256, 10) # Module 04: YOUR Linear! + + # Calculate total parameters + conv1_params = 3 * 3 * 3 * 32 + 32 # 3Γ—3 kernels, 3β†’32 channels + conv2_params = 3 * 3 * 32 * 64 + 64 # 3Γ—3 kernels, 32β†’64 channels + fc1_params = 64 * 6 * 6 * 256 + 256 # Flattenedβ†’256 + fc2_params = 256 * 10 + 10 # 256β†’10 classes + self.total_params = conv1_params + conv2_params + fc1_params + fc2_params + + print(f" Conv1: 3β†’32 channels (YOUR Conv2D extracts edges)") + print(f" Conv2: 32β†’64 channels (YOUR Conv2D builds shapes)") + print(f" Dense: 2304β†’256β†’10 (YOUR Linear classification)") + print(f" Total parameters: {self.total_params:,}") + def forward(self, x): - # First conv block: extract low-level features (edges, textures) - x = self.conv1(x) # Module 07: Your Conv2d sliding filters! - x = nn.F.relu(x) # Module 03: You built ReLU activation! - x = nn.F.max_pool2d(x, 2) # Module 07: You built max pooling! + """Forward pass through YOUR CNN architecture.""" + # First conv block: Extract low-level features (edges, colors) + x = self.conv1(x) # Module 09: YOUR Conv2D! + x = self.relu(x) # Module 03: YOUR ReLU! + x = self.pool(x) # Module 09: YOUR MaxPool2D! - # Second conv block: extract higher-level features (shapes, patterns) - x = self.conv2(x) # Module 07: Your deeper convolutions! - x = nn.F.relu(x) # Module 03: Your non-linearity! - x = nn.F.max_pool2d(x, 2) # Module 07: Your spatial reduction! + # Second conv block: Build higher-level features (shapes, patterns) + x = self.conv2(x) # Module 09: YOUR Conv2D! + x = self.relu(x) # Module 03: YOUR ReLU! + x = self.pool(x) # Module 09: YOUR MaxPool2D! - # Classification head - x = nn.F.flatten(x, start_dim=1) # Module 07: You built flatten operation! - x = self.fc1(x) # Module 04: Your Linear layer! - x = nn.F.relu(x) # Module 03: Your activation! - return self.fc2(x) # Module 04: Your final classification! + # Flatten and classify + x = flatten(x) # Module 09: YOUR spatialβ†’dense bridge! + x = self.fc1(x) # Module 04: YOUR Linear! + x = self.relu(x) # Module 03: YOUR ReLU! + x = self.fc2(x) # Module 04: YOUR classification! + + return x + + def parameters(self): + """Get all trainable parameters from YOUR layers.""" + return [ + self.conv1.weight, self.conv1.bias, + self.conv2.weight, self.conv2.bias, + self.fc1.weight, self.fc1.bias, + self.fc2.weight, self.fc2.bias + ] + +def visualize_cifar_cnn(): + """Show how CNNs process natural images.""" + print("\n" + "="*70) + print("πŸ–ΌοΈ VISUALIZING CNN FEATURE EXTRACTION:") + print("="*70) + + print(""" + How YOUR CNN Sees Images: Feature Maps at Each Layer: + + Original Image (32Γ—32Γ—3): After Conv1 (30Γ—30Γ—32): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β” + β”‚ [Cat in grass] β”‚ β”‚Edge detectors...β”‚ 32 filters + β”‚ Complex scene β”‚ β†’ Conv+ReLU β†’ β”‚Texture maps... β”‚ detect + β”‚ Many patterns β”‚ β”‚Color gradients. β”‚ features + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”˜ + + After Pool1 (15Γ—15Γ—32): After Conv2 (13Γ—13Γ—64): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β”¬β”€β” + β”‚Reduced β”‚ β”‚Cat ears... β”‚ 64 filters + β”‚spatial β”‚ β†’ Conv+ReLU β†’ β”‚Cat eyes... β”‚ combine + β”‚dimensionβ”‚ β”‚Grass texture...β”‚ features + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”΄β”€β”˜ + + After Pool2 + Flatten: Classification: + [6Γ—6Γ—64 = 2304 features] β†’ Dense β†’ [plane|car|bird|CAT|...] + Highest probability + + Key CNN Advantages YOUR Implementation Provides: + βœ“ SPATIAL HIERARCHY: Low β†’ High level features + βœ“ PARAMETER SHARING: 3Γ—3 kernel used everywhere + βœ“ TRANSLATION INVARIANCE: Detects patterns anywhere + βœ“ AUTOMATIC FEATURE LEARNING: No manual engineering! + """) + print("="*70) + +def train_cifar_cnn(model, train_data, train_labels, + epochs=3, batch_size=32, learning_rate=0.001): + """Train CNN using YOUR complete training system!""" + print("\nπŸš€ Training CIFAR-10 CNN with YOUR TinyTorch!") + print(f" Dataset: {len(train_data)} color images") + print(f" Batch size: {batch_size}") + print(f" YOUR Adam optimizer (Module 07)") + + # YOUR optimizer and loss + optimizer = Adam(model.parameters(), learning_rate=learning_rate) + loss_fn = CrossEntropyLoss() + + # Training loop + num_batches = min(100, len(train_data) // batch_size) # Demo mode + + for epoch in range(epochs): + print(f"\n Epoch {epoch+1}/{epochs}:") + epoch_loss = 0 + correct = 0 + total = 0 + + for batch_idx in range(num_batches): + # Get batch + start_idx = batch_idx * batch_size + end_idx = start_idx + batch_size + batch_X = train_data[start_idx:end_idx] + batch_y = train_labels[start_idx:end_idx] + + # YOUR Tensors + inputs = Tensor(batch_X) # Module 02! + targets = Tensor(batch_y) # Module 02! + + # Forward pass with YOUR CNN + outputs = model.forward(inputs) # YOUR spatial features! + loss = loss_fn(outputs, targets) # Module 05! + + # Backward pass with YOUR autograd + optimizer.zero_grad() # Module 07! + loss.backward() # Module 06: YOUR autodiff! + optimizer.step() # Module 07! + + # Track accuracy + predictions = np.argmax(outputs.data, axis=1) + correct += np.sum(predictions == batch_y) + total += len(batch_y) + + # Extract loss + if hasattr(loss, 'item'): + loss_value = loss.item() + else: + loss_value = float(loss.data) if not isinstance(loss.data, np.ndarray) else float(loss.data.flat[0]) + + epoch_loss += loss_value + + # Progress + if (batch_idx + 1) % 20 == 0: + acc = 100 * correct / total + print(f" Batch {batch_idx+1}/{num_batches}: " + f"Loss = {loss_value:.4f}, Accuracy = {acc:.1f}%") + + # Epoch summary + epoch_acc = 100 * correct / total + avg_loss = epoch_loss / num_batches + print(f" β†’ Epoch Complete: Loss = {avg_loss:.4f}, " + f"Accuracy = {epoch_acc:.1f}% (YOUR CNN learning!)") + + return model + +def test_cifar_cnn(model, test_data, test_labels, class_names): + """Test YOUR CNN on CIFAR-10 test set.""" + print("\nπŸ§ͺ Testing YOUR CNN on Natural Images...") + + batch_size = 100 + correct = 0 + total = 0 + class_correct = np.zeros(10) + class_total = np.zeros(10) + + # Test in batches + num_test_batches = min(20, len(test_data) // batch_size) # Demo + + for i in range(num_test_batches): + batch_X = test_data[i*batch_size:(i+1)*batch_size] + batch_y = test_labels[i*batch_size:(i+1)*batch_size] + + inputs = Tensor(batch_X) + outputs = model.forward(inputs) + + predictions = np.argmax(outputs.data, axis=1) + correct += np.sum(predictions == batch_y) + total += len(batch_y) + + # Per-class accuracy + for j in range(len(batch_y)): + label = batch_y[j] + class_total[label] += 1 + if predictions[j] == label: + class_correct[label] += 1 + + # Results + accuracy = 100 * correct / total + print(f"\n πŸ“Š Overall Test Accuracy: {accuracy:.2f}%") + + # Per-class performance + print("\n Per-Class Performance (YOUR CNN's understanding):") + print(" " + "─"*50) + print(" β”‚ Class β”‚ Accuracy β”‚ Visual β”‚") + print(" β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€") + + for i, class_name in enumerate(class_names): + if class_total[i] > 0: + class_acc = 100 * class_correct[i] / class_total[i] + bar_length = int(class_acc / 5) + bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length) + print(f" β”‚ {class_name:10} β”‚ {class_acc:5.1f}% β”‚ {bar} β”‚") + + print(" " + "─"*50) + + if accuracy >= 65: + print("\n πŸŽ‰ EXCELLENT! YOUR CNN mastered natural image recognition!") + elif accuracy >= 50: + print("\n βœ… Good progress! YOUR CNN is learning visual features!") + else: + print("\n πŸ”„ YOUR CNN is still learning... (normal for demo mode)") + + return accuracy + +def analyze_cnn_systems(model): + """Analyze YOUR CNN from an ML systems perspective.""" + print("\nπŸ”¬ SYSTEMS ANALYSIS of YOUR CNN Implementation:") + + print(f"\n Model Architecture:") + print(f" β€’ Convolutional layers: 2 (3β†’32β†’64 channels)") + print(f" β€’ Pooling layers: 2 (2Γ—2 max pooling)") + print(f" β€’ Dense layers: 2 (2304β†’256β†’10)") + print(f" β€’ Total parameters: {model.total_params:,}") + + print(f"\n Computational Complexity:") + print(f" β€’ Conv1: 32Γ—30Γ—30Γ—(3Γ—3Γ—3) = 777,600 ops") + print(f" β€’ Conv2: 64Γ—13Γ—13Γ—(3Γ—3Γ—32) = 3,093,504 ops") + print(f" β€’ Dense: 2,304Γ—256 + 256Γ—10 = 592,384 ops") + print(f" β€’ Total: ~4.5M ops per image") + + print(f"\n Memory Requirements:") + print(f" β€’ Parameters: {model.total_params * 4 / 1024:.1f} KB") + print(f" β€’ Activations (peak): ~500 KB per image") + print(f" β€’ YOUR implementation: Pure Python + NumPy") + + print(f"\n πŸ›οΈ CNN Evolution:") + print(f" β€’ 1989: LeCun's CNN for handwritten digits") + print(f" β€’ 2012: AlexNet revolutionizes ImageNet") + print(f" β€’ 2015: ResNet enables 100+ layer networks") + print(f" β€’ YOUR CNN: Core principles that power them all!") + + print(f"\n πŸ’‘ Why CNNs Dominate Vision:") + print(f" β€’ Spatial hierarchy matches visual cortex") + print(f" β€’ Parameter sharing: 3Γ—3 kernel vs 32Γ—32 dense") + print(f" β€’ Translation invariance from weight sharing") + print(f" β€’ YOUR implementation demonstrates all of these!") def main(): - # For validation testing, test architecture only (no training) - print("πŸ–ΌοΈ Testing CIFAR-10 CNN Architecture...") + """Demonstrate CIFAR-10 CNN using YOUR TinyTorch!""" - model = CIFARCNN() + parser = argparse.ArgumentParser(description='CIFAR-10 CNN') + parser.add_argument('--test-only', action='store_true', + help='Test architecture only') + parser.add_argument('--epochs', type=int, default=3, + help='Training epochs (demo mode)') + parser.add_argument('--batch-size', type=int, default=32, + help='Batch size') + parser.add_argument('--visualize', action='store_true', default=True, + help='Show CNN visualization') + parser.add_argument('--quick-test', action='store_true', + help='Use small subset for testing') + args = parser.parse_args() - print("πŸš€ CNN Architecture Validation!") - print(" Classes: plane, car, bird, cat, deer, dog, frog, horse, ship, truck") - print(" Architecture: Conv β†’ Pool β†’ Conv β†’ Pool β†’ Dense β†’ Classify") - print(f" Parameters: {sum(p.data.size for p in model.parameters()):,} weights") - print() + print("🎯 CIFAR-10 CNN - Natural Image Recognition with YOUR Spatial Modules!") + print(" Historical significance: CNNs revolutionized computer vision") + print(" YOUR achievement: Spatial feature extraction on real photos") + print(" Components used: YOUR Conv2D + MaxPool2D + complete system") - # Test forward pass with small input - test_input = Tensor(np.random.randn(1, 3, 32, 32).astype(np.float32)) - print(" Testing forward pass with single 32x32 RGB image...") + # Visualization + if args.visualize: + visualize_cifar_cnn() + + # Class names + class_names = ['plane', 'car', 'bird', 'cat', 'deer', + 'dog', 'frog', 'horse', 'ship', 'truck'] + + # Step 1: Load CIFAR-10 + print("\nπŸ“₯ Loading CIFAR-10 dataset...") + data_manager = DatasetManager() try: - output = model(test_input) - print(f" βœ… Forward pass successful! Output shape: {to_numpy(output).shape}") - print(f" βœ… Output contains {to_numpy(output).shape[1]} class predictions") - print() - print(" CNN architecture validated:") - print(" β€’ Conv2d layers process spatial features") - print(" β€’ MaxPool2d reduces spatial dimensions") - print(" β€’ Flatten converts 2D to 1D for classification") - print(" β€’ Linear layers perform final classification") - print() - print("βœ… Success! CNN architecture works correctly") + (train_data, train_labels), (test_data, test_labels) = data_manager.get_cifar10() + print(f"βœ… Loaded {len(train_data)} training, {len(test_data)} test images") + + if args.quick_test: + train_data = train_data[:1000] + train_labels = train_labels[:1000] + test_data = test_data[:500] + test_labels = test_labels[:500] + print(" (Using subset for quick testing)") + except Exception as e: - print(f" ❌ Error in forward pass: {e}") + print(f"⚠️ CIFAR-10 download failed: {e}") + print(" Using synthetic data for architecture testing...") + train_data = np.random.randn(100, 3, 32, 32).astype(np.float32) + train_labels = np.random.randint(0, 10, 100).astype(np.int64) + test_data = np.random.randn(20, 3, 32, 32).astype(np.float32) + test_labels = np.random.randint(0, 10, 20).astype(np.int64) + + # Step 2: Build CNN + model = CIFARCNN() + + if args.test_only: + print("\nπŸ§ͺ ARCHITECTURE TEST MODE") + test_input = Tensor(train_data[:5]) + test_output = model.forward(test_input) + print(f"βœ… Forward pass successful! Shape: {test_output.data.shape}") + print("βœ… YOUR CNN architecture works!") return - print("\n🎯 What You Learned by Building:") - print(" β€’ How convolutions detect local features (edges, textures)") - print(" β€’ Why pooling reduces computation while preserving information") - print(" β€’ How spatial feature hierarchies enable object recognition") - print(" β€’ Complete computer vision pipeline from pixels to predictions") + # Step 3: Train + start_time = time.time() + model = train_cifar_cnn(model, train_data, train_labels, + epochs=args.epochs, batch_size=args.batch_size) + train_time = time.time() - start_time + + # Step 4: Test + accuracy = test_cifar_cnn(model, test_data, test_labels, class_names) + + # Step 5: Analysis + analyze_cnn_systems(model) + + print(f"\n⏱️ Training time: {train_time:.1f} seconds") + print(f" Images/sec: {len(train_data) * args.epochs / train_time:.0f}") + + print("\nβœ… SUCCESS! CIFAR-10 CNN Milestone Complete!") + print("\nπŸŽ“ What YOU Accomplished:") + print(" β€’ YOUR Conv2D extracts spatial features from natural images") + print(" β€’ YOUR MaxPool2D reduces dimensions while preserving information") + print(" β€’ YOUR CNN achieves real accuracy on complex photos") + print(" β€’ YOUR implementation demonstrates core computer vision principles!") + + print("\nπŸš€ Next Steps:") + print(" β€’ Continue to TinyGPT after Module 14 (Transformers)") + print(" β€’ YOUR spatial understanding scales to segmentation, detection, etc.") + print(f" β€’ With {accuracy:.1f}% accuracy, YOUR computer vision works!") if __name__ == "__main__": main() \ No newline at end of file diff --git a/examples/data_manager.py b/examples/data_manager.py index 4583b97c..7a7b1b62 100644 --- a/examples/data_manager.py +++ b/examples/data_manager.py @@ -131,7 +131,8 @@ class DatasetManager: # Create XOR dataset np.random.seed(42) # Reproducible X = np.random.randint(0, 2, (num_samples, 2)).astype(np.float32) - y = (X[:, 0] ^ X[:, 1]).astype(np.int64) # XOR labels + # XOR: output 1 when inputs differ, 0 when same + y = (X[:, 0].astype(int) != X[:, 1].astype(int)).astype(np.int64) # Add some noise to make it more realistic X += np.random.normal(0, 0.1, X.shape) diff --git a/examples/mnist_mlp_1986/train_mlp.py b/examples/mnist_mlp_1986/train_mlp.py index 61d4d17e..fec3876a 100644 --- a/examples/mnist_mlp_1986/train_mlp.py +++ b/examples/mnist_mlp_1986/train_mlp.py @@ -1,105 +1,423 @@ #!/usr/bin/env python3 """ -Clean MNIST Example - What Students Built -========================================= +MNIST MLP (1986) - Backpropagation Revolution +============================================ -After completing modules 02-07, students can classify handwritten digits. -This demonstrates how multi-layer perceptrons solve real vision tasks. +πŸ“š HISTORICAL CONTEXT: +In 1986, Rumelhart, Hinton, and Williams popularized backpropagation, finally +enabling training of deep multi-layer networks. This breakthrough made it possible +to solve real vision problems like handwritten digit recognition, launching the +modern deep learning era. -MODULES EXERCISED IN THIS EXAMPLE: +🎯 WHAT YOU'RE BUILDING: +Using YOUR TinyTorch implementations, you'll build a multi-layer perceptron that +achieves 95%+ accuracy on MNIST digits - proving YOUR system can solve real vision! + +βœ… REQUIRED MODULES (Run after Module 8): ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - Module 02 (Tensor) : Data structure with gradient tracking + basic autograd - Module 03 (Activations) : ReLU activation function - Module 04 (Layers) : Linear layers + Module base + Flatten operation - Module 05 (Loss) : CrossEntropy loss for multi-class classification - Module 06 (Optimizers) : Adam optimizer with adaptive learning - Module 07 (Training) : Complete training loops and evaluation + Module 02 (Tensor) : YOUR data structure with autodiff + Module 03 (Activations) : YOUR ReLU for deep networks + Module 04 (Layers) : YOUR Linear layers + Flatten operation + Module 05 (Losses) : YOUR CrossEntropy for multi-class + Module 07 (Optimizers) : YOUR Adam optimizer with momentum + Module 08 (Training) : YOUR complete training loops ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -MLP Architecture: +πŸ—οΈ ARCHITECTURE (Deep Feedforward Network): β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Input Image β”‚ β”‚ Flatten β”‚ β”‚ Dense β”‚ β”‚ Dense β”‚ β”‚ Output β”‚ - β”‚ (28Γ—28) │───▢│ (784) │───▢│ (128) │───▢│ (64) │───▢│ (10) β”‚ - β”‚ Pixels β”‚ β”‚ Module β”‚ β”‚ Linear β”‚ β”‚ Linear β”‚ β”‚ Classes β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ 04 β”‚ β”‚ +ReLU β”‚ β”‚ +ReLU β”‚ β”‚Module 04β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚Module 04β”‚ β”‚Module 04β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ Input Image β”‚ β”‚ Flatten β”‚ β”‚ Linear β”‚ β”‚ Linear β”‚ β”‚ Output β”‚ + β”‚ 28Γ—28 │───▢│ 784 │───▢│ 784β†’128 │───▢│ 128β†’64 │───▢│ 64β†’10 β”‚ + β”‚ Pixels β”‚ β”‚ YOUR M4 β”‚ β”‚ +ReLU β”‚ β”‚ +ReLU β”‚ β”‚ Classes β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + Hidden Layer 1 Hidden Layer 2 Digit Probs -Key Insight: Simple MLPs can achieve 95%+ accuracy on MNIST digits -Hidden layers learn hierarchical feature representations +πŸ” MNIST DATASET - THE HELLO WORLD OF COMPUTER VISION: + +MNIST contains 70,000 handwritten digits (60K train, 10K test): + + Sample Digits: Why MNIST Matters: + + β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β” β€’ First "real" vision benchmark + β”‚ β–ˆβ–ˆβ–ˆ β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚ β€’ 28Γ—28 pixels = 784 features + β”‚β–ˆ β–ˆβ”‚ β”‚ β–ˆβ”‚ β”‚ β–ˆβ”‚ β€’ 10 classes (digits 0-9) + β”‚ β–ˆ β”‚ β”‚ β–ˆβ–ˆ β”‚ β”‚ β–ˆβ–ˆβ–ˆ β”‚ β€’ Proves deep learning works + β”‚ β–ˆ β”‚ β”‚ β–ˆ β”‚ β”‚ β–ˆβ”‚ β€’ YOUR MLP will get 95%+ accuracy! + β”‚ β–ˆ β”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚ β”‚β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ”‚ + β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”˜ + "1" "2" "3" + + Network learns to map: + 784 pixels β†’ Hidden features β†’ Digit classification + +πŸ“Š EXPECTED PERFORMANCE: +- Dataset: 60,000 training images, 10,000 test images +- Training time: 2-3 minutes (5 epochs) +- Expected accuracy: 95%+ on test set +- Parameters: ~100K weights (small by modern standards!) """ -from tinytorch import nn, optim -from tinytorch.core.tensor import Tensor -from tinytorch.core.training import CrossEntropyLoss -from tinytorch.core.autograd import to_numpy +import sys +import os import numpy as np +import argparse +import time -class MNISTMLP(nn.Module): - def __init__(self): - super().__init__() # Module 04: You built Module base class! - self.fc1 = nn.Linear(784, 128) # Module 04: You built Linear layers! - self.fc2 = nn.Linear(128, 64) # Module 04: You built weight matrices! - self.fc3 = nn.Linear(64, 10) # Module 04: Your output layer! +# Add project root to path for TinyTorch imports +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(project_root) + +# Import TinyTorch components YOU BUILT! +from tinytorch.core.tensor import Tensor # Module 02: YOU built this! +from tinytorch.core.layers import Linear # Module 04: YOU built this! +from tinytorch.core.activations import ReLU, Softmax # Module 03: YOU built this! +from tinytorch.core.losses import CrossEntropyLoss # Module 05: YOU built this! +from tinytorch.core.optimizers import Adam # Module 07: YOU built this! +from tinytorch.core.networks import Sequential # Module 04: YOU built this! + +# Import dataset manager +try: + from examples.data_manager import DatasetManager +except ImportError: + sys.path.append(os.path.join(project_root, 'examples')) + from data_manager import DatasetManager + +def flatten(x): + """Flatten operation for CNN to MLP transition.""" + batch_size = x.data.shape[0] + return Tensor(x.data.reshape(batch_size, -1)) + +class MNISTMLP: + """ + Multi-Layer Perceptron for MNIST using YOUR TinyTorch! + This architecture proved deep learning could solve real vision problems. + """ + + def __init__(self, input_size=784, hidden1=128, hidden2=64, num_classes=10): + print("🧠 Building MNIST MLP with YOUR TinyTorch modules...") + + # Deep architecture - multiple hidden layers! + self.fc1 = Linear(input_size, hidden1) # Module 04: YOUR Linear layer! + self.relu1 = ReLU() # Module 03: YOUR activation! + self.fc2 = Linear(hidden1, hidden2) # Module 04: YOUR Linear layer! + self.relu2 = ReLU() # Module 03: YOUR activation! + self.fc3 = Linear(hidden2, num_classes) # Module 04: YOUR output layer! + + # Store architecture info + self.total_params = ( + input_size * hidden1 + hidden1 + # fc1 + hidden1 * hidden2 + hidden2 + # fc2 + hidden2 * num_classes + num_classes # fc3 + ) + + print(f" Architecture: {input_size} β†’ {hidden1} β†’ {hidden2} β†’ {num_classes}") + print(f" Total parameters: {self.total_params:,} (YOUR Linear layers)") + print(f" Activation: ReLU (YOUR Module 03)") + def forward(self, x): - x = nn.F.flatten(x, start_dim=1) # Module 04: You built flatten! - x = self.fc1(x) # Module 04: Your Linear.forward()! - x = nn.F.relu(x) # Module 03: You built ReLU activation! - x = self.fc2(x) # Module 04: Your hidden layer! - x = nn.F.relu(x) # Module 03: Your non-linearity! - return self.fc3(x) # Module 04: Your classification layer! + """Forward pass through YOUR deep network.""" + # Flatten image to vector + batch_size = x.data.shape[0] + x = Tensor(x.data.reshape(batch_size, -1)) # 28Γ—28 β†’ 784 + + # Deep forward pass using YOUR components + x = self.fc1(x) # Module 04: YOUR Linear layer! + x = self.relu1(x) # Module 03: YOUR ReLU activation! + x = self.fc2(x) # Module 04: YOUR Linear layer! + x = self.relu2(x) # Module 03: YOUR ReLU activation! + x = self.fc3(x) # Module 04: YOUR output layer! + + return x + + def parameters(self): + """Get all trainable parameters from YOUR layers.""" + return [ + self.fc1.weight, self.fc1.bias, + self.fc2.weight, self.fc2.bias, + self.fc3.weight, self.fc3.bias + ] + +def visualize_mnist_digits(): + """Show ASCII representation of MNIST digits.""" + print("\n" + "="*70) + print("πŸ”’ VISUALIZING MNIST - Handwritten Digit Recognition:") + print("="*70) + + print(""" + Sample Training Data: What YOUR Network Learns: + + 28Γ—28 Pixel Images: Feature Hierarchy: + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚β–‘β–‘β–‘β–‘β–ˆβ–ˆβ–‘β–‘β–‘β–‘β”‚ β†’ Flatten(784) β†’ Layer 1: Edge detectors + β”‚β–‘β–‘β–‘β–ˆβ–ˆβ–ˆβ–‘β–‘β–‘β–‘β”‚ - Vertical lines + β”‚β–‘β–‘β–ˆβ–ˆβ–‘β–ˆβ–‘β–‘β–‘β–‘β”‚ - Horizontal lines + β”‚β–‘β–‘β–‘β–‘β–‘β–ˆβ–‘β–‘β–‘β–‘β”‚ - Curves + β”‚β–‘β–‘β–‘β–‘β–‘β–ˆβ–‘β–‘β–‘β–‘β”‚ + β”‚β–‘β–‘β–‘β–‘β–‘β–ˆβ–‘β–‘β–‘β–‘β”‚ Layer 2: Shape components + β”‚β–‘β–‘β–‘β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘β”‚ - Loops (0, 6, 8, 9) + β”‚β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β”‚ - Lines (1, 7) + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - Corners (4, 5) + Digit "7" + Output: Class probabilities + YOUR network learns to: P("0") = 0.01 + 1. Extract features from pixels P("1") = 0.02 + 2. Combine features hierarchically ... + 3. Classify into 10 digit classes P("7") = 0.91 ← Highest! + """) + print("="*70) + +def train_mnist_mlp(model, train_data, train_labels, + epochs=5, batch_size=32, learning_rate=0.001): + """ + Train MNIST MLP using YOUR complete training system! + """ + print("\nπŸš€ Training MNIST MLP with YOUR TinyTorch system!") + print(f" Dataset: {len(train_data)} training images") + print(f" Batch size: {batch_size}") + print(f" Learning rate: {learning_rate}") + print(f" Using YOUR Adam optimizer (Module 07)") + + # YOUR optimizer and loss + optimizer = Adam(model.parameters(), learning_rate=learning_rate) # Module 07! + loss_fn = CrossEntropyLoss() # Module 05: YOUR loss function! + + num_batches = len(train_data) // batch_size + + for epoch in range(epochs): + print(f"\n Epoch {epoch+1}/{epochs}:") + epoch_loss = 0 + correct = 0 + total = 0 + + # Shuffle data for each epoch + indices = np.random.permutation(len(train_data)) + train_data = train_data[indices] + train_labels = train_labels[indices] + + # Progress bar + for batch_idx in range(num_batches): + # Get batch + start_idx = batch_idx * batch_size + end_idx = start_idx + batch_size + batch_X = train_data[start_idx:end_idx] + batch_y = train_labels[start_idx:end_idx] + + # Convert to YOUR Tensors + inputs = Tensor(batch_X) # Module 02: YOUR Tensor! + targets = Tensor(batch_y) # Module 02: YOUR Tensor! + + # Forward pass with YOUR network + outputs = model.forward(inputs) # YOUR forward pass! + loss = loss_fn(outputs, targets) # Module 05: YOUR loss! + + # Backward pass with YOUR autograd + optimizer.zero_grad() # Module 07: YOUR gradient reset! + loss.backward() # Module 06: YOUR autodiff! + optimizer.step() # Module 07: YOUR parameter update! + + # Track accuracy + predictions = np.argmax(outputs.data, axis=1) + correct += np.sum(predictions == batch_y) + total += len(batch_y) + + # Extract loss value + if hasattr(loss, 'item'): + loss_value = loss.item() + elif isinstance(loss.data, np.ndarray): + loss_value = float(loss.data.flat[0]) + else: + loss_value = float(loss.data) + + epoch_loss += loss_value + + # Progress indicator + if (batch_idx + 1) % 100 == 0: + acc = 100 * correct / total + print(f" Batch {batch_idx+1}/{num_batches}: " + f"Loss = {loss_value:.4f}, Accuracy = {acc:.1f}%") + + # Epoch summary + epoch_acc = 100 * correct / total + avg_loss = epoch_loss / num_batches + print(f" β†’ Epoch {epoch+1} Complete: Loss = {avg_loss:.4f}, " + f"Accuracy = {epoch_acc:.1f}% (YOUR training!)") + + return model + +def test_mnist_mlp(model, test_data, test_labels): + """Test YOUR MLP on MNIST test set.""" + print("\nπŸ§ͺ Testing YOUR MNIST MLP on 10,000 test images...") + + batch_size = 100 + correct = 0 + total = 0 + + # Per-class accuracy tracking + class_correct = np.zeros(10) + class_total = np.zeros(10) + + for i in range(0, len(test_data), batch_size): + batch_X = test_data[i:i+batch_size] + batch_y = test_labels[i:i+batch_size] + + # Test with YOUR network + inputs = Tensor(batch_X) # Module 02: YOUR Tensor! + outputs = model.forward(inputs) # YOUR forward pass! + + predictions = np.argmax(outputs.data, axis=1) + correct += np.sum(predictions == batch_y) + total += len(batch_y) + + # Per-class accuracy + for j in range(len(batch_y)): + label = batch_y[j] + class_total[label] += 1 + if predictions[j] == label: + class_correct[label] += 1 + + # Overall accuracy + accuracy = 100 * correct / total + print(f"\n πŸ“Š Overall Test Accuracy: {accuracy:.2f}%") + + # Per-digit accuracy + print("\n Per-Digit Performance (YOUR network's understanding):") + print(" " + "─"*45) + print(" β”‚ Digit β”‚ Accuracy β”‚ Visual β”‚") + print(" β”œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€") + + for digit in range(10): + if class_total[digit] > 0: + digit_acc = 100 * class_correct[digit] / class_total[digit] + bar_length = int(digit_acc / 5) + bar = "β–ˆ" * bar_length + "β–‘" * (20 - bar_length) + print(f" β”‚ {digit} β”‚ {digit_acc:5.1f}% β”‚ {bar} β”‚") + + print(" " + "─"*45) + + if accuracy >= 95: + print("\n πŸŽ‰ SUCCESS! YOUR MLP achieved expert-level accuracy!") + elif accuracy >= 90: + print("\n βœ… Great job! YOUR MLP is learning well!") + else: + print("\n πŸ”„ YOUR MLP is learning... (try more epochs)") + + return accuracy + +def analyze_mnist_systems(model): + """Analyze YOUR MNIST MLP from an ML systems perspective.""" + print("\nπŸ”¬ SYSTEMS ANALYSIS of YOUR MNIST Implementation:") + + # Model size analysis + param_bytes = model.total_params * 4 # float32 + + print(f"\n Model Statistics:") + print(f" β€’ Parameters: {model.total_params:,} weights") + print(f" β€’ Memory: {param_bytes / 1024:.1f} KB") + print(f" β€’ FLOPs per image: ~{model.total_params * 2:,}") + + print(f"\n Performance Characteristics:") + print(f" β€’ Training: O(N Γ— P) where N=samples, P=parameters") + print(f" β€’ Inference: {model.total_params * 2 / 1_000_000:.2f}M ops/image") + print(f" β€’ YOUR implementation: Pure Python + NumPy") + + print(f"\n πŸ›οΈ Historical Context:") + print(f" β€’ 1986: Backprop made deep learning possible") + print(f" β€’ 1998: LeNet-5 achieved 99.2% on MNIST (CNNs)") + print(f" β€’ YOUR MLP: 95%+ with simple architecture") + print(f" β€’ Modern: 99.8%+ possible with advanced techniques") + + print(f"\n πŸ’‘ Systems Insights:") + print(f" β€’ Fully connected = O(NΒ²) parameters") + print(f" β€’ Why CNNs win: Weight sharing reduces parameters") + print(f" β€’ YOUR achievement: Real vision with YOUR code!") def main(): - # Generate MNIST-like data (real MNIST would use DataLoader) - batch_size, num_samples = 32, 1000 - X = np.random.randn(num_samples, 28, 28).astype(np.float32) # 28Γ—28 images - y = np.random.randint(0, 10, (num_samples,)).astype(np.int64) # 10 digit classes + """Demonstrate MNIST digit classification using YOUR TinyTorch!""" - model = MNISTMLP() # Module 04: Your neural network! - optimizer = optim.Adam(model.parameters(), learning_rate=0.001) # Module 06: You built Adam! - loss_fn = CrossEntropyLoss() # Module 05: You built cross-entropy loss! + parser = argparse.ArgumentParser(description='MNIST MLP 1986') + parser.add_argument('--test-only', action='store_true', + help='Test architecture without training') + parser.add_argument('--epochs', type=int, default=5, + help='Number of training epochs') + parser.add_argument('--batch-size', type=int, default=32, + help='Training batch size') + parser.add_argument('--visualize', action='store_true', default=True, + help='Show MNIST visualization') + parser.add_argument('--quick-test', action='store_true', + help='Train on subset for quick testing') + args = parser.parse_args() - print("πŸ”’ Training MNIST Digit Classifier") - print(" Architecture: Input(784) β†’ Dense(128) β†’ Dense(64) β†’ Output(10)") - print(f" Parameters: {sum(p.data.size for p in model.parameters())} trainable weights") - print(f" Dataset: {num_samples} handwritten digit images") - print() + print("🎯 MNIST MLP 1986 - Real Vision with YOUR Deep Network!") + print(" Historical significance: Backprop enables deep learning") + print(" YOUR achievement: 95%+ accuracy on real handwritten digits") + print(" Components used: YOUR complete ML system (Modules 2-8)") - # What students built: Complete digit classification pipeline - for epoch in range(10): - total_loss = 0 - num_batches = 0 + # Show MNIST visualization + if args.visualize: + visualize_mnist_digits() + + # Step 1: Load MNIST dataset + print("\nπŸ“₯ Loading MNIST dataset...") + data_manager = DatasetManager() + + try: + (train_data, train_labels), (test_data, test_labels) = data_manager.get_mnist() + print(f"βœ… Loaded {len(train_data)} training, {len(test_data)} test images") - for i in range(0, num_samples, batch_size): - # Mini-batch processing - batch_X = X[i:i+batch_size] - batch_y = y[i:i+batch_size] + # Quick test mode - use subset + if args.quick_test: + train_data = train_data[:1000] + train_labels = train_labels[:1000] + test_data = test_data[:100] + test_labels = test_labels[:100] + print(" (Using subset for quick testing)") - inputs = Tensor(batch_X) # Module 02: You built Tensor with gradients! - targets = Tensor(batch_y) # Module 02: Your data structure! - - outputs = model(inputs) # Modules 03+04: Your forward pass! - loss = loss_fn(outputs, targets) # Module 05: You built CrossEntropy! - - loss.backward() # Module 02: You built autodiff! - optimizer.step() # Module 06: You built Adam updates! - optimizer.zero_grad() # Module 06: Your gradient clearing! - - # Extract scalar loss value using to_numpy utility - loss_value = float(to_numpy(loss).flat[0]) - total_loss += loss_value - num_batches += 1 - - avg_loss = total_loss / num_batches - print(f" Epoch {epoch+1:2d}: Loss = {avg_loss:.4f}") + except Exception as e: + print(f"⚠️ MNIST download failed: {e}") + print(" Using synthetic data for demonstration...") + # Fallback synthetic data + train_data = np.random.randn(1000, 28, 28).astype(np.float32) + train_labels = np.random.randint(0, 10, 1000).astype(np.int64) + test_data = np.random.randn(100, 28, 28).astype(np.float32) + test_labels = np.random.randint(0, 10, 100).astype(np.int64) - print("\nβœ… Success! MLP trained on digit classification") - print("\n🎯 What You Learned by Building:") - print(" β€’ How dense layers transform high-dimensional inputs") - print(" β€’ Why multiple hidden layers improve representation") - print(" β€’ How cross-entropy loss handles multi-class problems") - print(" β€’ Complete vision pipeline from pixels to predictions") + # Step 2: Create MLP with YOUR components + model = MNISTMLP(input_size=784, hidden1=128, hidden2=64, num_classes=10) + + if args.test_only: + print("\nπŸ§ͺ ARCHITECTURE TEST MODE") + test_input = Tensor(train_data[:5]) # Module 02: YOUR Tensor! + test_output = model.forward(test_input) # YOUR architecture! + print(f"βœ… Forward pass successful! Output shape: {test_output.data.shape}") + print("βœ… YOUR deep MLP architecture works!") + return + + # Step 3: Train using YOUR system + start_time = time.time() + model = train_mnist_mlp(model, train_data, train_labels, + epochs=args.epochs, batch_size=args.batch_size) + train_time = time.time() - start_time + + # Step 4: Test on test set + accuracy = test_mnist_mlp(model, test_data, test_labels) + + # Step 5: Systems analysis + analyze_mnist_systems(model) + + print(f"\n⏱️ Training time: {train_time:.1f} seconds") + print(f" YOUR implementation: {len(train_data) * args.epochs / train_time:.0f} images/sec") + + print("\nβœ… SUCCESS! MNIST Milestone Complete!") + print("\nπŸŽ“ What YOU Accomplished:") + print(" β€’ YOU built a deep MLP achieving 95%+ accuracy") + print(" β€’ YOUR backprop trains 100K+ parameters efficiently") + print(" β€’ YOUR system solves real computer vision problems") + print(" β€’ YOUR implementation matches 1986 state-of-the-art!") + + print("\nπŸš€ Next Steps:") + print(" β€’ Continue to CIFAR CNN after Module 10 (Spatial + DataLoader)") + print(" β€’ YOUR foundation scales to ImageNet and beyond!") + print(f" β€’ With {accuracy:.1f}% accuracy, YOUR deep learning works!") if __name__ == "__main__": main() \ No newline at end of file diff --git a/examples/xor_1969/minsky_xor_problem.py b/examples/xor_1969/minsky_xor_problem.py index e2c1ac58..1e38f0a1 100644 --- a/examples/xor_1969/minsky_xor_problem.py +++ b/examples/xor_1969/minsky_xor_problem.py @@ -1,215 +1,333 @@ +#!/usr/bin/env python3 """ The XOR Problem (1969) - Minsky & Papert -========================================= +======================================== -Historical Context: -In 1969, Marvin Minsky and Seymour Papert published "Perceptrons", proving -that single-layer perceptrons couldn't solve XOR (exclusive-or). This finding -triggered the first "AI Winter" as funding dried up. The solution - hidden -layers with nonlinear activation - wouldn't be widely adopted until the 1980s -when backpropagation was rediscovered. +πŸ“š HISTORICAL CONTEXT: +In 1969, Marvin Minsky and Seymour Papert published "Perceptrons," proving that +single-layer perceptrons CANNOT solve the XOR problem. This killed neural network +research for a decade (the "AI Winter") until multi-layer networks solved it! -What You're Building: -A multi-layer perceptron that solves XOR - the problem that "killed" neural -networks for a decade. This demonstrates why deep networks with hidden layers -are essential for learning non-linear patterns. +🎯 WHAT YOU'RE BUILDING: +Using YOUR TinyTorch implementations, you'll solve the "impossible" XOR problem +that stumped AI for years - proving that YOUR hidden layers enable non-linear learning! -Required Modules (can run after Module 6): -- Module 2 (Tensor): Core data structure with gradients -- Module 3 (Activations): ReLU/Sigmoid for nonlinearity (the key!) -- Module 4 (Layers): Linear layers for transformations -- Module 5 (Losses): Binary cross-entropy for classification -- Module 6 (Autograd): Backpropagation (the missing piece in 1969!) +βœ… REQUIRED MODULES (Run after Module 6): +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Module 02 (Tensor) : YOUR data structure with autodiff + Module 03 (Activations) : YOUR ReLU for non-linearity (the key!) + Module 04 (Layers) : YOUR Linear layers for transformations + Module 06 (Autograd) : YOUR gradient computation for learning +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -This Example Demonstrates: -- Why XOR requires hidden layers -- How nonlinear activation enables complex decision boundaries -- The importance of backpropagation for training deep networks +πŸ—οΈ ARCHITECTURE (Multi-Layer Solution): + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Input β”‚ β”‚ Linear β”‚ β”‚ ReLU β”‚ β”‚ Linear β”‚ β”‚ Binary β”‚ + β”‚ (x1,x2) │───▢│ 2β†’4 │───▢│ Hidden │───▢│ 4β†’1 │───▢│ Output β”‚ + β”‚ 2 dims β”‚ β”‚ YOUR M4 β”‚ β”‚ YOUR M3 β”‚ β”‚ YOUR M4 β”‚ β”‚ 0 or 1 β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + Hidden Layer Non-linearity Output Layer + +πŸ” WHY XOR IS SPECIAL - THE NON-LINEAR SEPARABILITY PROBLEM: + +The XOR (exclusive OR) problem outputs 1 when inputs differ, 0 when they match: + + Input Space: XOR Truth Table: + + 1 β”‚ (0,1)β†’1 (1,1)β†’0 β”‚ x1 β”‚ x2 β”‚ XOR β”‚ + β”‚ RED BLUE β”œβ”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€ + β”‚ β”‚ 0 β”‚ 0 β”‚ 0 β”‚ (same β†’ 0) + 0 β”‚ (0,0)β†’0 (1,0)β†’1 β”‚ 0 β”‚ 1 β”‚ 1 β”‚ (diff β†’ 1) + β”‚ BLUE RED β”‚ 1 β”‚ 0 β”‚ 1 β”‚ (diff β†’ 1) + └──────────────────── β”‚ 1 β”‚ 1 β”‚ 0 β”‚ (same β†’ 0) + 0 1 β””β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜ + + 🚫 IMPOSSIBLE with single line: βœ… POSSIBLE with hidden layer: + + No single line can separate Hidden units learn features: + RED from BLUE points! - Unit 1: (x1 AND NOT x2) + - Unit 2: (x2 AND NOT x1) + 1 β”‚ R β•± β•± β•± B Then combine: Unit1 OR Unit2 + β”‚ β•± β•± β•± β•± β•± + 0 β”‚ B β•± β•± β•± R The hidden layer creates a new + └──────────── feature space where XOR becomes + 0 1 linearly separable! + +This is why neural networks need DEPTH - hidden layers create new representations! + +πŸ“Š EXPECTED PERFORMANCE: +- Dataset: 1,000 XOR samples with slight noise +- Training time: 1 minute +- Expected accuracy: 95%+ (non-linear problem solved!) +- Key insight: Hidden layer enables non-linear decision boundary """ -import numpy as np import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import numpy as np +import argparse -from tinytorch.core.tensor import Tensor -from tinytorch.core.layers import Linear -from tinytorch.core.activations import ReLU, Sigmoid -from tinytorch.core.training import MeanSquaredError -from tinytorch.core.autograd import to_numpy +# Add project root to path for TinyTorch imports +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(project_root) +# Import TinyTorch components YOU BUILT! +from tinytorch.core.tensor import Tensor # Module 02: YOU built this! +from tinytorch.core.layers import Linear # Module 04: YOU built this! +from tinytorch.core.activations import ReLU, Sigmoid # Module 03: YOU built this! -class XORNet: +# Import dataset manager for XOR data +try: + from examples.data_manager import DatasetManager +except ImportError: + # Fallback if running from different location + sys.path.append(os.path.join(project_root, 'examples')) + from data_manager import DatasetManager + +class XORNetwork: """ - Multi-layer Perceptron that solves XOR. + Multi-layer network that solves XOR using YOUR TinyTorch implementations! - Historical note: This architecture was theoretically possible in 1969, - but without backpropagation, no one knew how to train it efficiently! + The hidden layer is the KEY - it learns features that make XOR separable. """ - def __init__(self): - # Hidden layer - the key innovation! - self.hidden = Linear(2, 4) # 2 inputs β†’ 4 hidden units - self.relu = ReLU() # Nonlinearity (crucial!) - self.output = Linear(4, 1) # 4 hidden β†’ 1 output - self.sigmoid = Sigmoid() # For binary classification + def __init__(self, input_size=2, hidden_size=4, output_size=1): + print("🧠 Building XOR Network with YOUR TinyTorch modules...") + + # Hidden layer - this is what Minsky said was needed! + self.hidden = Linear(input_size, hidden_size) # Module 04: YOUR Linear layer! + self.activation = ReLU() # Module 03: YOUR ReLU (key to non-linearity!) + self.output = Linear(hidden_size, output_size) # Module 04: YOUR output layer! + self.sigmoid = Sigmoid() # Module 03: YOUR final activation! + + print(f" Input β†’ Hidden: {input_size} β†’ {hidden_size} (YOUR Linear layer)") + print(f" Hidden activation: ReLU (YOUR non-linearity - this solves XOR!)") + print(f" Hidden β†’ Output: {hidden_size} β†’ {output_size} (YOUR Linear layer)") + print(f" Output activation: Sigmoid (YOUR Module 03)") - # Enable gradients for training - for layer in [self.hidden, self.output]: - layer.weights.requires_grad = True - layer.bias.requires_grad = True - def forward(self, x): - """Forward pass through the network.""" - # This is what Minsky said we needed but couldn't train! - x = self.hidden(x) - x = self.relu(x) # Nonlinearity enables XOR solution - x = self.output(x) - x = self.sigmoid(x) + """Forward pass through YOUR multi-layer network.""" + # Hidden layer with non-linearity (the SECRET to solving XOR!) + x = self.hidden(x) # Module 04: YOUR Linear transformation! + x = self.activation(x) # Module 03: YOUR ReLU - creates non-linear features! + + # Output layer + x = self.output(x) # Module 04: YOUR final transformation! + x = self.sigmoid(x) # Module 03: YOUR sigmoid for probability! + return x - def __call__(self, x): - return self.forward(x) - - def predict(self, x): - """Binary prediction.""" - output = self.forward(x) - return (to_numpy(output) > 0.5).astype(int) - def parameters(self): - """Get all parameters.""" + """Get all trainable parameters from YOUR layers.""" return [ - self.hidden.weights, self.hidden.bias, - self.output.weights, self.output.bias + self.hidden.weight, self.hidden.bias, # Module 04: YOUR hidden parameters! + self.output.weight, self.output.bias # Module 04: YOUR output parameters! ] - - def zero_grad(self): - """Zero all gradients.""" - for param in self.parameters(): - if param.requires_grad: - param.zero_grad() +def visualize_xor_problem(): + """Show why XOR is non-linearly separable using ASCII art.""" + print("\n" + "="*70) + print("🎨 VISUALIZING THE XOR PROBLEM - Why Single Layers Fail:") + print("="*70) + + print(""" + XOR DATA POINTS: SINGLE LAYER ATTEMPT: + + 1.0 β”‚ β—‹(0,1)=1 ●(1,1)=0 1.0 β”‚ β—‹ ● + β”‚ RED BLUE β”‚ β•² + β”‚ β”‚ β•² ← No single line + 0.5 β”‚ 0.5 β”‚ β•² can separate! + β”‚ β”‚ β•² + β”‚ β”‚ β•² + 0.0 β”‚ ●(0,0)=0 β—‹(1,0)=1 0.0 β”‚ ● β•² β—‹ + └───────────────────── └───────────────── + 0.0 0.5 1.0 0.0 0.5 1.0 + + Legend: β—‹ = Output 1 (RED) Problem: RED and BLUE points + ● = Output 0 (BLUE) are diagonally mixed! + """) + + print("πŸ”„ THE MULTI-LAYER SOLUTION:") + print(""" + Hidden Layer Features: New Feature Space: + + Hidden Unit 1: x1 AND NOT x2 In hidden space, XOR becomes + Hidden Unit 2: x2 AND NOT x1 linearly separable! + + Original β†’ Hidden Transform: Now a single line works: + (0,0) β†’ [0,0] β†’ 0 βœ“ + (0,1) β†’ [0,1] β†’ 1 βœ“ H2 β”‚ β—‹(0,1) + (1,0) β†’ [1,0] β†’ 1 βœ“ β”‚ β•± + (1,1) β†’ [0,0] β†’ 0 βœ“ β”‚ β•± β—‹(1,0) + β”‚ β•± + YOUR hidden layer learned 0 β”‚ ●──────────── + to transform the problem! 0 H1 + """) + print("="*70) -def get_xor_data(): +def train_xor_network(model, X, y, learning_rate=0.1, epochs=1000): """ - The infamous XOR dataset that stumped perceptrons. + Train XOR network using YOUR autograd system! - XOR Truth Table: - 0, 0 β†’ 0 - 0, 1 β†’ 1 - 1, 0 β†’ 1 - 1, 1 β†’ 0 - - This is NOT linearly separable! + This uses gradient descent with YOUR automatic differentiation. """ - X = np.array([ - [0, 0], - [0, 1], - [1, 0], - [1, 1] - ], dtype=np.float32) + print("\nπŸš€ Training XOR Network with YOUR TinyTorch autograd!") + print(f" Learning rate: {learning_rate}") + print(f" Epochs: {epochs}") + print(f" YOUR Module 06 autograd computes all gradients!") - y = np.array([ - [0], # 0 XOR 0 = 0 - [1], # 0 XOR 1 = 1 - [1], # 1 XOR 0 = 1 - [0] # 1 XOR 1 = 0 - ], dtype=np.float32) - - return X, y - - -def train_xor(model, X, y, epochs=100, lr=0.1): - """ - Train the network to solve XOR. - - Historical note: This training loop represents backpropagation, - which wasn't widely known until Rumelhart, Hinton, and Williams - popularized it in 1986! - """ - criterion = MeanSquaredError() + # Convert to YOUR Tensor format + X_tensor = Tensor(X) # Module 02: YOUR Tensor! + y_tensor = Tensor(y.reshape(-1, 1)) # Module 02: YOUR data structure! for epoch in range(epochs): - # Convert to tensors - X_tensor = Tensor(X) - y_tensor = Tensor(y) + # Forward pass using YOUR network + predictions = model.forward(X_tensor) # YOUR multi-layer forward! - # Forward pass - output = model(X_tensor) - loss = criterion(output, y_tensor) + # Binary cross-entropy loss + loss_value = np.mean(-y_tensor.data * np.log(predictions.data + 1e-8) - + (1 - y_tensor.data) * np.log(1 - predictions.data + 1e-8)) + loss = Tensor([loss_value]) - # Backward pass (backpropagation - the missing piece!) - loss.backward() + # Backward pass using YOUR autograd + loss.backward() # Module 06: YOUR automatic differentiation! - # Update weights (gradient descent) + # Update parameters using gradient descent for param in model.parameters(): - if param.requires_grad and param.grad is not None: - param.data = param.data - lr * param.grad.data + if param.grad is not None: + param.data -= learning_rate * param.grad + param.grad = None - # Zero gradients - model.zero_grad() + # Progress updates + if epoch % 100 == 0 or epoch == epochs - 1: + accuracy = np.mean((predictions.data > 0.5) == y_tensor.data) * 100 + print(f" Epoch {epoch:4d}: Loss = {loss_value:.4f}, " + f"Accuracy = {accuracy:.1f}% (YOUR training!)") + + return model + +def test_xor_solution(model, show_examples=True): + """Test YOUR XOR solution on the classic 4 points.""" + print("\nπŸ§ͺ Testing YOUR XOR Network on Classic Examples:") + print(" " + "─"*45) + + # The classic XOR test cases + test_cases = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32) + expected = np.array([0, 1, 1, 0]) + + # Test with YOUR network + X_test = Tensor(test_cases) # Module 02: YOUR Tensor! + predictions = model.forward(X_test) # YOUR forward pass! + predicted_classes = (predictions.data > 0.5).astype(int).flatten() + + # Display results + print(" β”‚ x1 β”‚ x2 β”‚ Expected β”‚ YOUR Output β”‚ βœ“/βœ— β”‚") + print(" β”œβ”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€") + + all_correct = True + for i in range(4): + x1, x2 = test_cases[i] + exp = expected[i] + pred = predicted_classes[i] + prob = predictions.data[i, 0] + status = "βœ“" if pred == exp else "βœ—" + if pred != exp: + all_correct = False - # Print progress - if epoch % 20 == 0: - loss_value = to_numpy(loss) - predictions = model.predict(X_tensor) - accuracy = np.mean(predictions == y) * 100 - print(f"Epoch {epoch:3d}: Loss = {float(loss_value):.4f}, Accuracy = {accuracy:.0f}%") + print(f" β”‚ {x1:.0f} β”‚ {x2:.0f} β”‚ {exp} β”‚ {pred} ({prob:.3f}) β”‚ {status} β”‚") + + print(" " + "─"*45) + + if all_correct: + print(" πŸŽ‰ SUCCESS! YOUR network solved XOR perfectly!") + print(" Hidden layers enabled non-linear learning!") + else: + print(" πŸ”„ Network still training... (try more epochs)") + + return all_correct +def analyze_xor_systems(model): + """Analyze YOUR XOR solution from an ML systems perspective.""" + print("\nπŸ”¬ SYSTEMS ANALYSIS of YOUR XOR Network:") + + # Parameter count + total_params = sum(p.data.size for p in model.parameters()) + + print(f" Parameters: {total_params} weights (YOUR Linear layers)") + print(f" Architecture: 2 β†’ 4 β†’ 1 (minimal for XOR)") + print(f" Key innovation: Hidden layer creates non-linear features") + print(f" Memory: {total_params * 4} bytes (float32)") + + print("\n πŸ›οΈ Historical Impact:") + print(" β€’ 1969: Minsky showed single layers CAN'T solve XOR") + print(" β€’ 1970s: 'AI Winter' - neural networks abandoned") + print(" β€’ 1980s: Backprop + hidden layers solved it (YOUR approach!)") + print(" β€’ Today: Deep networks with many hidden layers power AI") + + print("\n πŸ’‘ Why This Matters:") + print(" β€’ YOUR hidden layer transforms the feature space") + print(" β€’ Non-linear activation (ReLU) is ESSENTIAL") + print(" β€’ This principle scales to ImageNet, GPT, etc.") + print(" β€’ Modern AI = deeper versions of YOUR XOR network!") -def demonstrate_xor(): - """Demonstrate solving the XOR problem.""" +def main(): + """Demonstrate the XOR solution using YOUR TinyTorch system!""" - print("="*60) - print("THE XOR PROBLEM (1969) - The Challenge That Stopped AI") - print("="*60) - print() - print("Historical Context:") - print("Minsky & Papert proved single-layer perceptrons can't solve XOR.") - print("This caused the first AI Winter (1969-1980s).") - print("Solution: Hidden layers + nonlinearity + backpropagation!") - print() + parser = argparse.ArgumentParser(description='XOR Problem 1969') + parser.add_argument('--test-only', action='store_true', + help='Test architecture without training') + parser.add_argument('--epochs', type=int, default=1000, + help='Number of training epochs') + parser.add_argument('--visualize', action='store_true', default=True, + help='Show XOR visualization') + args = parser.parse_args() - # Get XOR data - X, y = get_xor_data() + print("🎯 XOR PROBLEM 1969 - Breaking the Linear Barrier!") + print(" Historical significance: Proved need for hidden layers") + print(" YOUR achievement: Solving 'impossible' problem with YOUR network") + print(" Components used: YOUR Tensor + Linear + ReLU + Autograd") - print("XOR Truth Table (Not Linearly Separable!):") - print("Input β†’ Output") - for i in range(len(X)): - print(f"{X[i]} β†’ {y[i][0]}") - print() + # Show why XOR is special + if args.visualize: + visualize_xor_problem() - # Create multi-layer network - model = XORNet() + # Step 1: Get XOR data + print("\nπŸ“Š Generating XOR dataset...") + data_manager = DatasetManager() + X, y = data_manager.get_xor_data(num_samples=1000) + print(f" Generated {len(X)} XOR samples with noise") - print("Network Architecture (The Solution):") - print("Input(2) β†’ Hidden(4) + ReLU β†’ Output(1) + Sigmoid") - print(f"Total parameters: {sum(p.size for p in model.parameters())}") - print() + # Step 2: Create network with YOUR components + model = XORNetwork(input_size=2, hidden_size=4, output_size=1) - # Test before training - print("Before Training:") - for i in range(len(X)): - pred = model.predict(Tensor(X[i:i+1]))[0, 0] - print(f"{X[i]} β†’ Predicted: {pred}, Actual: {y[i][0]}") - print() + if args.test_only: + print("\nπŸ§ͺ ARCHITECTURE TEST MODE") + test_input = Tensor(X[:4]) # Module 02: YOUR Tensor! + test_output = model.forward(test_input) # YOUR architecture! + print(f"βœ… Forward pass successful! Output shape: {test_output.data.shape}") + print("βœ… YOUR multi-layer network works!") + return - # Training would happen here with backpropagation - print("Training with Backpropagation (the missing piece from 1969!):") - # Note: Actual training requires working autograd integration - print("(Training demonstration - requires complete autograd)") - print() + # Step 3: Train using YOUR autograd + model = train_xor_network(model, X, y, epochs=args.epochs) - print("Historical Impact:") - print("βœ“ Proved need for hidden layers and nonlinearity") - print("βœ“ Led to backpropagation rediscovery (1986)") - print("βœ“ Sparked the deep learning revolution") - print() - print("Key Insight: Depth + Nonlinearity = Universal Approximation") - print() - print("After Module 8 (Optimizers), you can train this to 100% accuracy!") - print("="*60) - + # Step 4: Test on classic XOR cases + solved = test_xor_solution(model) + + # Step 5: Systems analysis + analyze_xor_systems(model) + + print("\nβœ… SUCCESS! XOR Milestone Complete!") + print("\nπŸŽ“ What YOU Accomplished:") + print(" β€’ YOU solved the 'impossible' XOR problem") + print(" β€’ YOUR hidden layer creates non-linear decision boundaries") + print(" β€’ YOUR ReLU activation enables feature learning") + print(" β€’ YOUR autograd trains multi-layer networks") + + print("\nπŸš€ Next Steps:") + print(" β€’ Continue to MNIST MLP after Module 08 (Training)") + print(" β€’ YOUR XOR solution scales to real vision problems!") + print(" β€’ Hidden layers principle powers all modern deep learning!") if __name__ == "__main__": - demonstrate_xor() \ No newline at end of file + main() \ No newline at end of file