diff --git a/milestones/02_xor_crisis_1969/xor_crisis.py b/milestones/02_xor_crisis_1969/xor_crisis.py new file mode 100644 index 00000000..f2658586 --- /dev/null +++ b/milestones/02_xor_crisis_1969/xor_crisis.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +The XOR Crisis (1969) - Minsky & Papert +======================================== + +📚 HISTORICAL CONTEXT: +In 1969, Marvin Minsky and Seymour Papert published "Perceptrons," mathematically +proving that single-layer perceptrons CANNOT solve the XOR problem. This revelation +killed neural network research funding for over a decade - the "AI Winter." + +🎯 MILESTONE 2 PART 1: THE CRISIS (After Modules 01-04) + +This demonstrates WHY the crisis happened. Watch a perceptron fail to learn XOR, +no matter how much we train it. This is what convinced the world that neural +networks were a dead end. + +✅ REQUIRED MODULES (Run after Module 04): +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Module 01 (Tensor) : YOUR data structure + Module 02 (Activations) : YOUR sigmoid activation + Module 03 (Layers) : YOUR Linear layer (single layer only!) + Module 04 (Losses) : YOUR loss function +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +🔍 THE XOR PROBLEM - Why It's Impossible for Single Layers: + +XOR (Exclusive OR) outputs 1 when inputs DIFFER, 0 when they're the SAME: + + Visual Representation: Truth Table: + + 1 │ ○ (0,1) ● (1,1) │ x₁ │ x₂ │ XOR │ + │ [1] [0] ├────┼────┼─────┤ + │ │ 0 │ 0 │ 0 │ ← same + 0 │ ● (0,0) ○ (1,0) │ 0 │ 1 │ 1 │ ← different + │ [0] [1] │ 1 │ 0 │ 1 │ ← different + └───────────────── │ 1 │ 1 │ 0 │ ← same + 0 1 └────┴────┴─────┘ + +🚫 THE FUNDAMENTAL PROBLEM: + +No single straight line can separate the points! + + Try drawing a line: Any line fails: + + 1 │ 1 ╱ ╱ ╱ 0 1 │ 1 ╲ ╲ ╲ 0 + │ ╱ ╱ ╱ ╱ ╱ │ ╲ ╲ ╲ ╲ ╲ + 0 │ 0 ╱ ╱ ╱ 1 0 │ 0 ╲ ╲ ╲ 1 + └──────────── └──────────── + Line can't separate! Still wrong! + +This is called "non-linear separability" - the problem that ended the first +neural network era. + +⚠️ WHAT TO EXPECT: +- Training will complete (no errors) +- Loss will NOT decrease (stuck around 0.69) +- Accuracy will NOT improve (stuck at 50% - random guessing) +- The model CANNOT learn XOR - Minsky was right! + +This failure launched the AI Winter. Part 2 (xor_solved.py) shows the solution! +""" + +import sys +import os +import numpy as np +from rich.console import Console +from rich.table import Table +from rich.panel import Panel + +# Add project root to path +sys.path.insert(0, os.getcwd()) + +# Import TinyTorch components YOU BUILT! +from tinytorch import Tensor, Linear, Sigmoid, BinaryCrossEntropyLoss, SGD + +console = Console() + + +# ============================================================================ +# 🎲 DATA GENERATION +# ============================================================================ + +def generate_xor_data(n_samples=100): + """ + Generate XOR dataset with slight noise. + + Returns clean XOR data to clearly demonstrate the failure. + """ + console.print("\n[bold]Step 1:[/bold] Generating XOR dataset...") + + # Generate each XOR case with repetition + samples_per_case = n_samples // 4 + + # Case 1: (0,0) → 0 + x1 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([0.0, 0.0]) + y1 = np.zeros((samples_per_case, 1)) + + # Case 2: (0,1) → 1 + x2 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([0.0, 1.0]) + y2 = np.ones((samples_per_case, 1)) + + # Case 3: (1,0) → 1 + x3 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([1.0, 0.0]) + y3 = np.ones((samples_per_case, 1)) + + # Case 4: (1,1) → 0 + x4 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([1.0, 1.0]) + y4 = np.zeros((samples_per_case, 1)) + + # Combine and shuffle + X = np.vstack([x1, x2, x3, x4]) + y = np.vstack([y1, y2, y3, y4]) + + indices = np.random.permutation(n_samples) + X = X[indices] + y = y[indices] + + console.print(f" ✓ Created [bold]{n_samples}[/bold] XOR samples") + console.print(f" ✓ Problem: [bold red]NOT linearly separable![/bold red]") + + return Tensor(X), Tensor(y) + + +# ============================================================================ +# 🏗️ SINGLE-LAYER PERCEPTRON (The Architecture That FAILS) +# ============================================================================ + +class SingleLayerPerceptron: + """ + Single-layer perceptron - the architecture that CANNOT solve XOR. + + This is the exact architecture Minsky proved insufficient in 1969. + """ + + def __init__(self): + self.linear = Linear(2, 1) + self.sigmoid = Sigmoid() + + def __call__(self, x): + """Forward pass: Input → Linear → Sigmoid → Output""" + logits = self.linear(x) + output = self.sigmoid(logits) + return output + + def parameters(self): + """Return trainable parameters.""" + return self.linear.parameters() + + +# ============================================================================ +# 🔥 TRAINING FUNCTION (That Will FAIL on XOR) +# ============================================================================ + +def train_perceptron(model, X, y, epochs=100, lr=0.1): + """ + Train single-layer perceptron on XOR. + + This will fail - the model CANNOT learn XOR. + """ + loss_fn = BinaryCrossEntropyLoss() + optimizer = SGD(model.parameters(), lr=lr) + + console.print("\n[bold cyan]🔥 Attempting to Train on XOR...[/bold cyan]") + console.print("[dim](This will fail - Minsky proved it mathematically!)[/dim]\n") + + history = {"loss": [], "accuracy": []} + + for epoch in range(epochs): + # Forward pass + predictions = model(X) + loss = loss_fn(predictions, y) + + # Backward pass + loss.backward() + + # Update weights + optimizer.step() + optimizer.zero_grad() + + # Calculate accuracy + pred_classes = (predictions.data > 0.5).astype(int) + accuracy = (pred_classes == y.data).mean() + + history["loss"].append(loss.data.item()) + history["accuracy"].append(accuracy) + + # Print progress every 20 epochs + if (epoch + 1) % 20 == 0: + console.print(f"Epoch {epoch+1:3d}/{epochs} Loss: {loss.data:.4f} Accuracy: {accuracy:.1%}") + + console.print("\n[bold yellow]⚠️ Training Complete (But Failed to Learn!)[/bold yellow]\n") + + return history + + +# ============================================================================ +# 📊 EVALUATION & VISUALIZATION +# ============================================================================ + +def evaluate_and_explain(model, X, y, history): + """Evaluate the failed model and explain WHY it failed.""" + + predictions = model(X) + pred_classes = (predictions.data > 0.5).astype(int) + final_accuracy = (pred_classes == y.data).mean() + + # Get final metrics + initial_loss = history["loss"][0] + final_loss = history["loss"][-1] + initial_acc = history["accuracy"][0] + final_acc = history["accuracy"][-1] + + # Show results table + table = Table(title="\n🎯 The XOR Crisis - Results", show_header=True) + table.add_column("Metric", style="cyan") + table.add_column("Initial", style="white") + table.add_column("Final", style="white") + table.add_column("Change", style="bold") + + loss_change = "No improvement" if abs(final_loss - initial_loss) < 0.1 else f"{initial_loss - final_loss:+.4f}" + acc_change = "No improvement" if abs(final_acc - initial_acc) < 0.05 else f"{final_acc - initial_acc:+.1%}" + + table.add_row("Loss", f"{initial_loss:.4f}", f"{final_loss:.4f}", loss_change) + table.add_row("Accuracy", f"{initial_acc:.1%}", f"{final_acc:.1%}", acc_change) + + console.print(table) + + # Show the failure + if final_accuracy < 0.6: + console.print(Panel( + "[bold red]❌ FAILURE: Cannot Learn XOR[/bold red]\n\n" + f"Final accuracy: {final_accuracy:.1%} (essentially random guessing)\n" + f"Loss stuck at: {final_loss:.4f} (not decreasing)\n\n" + "[bold]This is the XOR Crisis![/bold]\n" + "Single-layer perceptrons cannot solve non-linearly separable problems.", + title="⚠️ The 1969 AI Winter Begins", + border_style="red" + )) + else: + console.print(Panel( + "[yellow]⚠️ PARTIAL SUCCESS (Unexpected!)[/yellow]\n\n" + f"Accuracy: {final_accuracy:.1%}\n" + "This shouldn't happen with clean XOR data.\n" + "The problem is fundamentally non-linearly separable.", + border_style="yellow" + )) + + # Show XOR truth table vs predictions + console.print("\n[bold]XOR Truth Table vs Model Predictions:[/bold]") + test_inputs = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) + test_preds = model(Tensor(test_inputs)) + + truth_table = Table(show_header=True) + truth_table.add_column("x₁", style="cyan") + truth_table.add_column("x₂", style="cyan") + truth_table.add_column("XOR (True)", style="green") + truth_table.add_column("Predicted", style="yellow") + truth_table.add_column("Correct?", style="white") + + for i, (x1, x2) in enumerate(test_inputs): + true_xor = int(x1 != x2) + pred = int(test_preds.data[i, 0] > 0.5) + correct = "✓" if pred == true_xor else "✗" + truth_table.add_row( + f"{int(x1)}", + f"{int(x2)}", + f"{true_xor}", + f"{pred}", + correct + ) + + console.print(truth_table) + + +# ============================================================================ +# 🎯 MAIN EXECUTION +# ============================================================================ + +def main(): + """Demonstrate the XOR crisis - single-layer perceptron failure.""" + + console.print(Panel.fit( + "[bold]The XOR Crisis (1969) - Minsky & Papert[/bold]\n\n" + "[dim]Watch a single-layer perceptron FAIL to learn XOR.[/dim]\n" + "[dim]This failure convinced the world neural networks were useless.[/dim]", + border_style="red" + )) + + # Generate data + X, y = generate_xor_data(n_samples=100) + + # Create single-layer perceptron + console.print("\n[bold]Step 2:[/bold] Creating single-layer perceptron...") + model = SingleLayerPerceptron() + console.print(" ✓ Architecture: Input(2) → Linear(2→1) → Sigmoid → Output") + console.print(" ⚠️ [bold red]No hidden layer - this is the problem![/bold red]") + + # Attempt to train (will fail) + console.print("\n[bold]Step 3:[/bold] Training on XOR...") + history = train_perceptron(model, X, y, epochs=100, lr=0.5) + + # Evaluate and explain the failure + evaluate_and_explain(model, X, y, history) + + # Historical context + console.print(Panel( + "[bold]💡 Historical Significance[/bold]\n\n" + "[bold cyan]1969:[/bold cyan] Minsky & Papert prove single-layer networks can't solve XOR\n" + "[bold red]1970s:[/bold red] AI Winter begins - funding disappears\n" + "[bold yellow]1986:[/bold yellow] Multi-layer networks + backprop solve it (see xor_solved.py!)\n" + "[bold green]Today:[/bold green] Deep learning powers GPT, AlphaGo, etc.\n\n" + "[dim]The solution? Hidden layers! See [bold]xor_solved.py[/bold] to witness the revival.[/dim]", + title="🌨️ The AI Winter", + border_style="blue" + )) + + +if __name__ == "__main__": + main() diff --git a/milestones/02_xor_crisis_1969/xor_solved.py b/milestones/02_xor_crisis_1969/xor_solved.py new file mode 100644 index 00000000..9ab0ae19 --- /dev/null +++ b/milestones/02_xor_crisis_1969/xor_solved.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 +""" +XOR Solved! Multi-Layer Networks (1986) +======================================== + +📚 HISTORICAL CONTEXT: +After the 1969 XOR crisis killed neural networks, research funding dried up for over +a decade. Then in 1986, Rumelhart, Hinton, and Williams published the backpropagation +algorithm for training multi-layer networks - and XOR became trivial! + +🎯 MILESTONE 2 PART 2: THE SOLUTION (After Modules 01-07) + +Watch a multi-layer network SOLVE the "impossible" XOR problem that stumped AI for +17 years. The secret? Hidden layers + backpropagation (which YOU just built!). + +✅ REQUIRED MODULES (Run after Module 07): +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + Module 01 (Tensor) : YOUR data structure with autodiff + Module 02 (Activations) : YOUR ReLU and Sigmoid (non-linearity!) + Module 03 (Layers) : YOUR Linear layers (multiple layers!) + Module 04 (Losses) : YOUR loss function + Module 05 (Autograd) : YOUR backpropagation through hidden layers + Module 06 (Optimizers) : YOUR SGD optimizer + Module 07 (Training) : YOUR training loop +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +🏗️ THE KEY INSIGHT - Hidden Layers Create New Features: + + Single Layer (FAILS): Multi-Layer (SUCCEEDS): + + Input → Linear → Sigmoid Input → Linear → ReLU → Linear → Sigmoid + ↑ ↑ ↑ + No hidden layer Hidden Layer! Non-linearity! + +The hidden layer learns NEW features that make XOR linearly separable! + +🔍 HOW IT WORKS - Feature Learning: + +Original space (XOR not separable): + + 1 │ 1 0 Hidden units learn: + │ • h₁: detects "x₁ AND NOT x₂" + 0 │ 0 1 • h₂: detects "x₂ AND NOT x₁" + └───── • h₃: detects other patterns + 0 1 • h₄: etc. + +New feature space (linearly separable!): + + The hidden layer creates a new representation where + XOR becomes a simple linear decision boundary! + +✅ EXPECTED RESULTS: +- Training time: ~30 seconds +- Accuracy: 95-100% (problem solved!) +- Loss decreases smoothly +- Perfect XOR predictions + +This is the architecture that ended the AI Winter! +""" + +import sys +import os +import numpy as np +from rich.console import Console +from rich.table import Table +from rich.panel import Panel + +# Add project root to path +sys.path.insert(0, os.getcwd()) + +# Import TinyTorch components YOU BUILT! +from tinytorch import Tensor, Linear, ReLU, Sigmoid, BinaryCrossEntropyLoss, SGD + +console = Console() + + +# ============================================================================ +# 🎲 DATA GENERATION +# ============================================================================ + +def generate_xor_data(n_samples=100): + """Generate XOR dataset with slight noise.""" + console.print("\n[bold]Step 1:[/bold] Generating XOR dataset...") + + # Generate each XOR case with repetition + samples_per_case = n_samples // 4 + + # Case 1: (0,0) → 0 + x1 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([0.0, 0.0]) + y1 = np.zeros((samples_per_case, 1)) + + # Case 2: (0,1) → 1 + x2 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([0.0, 1.0]) + y2 = np.ones((samples_per_case, 1)) + + # Case 3: (1,0) → 1 + x3 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([1.0, 0.0]) + y3 = np.ones((samples_per_case, 1)) + + # Case 4: (1,1) → 0 + x4 = np.random.randn(samples_per_case, 2) * 0.1 + np.array([1.0, 1.0]) + y4 = np.zeros((samples_per_case, 1)) + + # Combine and shuffle + X = np.vstack([x1, x2, x3, x4]) + y = np.vstack([y1, y2, y3, y4]) + + indices = np.random.permutation(n_samples) + X = X[indices] + y = y[indices] + + console.print(f" ✓ Created [bold]{n_samples}[/bold] XOR samples") + console.print(f" ✓ Problem: [bold yellow]NOT linearly separable[/bold yellow]") + console.print(f" ✓ Solution: [bold green]Use hidden layers![/bold green]") + + return Tensor(X), Tensor(y) + + +# ============================================================================ +# 🏗️ MULTI-LAYER NETWORK (The Solution!) +# ============================================================================ + +class XORNetwork: + """ + Multi-layer network that SOLVES XOR! + + The hidden layer creates new features that make XOR linearly separable. + This is the architecture that ended the AI Winter. + """ + + def __init__(self, hidden_size=4): + # Hidden layer - THE KEY INNOVATION! + self.hidden = Linear(2, hidden_size) + self.relu = ReLU() # Non-linearity is essential! + + # Output layer + self.output = Linear(hidden_size, 1) + self.sigmoid = Sigmoid() + + def __call__(self, x): + """ + Forward pass through hidden layer. + + Input → Hidden Layer → ReLU → Output Layer → Sigmoid + """ + # Hidden layer transforms input space + h = self.hidden(x) + h_activated = self.relu(h) + + # Output layer in new feature space + logits = self.output(h_activated) + output = self.sigmoid(logits) + + return output + + def parameters(self): + """Return all trainable parameters.""" + return self.hidden.parameters() + self.output.parameters() + + +# ============================================================================ +# 🔥 TRAINING FUNCTION (That Will SUCCEED on XOR!) +# ============================================================================ + +def train_network(model, X, y, epochs=500, lr=0.5): + """ + Train multi-layer network on XOR. + + This WILL succeed - hidden layers solve the problem! + """ + loss_fn = BinaryCrossEntropyLoss() + optimizer = SGD(model.parameters(), lr=lr) + + console.print("\n[bold cyan]🔥 Training Multi-Layer Network...[/bold cyan]") + console.print("[dim](This will work - hidden layers solve XOR!)[/dim]\n") + + history = {"loss": [], "accuracy": []} + + for epoch in range(epochs): + # Forward pass + predictions = model(X) + loss = loss_fn(predictions, y) + + # Backward pass (through hidden layers!) + loss.backward() + + # Update weights + optimizer.step() + optimizer.zero_grad() + + # Calculate accuracy + pred_classes = (predictions.data > 0.5).astype(int) + accuracy = (pred_classes == y.data).mean() + + history["loss"].append(loss.data.item()) + history["accuracy"].append(accuracy) + + # Print progress every 100 epochs + if (epoch + 1) % 100 == 0: + console.print(f"Epoch {epoch+1:3d}/{epochs} Loss: {loss.data:.4f} Accuracy: {accuracy:.1%}") + + console.print("\n[bold green]✅ Training Complete - XOR Solved![/bold green]\n") + + return history + + +# ============================================================================ +# 📊 EVALUATION & CELEBRATION +# ============================================================================ + +def evaluate_and_celebrate(model, X, y, history): + """Evaluate the successful model and celebrate the victory!""" + + predictions = model(X) + pred_classes = (predictions.data > 0.5).astype(int) + final_accuracy = (pred_classes == y.data).mean() + + # Get metrics + initial_loss = history["loss"][0] + final_loss = history["loss"][-1] + initial_acc = history["accuracy"][0] + final_acc = history["accuracy"][-1] + + # Show transformation + table = Table(title="\n🎯 The Transformation", show_header=True) + table.add_column("Metric", style="cyan") + table.add_column("Before Training", style="white") + table.add_column("After Training", style="white") + table.add_column("Improvement", style="bold green") + + loss_improvement = f"-{initial_loss - final_loss:.4f}" + acc_improvement = f"+{final_acc - initial_acc:.1%}" + + table.add_row("Loss", f"{initial_loss:.4f}", f"{final_loss:.4f}", loss_improvement) + table.add_row("Accuracy", f"{initial_acc:.1%}", f"{final_acc:.1%}", acc_improvement) + + console.print(table) + + # Celebrate success! + if final_accuracy >= 0.9: + console.print(Panel( + "[bold green]🎉 SUCCESS! XOR Problem Solved![/bold green]\n\n" + f"Final accuracy: {final_accuracy:.1%}\n" + f"Final loss: {final_loss:.4f}\n\n" + "[bold]The \"impossible\" problem is now trivial![/bold]\n" + "Hidden layers + backpropagation = AI renaissance", + title="✅ 1986 AI Revival", + border_style="green" + )) + else: + console.print(Panel( + f"[yellow]Accuracy: {final_accuracy:.1%}[/yellow]\n\n" + "Try training longer or adjusting learning rate.", + border_style="yellow" + )) + + # Show XOR truth table vs predictions + console.print("\n[bold]XOR Truth Table vs Model Predictions:[/bold]") + test_inputs = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) + test_preds = model(Tensor(test_inputs)) + + truth_table = Table(show_header=True, border_style="green") + truth_table.add_column("x₁", style="cyan") + truth_table.add_column("x₂", style="cyan") + truth_table.add_column("XOR (True)", style="green") + truth_table.add_column("Predicted", style="yellow") + truth_table.add_column("Correct?", style="white") + + all_correct = True + for i, (x1, x2) in enumerate(test_inputs): + true_xor = int(x1 != x2) + pred_prob = test_preds.data[i, 0] + pred = int(pred_prob > 0.5) + correct = pred == true_xor + all_correct = all_correct and correct + + truth_table.add_row( + f"{int(x1)}", + f"{int(x2)}", + f"{true_xor}", + f"{pred} ({pred_prob:.3f})", + "✅" if correct else "❌" + ) + + console.print(truth_table) + + if all_correct: + console.print("\n[bold green]✨ Perfect! All XOR cases correctly predicted![/bold green]") + + +# ============================================================================ +# 🎯 MAIN EXECUTION +# ============================================================================ + +def main(): + """Demonstrate solving XOR with multi-layer networks.""" + + console.print(Panel.fit( + "[bold]XOR Solved! Multi-Layer Networks (1986)[/bold]\n\n" + "[dim]Watch a multi-layer network SOLVE the problem that killed AI.[/dim]\n" + "[dim]Hidden layers + backpropagation = The AI Renaissance![/dim]", + border_style="green" + )) + + # Generate data + X, y = generate_xor_data(n_samples=100) + + # Create multi-layer network + console.print("\n[bold]Step 2:[/bold] Creating multi-layer network...") + model = XORNetwork(hidden_size=4) + console.print(" ✓ Architecture: Input(2) → [bold green]Hidden(4)[/bold green] → ReLU → Output(1) → Sigmoid") + console.print(" ✓ [bold green]Hidden layer is the KEY![/bold green] It learns new features.") + console.print(" ✓ Total parameters: ~17 (vs 3 for single-layer)") + + # Check initial performance + console.print("\n[bold]Initial Performance (random weights):[/bold]") + initial_preds = model(X) + initial_acc = ((initial_preds.data > 0.5).astype(int) == y.data).mean() + console.print(f" Accuracy: {initial_acc:.1%} (random guessing)") + + # Train the network + console.print("\n[bold]Step 3:[/bold] Training on XOR...") + history = train_network(model, X, y, epochs=500, lr=0.5) + + # Evaluate and celebrate + evaluate_and_celebrate(model, X, y, history) + + # Historical context + console.print(Panel( + "[bold]💡 What You Just Accomplished[/bold]\n\n" + "[bold red]1969:[/bold red] XOR crisis - single layers fail\n" + "[bold yellow]1970-1986:[/bold yellow] AI Winter - 17 years of darkness\n" + "[bold green]1986:[/bold green] Backprop + hidden layers solve it\n" + "[bold cyan]TODAY:[/bold cyan] YOU solved it with YOUR TinyTorch!\n\n" + "[bold]The Components YOU Built:[/bold]\n" + " • Tensor with autograd (Module 01 + 05)\n" + " • Linear layers for transformations (Module 03)\n" + " • ReLU for non-linearity (Module 02)\n" + " • Backprop through multiple layers (Module 05)\n" + " • SGD for optimization (Module 06)\n\n" + "[dim]This same pattern scales to GPT-4, AlphaGo, and beyond![/dim]", + title="🎓 Educational Significance", + border_style="blue" + )) + + console.print(Panel( + "[bold cyan]🚀 Next Steps[/bold cyan]\n\n" + "You've solved the problem that stumped AI for 17 years!\n\n" + "[bold]Ready for more?[/bold]\n" + " • Milestone 03: Train deeper networks on real data\n" + " • Module 08: DataLoaders for batch processing\n" + " • Module 09: CNNs for image recognition\n" + " • And beyond: Transformers, attention, etc.\n\n" + "[dim]Every modern AI architecture builds on what you just learned![/dim]", + title="🌟 Your Journey", + border_style="cyan" + )) + + +if __name__ == "__main__": + main() diff --git a/modules/source/05_autograd/autograd_dev.ipynb b/modules/source/05_autograd/autograd_dev.ipynb index 0fb5f4fa..c4c52a8d 100644 --- a/modules/source/05_autograd/autograd_dev.ipynb +++ b/modules/source/05_autograd/autograd_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "e3cfec75", + "id": "64844e96", "metadata": { "cell_marker": "\"\"\"" }, @@ -54,7 +54,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58074465", + "id": "275b8540", "metadata": { "nbgrader": { "grade": false, @@ -77,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "69b165b7", + "id": "1330c3bb", "metadata": { "cell_marker": "\"\"\"" }, @@ -131,7 +131,7 @@ }, { "cell_type": "markdown", - "id": "74b7f7b1", + "id": "162f90a1", "metadata": { "cell_marker": "\"\"\"" }, @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "f0ebfa26", + "id": "3a977634", "metadata": { "cell_marker": "\"\"\"" }, @@ -227,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "dbf5a8fe", + "id": "b5ce7cd9", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -255,7 +255,7 @@ { "cell_type": "code", "execution_count": null, - "id": "637e3665", + "id": "13a6880f", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -321,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "d791e7e6", + "id": "07856311", "metadata": { "cell_marker": "\"\"\"" }, @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "68eb4e20", + "id": "7435243b", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -389,7 +389,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7a18ba60", + "id": "b5b4727d", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "923b65a8", + "id": "55bf20de", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6fc95eaf", + "id": "111d8569", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -535,7 +535,7 @@ }, { "cell_type": "markdown", - "id": "fbfc3b8b", + "id": "99575c31", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -570,7 +570,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d26abee2", + "id": "6475d858", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -627,7 +627,7 @@ }, { "cell_type": "markdown", - "id": "d714d4d7", + "id": "9b93dc03", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -658,7 +658,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63a43449", + "id": "a4d5e89a", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -706,7 +706,7 @@ }, { "cell_type": "markdown", - "id": "7c451fcc", + "id": "025c5aeb", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -722,7 +722,7 @@ { "cell_type": "code", "execution_count": null, - "id": "283dd53b", + "id": "6a4f7493", "metadata": { "nbgrader": { "grade": true, @@ -769,7 +769,7 @@ }, { "cell_type": "markdown", - "id": "74b997fa", + "id": "e72f7ca9", "metadata": { "cell_marker": "\"\"\"" }, @@ -804,7 +804,7 @@ }, { "cell_type": "markdown", - "id": "8f86f108", + "id": "e3c0e837", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -830,7 +830,44 @@ { "cell_type": "code", "execution_count": null, - "id": "14fe4ca5", + "id": "f4a2ab86", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "relu-backward", + "solution": true + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class ReLUBackward(Function):\n", + " \"\"\"\n", + " Gradient computation for ReLU activation.\n", + " \n", + " ReLU: f(x) = max(0, x)\n", + " Derivative: f'(x) = 1 if x > 0, else 0\n", + " \"\"\"\n", + " \n", + " def __init__(self, input_tensor):\n", + " \"\"\"Initialize with input tensor.\"\"\"\n", + " super().__init__(input_tensor)\n", + " \n", + " def apply(self, grad_output):\n", + " \"\"\"Compute gradient for ReLU.\"\"\"\n", + " tensor, = self.saved_tensors\n", + " \n", + " if isinstance(tensor, Tensor) and tensor.requires_grad:\n", + " # ReLU gradient: 1 if x > 0, else 0\n", + " relu_grad = (tensor.data > 0).astype(np.float32)\n", + " return grad_output * relu_grad,\n", + " return None," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac426d12", "metadata": { "nbgrader": { "grade": false, @@ -874,7 +911,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bf1dd71d", + "id": "037ebadb", "metadata": { "nbgrader": { "grade": false, @@ -914,7 +951,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7934b8f7", + "id": "c5b090f4", "metadata": { "nbgrader": { "grade": false, @@ -958,7 +995,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4d7816e7", + "id": "c97ef6f9", "metadata": { "nbgrader": { "grade": false, @@ -1169,11 +1206,12 @@ "\n", " # Patch activations and losses to track gradients\n", " try:\n", - " from tinytorch.core.activations import Sigmoid\n", + " from tinytorch.core.activations import Sigmoid, ReLU\n", " from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss\n", " \n", " # Store original methods\n", " _original_sigmoid_forward = Sigmoid.forward\n", + " _original_relu_forward = ReLU.forward\n", " _original_bce_forward = BinaryCrossEntropyLoss.forward\n", " _original_mse_forward = MSELoss.forward\n", " \n", @@ -1188,6 +1226,17 @@ " \n", " return result\n", " \n", + " def tracked_relu_forward(self, x):\n", + " \"\"\"ReLU with gradient tracking.\"\"\"\n", + " result_data = np.maximum(0, x.data)\n", + " result = Tensor(result_data)\n", + " \n", + " if x.requires_grad:\n", + " result.requires_grad = True\n", + " result._grad_fn = ReLUBackward(x)\n", + " \n", + " return result\n", + " \n", " def tracked_bce_forward(self, predictions, targets):\n", " \"\"\"Binary cross-entropy with gradient tracking.\"\"\"\n", " # Compute BCE loss\n", @@ -1223,6 +1272,7 @@ " \n", " # Install patched methods\n", " Sigmoid.forward = tracked_sigmoid_forward\n", + " ReLU.forward = tracked_relu_forward\n", " BinaryCrossEntropyLoss.forward = tracked_bce_forward\n", " MSELoss.forward = tracked_mse_forward\n", " \n", @@ -1244,7 +1294,7 @@ }, { "cell_type": "markdown", - "id": "74bf991c", + "id": "29666f4e", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1260,7 +1310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c602541a", + "id": "d7e3b901", "metadata": { "nbgrader": { "grade": true, @@ -1308,7 +1358,7 @@ }, { "cell_type": "markdown", - "id": "940e33e0", + "id": "40f82d7b", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1322,7 +1372,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6b58276", + "id": "23f2e6ff", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1435,7 +1485,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07cf3600", + "id": "13ab8ef7", "metadata": {}, "outputs": [], "source": [ @@ -1446,7 +1496,7 @@ }, { "cell_type": "markdown", - "id": "fd4719db", + "id": "e85f24a6", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/05_autograd/autograd_dev.py b/modules/source/05_autograd/autograd_dev.py index e8a89a16..d6230d0f 100644 --- a/modules/source/05_autograd/autograd_dev.py +++ b/modules/source/05_autograd/autograd_dev.py @@ -670,6 +670,31 @@ Enhanced: x + y → addition + gradient tracking (if requires_grad=True) This approach follows PyTorch 2.0 style - clean, modern, and educational. """ +# %% nbgrader={"grade": false, "grade_id": "relu-backward", "solution": true} +#| export +class ReLUBackward(Function): + """ + Gradient computation for ReLU activation. + + ReLU: f(x) = max(0, x) + Derivative: f'(x) = 1 if x > 0, else 0 + """ + + def __init__(self, input_tensor): + """Initialize with input tensor.""" + super().__init__(input_tensor) + + def apply(self, grad_output): + """Compute gradient for ReLU.""" + tensor, = self.saved_tensors + + if isinstance(tensor, Tensor) and tensor.requires_grad: + # ReLU gradient: 1 if x > 0, else 0 + relu_grad = (tensor.data > 0).astype(np.float32) + return grad_output * relu_grad, + return None, + + # %% nbgrader={"grade": false, "grade_id": "sigmoid-backward", "solution": true} #| export class SigmoidBackward(Function): @@ -964,11 +989,12 @@ def enable_autograd(): # Patch activations and losses to track gradients try: - from tinytorch.core.activations import Sigmoid + from tinytorch.core.activations import Sigmoid, ReLU from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss # Store original methods _original_sigmoid_forward = Sigmoid.forward + _original_relu_forward = ReLU.forward _original_bce_forward = BinaryCrossEntropyLoss.forward _original_mse_forward = MSELoss.forward @@ -983,6 +1009,17 @@ def enable_autograd(): return result + def tracked_relu_forward(self, x): + """ReLU with gradient tracking.""" + result_data = np.maximum(0, x.data) + result = Tensor(result_data) + + if x.requires_grad: + result.requires_grad = True + result._grad_fn = ReLUBackward(x) + + return result + def tracked_bce_forward(self, predictions, targets): """Binary cross-entropy with gradient tracking.""" # Compute BCE loss @@ -1018,6 +1055,7 @@ def enable_autograd(): # Install patched methods Sigmoid.forward = tracked_sigmoid_forward + ReLU.forward = tracked_relu_forward BinaryCrossEntropyLoss.forward = tracked_bce_forward MSELoss.forward = tracked_mse_forward diff --git a/tinytorch/core/autograd.py b/tinytorch/core/autograd.py index dbad17cb..412e37af 100644 --- a/tinytorch/core/autograd.py +++ b/tinytorch/core/autograd.py @@ -15,8 +15,8 @@ # ║ happens! The tinytorch/ directory is just the compiled output. ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 -__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'SigmoidBackward', 'MSEBackward', - 'BCEBackward', 'enable_autograd'] +__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward', + 'MSEBackward', 'BCEBackward', 'enable_autograd'] # %% ../../modules/source/05_autograd/autograd_dev.ipynb 1 import numpy as np @@ -241,6 +241,29 @@ class SumBackward(Function): return None, # %% ../../modules/source/05_autograd/autograd_dev.ipynb 20 +class ReLUBackward(Function): + """ + Gradient computation for ReLU activation. + + ReLU: f(x) = max(0, x) + Derivative: f'(x) = 1 if x > 0, else 0 + """ + + def __init__(self, input_tensor): + """Initialize with input tensor.""" + super().__init__(input_tensor) + + def apply(self, grad_output): + """Compute gradient for ReLU.""" + tensor, = self.saved_tensors + + if isinstance(tensor, Tensor) and tensor.requires_grad: + # ReLU gradient: 1 if x > 0, else 0 + relu_grad = (tensor.data > 0).astype(np.float32) + return grad_output * relu_grad, + return None, + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21 class SigmoidBackward(Function): """ Gradient computation for sigmoid activation. @@ -270,7 +293,7 @@ class SigmoidBackward(Function): return grad_output * sigmoid_grad, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 22 class MSEBackward(Function): """ Gradient computation for Mean Squared Error Loss. @@ -296,7 +319,7 @@ class MSEBackward(Function): return grad * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 22 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23 class BCEBackward(Function): """ Gradient computation for Binary Cross-Entropy Loss. @@ -326,7 +349,7 @@ class BCEBackward(Function): return grad * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24 def enable_autograd(): """ Enable gradient tracking for all Tensor operations. @@ -527,11 +550,12 @@ def enable_autograd(): # Patch activations and losses to track gradients try: - from tinytorch.core.activations import Sigmoid + from tinytorch.core.activations import Sigmoid, ReLU from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss # Store original methods _original_sigmoid_forward = Sigmoid.forward + _original_relu_forward = ReLU.forward _original_bce_forward = BinaryCrossEntropyLoss.forward _original_mse_forward = MSELoss.forward @@ -546,6 +570,17 @@ def enable_autograd(): return result + def tracked_relu_forward(self, x): + """ReLU with gradient tracking.""" + result_data = np.maximum(0, x.data) + result = Tensor(result_data) + + if x.requires_grad: + result.requires_grad = True + result._grad_fn = ReLUBackward(x) + + return result + def tracked_bce_forward(self, predictions, targets): """Binary cross-entropy with gradient tracking.""" # Compute BCE loss @@ -581,6 +616,7 @@ def enable_autograd(): # Install patched methods Sigmoid.forward = tracked_sigmoid_forward + ReLU.forward = tracked_relu_forward BinaryCrossEntropyLoss.forward = tracked_bce_forward MSELoss.forward = tracked_mse_forward