Files
TinyTorch/tests/integration/minimal_training_example.py
Vijay Janapa Reddi 3a885601f9 Clean up repository
- Remove stale feature branches (kept debugging branch with unmerged work)
- Move test_spatial_core.py to correct directory (tests/09_spatial)
- Remove .tito user state from tracking (config.json, progress.json)
- Delete archived CLI commands (tito/commands/_archived/)
- Move standalone integration tests to tests/integration/
- Remove outdated audit/report markdown files
- Remove old template and deprecated test files
- Simplify .gitignore for .tito/ directory
2025-12-02 22:03:16 -05:00

261 lines
8.2 KiB
Python

#!/usr/bin/env python
"""
Minimal Complete Training Example for TinyTorch
================================================
This demonstrates the MINIMUM code needed to get gradient-based training working.
This is what students need to understand to build neural networks that learn.
"""
import numpy as np
import sys
sys.path.insert(0, '.')
from tinytorch.core.tensor import Tensor, Parameter
from tinytorch.core.autograd import Variable
class SimpleLinear:
"""Minimal linear layer that works with autograd."""
def __init__(self, in_features, out_features):
# Initialize weights and bias as Parameters (Tensors with requires_grad=True)
self.weights = Parameter(np.random.randn(in_features, out_features) * 0.1)
self.bias = Parameter(np.random.randn(out_features) * 0.1)
def __call__(self, x):
"""Forward pass maintaining gradient chain."""
# Convert everything to Variables for gradient tracking
if not isinstance(x, Variable):
x = Variable(x)
w = Variable(self.weights)
b = Variable(self.bias)
# Simple matmul using Variable operations
# This is inefficient but shows the concept clearly
output = x @ w + b # Uses Variable.__matmul__ and Variable.__add__
return output
def parameters(self):
"""Return parameters for optimizer."""
return [self.weights, self.bias]
def sigmoid(x):
"""Sigmoid activation as Variable operation."""
if not isinstance(x, Variable):
x = Variable(x)
# Compute sigmoid
sig_data = 1.0 / (1.0 + np.exp(-x.data.data))
# Create gradient function
def sig_grad_fn(grad_output):
# Sigmoid gradient: sig * (1 - sig)
grad = sig_data * (1 - sig_data) * grad_output.data.data
x.backward(Variable(grad))
return Variable(sig_data, requires_grad=x.requires_grad, grad_fn=sig_grad_fn)
class SimpleMSE:
"""Minimal MSE loss that returns a scalar Variable."""
def __call__(self, pred, target):
"""Compute MSE loss."""
# Convert to Variables
if not isinstance(pred, Variable):
pred = Variable(pred)
if not isinstance(target, Variable):
target = Variable(target, requires_grad=False)
# MSE = mean((pred - target)^2)
diff = pred - target
squared = diff * diff
# Manual mean
total = np.sum(squared.data.data)
n = squared.data.data.size
loss_val = total / n
# Create loss Variable with gradient function
def mse_grad_fn(grad_output=Variable(1.0)):
# Gradient of MSE: 2 * (pred - target) / n
grad = 2.0 * (pred.data.data - target.data.data) / n
pred.backward(Variable(grad))
return Variable(loss_val, requires_grad=True, grad_fn=mse_grad_fn)
class SimpleSGD:
"""Minimal SGD optimizer."""
def __init__(self, params, lr=0.01):
self.params = params
self.lr = lr
def zero_grad(self):
"""Clear gradients."""
for p in self.params:
p.grad = None
def step(self):
"""Update parameters."""
for p in self.params:
if p.grad is not None:
# Simple gradient descent: param = param - lr * grad
p.data = p.data - self.lr * p.grad.data
def train_xor_minimal():
"""Train XOR with minimal implementation."""
print("="*60)
print("MINIMAL XOR TRAINING EXAMPLE")
print("This shows the absolute minimum needed for learning")
print("="*60)
# XOR dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32)
y = np.array([[0], [1], [1], [0]], dtype=np.float32)
# Build simple network
layer1 = SimpleLinear(2, 4)
layer2 = SimpleLinear(4, 1)
# Optimizer and loss
params = layer1.parameters() + layer2.parameters()
optimizer = SimpleSGD(params, lr=0.5)
criterion = SimpleMSE()
# Training loop
for epoch in range(1000):
# Forward pass
h = layer1(Tensor(X))
h = sigmoid(h) # Activation
output = layer2(h)
output = sigmoid(output)
# Compute loss
loss = criterion(output, Tensor(y))
# Extract scalar loss value for printing
loss_val = float(loss.data.data) if hasattr(loss.data, 'data') else float(loss.data)
# Backward pass
optimizer.zero_grad()
loss.backward()
# Update weights
optimizer.step()
if epoch % 200 == 0:
print(f"Epoch {epoch:4d}: Loss = {loss_val:.4f}")
# Final predictions
print("\nFinal predictions:")
with_grad = False # No need for gradients during inference
h = layer1(Tensor(X))
h = sigmoid(h)
output = layer2(h)
output = sigmoid(output)
# Extract predictions
if hasattr(output, 'data'):
if hasattr(output.data, 'data'):
predictions = output.data.data
else:
predictions = output.data
else:
predictions = output
for i, (input_val, pred, target) in enumerate(zip(X, predictions, y)):
print(f" Input: {input_val} → Prediction: {pred[0]:.3f} (Target: {target[0]})")
# Check accuracy
predictions_binary = (predictions > 0.5).astype(int)
accuracy = np.mean(predictions_binary == y)
print(f"\nAccuracy: {accuracy*100:.1f}%")
if accuracy >= 0.75:
print("✅ XOR learned successfully!")
else:
print("⚠️ XOR not fully learned (but training is working)")
def train_linear_regression_minimal():
"""Even simpler: train linear regression."""
print("\n" + "="*60)
print("MINIMAL LINEAR REGRESSION")
print("Simplest possible learning example: y = 2x + 1")
print("="*60)
# Simple linear data
X = np.array([[1], [2], [3], [4]], dtype=np.float32)
y = np.array([[3], [5], [7], [9]], dtype=np.float32) # y = 2x + 1
# Single layer
model = SimpleLinear(1, 1)
optimizer = SimpleSGD(model.parameters(), lr=0.01)
criterion = SimpleMSE()
print(f"Initial weight: {model.weights.data[0,0]:.3f}")
print(f"Initial bias: {model.bias.data[0]:.3f}")
# Training
for epoch in range(100):
output = model(Tensor(X))
loss = criterion(output, Tensor(y))
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 20 == 0:
loss_val = float(loss.data.data) if hasattr(loss.data, 'data') else float(loss.data)
print(f"Epoch {epoch:3d}: Loss = {loss_val:.4f}")
print(f"\nFinal weight: {model.weights.data[0,0]:.3f} (should be ≈2.0)")
print(f"Final bias: {model.bias.data[0]:.3f} (should be ≈1.0)")
# Test prediction
test_x = Tensor(np.array([[5]], dtype=np.float32))
pred = model(test_x)
pred_val = float(pred.data.data[0,0]) if hasattr(pred.data, 'data') else float(pred.data[0,0])
print(f"\nTest: x=5 → prediction={pred_val:.3f} (should be ≈11.0)")
if abs(model.weights.data[0,0] - 2.0) < 0.5 and abs(model.bias.data[0] - 1.0) < 0.5:
print("✅ Linear regression learned successfully!")
if __name__ == "__main__":
# Start with simplest example
train_linear_regression_minimal()
# Then show XOR (non-linear problem)
print("\n")
train_xor_minimal()
print("\n" + "="*60)
print("KEY INSIGHTS FOR STUDENTS:")
print("="*60)
print("""
1. GRADIENT CHAIN: Every operation must maintain the Variable chain
- Tensors → Variables → Operations → Loss → Backward
2. PARAMETER UPDATES: Gradients must flow back to the original Parameters
- This requires Variable to keep reference to source Tensor
3. MINIMUM REQUIREMENTS FOR LEARNING:
- Forward pass that maintains computational graph
- Loss function that returns a Variable
- Backward pass that computes gradients
- Optimizer that updates parameters
4. WHAT MAKES IT WORK:
- Variable wrapping maintains gradient tracking
- Operations between Variables create new Variables
- backward() propagates gradients through the chain
- Optimizer uses param.grad to update param.data
This is the CORE of all deep learning frameworks!
""")