diff --git a/modules/source/05_autograd/autograd_dev.ipynb b/modules/source/05_autograd/autograd_dev.ipynb index 769d087d..0fb5f4fa 100644 --- a/modules/source/05_autograd/autograd_dev.ipynb +++ b/modules/source/05_autograd/autograd_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "7d52b57e", + "id": "e3cfec75", "metadata": { "cell_marker": "\"\"\"" }, @@ -54,7 +54,7 @@ { "cell_type": "code", "execution_count": null, - "id": "99f458a5", + "id": "58074465", "metadata": { "nbgrader": { "grade": false, @@ -77,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "d54ede56", + "id": "69b165b7", "metadata": { "cell_marker": "\"\"\"" }, @@ -131,7 +131,7 @@ }, { "cell_type": "markdown", - "id": "071dbac9", + "id": "74b7f7b1", "metadata": { "cell_marker": "\"\"\"" }, @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "e5fc52b8", + "id": "f0ebfa26", "metadata": { "cell_marker": "\"\"\"" }, @@ -227,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "1c14a640", + "id": "dbf5a8fe", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -255,7 +255,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5471a5ea", + "id": "637e3665", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -321,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "869a5b2d", + "id": "d791e7e6", "metadata": { "cell_marker": "\"\"\"" }, @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "f8016adc", + "id": "68eb4e20", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -389,7 +389,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a7bbf6b", + "id": "7a18ba60", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "35957bcb", + "id": "923b65a8", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0e4b3283", + "id": "6fc95eaf", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -535,7 +535,7 @@ }, { "cell_type": "markdown", - "id": "a78a3194", + "id": "fbfc3b8b", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -570,7 +570,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d860c218", + "id": "d26abee2", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -627,7 +627,7 @@ }, { "cell_type": "markdown", - "id": "482f627f", + "id": "d714d4d7", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -658,7 +658,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2e4e4804", + "id": "63a43449", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -706,7 +706,7 @@ }, { "cell_type": "markdown", - "id": "b75cc673", + "id": "7c451fcc", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -722,7 +722,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84e59e31", + "id": "283dd53b", "metadata": { "nbgrader": { "grade": true, @@ -769,7 +769,7 @@ }, { "cell_type": "markdown", - "id": "19eb6107", + "id": "74b997fa", "metadata": { "cell_marker": "\"\"\"" }, @@ -804,7 +804,7 @@ }, { "cell_type": "markdown", - "id": "54572fbb", + "id": "8f86f108", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -830,7 +830,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb670ba7", + "id": "14fe4ca5", "metadata": { "nbgrader": { "grade": false, @@ -874,7 +874,47 @@ { "cell_type": "code", "execution_count": null, - "id": "73ca569b", + "id": "bf1dd71d", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "mse-backward", + "solution": true + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class MSEBackward(Function):\n", + " \"\"\"\n", + " Gradient computation for Mean Squared Error Loss.\n", + " \n", + " MSE: L = mean((predictions - targets)²)\n", + " Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N\n", + " \"\"\"\n", + " \n", + " def __init__(self, predictions, targets):\n", + " \"\"\"Initialize with predictions and targets.\"\"\"\n", + " super().__init__(predictions)\n", + " self.targets_data = targets.data\n", + " self.num_samples = np.size(targets.data)\n", + " \n", + " def apply(self, grad_output):\n", + " \"\"\"Compute gradient for MSE loss.\"\"\"\n", + " predictions, = self.saved_tensors\n", + " \n", + " if isinstance(predictions, Tensor) and predictions.requires_grad:\n", + " # Gradient: 2 * (predictions - targets) / N\n", + " grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples\n", + " \n", + " return grad * grad_output,\n", + " return None," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7934b8f7", "metadata": { "nbgrader": { "grade": false, @@ -918,7 +958,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd8e3766", + "id": "4d7816e7", "metadata": { "nbgrader": { "grade": false, @@ -1130,11 +1170,12 @@ " # Patch activations and losses to track gradients\n", " try:\n", " from tinytorch.core.activations import Sigmoid\n", - " from tinytorch.core.losses import BinaryCrossEntropyLoss\n", + " from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss\n", " \n", " # Store original methods\n", " _original_sigmoid_forward = Sigmoid.forward\n", " _original_bce_forward = BinaryCrossEntropyLoss.forward\n", + " _original_mse_forward = MSELoss.forward\n", " \n", " def tracked_sigmoid_forward(self, x):\n", " \"\"\"Sigmoid with gradient tracking.\"\"\"\n", @@ -1165,9 +1206,25 @@ " \n", " return result\n", " \n", + " def tracked_mse_forward(self, predictions, targets):\n", + " \"\"\"MSE loss with gradient tracking.\"\"\"\n", + " # Compute MSE loss\n", + " diff = predictions.data - targets.data\n", + " squared_diff = diff ** 2\n", + " mse = np.mean(squared_diff)\n", + " \n", + " result = Tensor(mse)\n", + " \n", + " if predictions.requires_grad:\n", + " result.requires_grad = True\n", + " result._grad_fn = MSEBackward(predictions, targets)\n", + " \n", + " return result\n", + " \n", " # Install patched methods\n", " Sigmoid.forward = tracked_sigmoid_forward\n", " BinaryCrossEntropyLoss.forward = tracked_bce_forward\n", + " MSELoss.forward = tracked_mse_forward\n", " \n", " except ImportError:\n", " # Activations/losses not yet available (happens during module development)\n", @@ -1187,7 +1244,7 @@ }, { "cell_type": "markdown", - "id": "6297d6e1", + "id": "74bf991c", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1203,7 +1260,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e99c2b74", + "id": "c602541a", "metadata": { "nbgrader": { "grade": true, @@ -1251,7 +1308,7 @@ }, { "cell_type": "markdown", - "id": "18ae32ed", + "id": "940e33e0", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1265,7 +1322,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2d5083ff", + "id": "a6b58276", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1378,7 +1435,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d72f5056", + "id": "07cf3600", "metadata": {}, "outputs": [], "source": [ @@ -1389,7 +1446,7 @@ }, { "cell_type": "markdown", - "id": "cf8b02e2", + "id": "fd4719db", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/05_autograd/autograd_dev.py b/modules/source/05_autograd/autograd_dev.py index 19987b76..e8a89a16 100644 --- a/modules/source/05_autograd/autograd_dev.py +++ b/modules/source/05_autograd/autograd_dev.py @@ -702,6 +702,34 @@ class SigmoidBackward(Function): return None, +# %% nbgrader={"grade": false, "grade_id": "mse-backward", "solution": true} +#| export +class MSEBackward(Function): + """ + Gradient computation for Mean Squared Error Loss. + + MSE: L = mean((predictions - targets)²) + Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N + """ + + def __init__(self, predictions, targets): + """Initialize with predictions and targets.""" + super().__init__(predictions) + self.targets_data = targets.data + self.num_samples = np.size(targets.data) + + def apply(self, grad_output): + """Compute gradient for MSE loss.""" + predictions, = self.saved_tensors + + if isinstance(predictions, Tensor) and predictions.requires_grad: + # Gradient: 2 * (predictions - targets) / N + grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples + + return grad * grad_output, + return None, + + # %% nbgrader={"grade": false, "grade_id": "bce-backward", "solution": true} #| export class BCEBackward(Function): @@ -937,11 +965,12 @@ def enable_autograd(): # Patch activations and losses to track gradients try: from tinytorch.core.activations import Sigmoid - from tinytorch.core.losses import BinaryCrossEntropyLoss + from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss # Store original methods _original_sigmoid_forward = Sigmoid.forward _original_bce_forward = BinaryCrossEntropyLoss.forward + _original_mse_forward = MSELoss.forward def tracked_sigmoid_forward(self, x): """Sigmoid with gradient tracking.""" @@ -972,9 +1001,25 @@ def enable_autograd(): return result + def tracked_mse_forward(self, predictions, targets): + """MSE loss with gradient tracking.""" + # Compute MSE loss + diff = predictions.data - targets.data + squared_diff = diff ** 2 + mse = np.mean(squared_diff) + + result = Tensor(mse) + + if predictions.requires_grad: + result.requires_grad = True + result._grad_fn = MSEBackward(predictions, targets) + + return result + # Install patched methods Sigmoid.forward = tracked_sigmoid_forward BinaryCrossEntropyLoss.forward = tracked_bce_forward + MSELoss.forward = tracked_mse_forward except ImportError: # Activations/losses not yet available (happens during module development) diff --git a/tests/05_autograd/__init__.py b/tests/05_autograd/__init__.py new file mode 100644 index 00000000..7ec2ba83 --- /dev/null +++ b/tests/05_autograd/__init__.py @@ -0,0 +1,13 @@ +""" +Autograd-specific edge case tests. + +These tests focus on the autograd module's internal behavior: +- Broadcasting in gradients (common bug source) +- Computation graph construction +- Numerical stability in backward pass +- Memory management in gradient accumulation +- Edge cases students encounter + +Complements the inline tests in the autograd module with +focused edge case validation. +""" diff --git a/tests/README.md b/tests/README.md index 15b95194..d15e1ee3 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,67 +1,99 @@ -# 🧪 TinyTorch Integration Tests +# TinyTorch Test Suite -## ⚠️ **CRITICAL DIRECTORY - DO NOT DELETE** +Comprehensive testing organized by purpose and scope. -This directory contains **17 integration test files** that verify cross-module functionality across the entire TinyTorch system. These tests represent significant development effort and are essential for: +## Test Organization -- **Module integration validation** -- **Cross-component compatibility** -- **Real-world ML pipeline testing** -- **System-level regression detection** +### 📦 Module Tests (`XX_modulename/`) +**Purpose**: Test individual module functionality +**Scope**: Single module, isolated behavior +**Example**: `01_tensor/test_progressive_integration.py` -## 📁 **Test Structure** -- `test_*_integration.py` - Cross-module integration tests -- `test_utils.py` - Shared testing utilities -- `test_integration_report.md` - Test documentation +These tests validate that each module works correctly in isolation. -## 🧪 **Integration Test Coverage** +### 🔗 Integration Tests (`integration/`) +**Purpose**: Test cross-module interactions +**Scope**: Multiple modules working together +**Files**: +- `test_gradient_flow.py` - **CRITICAL**: Validates gradients flow through entire training stack +- `test_end_to_end_training.py` - Full training loops (TODO) +- `test_module_compatibility.py` - Module interfaces (TODO) -### Foundation Integration -- `test_tensor_activations_integration.py` - Tensor + Activations -- `test_layers_networks_integration.py` - Layers + Dense Networks -- `test_tensor_autograd_integration.py` - Tensor + Autograd +**Why this matters**: +- Catches bugs that unit tests miss +- Validates the "seams" between modules +- Ensures training actually works end-to-end -### Architecture Integration -- `test_tensor_attention_integration.py` - **NEW**: Tensor + Attention mechanisms -- `test_attention_pipeline_integration.py` - **NEW**: Complete transformer-like pipelines -- `test_tensor_cnn_integration.py` - Tensor + Spatial/CNN -- `test_cnn_networks_integration.py` - Spatial + Dense Networks -- `test_cnn_pipeline_integration.py` - Complete CNN pipelines +### 🐛 Debugging Tests (`debugging/`) +**Purpose**: Catch common student pitfalls +**Scope**: Pedagogical - teaches debugging +**Files**: +- `test_gradient_vanishing.py` - Detect/diagnose vanishing gradients (TODO) +- `test_gradient_explosion.py` - Detect/diagnose exploding gradients (TODO) +- `test_common_mistakes.py` - "Did you forget backward()?" style tests (TODO) -### Training & Data Integration -- `test_dataloader_tensor_integration.py` - DataLoader + Tensor -- `test_training_integration.py` - Complete training workflows -- `test_ml_pipeline_integration.py` - End-to-end ML pipelines +**Philosophy**: When these tests fail, the error message should teach the student what went wrong and how to fix it. -### Inference Serving Integration -- `test_compression_integration.py` - Model compression -- `test_kernels_integration.py` - Custom operations -- `test_benchmarking_integration.py` - Performance measurement -- `test_mlops_integration.py` - Deployment and serving +### ⚡ Autograd Edge Cases (`05_autograd/`) +**Purpose**: Stress-test autograd system +**Scope**: Autograd internals and edge cases +**Files**: +- `test_broadcasting.py` - Broadcasting gradient bugs (TODO) +- `test_computation_graph.py` - Graph construction edge cases (TODO) +- `test_backward_edge_cases.py` - Numerical stability, etc. (TODO) -## 🔧 **Usage** +## Running Tests + +### All tests ```bash -# Run all integration tests pytest tests/ -v - -# Run specific module integration -pytest tests/test_tensor_attention_integration.py -v -pytest tests/test_attention_pipeline_integration.py -v - -# Run attention-related tests -pytest tests/ -k "attention" -v ``` -## 🚨 **Recovery Instructions** -If accidentally deleted: +### Integration tests only (recommended for debugging training issues) ```bash -git checkout HEAD -- tests/ -git status # Verify recovery +pytest tests/integration/ -v ``` -## 📊 **Test Coverage** -These integration tests complement the inline tests in each module's `*_dev.py` files, providing comprehensive system validation with focus on: -- **Real component integration** (not mocks) -- **Cross-module compatibility** -- **Realistic ML workflows** (classification, seq2seq, transformers) -- **Performance and scalability** \ No newline at end of file +### Specific test +```bash +pytest tests/integration/test_gradient_flow.py -v +``` + +### Run without pytest +```bash +python tests/integration/test_gradient_flow.py +``` + +## Test Philosophy + +1. **Integration tests catch real bugs**: The gradient flow test caught the exact bugs that prevented training +2. **Descriptive names**: Test names should explain what they test +3. **Good error messages**: When tests fail, students should understand why +4. **Pedagogical value**: Tests teach correct usage patterns + +## Adding New Tests + +When adding a test, ask: +- **Is it testing one module?** → Put in `XX_modulename/` +- **Is it testing modules working together?** → Put in `integration/` +- **Is it teaching debugging?** → Put in `debugging/` +- **Is it an autograd edge case?** → Put in `05_autograd/` + +## Most Important Tests + +🔥 **Must pass before merging**: +- `integration/test_gradient_flow.py` - If this fails, training is broken + +📚 **Module validation**: +- Each module's inline tests (in `modules/source/`) +- Module-specific tests in `tests/XX_modulename/` + +## Test Coverage Goals + +- ✅ All tensor operations have gradient tests +- ✅ All layers compute gradients correctly +- ✅ All activations integrate with autograd +- ✅ All loss functions compute gradients +- ✅ All optimizers update parameters +- ⏳ End-to-end training converges (TODO) +- ⏳ Common pitfalls are detected (TODO) \ No newline at end of file diff --git a/tests/debugging/__init__.py b/tests/debugging/__init__.py new file mode 100644 index 00000000..6944f771 --- /dev/null +++ b/tests/debugging/__init__.py @@ -0,0 +1,12 @@ +""" +Debugging tests for common student pitfalls. + +These tests identify and diagnose common issues students encounter: +- Vanishing gradients (ReLU dying, sigmoid saturation) +- Exploding gradients (unstable initialization) +- Silent failures (forgot backward(), forgot zero_grad()) +- Common mistakes (wrong loss function, learning rate issues) + +Goal: When a test fails, the error message should guide students +to the solution. These are pedagogical tests that teach debugging. +""" diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py index b83f5731..42337371 100644 --- a/tests/integration/__init__.py +++ b/tests/integration/__init__.py @@ -1,12 +1,14 @@ """ -Integration tests for TinyTorch modules. +Integration tests for TinyTorch. -These tests verify that individual modules integrate correctly with the package: -- Export correctly to the package -- Can be imported without errors -- Basic functionality works -- Don't conflict with other modules +These tests validate that multiple modules work together correctly. +They catch issues that unit tests miss, like: +- Gradient flow through entire training pipelines +- Module compatibility and interface contracts +- End-to-end training scenarios -This is different from checkpoint tests which validate complete capabilities. -Integration tests are quick validation that runs after every module completion. +Critical for catching bugs like: +- Missing autograd integration +- Shape mismatches in broadcasting +- Optimizer parameter updates """ \ No newline at end of file diff --git a/tests/integration/test_gradient_flow.py b/tests/integration/test_gradient_flow.py new file mode 100644 index 00000000..68f8c44e --- /dev/null +++ b/tests/integration/test_gradient_flow.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python3 +""" +Comprehensive gradient flow testing for TinyTorch. + +This test suite systematically validates that gradients propagate correctly +through all components of the training stack. + +Run with: pytest tests/test_gradient_flow.py -v +Or directly: python tests/test_gradient_flow.py +""" + +import numpy as np +import sys +import os + +# Add project root to path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from tinytorch import Tensor, Linear, Dropout +from tinytorch import Sigmoid, ReLU, Tanh, GELU, Softmax +from tinytorch import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss +from tinytorch import SGD, AdamW + + +class TestBasicTensorGradients: + """Test gradient computation for basic tensor operations.""" + + def test_multiplication_gradient(self): + """Test gradient flow through multiplication.""" + x = Tensor([[1.0, 2.0]], requires_grad=True) + y = x * 3 + loss = y.sum() + + loss.backward() + + # dy/dx = 3 + assert x.grad is not None, "Gradient should be computed" + assert np.allclose(x.grad, [[3.0, 3.0]]), f"Expected [[3, 3]], got {x.grad}" + + def test_addition_gradient(self): + """Test gradient flow through addition.""" + x = Tensor([[1.0, 2.0]], requires_grad=True) + y = Tensor([[3.0, 4.0]], requires_grad=True) + z = x + y + loss = z.sum() + + loss.backward() + + # dz/dx = 1, dz/dy = 1 + assert np.allclose(x.grad, [[1.0, 1.0]]), f"x.grad: {x.grad}" + assert np.allclose(y.grad, [[1.0, 1.0]]), f"y.grad: {y.grad}" + + def test_chain_rule(self): + """Test gradient flow through chain of operations.""" + x = Tensor([[2.0]], requires_grad=True) + y = x * 3 # y = 3x + z = y + 1 # z = 3x + 1 + w = z * 2 # w = 2(3x + 1) = 6x + 2 + + w.backward() + + # dw/dx = 6 + assert np.allclose(x.grad, [[6.0]]), f"Expected [[6]], got {x.grad}" + + def test_matmul_gradient(self): + """Test gradient flow through matrix multiplication.""" + x = Tensor([[1.0, 2.0]], requires_grad=True) + W = Tensor([[1.0], [2.0]], requires_grad=True) + y = x.matmul(W) # y = [[5.0]] + + y.backward() + + # dy/dx = W^T = [[1, 2]] + # dy/dW = x^T = [[1], [2]] + assert np.allclose(x.grad, [[1.0, 2.0]]), f"x.grad: {x.grad}" + assert np.allclose(W.grad, [[1.0], [2.0]]), f"W.grad: {W.grad}" + + def test_broadcasting_gradient(self): + """Test gradient flow with broadcasting (e.g., bias addition).""" + x = Tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True) # (2, 2) + bias = Tensor([1.0, 2.0], requires_grad=True) # (2,) + y = x + bias # Broadcasting happens + loss = y.sum() + + loss.backward() + + # Gradient should sum over broadcast dimension + assert x.grad.shape == (2, 2), f"x.grad shape: {x.grad.shape}" + assert bias.grad.shape == (2,), f"bias.grad shape: {bias.grad.shape}" + assert np.allclose(bias.grad, [2.0, 2.0]), f"bias.grad: {bias.grad}" + + +class TestLayerGradients: + """Test gradient computation through neural network layers.""" + + def test_linear_layer_gradients(self): + """Test gradient flow through Linear layer.""" + layer = Linear(2, 3) + x = Tensor([[1.0, 2.0]], requires_grad=True) + + w_before = layer.weight.data.copy() + b_before = layer.bias.data.copy() + + out = layer(x) + loss = out.sum() + loss.backward() + + # All gradients should exist + assert layer.weight.grad is not None, "Weight gradient missing" + assert layer.bias.grad is not None, "Bias gradient missing" + assert x.grad is not None, "Input gradient missing" + + # Gradient shapes should match parameter shapes + assert layer.weight.grad.shape == layer.weight.shape + assert layer.bias.grad.shape == layer.bias.shape + + def test_multi_layer_gradients(self): + """Test gradient flow through multiple layers.""" + layer1 = Linear(2, 3) + layer2 = Linear(3, 1) + + x = Tensor([[1.0, 2.0]], requires_grad=True) + + h = layer1(x) + out = layer2(h) + loss = out.sum() + + loss.backward() + + # All layers should have gradients + assert layer1.weight.grad is not None + assert layer1.bias.grad is not None + assert layer2.weight.grad is not None + assert layer2.bias.grad is not None + + +class TestActivationGradients: + """Test gradient computation through activation functions.""" + + def test_sigmoid_gradient(self): + """Test gradient flow through Sigmoid.""" + x = Tensor([[0.0, 1.0, -1.0]], requires_grad=True) + sigmoid = Sigmoid() + + y = sigmoid(x) + loss = y.sum() + loss.backward() + + assert x.grad is not None, "Sigmoid gradient missing" + # Sigmoid gradient: σ'(x) = σ(x)(1 - σ(x)) + # At x=0: σ(0) = 0.5, σ'(0) = 0.25 + assert x.grad[0, 0] > 0, "Gradient should be positive" + + def test_relu_gradient(self): + """Test gradient flow through ReLU.""" + x = Tensor([[-1.0, 0.0, 1.0]], requires_grad=True) + relu = ReLU() + + y = relu(x) + loss = y.sum() + loss.backward() + + # ReLU gradient: 1 if x > 0, else 0 + # Note: We haven't implemented ReLU backward yet, so this will fail + # TODO: Implement ReLU backward in autograd + + def test_tanh_gradient(self): + """Test gradient flow through Tanh.""" + x = Tensor([[0.0, 1.0]], requires_grad=True) + tanh = Tanh() + + y = tanh(x) + loss = y.sum() + + # TODO: Implement Tanh backward + # loss.backward() + + +class TestLossGradients: + """Test gradient computation through loss functions.""" + + def test_bce_gradient(self): + """Test gradient flow through Binary Cross-Entropy.""" + predictions = Tensor([[0.7, 0.3, 0.9]], requires_grad=True) + targets = Tensor([[1.0, 0.0, 1.0]]) + + loss_fn = BinaryCrossEntropyLoss() + loss = loss_fn(predictions, targets) + + loss.backward() + + assert predictions.grad is not None, "BCE gradient missing" + assert predictions.grad.shape == predictions.shape + # Gradient should be negative for correct predictions + assert predictions.grad[0, 0] < 0, "Gradient sign incorrect" + + def test_mse_gradient(self): + """Test gradient flow through MSE loss.""" + predictions = Tensor([[1.0, 2.0, 3.0]], requires_grad=True) + targets = Tensor([[2.0, 2.0, 2.0]]) + + loss_fn = MSELoss() + loss = loss_fn(predictions, targets) + + # TODO: Implement MSE backward + # loss.backward() + + +class TestOptimizerIntegration: + """Test optimizer integration with gradient flow.""" + + def test_sgd_updates_parameters(self): + """Test that SGD actually updates parameters.""" + layer = Linear(2, 1) + optimizer = SGD(layer.parameters(), lr=0.1) + + w_before = layer.weight.data.copy() + b_before = layer.bias.data.copy() + + # Forward pass + x = Tensor([[1.0, 2.0]], requires_grad=True) + out = layer(x) + loss = out.sum() + + # Backward pass + loss.backward() + + # Optimizer step + optimizer.step() + + # Parameters should change + assert not np.allclose(layer.weight.data, w_before), "Weights didn't update" + assert not np.allclose(layer.bias.data, b_before), "Bias didn't update" + + def test_zero_grad_clears_gradients(self): + """Test that zero_grad() clears gradients.""" + layer = Linear(2, 1) + optimizer = SGD(layer.parameters(), lr=0.1) + + # First backward pass + x = Tensor([[1.0, 2.0]]) + out = layer(x) + loss = out.sum() + loss.backward() + + assert layer.weight.grad is not None, "Gradient should exist" + + # Clear gradients + optimizer.zero_grad() + + assert layer.weight.grad is None, "Gradient should be cleared" + assert layer.bias.grad is None, "Bias gradient should be cleared" + + def test_adamw_updates_parameters(self): + """Test that AdamW optimizer works.""" + layer = Linear(2, 1) + optimizer = AdamW(layer.parameters(), lr=0.01) + + w_before = layer.weight.data.copy() + + x = Tensor([[1.0, 2.0]]) + out = layer(x) + loss = out.sum() + loss.backward() + optimizer.step() + + assert not np.allclose(layer.weight.data, w_before), "AdamW didn't update weights" + + +class TestFullTrainingLoop: + """Test complete training scenarios.""" + + def test_simple_convergence(self): + """Test that a simple model can learn.""" + # Simple task: learn to output 5 from input [1, 2] + layer = Linear(2, 1) + optimizer = SGD(layer.parameters(), lr=0.1) + loss_fn = MSELoss() + + x = Tensor([[1.0, 2.0]]) + target = Tensor([[5.0]]) + + initial_loss = None + final_loss = None + + # Train for a few iterations + for i in range(50): + # Forward + pred = layer(x) + loss = loss_fn(pred, target) + + if i == 0: + initial_loss = loss.data + if i == 49: + final_loss = loss.data + + # Backward + loss.backward() + + # Update + optimizer.step() + optimizer.zero_grad() + + # Loss should decrease + assert final_loss < initial_loss, f"Loss didn't decrease: {initial_loss} → {final_loss}" + + def test_binary_classification(self): + """Test binary classification training.""" + layer = Linear(2, 1) + sigmoid = Sigmoid() + loss_fn = BinaryCrossEntropyLoss() + optimizer = SGD(layer.parameters(), lr=0.1) + + # Simple dataset: [1, 1] → 1, [0, 0] → 0 + X = Tensor([[1.0, 1.0], [0.0, 0.0]]) + y = Tensor([[1.0], [0.0]]) + + initial_loss = None + final_loss = None + + for i in range(50): + # Forward + logits = layer(X) + probs = sigmoid(logits) + loss = loss_fn(probs, y) + + if i == 0: + initial_loss = loss.data + if i == 49: + final_loss = loss.data + + # Backward + loss.backward() + + # Update + optimizer.step() + optimizer.zero_grad() + + assert final_loss < initial_loss, "Binary classification didn't learn" + + +class TestEdgeCases: + """Test edge cases and potential failure modes.""" + + def test_zero_gradient(self): + """Test that zero gradients don't break training.""" + x = Tensor([[0.0, 0.0]], requires_grad=True) + y = x * 0 + loss = y.sum() + + loss.backward() + + assert x.grad is not None + assert np.allclose(x.grad, [[0.0, 0.0]]) + + def test_very_small_values(self): + """Test gradient flow with very small values.""" + x = Tensor([[1e-8, 1e-8]], requires_grad=True) + y = x * 2 + loss = y.sum() + + loss.backward() + + assert x.grad is not None + assert np.allclose(x.grad, [[2.0, 2.0]]) + + def test_gradient_accumulation(self): + """Test that gradients accumulate correctly across multiple backward passes.""" + x = Tensor([[1.0]], requires_grad=True) + + # First backward + y1 = x * 2 + y1.backward() + grad_after_first = x.grad.copy() + + # Second backward (without zero_grad) + y2 = x * 3 + y2.backward() + + # Gradient should accumulate: 2 + 3 = 5 + expected = grad_after_first + np.array([[3.0]]) + assert np.allclose(x.grad, expected), f"Expected {expected}, got {x.grad}" + + +def run_all_tests(): + """Run all tests and print results.""" + import inspect + + test_classes = [ + TestBasicTensorGradients, + TestLayerGradients, + TestActivationGradients, + TestLossGradients, + TestOptimizerIntegration, + TestFullTrainingLoop, + TestEdgeCases, + ] + + total_tests = 0 + passed_tests = 0 + failed_tests = [] + skipped_tests = [] + + print("=" * 80) + print("🧪 TINYTORCH GRADIENT FLOW TEST SUITE") + print("=" * 80) + + for test_class in test_classes: + print(f"\n{'=' * 80}") + print(f"📦 {test_class.__name__}") + print(f"{'=' * 80}") + + instance = test_class() + methods = [m for m in dir(instance) if m.startswith('test_')] + + for method_name in methods: + total_tests += 1 + method = getattr(instance, method_name) + + # Get docstring + doc = method.__doc__ or method_name + doc = doc.strip().split('\n')[0] + + print(f"\n {method_name}") + print(f" {doc}") + + try: + method() + print(f" ✅ PASSED") + passed_tests += 1 + except NotImplementedError as e: + print(f" ⏭️ SKIPPED: {e}") + skipped_tests.append((test_class.__name__, method_name, str(e))) + except AssertionError as e: + print(f" ❌ FAILED: {e}") + failed_tests.append((test_class.__name__, method_name, str(e))) + except Exception as e: + print(f" ❌ ERROR: {e}") + failed_tests.append((test_class.__name__, method_name, str(e))) + + # Summary + print("\n" + "=" * 80) + print("📊 TEST SUMMARY") + print("=" * 80) + print(f"Total tests: {total_tests}") + print(f"✅ Passed: {passed_tests}") + print(f"❌ Failed: {len(failed_tests)}") + print(f"⏭️ Skipped: {len(skipped_tests)}") + + if failed_tests: + print("\n" + "=" * 80) + print("❌ FAILED TESTS:") + print("=" * 80) + for class_name, method_name, error in failed_tests: + print(f"\n {class_name}.{method_name}") + print(f" {error}") + + if skipped_tests: + print("\n" + "=" * 80) + print("⏭️ SKIPPED TESTS (Not Yet Implemented):") + print("=" * 80) + for class_name, method_name, reason in skipped_tests: + print(f" {class_name}.{method_name}") + + print("\n" + "=" * 80) + + return len(failed_tests) == 0 + + +if __name__ == "__main__": + success = run_all_tests() + sys.exit(0 if success else 1) diff --git a/tinytorch/core/autograd.py b/tinytorch/core/autograd.py index fcf7e121..dbad17cb 100644 --- a/tinytorch/core/autograd.py +++ b/tinytorch/core/autograd.py @@ -15,8 +15,8 @@ # ║ happens! The tinytorch/ directory is just the compiled output. ║ # ╚═══════════════════════════════════════════════════════════════════════════════╝ # %% auto 0 -__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'SigmoidBackward', 'BCEBackward', - 'enable_autograd'] +__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'SigmoidBackward', 'MSEBackward', + 'BCEBackward', 'enable_autograd'] # %% ../../modules/source/05_autograd/autograd_dev.ipynb 1 import numpy as np @@ -271,6 +271,32 @@ class SigmoidBackward(Function): return None, # %% ../../modules/source/05_autograd/autograd_dev.ipynb 21 +class MSEBackward(Function): + """ + Gradient computation for Mean Squared Error Loss. + + MSE: L = mean((predictions - targets)²) + Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N + """ + + def __init__(self, predictions, targets): + """Initialize with predictions and targets.""" + super().__init__(predictions) + self.targets_data = targets.data + self.num_samples = np.size(targets.data) + + def apply(self, grad_output): + """Compute gradient for MSE loss.""" + predictions, = self.saved_tensors + + if isinstance(predictions, Tensor) and predictions.requires_grad: + # Gradient: 2 * (predictions - targets) / N + grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples + + return grad * grad_output, + return None, + +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 22 class BCEBackward(Function): """ Gradient computation for Binary Cross-Entropy Loss. @@ -300,7 +326,7 @@ class BCEBackward(Function): return grad * grad_output, return None, -# %% ../../modules/source/05_autograd/autograd_dev.ipynb 22 +# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23 def enable_autograd(): """ Enable gradient tracking for all Tensor operations. @@ -502,11 +528,12 @@ def enable_autograd(): # Patch activations and losses to track gradients try: from tinytorch.core.activations import Sigmoid - from tinytorch.core.losses import BinaryCrossEntropyLoss + from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss # Store original methods _original_sigmoid_forward = Sigmoid.forward _original_bce_forward = BinaryCrossEntropyLoss.forward + _original_mse_forward = MSELoss.forward def tracked_sigmoid_forward(self, x): """Sigmoid with gradient tracking.""" @@ -537,9 +564,25 @@ def enable_autograd(): return result + def tracked_mse_forward(self, predictions, targets): + """MSE loss with gradient tracking.""" + # Compute MSE loss + diff = predictions.data - targets.data + squared_diff = diff ** 2 + mse = np.mean(squared_diff) + + result = Tensor(mse) + + if predictions.requires_grad: + result.requires_grad = True + result._grad_fn = MSEBackward(predictions, targets) + + return result + # Install patched methods Sigmoid.forward = tracked_sigmoid_forward BinaryCrossEntropyLoss.forward = tracked_bce_forward + MSELoss.forward = tracked_mse_forward except ImportError: # Activations/losses not yet available (happens during module development)