mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-25 07:19:40 -05:00
Test Fixes (External pytest tests - all passing): - Module 03: Reverted .weights for test helper classes - Module 08: Fixed DataLoader data format (tuple → list(zip())) - Module 10: Use CharTokenizer instead of abstract Tokenizer - Module 15: Fixed KVCache constructor args and seq_len - Module 19: Fixed Benchmark constructor args Tito CLI Improvements: - Added module name resolver: "15" → "15_quantization" - Added .ipynb file support in _get_dev_file_path() - Added notebook-to-Python conversion using jupytext - Inline tests now execute notebooks correctly Results: - External tests: 36/36 passing (100%) - Tito inline tests: 15/20 passing (75%) - Remaining failures are module code bugs, not test framework issues
188 lines
6.1 KiB
Python
188 lines
6.1 KiB
Python
"""
|
|
Test gradient flow through all autograd operations.
|
|
|
|
This test suite validates that all arithmetic operations and activations
|
|
properly preserve gradient tracking and enable backpropagation.
|
|
"""
|
|
|
|
import numpy as np
|
|
import pytest
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.autograd import enable_autograd
|
|
from tinytorch.core.activations import GELU
|
|
# Import transformer to ensure mean/sqrt monkey-patches are applied
|
|
from tinytorch.models import transformer
|
|
|
|
|
|
def test_arithmetic_gradient_flow():
|
|
"""Test that arithmetic operations preserve requires_grad and set _grad_fn."""
|
|
print("Testing arithmetic gradient flow...")
|
|
|
|
x = Tensor(np.array([2.0, 3.0]), requires_grad=True)
|
|
y = Tensor(np.array([4.0, 5.0]), requires_grad=True)
|
|
|
|
# Test addition
|
|
z_add = x + y
|
|
assert z_add.requires_grad, "Addition should preserve requires_grad"
|
|
assert hasattr(z_add, '_grad_fn'), "Addition should set _grad_fn"
|
|
|
|
# Test subtraction
|
|
z_sub = x - y
|
|
assert z_sub.requires_grad, "Subtraction should preserve requires_grad"
|
|
assert hasattr(z_sub, '_grad_fn'), "Subtraction should set _grad_fn"
|
|
|
|
# Test multiplication
|
|
z_mul = x * y
|
|
assert z_mul.requires_grad, "Multiplication should preserve requires_grad"
|
|
assert hasattr(z_mul, '_grad_fn'), "Multiplication should set _grad_fn"
|
|
|
|
# Test division
|
|
z_div = x / y
|
|
assert z_div.requires_grad, "Division should preserve requires_grad"
|
|
assert hasattr(z_div, '_grad_fn'), "Division should set _grad_fn"
|
|
|
|
print("✅ All arithmetic operations preserve gradient tracking")
|
|
|
|
|
|
def test_subtraction_backward():
|
|
"""Test that subtraction computes correct gradients."""
|
|
print("Testing subtraction backward pass...")
|
|
|
|
a = Tensor(np.array([5.0, 10.0]), requires_grad=True)
|
|
b = Tensor(np.array([2.0, 3.0]), requires_grad=True)
|
|
|
|
# Forward: c = a - b
|
|
c = a - b
|
|
|
|
# Backward
|
|
loss = c.sum()
|
|
loss.backward()
|
|
|
|
# Check gradients: ∂loss/∂a = 1, ∂loss/∂b = -1
|
|
assert a.grad is not None, "Gradient should flow to a"
|
|
assert b.grad is not None, "Gradient should flow to b"
|
|
assert np.allclose(a.grad, np.array([1.0, 1.0])), "Gradient wrt a should be 1"
|
|
assert np.allclose(b.grad, np.array([-1.0, -1.0])), "Gradient wrt b should be -1"
|
|
|
|
print("✅ Subtraction backward pass correct")
|
|
|
|
|
|
def test_division_backward():
|
|
"""Test that division computes correct gradients."""
|
|
print("Testing division backward pass...")
|
|
|
|
a = Tensor(np.array([6.0, 12.0]), requires_grad=True)
|
|
b = Tensor(np.array([2.0, 3.0]), requires_grad=True)
|
|
|
|
# Forward: c = a / b
|
|
c = a / b
|
|
|
|
# Backward
|
|
loss = c.sum()
|
|
loss.backward()
|
|
|
|
# Check gradients: ∂(a/b)/∂a = 1/b, ∂(a/b)/∂b = -a/b²
|
|
assert a.grad is not None, "Gradient should flow to a"
|
|
assert b.grad is not None, "Gradient should flow to b"
|
|
assert np.allclose(a.grad, 1.0 / b.data), "Gradient wrt a should be 1/b"
|
|
expected_b_grad = -a.data / (b.data ** 2)
|
|
assert np.allclose(b.grad, expected_b_grad), "Gradient wrt b should be -a/b²"
|
|
|
|
print("✅ Division backward pass correct")
|
|
|
|
|
|
def test_gelu_gradient_flow():
|
|
"""Test that GELU activation preserves gradient flow."""
|
|
print("Testing GELU gradient flow...")
|
|
|
|
x = Tensor(np.array([1.0, 2.0, 3.0]), requires_grad=True)
|
|
gelu = GELU()
|
|
|
|
# Forward
|
|
y = gelu(x)
|
|
assert y.requires_grad, "GELU output should have requires_grad=True"
|
|
assert hasattr(y, '_grad_fn'), "GELU should set _grad_fn"
|
|
|
|
# Backward
|
|
loss = y.sum()
|
|
loss.backward()
|
|
|
|
assert x.grad is not None, "Gradient should flow through GELU"
|
|
assert np.abs(x.grad).max() > 1e-10, "GELU gradient should be non-zero"
|
|
|
|
print("✅ GELU gradient flow works correctly")
|
|
|
|
|
|
def test_layernorm_operations():
|
|
"""Test gradient flow through LayerNorm operations (sqrt, div)."""
|
|
print("Testing LayerNorm operations gradient flow...")
|
|
|
|
# Test sqrt (monkey-patched in transformer module)
|
|
x = Tensor(np.array([4.0, 9.0, 16.0]), requires_grad=True)
|
|
|
|
# Check if sqrt is available (added in later modules)
|
|
if not hasattr(x, 'sqrt'):
|
|
pytest.skip("sqrt operation not yet implemented (requires transformer module)")
|
|
|
|
sqrt_x = x.sqrt()
|
|
assert sqrt_x.requires_grad, "Sqrt should preserve requires_grad"
|
|
loss = sqrt_x.sum()
|
|
loss.backward()
|
|
assert x.grad is not None, "Gradient should flow through sqrt"
|
|
|
|
# Test mean (should be available)
|
|
x2 = Tensor(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), requires_grad=True)
|
|
if hasattr(x2, 'mean'):
|
|
mean = x2.mean(axis=-1, keepdims=True)
|
|
# Mean uses monkey-patched version in transformer context
|
|
assert mean.requires_grad, "Mean should preserve requires_grad"
|
|
loss2 = mean.sum()
|
|
loss2.backward()
|
|
assert x2.grad is not None, "Gradient should flow through mean"
|
|
|
|
print("✅ LayerNorm operations gradient flow works")
|
|
|
|
|
|
def test_reshape_gradient_flow():
|
|
"""Test that reshape preserves gradient flow."""
|
|
print("Testing reshape gradient flow...")
|
|
|
|
x = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]), requires_grad=True)
|
|
y = x.reshape(4)
|
|
|
|
assert y.requires_grad, "Reshape should preserve requires_grad"
|
|
assert hasattr(y, '_grad_fn'), "Reshape should set _grad_fn"
|
|
|
|
# Backward
|
|
loss = y.sum()
|
|
loss.backward()
|
|
|
|
assert x.grad is not None, "Gradient should flow through reshape"
|
|
assert x.grad.shape == x.shape, "Gradient shape should match input shape"
|
|
|
|
print("✅ Reshape gradient flow works correctly")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("\n" + "="*70)
|
|
print("GRADIENT FLOW TEST SUITE")
|
|
print("="*70 + "\n")
|
|
|
|
test_arithmetic_gradient_flow()
|
|
test_subtraction_backward()
|
|
test_division_backward()
|
|
test_gelu_gradient_flow()
|
|
test_layernorm_operations()
|
|
test_reshape_gradient_flow()
|
|
|
|
print("\n" + "="*70)
|
|
print("✅ ALL GRADIENT FLOW TESTS PASSED")
|
|
print("="*70 + "\n")
|
|
|