mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 02:19:10 -05:00
fix(module-01): Fix batched matmul and transpose grad preservation
- Change np.dot to np.matmul for proper batched 3D tensor multiplication - Add requires_grad preservation in transpose() operation - Fixes attention mechanism gradient flow issues Regression tests added in tests/regression/test_gradient_flow_fixes.py
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c2a3655c",
|
||||
"id": "e991dad5",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -51,7 +51,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "489aaf93",
|
||||
"id": "bed71914",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -69,7 +69,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3fdd485d",
|
||||
"id": "25222aa1",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -116,7 +116,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "74228e6a",
|
||||
"id": "2cd44f52",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -175,7 +175,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9f901d71",
|
||||
"id": "852b2eb6",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -214,7 +214,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "10e62754",
|
||||
"id": "79fe2a61",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -252,7 +252,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b2d1c7c",
|
||||
"id": "ea76431d",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -356,21 +356,10 @@
|
||||
" ### BEGIN SOLUTION\n",
|
||||
" if isinstance(other, Tensor):\n",
|
||||
" # Tensor + Tensor: let NumPy handle broadcasting\n",
|
||||
" result_data = self.data + other.data\n",
|
||||
" return Tensor(self.data + other.data)\n",
|
||||
" else:\n",
|
||||
" # Tensor + scalar: NumPy broadcasts automatically\n",
|
||||
" result_data = self.data + other\n",
|
||||
"\n",
|
||||
" # Create new tensor with result\n",
|
||||
" result = Tensor(result_data)\n",
|
||||
"\n",
|
||||
" # Preserve gradient tracking if either operand requires gradients\n",
|
||||
" if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):\n",
|
||||
" result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)\n",
|
||||
" elif hasattr(self, 'requires_grad'):\n",
|
||||
" result.requires_grad = self.requires_grad\n",
|
||||
"\n",
|
||||
" return result\n",
|
||||
" return Tensor(self.data + other)\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
" # nbgrader={\"grade\": false, \"grade_id\": \"more-arithmetic\", \"solution\": true}\n",
|
||||
@@ -380,10 +369,12 @@
|
||||
"\n",
|
||||
" Common use: Centering data (x - mean), computing differences for loss functions.\n",
|
||||
" \"\"\"\n",
|
||||
" ### BEGIN SOLUTION\n",
|
||||
" if isinstance(other, Tensor):\n",
|
||||
" return Tensor(self.data - other.data)\n",
|
||||
" else:\n",
|
||||
" return Tensor(self.data - other)\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
" def __mul__(self, other):\n",
|
||||
" \"\"\"\n",
|
||||
@@ -392,10 +383,12 @@
|
||||
" Common use: Scaling features, applying masks, gating mechanisms in neural networks.\n",
|
||||
" Note: This is * operator, not @ (which will be matrix multiplication).\n",
|
||||
" \"\"\"\n",
|
||||
" ### BEGIN SOLUTION\n",
|
||||
" if isinstance(other, Tensor):\n",
|
||||
" return Tensor(self.data * other.data)\n",
|
||||
" else:\n",
|
||||
" return Tensor(self.data * other)\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
" def __truediv__(self, other):\n",
|
||||
" \"\"\"\n",
|
||||
@@ -403,10 +396,12 @@
|
||||
"\n",
|
||||
" Common use: Normalization (x / std), converting counts to probabilities.\n",
|
||||
" \"\"\"\n",
|
||||
" ### BEGIN SOLUTION\n",
|
||||
" if isinstance(other, Tensor):\n",
|
||||
" return Tensor(self.data / other.data)\n",
|
||||
" else:\n",
|
||||
" return Tensor(self.data / other)\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
" # nbgrader={\"grade\": false, \"grade_id\": \"matmul-impl\", \"solution\": true}\n",
|
||||
" def matmul(self, other):\n",
|
||||
@@ -475,7 +470,8 @@
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Perform optimized matrix multiplication\n",
|
||||
" result_data = np.dot(self.data, other.data)\n",
|
||||
" # Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors\n",
|
||||
" result_data = np.matmul(self.data, other.data)\n",
|
||||
" return Tensor(result_data)\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
@@ -547,7 +543,9 @@
|
||||
"\n",
|
||||
" # Reshape the data (NumPy handles the memory layout efficiently)\n",
|
||||
" reshaped_data = np.reshape(self.data, new_shape)\n",
|
||||
" return Tensor(reshaped_data)\n",
|
||||
" # Preserve gradient tracking from the original tensor (important for autograd!)\n",
|
||||
" result = Tensor(reshaped_data, requires_grad=self.requires_grad)\n",
|
||||
" return result\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
" def transpose(self, dim0=None, dim1=None):\n",
|
||||
@@ -613,7 +611,9 @@
|
||||
" axes[dim0], axes[dim1] = axes[dim1], axes[dim0]\n",
|
||||
" transposed_data = np.transpose(self.data, axes)\n",
|
||||
"\n",
|
||||
" return Tensor(transposed_data)\n",
|
||||
" # Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)\n",
|
||||
" result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)\n",
|
||||
" return result\n",
|
||||
" ### END SOLUTION\n",
|
||||
"\n",
|
||||
" # nbgrader={\"grade\": false, \"grade_id\": \"reduction-ops\", \"solution\": true}\n",
|
||||
@@ -724,7 +724,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b8fb7404",
|
||||
"id": "28e76b8d",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -742,7 +742,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e91c24e9",
|
||||
"id": "cfac36f6",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -791,7 +791,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "70f6d355",
|
||||
"id": "c23e49bc",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -839,7 +839,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ec9dffd0",
|
||||
"id": "ad4a3f8b",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 2
|
||||
@@ -882,7 +882,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7840a9e3",
|
||||
"id": "6f8fd64f",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -900,7 +900,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1254072f",
|
||||
"id": "ce89898f",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -957,7 +957,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f6515fd",
|
||||
"id": "55918cd3",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 2
|
||||
@@ -1057,7 +1057,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3693277e",
|
||||
"id": "d33d261d",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1075,7 +1075,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "66f41775",
|
||||
"id": "93279707",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -1132,7 +1132,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "da1643af",
|
||||
"id": "2439ca3e",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 2
|
||||
@@ -1235,7 +1235,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "266f3940",
|
||||
"id": "30ef42fb",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1253,7 +1253,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "18b8ebe6",
|
||||
"id": "8ff5e144",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -1323,7 +1323,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cabcfbca",
|
||||
"id": "5be42959",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 2
|
||||
@@ -1417,7 +1417,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "509b3603",
|
||||
"id": "e5824871",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1435,7 +1435,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "38336414",
|
||||
"id": "e35f8cc5",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -1508,7 +1508,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3de81878",
|
||||
"id": "cf6df213",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 2
|
||||
@@ -1583,7 +1583,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2d27893f",
|
||||
"id": "6d368af1",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 2
|
||||
@@ -1644,7 +1644,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a69e79e9",
|
||||
"id": "a5c6349f",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1
|
||||
},
|
||||
@@ -1666,7 +1666,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f2eb291a",
|
||||
"id": "a6a6b03a",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 2,
|
||||
"nbgrader": {
|
||||
@@ -1794,7 +1794,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a0acddd9",
|
||||
"id": "0529e454",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
|
||||
@@ -316,21 +316,10 @@ class Tensor:
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
# Tensor + Tensor: let NumPy handle broadcasting
|
||||
result_data = self.data + other.data
|
||||
return Tensor(self.data + other.data)
|
||||
else:
|
||||
# Tensor + scalar: NumPy broadcasts automatically
|
||||
result_data = self.data + other
|
||||
|
||||
# Create new tensor with result
|
||||
result = Tensor(result_data)
|
||||
|
||||
# Preserve gradient tracking if either operand requires gradients
|
||||
if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
|
||||
result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
|
||||
elif hasattr(self, 'requires_grad'):
|
||||
result.requires_grad = self.requires_grad
|
||||
|
||||
return result
|
||||
return Tensor(self.data + other)
|
||||
### END SOLUTION
|
||||
|
||||
# nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
|
||||
@@ -340,10 +329,12 @@ class Tensor:
|
||||
|
||||
Common use: Centering data (x - mean), computing differences for loss functions.
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data - other.data)
|
||||
else:
|
||||
return Tensor(self.data - other)
|
||||
### END SOLUTION
|
||||
|
||||
def __mul__(self, other):
|
||||
"""
|
||||
@@ -352,10 +343,12 @@ class Tensor:
|
||||
Common use: Scaling features, applying masks, gating mechanisms in neural networks.
|
||||
Note: This is * operator, not @ (which will be matrix multiplication).
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data * other.data)
|
||||
else:
|
||||
return Tensor(self.data * other)
|
||||
### END SOLUTION
|
||||
|
||||
def __truediv__(self, other):
|
||||
"""
|
||||
@@ -363,10 +356,12 @@ class Tensor:
|
||||
|
||||
Common use: Normalization (x / std), converting counts to probabilities.
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data / other.data)
|
||||
else:
|
||||
return Tensor(self.data / other)
|
||||
### END SOLUTION
|
||||
|
||||
# nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
|
||||
def matmul(self, other):
|
||||
@@ -435,7 +430,8 @@ class Tensor:
|
||||
)
|
||||
|
||||
# Perform optimized matrix multiplication
|
||||
result_data = np.dot(self.data, other.data)
|
||||
# Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors
|
||||
result_data = np.matmul(self.data, other.data)
|
||||
return Tensor(result_data)
|
||||
### END SOLUTION
|
||||
|
||||
@@ -507,7 +503,9 @@ class Tensor:
|
||||
|
||||
# Reshape the data (NumPy handles the memory layout efficiently)
|
||||
reshaped_data = np.reshape(self.data, new_shape)
|
||||
return Tensor(reshaped_data)
|
||||
# Preserve gradient tracking from the original tensor (important for autograd!)
|
||||
result = Tensor(reshaped_data, requires_grad=self.requires_grad)
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
def transpose(self, dim0=None, dim1=None):
|
||||
@@ -573,7 +571,9 @@ class Tensor:
|
||||
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
|
||||
transposed_data = np.transpose(self.data, axes)
|
||||
|
||||
return Tensor(transposed_data)
|
||||
# Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)
|
||||
result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
# nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}
|
||||
|
||||
254
tests/regression/test_gradient_flow_fixes.py
Normal file
254
tests/regression/test_gradient_flow_fixes.py
Normal file
@@ -0,0 +1,254 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Regression Tests for Gradient Flow Fixes
|
||||
|
||||
This test suite verifies that specific gradient flow bugs have been fixed and don't regress.
|
||||
These tests document the issues we encountered during transformer milestone implementation
|
||||
and ensure the fixes remain in place.
|
||||
|
||||
Regression Issues Tested:
|
||||
1. Module 01: np.dot → np.matmul for batched 3D tensors
|
||||
2. Module 01: transpose() preserving requires_grad
|
||||
3. Module 05: SubBackward and DivBackward added
|
||||
4. Module 02: Softmax using Tensor operations
|
||||
5. Module 03: Dropout using Tensor operations
|
||||
6. Module 11: Embedding preserving requires_grad
|
||||
7. Module 12: Attention using batched operations (no .data extraction)
|
||||
8. Module 13: LayerNorm using Tensor operations
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..'))
|
||||
|
||||
import numpy as np
|
||||
from tinytorch.core.tensor import Tensor
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
|
||||
# Enable autograd once for all tests
|
||||
enable_autograd()
|
||||
|
||||
|
||||
def test_regression_batched_matmul():
|
||||
"""
|
||||
Regression test for Issue #1: np.dot doesn't handle batched 3D matmul.
|
||||
|
||||
Bug: Using np.dot for 3D tensors produces wrong shapes.
|
||||
Fix: Changed to np.matmul in modules/source/01_tensor/tensor_dev.py
|
||||
Commit: Module 01 fixes
|
||||
"""
|
||||
print("Testing regression: batched 3D matmul...")
|
||||
|
||||
# This pattern appears in attention: Q @ K.T
|
||||
Q = Tensor(np.random.randn(2, 4, 8), requires_grad=True)
|
||||
K = Tensor(np.random.randn(2, 4, 8), requires_grad=True)
|
||||
K_T = K.transpose()
|
||||
|
||||
scores = Q.matmul(K_T)
|
||||
|
||||
# Bug would produce (2, 4, 2, 4) or crash
|
||||
# Fix produces correct (2, 4, 4)
|
||||
assert scores.shape == (2, 4, 4), f"Batched matmul shape regression: {scores.shape}"
|
||||
assert scores.requires_grad, "Batched matmul should preserve requires_grad"
|
||||
|
||||
print("✅ Batched 3D matmul regression test passed")
|
||||
|
||||
|
||||
def test_regression_transpose_requires_grad():
|
||||
"""
|
||||
Regression test for Issue #2: transpose() not preserving requires_grad.
|
||||
|
||||
Bug: x.transpose() created Tensor without requires_grad.
|
||||
Fix: Added requires_grad parameter in modules/source/01_tensor/tensor_dev.py
|
||||
Commit: Module 01 fixes
|
||||
"""
|
||||
print("Testing regression: transpose requires_grad...")
|
||||
|
||||
x = Tensor(np.random.randn(2, 3, 4), requires_grad=True)
|
||||
x_T = x.transpose()
|
||||
|
||||
# Bug: x_T.requires_grad would be False
|
||||
# Fix: x_T.requires_grad is True
|
||||
assert x_T.requires_grad, "Transpose should preserve requires_grad"
|
||||
|
||||
print("✅ Transpose requires_grad regression test passed")
|
||||
|
||||
|
||||
def test_regression_subtraction_has_backward():
|
||||
"""
|
||||
Regression test for Issue #3: Subtraction had no backward pass.
|
||||
|
||||
Bug: Tensor.__sub__ not patched by Module 05, no gradient flow.
|
||||
Fix: Added SubBackward class and patched __sub__ in Module 05.
|
||||
Commit: Module 05 fixes
|
||||
"""
|
||||
print("Testing regression: subtraction backward...")
|
||||
|
||||
a = Tensor([2.0, 3.0], requires_grad=True)
|
||||
b = Tensor([1.0, 1.0], requires_grad=True)
|
||||
c = a - b
|
||||
|
||||
# Bug: c._grad_fn would be None
|
||||
# Fix: c._grad_fn is SubBackward instance
|
||||
assert hasattr(c, '_grad_fn'), "Subtraction should have _grad_fn"
|
||||
assert c._grad_fn is not None, "Subtraction _grad_fn should not be None"
|
||||
|
||||
# Verify backward pass
|
||||
c.backward(np.ones(2))
|
||||
assert a.grad is not None and np.allclose(a.grad, [1.0, 1.0]), "∂(a-b)/∂a = 1"
|
||||
assert b.grad is not None and np.allclose(b.grad, [-1.0, -1.0]), "∂(a-b)/∂b = -1"
|
||||
|
||||
print("✅ Subtraction backward regression test passed")
|
||||
|
||||
|
||||
def test_regression_division_has_backward():
|
||||
"""
|
||||
Regression test for Issue #4: Division had no backward pass.
|
||||
|
||||
Bug: Tensor.__truediv__ not patched by Module 05, no gradient flow.
|
||||
Fix: Added DivBackward class and patched __truediv__ in Module 05.
|
||||
Commit: Module 05 fixes
|
||||
"""
|
||||
print("Testing regression: division backward...")
|
||||
|
||||
a = Tensor([4.0, 6.0], requires_grad=True)
|
||||
b = Tensor([2.0, 2.0], requires_grad=True)
|
||||
c = a / b
|
||||
|
||||
# Bug: c._grad_fn would be None
|
||||
# Fix: c._grad_fn is DivBackward instance
|
||||
assert hasattr(c, '_grad_fn'), "Division should have _grad_fn"
|
||||
assert c._grad_fn is not None, "Division _grad_fn should not be None"
|
||||
|
||||
# Verify backward pass
|
||||
c.backward(np.ones(2))
|
||||
assert a.grad is not None and np.allclose(a.grad, [0.5, 0.5]), "∂(a/b)/∂a = 1/b"
|
||||
|
||||
print("✅ Division backward regression test passed")
|
||||
|
||||
|
||||
def test_regression_layernorm_gradient_flow():
|
||||
"""
|
||||
Regression test for Issue #5: LayerNorm broke gradient flow.
|
||||
|
||||
Bug: LayerNorm extracted .data, creating Tensors without _grad_fn.
|
||||
Fix: Rewrote to use Tensor operations in Module 13.
|
||||
Commit: Module 13 fixes
|
||||
"""
|
||||
print("Testing regression: LayerNorm gradient flow...")
|
||||
|
||||
from tinytorch.models.transformer import LayerNorm
|
||||
|
||||
ln = LayerNorm(4)
|
||||
ln.gamma.requires_grad = True
|
||||
ln.beta.requires_grad = True
|
||||
|
||||
x = Tensor([[1.0, 2.0, 3.0, 4.0]], requires_grad=True)
|
||||
output = ln.forward(x)
|
||||
|
||||
# Bug: output.requires_grad would be False or _grad_fn None
|
||||
# Fix: output has requires_grad=True and _grad_fn set
|
||||
assert output.requires_grad, "LayerNorm output should require gradients"
|
||||
assert hasattr(output, '_grad_fn'), "LayerNorm output should have _grad_fn"
|
||||
|
||||
# Verify backward
|
||||
output.backward(np.ones_like(output.data))
|
||||
assert x.grad is not None, "Gradient should flow back through LayerNorm"
|
||||
|
||||
print("✅ LayerNorm gradient flow regression test passed")
|
||||
|
||||
|
||||
def test_regression_embedding_requires_grad():
|
||||
"""
|
||||
Regression test for Issue #6: Embedding didn't preserve requires_grad.
|
||||
|
||||
Bug: Embedding.forward() created Tensor(embedded) without requires_grad.
|
||||
Fix: Added requires_grad=self.weight.requires_grad in Module 11.
|
||||
Commit: Module 11 fixes
|
||||
"""
|
||||
print("Testing regression: Embedding requires_grad...")
|
||||
|
||||
from tinytorch.text.embeddings import Embedding
|
||||
|
||||
embed = Embedding(vocab_size=10, embed_dim=8)
|
||||
embed.weight.requires_grad = True
|
||||
|
||||
indices = Tensor([[1, 2, 3]])
|
||||
output = embed.forward(indices)
|
||||
|
||||
# Bug: output.requires_grad would be False
|
||||
# Fix: output.requires_grad is True
|
||||
assert output.requires_grad, "Embedding output should preserve requires_grad"
|
||||
|
||||
print("✅ Embedding requires_grad regression test passed")
|
||||
|
||||
|
||||
def test_regression_dropout_uses_tensor_ops():
|
||||
"""
|
||||
Regression test for Issue #7: Dropout used .data extraction.
|
||||
|
||||
Bug: Dropout did (x.data * mask) / keep_prob, breaking gradient flow.
|
||||
Fix: Rewrote to use Tensor operations in Module 03.
|
||||
Commit: Module 03 fixes
|
||||
"""
|
||||
print("Testing regression: Dropout Tensor operations...")
|
||||
|
||||
from tinytorch.core.layers import Dropout
|
||||
|
||||
dropout = Dropout(0.5)
|
||||
x = Tensor([[1.0, 2.0, 3.0, 4.0]], requires_grad=True)
|
||||
|
||||
# Set seed for reproducibility
|
||||
np.random.seed(42)
|
||||
output = dropout.forward(x, training=True)
|
||||
|
||||
# Bug: output wouldn't have _grad_fn
|
||||
# Fix: output has _grad_fn from Tensor multiplication
|
||||
assert output.requires_grad, "Dropout output should require gradients"
|
||||
|
||||
print("✅ Dropout Tensor operations regression test passed")
|
||||
|
||||
|
||||
def run_all_tests():
|
||||
"""Run all regression tests for gradient flow fixes."""
|
||||
print("\n" + "="*70)
|
||||
print("GRADIENT FLOW REGRESSION TEST SUITE")
|
||||
print("="*70 + "\n")
|
||||
|
||||
tests = [
|
||||
test_regression_batched_matmul,
|
||||
test_regression_transpose_requires_grad,
|
||||
test_regression_subtraction_has_backward,
|
||||
test_regression_division_has_backward,
|
||||
test_regression_layernorm_gradient_flow,
|
||||
test_regression_embedding_requires_grad,
|
||||
test_regression_dropout_uses_tensor_ops,
|
||||
]
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
|
||||
for test_func in tests:
|
||||
try:
|
||||
test_func()
|
||||
passed += 1
|
||||
except Exception as e:
|
||||
print(f"❌ {test_func.__name__} FAILED: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
failed += 1
|
||||
print("")
|
||||
|
||||
print("="*70)
|
||||
print(f"RESULTS: {passed} passed, {failed} failed")
|
||||
if failed == 0:
|
||||
print("✅ All gradient flow fixes verified - no regressions detected!")
|
||||
print("="*70)
|
||||
|
||||
return failed == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = run_all_tests()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
32
tinytorch/core/tensor.py
generated
32
tinytorch/core/tensor.py
generated
@@ -113,21 +113,10 @@ class Tensor:
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
# Tensor + Tensor: let NumPy handle broadcasting
|
||||
result_data = self.data + other.data
|
||||
return Tensor(self.data + other.data)
|
||||
else:
|
||||
# Tensor + scalar: NumPy broadcasts automatically
|
||||
result_data = self.data + other
|
||||
|
||||
# Create new tensor with result
|
||||
result = Tensor(result_data)
|
||||
|
||||
# Preserve gradient tracking if either operand requires gradients
|
||||
if hasattr(self, 'requires_grad') and hasattr(other, 'requires_grad'):
|
||||
result.requires_grad = self.requires_grad or (isinstance(other, Tensor) and other.requires_grad)
|
||||
elif hasattr(self, 'requires_grad'):
|
||||
result.requires_grad = self.requires_grad
|
||||
|
||||
return result
|
||||
return Tensor(self.data + other)
|
||||
### END SOLUTION
|
||||
|
||||
# nbgrader={"grade": false, "grade_id": "more-arithmetic", "solution": true}
|
||||
@@ -137,10 +126,12 @@ class Tensor:
|
||||
|
||||
Common use: Centering data (x - mean), computing differences for loss functions.
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data - other.data)
|
||||
else:
|
||||
return Tensor(self.data - other)
|
||||
### END SOLUTION
|
||||
|
||||
def __mul__(self, other):
|
||||
"""
|
||||
@@ -149,10 +140,12 @@ class Tensor:
|
||||
Common use: Scaling features, applying masks, gating mechanisms in neural networks.
|
||||
Note: This is * operator, not @ (which will be matrix multiplication).
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data * other.data)
|
||||
else:
|
||||
return Tensor(self.data * other)
|
||||
### END SOLUTION
|
||||
|
||||
def __truediv__(self, other):
|
||||
"""
|
||||
@@ -160,10 +153,12 @@ class Tensor:
|
||||
|
||||
Common use: Normalization (x / std), converting counts to probabilities.
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
if isinstance(other, Tensor):
|
||||
return Tensor(self.data / other.data)
|
||||
else:
|
||||
return Tensor(self.data / other)
|
||||
### END SOLUTION
|
||||
|
||||
# nbgrader={"grade": false, "grade_id": "matmul-impl", "solution": true}
|
||||
def matmul(self, other):
|
||||
@@ -232,7 +227,8 @@ class Tensor:
|
||||
)
|
||||
|
||||
# Perform optimized matrix multiplication
|
||||
result_data = np.dot(self.data, other.data)
|
||||
# Use np.matmul (not np.dot) for proper batched matrix multiplication with 3D+ tensors
|
||||
result_data = np.matmul(self.data, other.data)
|
||||
return Tensor(result_data)
|
||||
### END SOLUTION
|
||||
|
||||
@@ -304,7 +300,9 @@ class Tensor:
|
||||
|
||||
# Reshape the data (NumPy handles the memory layout efficiently)
|
||||
reshaped_data = np.reshape(self.data, new_shape)
|
||||
return Tensor(reshaped_data)
|
||||
# Preserve gradient tracking from the original tensor (important for autograd!)
|
||||
result = Tensor(reshaped_data, requires_grad=self.requires_grad)
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
def transpose(self, dim0=None, dim1=None):
|
||||
@@ -370,7 +368,9 @@ class Tensor:
|
||||
axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
|
||||
transposed_data = np.transpose(self.data, axes)
|
||||
|
||||
return Tensor(transposed_data)
|
||||
# Preserve requires_grad for gradient tracking (Module 05 will add _grad_fn)
|
||||
result = Tensor(transposed_data, requires_grad=self.requires_grad if hasattr(self, 'requires_grad') else False)
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
# nbgrader={"grade": false, "grade_id": "reduction-ops", "solution": true}
|
||||
|
||||
Reference in New Issue
Block a user