Fix gradient propagation: enable autograd and patch activations/losses

CRITICAL FIX: Gradients now flow through entire training stack!

Changes:
1. Enable autograd in __init__.py - patches Tensor operations on import
2. Extend enable_autograd() to patch Sigmoid and BCE forward methods
3. Fix gradient accumulation to handle broadcasting (bias gradients)
4. Fix optimizer.step() - param.grad is numpy array, not Tensor.data
5. Add debug_gradients.py for systematic gradient flow testing

Architecture:
- Clean patching pattern - all gradient tracking in enable_autograd()
- Activations/losses remain simple (Module 02/04)
- Autograd (Module 05) upgrades them with gradient tracking
- Pedagogically sound: separation of concerns

Results:
 All 6 debug tests pass
 Perceptron learns: 50% → 93% accuracy
 Loss decreases: 0.79 → 0.36
 Weights update correctly through SGD
This commit is contained in:
Vijay Janapa Reddi
2025-09-30 13:51:30 -04:00
parent ba6bd79a67
commit 5ae68dd4b4
11 changed files with 549 additions and 113 deletions

230
debug_gradients.py Normal file
View File

@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Debug script to trace gradient propagation through the TinyTorch stack.
Tests each component step-by-step to find where gradients stop flowing.
"""
import numpy as np
from tinytorch import Tensor, Linear, Sigmoid, BinaryCrossEntropyLoss, SGD
print("=" * 70)
print("🔍 GRADIENT FLOW DEBUGGING")
print("=" * 70)
# ============================================================================
# TEST 1: Basic Tensor Operations
# ============================================================================
print("\n[TEST 1] Basic Tensor Operations")
print("-" * 70)
x = Tensor([[1.0, 2.0]], requires_grad=True)
print(f"✓ Created tensor x: {x.data}")
print(f" requires_grad: {x.requires_grad}")
print(f" grad: {x.grad}")
y = x * 2
print(f"\n✓ Created y = x * 2: {y.data}")
print(f" requires_grad: {y.requires_grad}")
print(f" grad: {y.grad}")
loss = y.sum()
print(f"\n✓ Created loss = y.sum(): {loss.data}")
print(f" requires_grad: {loss.requires_grad}")
print("\n📊 Before backward:")
print(f" x.grad: {x.grad}")
loss.backward()
print("\n📊 After backward:")
print(f" x.grad: {x.grad}")
if x.grad is not None and np.allclose(x.grad, [[2.0, 2.0]]):
print("✅ TEST 1 PASSED: Basic gradients work!")
else:
print("❌ TEST 1 FAILED: Basic gradients don't work!")
print(f" Expected: [[2.0, 2.0]], Got: {x.grad}")
# ============================================================================
# TEST 2: Linear Layer Forward Pass
# ============================================================================
print("\n\n[TEST 2] Linear Layer Forward Pass")
print("-" * 70)
layer = Linear(2, 1)
print(f"✓ Created Linear(2, 1)")
print(f" weight.data: {layer.weight.data}")
print(f" weight.requires_grad: {layer.weight.requires_grad}")
print(f" bias.data: {layer.bias.data}")
print(f" bias.requires_grad: {layer.bias.requires_grad}")
x = Tensor([[1.0, 2.0]], requires_grad=True)
out = layer(x)
print(f"\n✓ Forward pass output: {out.data}")
print(f" out.requires_grad: {out.requires_grad}")
# ============================================================================
# TEST 3: Linear Layer Backward Pass
# ============================================================================
print("\n\n[TEST 3] Linear Layer Backward Pass")
print("-" * 70)
layer = Linear(2, 1)
w_before = layer.weight.data.copy()
b_before = layer.bias.data.copy()
print(f"Before backward:")
print(f" weight: {w_before}")
print(f" bias: {b_before}")
print(f" weight.grad: {layer.weight.grad}")
print(f" bias.grad: {layer.bias.grad}")
x = Tensor([[1.0, 2.0]], requires_grad=True)
out = layer(x)
loss = out.sum()
print(f"\n✓ Created loss: {loss.data}")
loss.backward()
print(f"\nAfter backward:")
print(f" weight.grad: {layer.weight.grad}")
print(f" bias.grad: {layer.bias.grad}")
print(f" x.grad: {x.grad}")
if layer.weight.grad is not None and layer.bias.grad is not None:
print("✅ TEST 3 PASSED: Linear layer gradients computed!")
else:
print("❌ TEST 3 FAILED: Linear layer gradients missing!")
# ============================================================================
# TEST 4: Optimizer Step
# ============================================================================
print("\n\n[TEST 4] Optimizer Step")
print("-" * 70)
layer = Linear(2, 1)
optimizer = SGD(layer.parameters(), lr=0.1)
print(f"✓ Created optimizer with lr=0.1")
print(f" Num parameters: {len(optimizer.params)}")
w_before = layer.weight.data.copy()
b_before = layer.bias.data.copy()
print(f"\nBefore training step:")
print(f" weight: {w_before}")
print(f" bias: {b_before}")
# Forward
x = Tensor([[1.0, 2.0]], requires_grad=True)
out = layer(x)
loss = out.sum()
print(f"\n✓ Forward pass, loss: {loss.data}")
# Backward
loss.backward()
print(f"\nAfter backward:")
print(f" weight.grad: {layer.weight.grad}")
print(f" bias.grad: {layer.bias.grad}")
# Step
optimizer.step()
w_after = layer.weight.data.copy()
b_after = layer.bias.data.copy()
print(f"\nAfter optimizer.step():")
print(f" weight: {w_after}")
print(f" bias: {b_after}")
print(f" weight changed: {not np.allclose(w_before, w_after)}")
print(f" bias changed: {not np.allclose(b_before, b_after)}")
if not np.allclose(w_before, w_after) or not np.allclose(b_before, b_after):
print("✅ TEST 4 PASSED: Optimizer updates parameters!")
else:
print("❌ TEST 4 FAILED: Optimizer didn't update parameters!")
# ============================================================================
# TEST 5: Full Training Step with Sigmoid + BCE
# ============================================================================
print("\n\n[TEST 5] Full Training Step (Linear + Sigmoid + BCE)")
print("-" * 70)
layer = Linear(2, 1)
sigmoid = Sigmoid()
loss_fn = BinaryCrossEntropyLoss()
optimizer = SGD(layer.parameters(), lr=0.1)
w_before = layer.weight.data.copy()
b_before = layer.bias.data.copy()
print(f"Before training:")
print(f" weight: {w_before}")
print(f" bias: {b_before}")
# Data
x = Tensor([[1.0, 2.0]], requires_grad=True)
y_true = Tensor([[1.0]])
# Forward
logits = layer(x)
print(f"\n✓ Logits: {logits.data}")
probs = sigmoid(logits)
print(f"✓ Probs: {probs.data}")
loss = loss_fn(probs, y_true)
print(f"✓ Loss: {loss.data}")
# Backward
print("\n📊 Calling loss.backward()...")
loss.backward()
print(f"\nAfter backward:")
print(f" loss.grad: {loss.grad}")
print(f" probs.grad: {probs.grad}")
print(f" logits.grad: {logits.grad}")
print(f" weight.grad: {layer.weight.grad}")
print(f" bias.grad: {layer.bias.grad}")
# Update
optimizer.step()
w_after = layer.weight.data.copy()
b_after = layer.bias.data.copy()
print(f"\nAfter optimizer.step():")
print(f" weight: {w_after}")
print(f" bias: {b_after}")
print(f" weight changed: {not np.allclose(w_before, w_after)}")
print(f" bias changed: {not np.allclose(b_before, b_after)}")
if not np.allclose(w_before, w_after) or not np.allclose(b_before, b_after):
print("✅ TEST 5 PASSED: Full training step works!")
else:
print("❌ TEST 5 FAILED: Full training step doesn't update weights!")
# ============================================================================
# TEST 6: Parameters Function
# ============================================================================
print("\n\n[TEST 6] Layer parameters() method")
print("-" * 70)
layer = Linear(2, 1)
params = layer.parameters()
print(f"✓ layer.parameters() returned {len(params)} parameters")
for i, p in enumerate(params):
print(f" param[{i}]: shape={p.shape}, requires_grad={p.requires_grad}, type={type(p)}")
if len(params) == 2:
print("✅ TEST 6 PASSED: parameters() returns weight and bias!")
else:
print("❌ TEST 6 FAILED: parameters() should return 2 tensors!")
print("\n" + "=" * 70)
print("🏁 DEBUGGING COMPLETE")
print("=" * 70)

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"id": "41637b5b",
"id": "e444d0e8",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -34,7 +34,7 @@
},
{
"cell_type": "markdown",
"id": "eb80f71c",
"id": "f8e11333",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -59,7 +59,7 @@
},
{
"cell_type": "markdown",
"id": "ad445b19",
"id": "9bb0541c",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -78,7 +78,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "7fc4b3ae",
"id": "c50f8430",
"metadata": {
"nbgrader": {
"grade": false,
@@ -102,7 +102,7 @@
},
{
"cell_type": "markdown",
"id": "6c49b0a7",
"id": "7dd50568",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -144,7 +144,7 @@
},
{
"cell_type": "markdown",
"id": "a82d5ffc",
"id": "0402638e",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -166,7 +166,7 @@
},
{
"cell_type": "markdown",
"id": "d954190f",
"id": "9cc4031a",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -190,7 +190,7 @@
},
{
"cell_type": "markdown",
"id": "1d26aa84",
"id": "64941304",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -228,7 +228,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "cd112f28",
"id": "e6b7ee43",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -272,8 +272,15 @@
" \"\"\"\n",
" ### BEGIN SOLUTION\n",
" # Apply sigmoid: 1 / (1 + exp(-x))\n",
" result = 1.0 / (1.0 + np.exp(-x.data))\n",
" return Tensor(result)\n",
" result_data = 1.0 / (1.0 + np.exp(-x.data))\n",
" result = Tensor(result_data)\n",
" \n",
" # Track gradients if autograd is enabled and input requires_grad\n",
" if SigmoidBackward is not None and x.requires_grad:\n",
" result.requires_grad = True\n",
" result._grad_fn = SigmoidBackward(x, result)\n",
" \n",
" return result\n",
" ### END SOLUTION\n",
"\n",
" def __call__(self, x: Tensor) -> Tensor:\n",
@@ -287,7 +294,7 @@
},
{
"cell_type": "markdown",
"id": "87407a56",
"id": "526bd575",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -303,7 +310,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "8599e53a",
"id": "667cc4ec",
"metadata": {
"nbgrader": {
"grade": true,
@@ -344,7 +351,7 @@
},
{
"cell_type": "markdown",
"id": "96438263",
"id": "6b301e45",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -386,7 +393,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "6bdad44d",
"id": "c29349e4",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -442,7 +449,7 @@
},
{
"cell_type": "markdown",
"id": "853265df",
"id": "34c5f3a4",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -458,7 +465,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "e3f2e5fd",
"id": "517b440a",
"metadata": {
"nbgrader": {
"grade": true,
@@ -505,7 +512,7 @@
},
{
"cell_type": "markdown",
"id": "d137e456",
"id": "b92601a1",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -544,7 +551,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "3a3ec4c5",
"id": "9e1cbe76",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -600,7 +607,7 @@
},
{
"cell_type": "markdown",
"id": "b2ad2baa",
"id": "b2d12529",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -616,7 +623,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b92572ae",
"id": "a2e9f944",
"metadata": {
"nbgrader": {
"grade": true,
@@ -664,7 +671,7 @@
},
{
"cell_type": "markdown",
"id": "d1cdd503",
"id": "f24f4022",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -707,7 +714,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "90f15779",
"id": "3d86ca1f",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -768,7 +775,7 @@
},
{
"cell_type": "markdown",
"id": "eb655b3b",
"id": "f16cc4b7",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -784,7 +791,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "838060ac",
"id": "07f74f6a",
"metadata": {
"nbgrader": {
"grade": true,
@@ -832,7 +839,7 @@
},
{
"cell_type": "markdown",
"id": "a8047ea8",
"id": "4f0611e8",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -870,7 +877,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "aa266bb7",
"id": "39ff5a3a",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -942,7 +949,7 @@
},
{
"cell_type": "markdown",
"id": "80e6ad27",
"id": "f0a20259",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -958,7 +965,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "f3db3810",
"id": "3b6dd4d2",
"metadata": {
"nbgrader": {
"grade": true,
@@ -1016,7 +1023,7 @@
},
{
"cell_type": "markdown",
"id": "2db83cef",
"id": "9d0b8c20",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 2
@@ -1029,7 +1036,7 @@
},
{
"cell_type": "markdown",
"id": "428eaa1b",
"id": "9716c7c5",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -1049,7 +1056,7 @@
},
{
"cell_type": "markdown",
"id": "fe7666b9",
"id": "bdcbe6f6",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1063,7 +1070,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "fac9ee55",
"id": "e7a90720",
"metadata": {
"lines_to_next_cell": 2,
"nbgrader": {
@@ -1162,7 +1169,7 @@
},
{
"cell_type": "markdown",
"id": "6a9cc930",
"id": "d82baf1e",
"metadata": {
"cell_marker": "\"\"\""
},

View File

@@ -223,8 +223,15 @@ class Sigmoid:
"""
### BEGIN SOLUTION
# Apply sigmoid: 1 / (1 + exp(-x))
result = 1.0 / (1.0 + np.exp(-x.data))
return Tensor(result)
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
# Track gradients if autograd is enabled and input requires_grad
if SigmoidBackward is not None and x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"id": "11a866a5",
"id": "dc404941",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -54,7 +54,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "6f716656",
"id": "6b25fc7d",
"metadata": {
"nbgrader": {
"grade": false,
@@ -77,7 +77,7 @@
},
{
"cell_type": "markdown",
"id": "1c5fcfe6",
"id": "ad79196c",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -131,7 +131,7 @@
},
{
"cell_type": "markdown",
"id": "82cafe21",
"id": "268f66e4",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -190,7 +190,7 @@
},
{
"cell_type": "markdown",
"id": "47bd67c9",
"id": "47d547ca",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -227,7 +227,7 @@
},
{
"cell_type": "markdown",
"id": "cce8538a",
"id": "b09afd8d",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -255,7 +255,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "7c604fa6",
"id": "d02437c4",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -321,7 +321,7 @@
},
{
"cell_type": "markdown",
"id": "f721b07e",
"id": "8e0706ae",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -360,7 +360,7 @@
},
{
"cell_type": "markdown",
"id": "b783a909",
"id": "dea5fcaf",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -389,7 +389,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b8c92aa2",
"id": "a0f8a601",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -444,7 +444,7 @@
},
{
"cell_type": "markdown",
"id": "31a8a1ab",
"id": "e8dc3302",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -477,7 +477,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "1a6762d0",
"id": "c6812744",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -535,7 +535,7 @@
},
{
"cell_type": "markdown",
"id": "11567a68",
"id": "303fb215",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -570,7 +570,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "102ba9f6",
"id": "5e7d9f70",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -627,7 +627,7 @@
},
{
"cell_type": "markdown",
"id": "d9496bda",
"id": "08c1cf65",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -658,7 +658,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "37f9b250",
"id": "21c2fb7b",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -706,7 +706,7 @@
},
{
"cell_type": "markdown",
"id": "116f71ea",
"id": "cb3a65f0",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -722,7 +722,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b2120ecf",
"id": "78182b35",
"metadata": {
"nbgrader": {
"grade": true,
@@ -769,7 +769,7 @@
},
{
"cell_type": "markdown",
"id": "9685115d",
"id": "25c230ca",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -804,7 +804,7 @@
},
{
"cell_type": "markdown",
"id": "5612e207",
"id": "8b293571",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -830,7 +830,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b49922a3",
"id": "eba66f56",
"metadata": {
"nbgrader": {
"grade": false,
@@ -874,7 +874,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "e79f5497",
"id": "0fd1c893",
"metadata": {
"nbgrader": {
"grade": false,
@@ -918,7 +918,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "58b86487",
"id": "1eb3f913",
"metadata": {
"nbgrader": {
"grade": false,
@@ -1084,6 +1084,20 @@
" # Initialize or accumulate gradient\n",
" if self.grad is None:\n",
" self.grad = np.zeros_like(self.data)\n",
" \n",
" # Handle broadcasting: sum gradient to match self.data shape\n",
" if gradient.shape != self.grad.shape:\n",
" # Sum over broadcasted dimensions\n",
" # This handles cases like bias gradients that get broadcast\n",
" ndims_added = len(gradient.shape) - len(self.grad.shape)\n",
" for i in range(ndims_added):\n",
" gradient = np.sum(gradient, axis=0)\n",
" for i, (grad_dim, self_dim) in enumerate(zip(gradient.shape, self.grad.shape)):\n",
" if self_dim == 1 and grad_dim > 1:\n",
" gradient = np.sum(gradient, axis=i, keepdims=True)\n",
" elif self_dim != grad_dim:\n",
" gradient = np.sum(gradient, axis=i, keepdims=True)\n",
" \n",
" self.grad += gradient\n",
"\n",
" # Propagate gradients through computation graph\n",
@@ -1112,6 +1126,52 @@
" Tensor.backward = backward\n",
" Tensor.zero_grad = zero_grad\n",
"\n",
" # Patch activations and losses to track gradients\n",
" try:\n",
" from tinytorch.core.activations import Sigmoid\n",
" from tinytorch.core.losses import BinaryCrossEntropyLoss\n",
" \n",
" # Store original methods\n",
" _original_sigmoid_forward = Sigmoid.forward\n",
" _original_bce_forward = BinaryCrossEntropyLoss.forward\n",
" \n",
" def tracked_sigmoid_forward(self, x):\n",
" \"\"\"Sigmoid with gradient tracking.\"\"\"\n",
" result_data = 1.0 / (1.0 + np.exp(-x.data))\n",
" result = Tensor(result_data)\n",
" \n",
" if x.requires_grad:\n",
" result.requires_grad = True\n",
" result._grad_fn = SigmoidBackward(x, result)\n",
" \n",
" return result\n",
" \n",
" def tracked_bce_forward(self, predictions, targets):\n",
" \"\"\"Binary cross-entropy with gradient tracking.\"\"\"\n",
" # Compute BCE loss\n",
" eps = 1e-7\n",
" clamped_preds = np.clip(predictions.data, eps, 1 - eps)\n",
" log_preds = np.log(clamped_preds)\n",
" log_one_minus_preds = np.log(1 - clamped_preds)\n",
" bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)\n",
" bce_loss = np.mean(bce_per_sample)\n",
" \n",
" result = Tensor(bce_loss)\n",
" \n",
" if predictions.requires_grad:\n",
" result.requires_grad = True\n",
" result._grad_fn = BCEBackward(predictions, targets)\n",
" \n",
" return result\n",
" \n",
" # Install patched methods\n",
" Sigmoid.forward = tracked_sigmoid_forward\n",
" BinaryCrossEntropyLoss.forward = tracked_bce_forward\n",
" \n",
" except ImportError:\n",
" # Activations/losses not yet available (happens during module development)\n",
" pass\n",
"\n",
" # Mark as enabled\n",
" Tensor._autograd_enabled = True\n",
"\n",
@@ -1126,7 +1186,7 @@
},
{
"cell_type": "markdown",
"id": "d03e54f6",
"id": "19562839",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1142,7 +1202,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "1bae0903",
"id": "469f4a49",
"metadata": {
"nbgrader": {
"grade": true,
@@ -1190,7 +1250,7 @@
},
{
"cell_type": "markdown",
"id": "fc159b24",
"id": "db0b3fec",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1204,7 +1264,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "92f51d47",
"id": "7d5d64d8",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1317,7 +1377,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "ef3b1668",
"id": "8fbb5c87",
"metadata": {},
"outputs": [],
"source": [
@@ -1328,7 +1388,7 @@
},
{
"cell_type": "markdown",
"id": "7728d17d",
"id": "4e4a0fe1",
"metadata": {
"cell_marker": "\"\"\""
},

View File

@@ -891,6 +891,20 @@ def enable_autograd():
# Initialize or accumulate gradient
if self.grad is None:
self.grad = np.zeros_like(self.data)
# Handle broadcasting: sum gradient to match self.data shape
if gradient.shape != self.grad.shape:
# Sum over broadcasted dimensions
# This handles cases like bias gradients that get broadcast
ndims_added = len(gradient.shape) - len(self.grad.shape)
for i in range(ndims_added):
gradient = np.sum(gradient, axis=0)
for i, (grad_dim, self_dim) in enumerate(zip(gradient.shape, self.grad.shape)):
if self_dim == 1 and grad_dim > 1:
gradient = np.sum(gradient, axis=i, keepdims=True)
elif self_dim != grad_dim:
gradient = np.sum(gradient, axis=i, keepdims=True)
self.grad += gradient
# Propagate gradients through computation graph
@@ -919,6 +933,52 @@ def enable_autograd():
Tensor.backward = backward
Tensor.zero_grad = zero_grad
# Patch activations and losses to track gradients
try:
from tinytorch.core.activations import Sigmoid
from tinytorch.core.losses import BinaryCrossEntropyLoss
# Store original methods
_original_sigmoid_forward = Sigmoid.forward
_original_bce_forward = BinaryCrossEntropyLoss.forward
def tracked_sigmoid_forward(self, x):
"""Sigmoid with gradient tracking."""
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
if x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
def tracked_bce_forward(self, predictions, targets):
"""Binary cross-entropy with gradient tracking."""
# Compute BCE loss
eps = 1e-7
clamped_preds = np.clip(predictions.data, eps, 1 - eps)
log_preds = np.log(clamped_preds)
log_one_minus_preds = np.log(1 - clamped_preds)
bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)
bce_loss = np.mean(bce_per_sample)
result = Tensor(bce_loss)
if predictions.requires_grad:
result.requires_grad = True
result._grad_fn = BCEBackward(predictions, targets)
return result
# Install patched methods
Sigmoid.forward = tracked_sigmoid_forward
BinaryCrossEntropyLoss.forward = tracked_bce_forward
except ImportError:
# Activations/losses not yet available (happens during module development)
pass
# Mark as enabled
Tensor._autograd_enabled = True

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"id": "518b6ae0",
"id": "12ec74ba",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -51,7 +51,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "30bbc6f8",
"id": "22b8191c",
"metadata": {
"nbgrader": {
"grade": false,
@@ -73,7 +73,7 @@
},
{
"cell_type": "markdown",
"id": "9057f3bf",
"id": "5e247213",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -130,7 +130,7 @@
},
{
"cell_type": "markdown",
"id": "3b2f074e",
"id": "b02eef44",
"metadata": {
"cell_marker": "\"\"\""
},
@@ -216,7 +216,7 @@
},
{
"cell_type": "markdown",
"id": "3000c581",
"id": "8e1f357d",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -244,7 +244,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "d9343aa4",
"id": "e321b2f5",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -330,7 +330,7 @@
},
{
"cell_type": "markdown",
"id": "0ded4383",
"id": "03cb57b3",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -346,7 +346,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "25d61648",
"id": "144f8d5e",
"metadata": {
"nbgrader": {
"grade": true,
@@ -399,7 +399,7 @@
},
{
"cell_type": "markdown",
"id": "bf5adabc",
"id": "9d29a09d",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -471,7 +471,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "12f0f4b6",
"id": "9f0b044f",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -548,8 +548,8 @@
" if param.grad is None:\n",
" continue\n",
"\n",
" # Get gradient\n",
" grad = param.grad.data\n",
" # Get gradient (param.grad is already a numpy array)\n",
" grad = param.grad\n",
"\n",
" # Apply weight decay\n",
" if self.weight_decay != 0:\n",
@@ -575,7 +575,7 @@
},
{
"cell_type": "markdown",
"id": "815d0bab",
"id": "6ef1174b",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -591,7 +591,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "c01ebc69",
"id": "2bb83981",
"metadata": {
"nbgrader": {
"grade": true,
@@ -658,7 +658,7 @@
},
{
"cell_type": "markdown",
"id": "c656b1b4",
"id": "340304b6",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -732,7 +732,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b545ed16",
"id": "5ea57b8b",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -820,8 +820,8 @@
" if param.grad is None:\n",
" continue\n",
"\n",
" # Get gradient\n",
" grad = param.grad.data\n",
" # Get gradient (param.grad is already a numpy array)\n",
" grad = param.grad\n",
"\n",
" # Apply weight decay\n",
" if self.weight_decay != 0:\n",
@@ -853,7 +853,7 @@
},
{
"cell_type": "markdown",
"id": "b688bced",
"id": "9a2c3a83",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -869,7 +869,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "61fa7116",
"id": "313cea61",
"metadata": {
"nbgrader": {
"grade": true,
@@ -945,7 +945,7 @@
},
{
"cell_type": "markdown",
"id": "7cb028b2",
"id": "2e3dd1a3",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1019,7 +1019,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "277056cc",
"id": "8a7bc513",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1103,8 +1103,8 @@
" if param.grad is None:\n",
" continue\n",
"\n",
" # Get gradient (NOT modified by weight decay)\n",
" grad = param.grad.data\n",
" # Get gradient (NOT modified by weight decay) - param.grad is already a numpy array\n",
" grad = param.grad\n",
"\n",
" # Initialize buffers if needed\n",
" if self.m_buffers[i] is None:\n",
@@ -1134,7 +1134,7 @@
},
{
"cell_type": "markdown",
"id": "d59b1b2b",
"id": "3e8313ad",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1150,7 +1150,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "619464ee",
"id": "82da7b56",
"metadata": {
"nbgrader": {
"grade": true,
@@ -1225,7 +1225,7 @@
},
{
"cell_type": "markdown",
"id": "29f5ad7b",
"id": "aed65b1f",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 2
@@ -1252,7 +1252,7 @@
},
{
"cell_type": "markdown",
"id": "9dd160f5",
"id": "06a5f38a",
"metadata": {
"cell_marker": "\"\"\"",
"lines_to_next_cell": 1
@@ -1298,7 +1298,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "ab882d12",
"id": "2e3d283c",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1356,7 +1356,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "abac74aa",
"id": "740b19ea",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1435,7 +1435,7 @@
},
{
"cell_type": "markdown",
"id": "146f209d",
"id": "54210732",
"metadata": {
"lines_to_next_cell": 1
},
@@ -1457,7 +1457,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "1726f746",
"id": "9ab2a813",
"metadata": {
"lines_to_next_cell": 1,
"nbgrader": {
@@ -1608,7 +1608,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "7328ac69",
"id": "85af7526",
"metadata": {},
"outputs": [],
"source": [
@@ -1619,7 +1619,7 @@
},
{
"cell_type": "markdown",
"id": "c662a5f7",
"id": "5f6cabd5",
"metadata": {
"cell_marker": "\"\"\""
},

View File

@@ -473,8 +473,8 @@ class SGD(Optimizer):
if param.grad is None:
continue
# Get gradient
grad = param.grad.data
# Get gradient (param.grad is already a numpy array)
grad = param.grad
# Apply weight decay
if self.weight_decay != 0:
@@ -705,8 +705,8 @@ class Adam(Optimizer):
if param.grad is None:
continue
# Get gradient
grad = param.grad.data
# Get gradient (param.grad is already a numpy array)
grad = param.grad
# Apply weight decay
if self.weight_decay != 0:
@@ -948,8 +948,8 @@ class AdamW(Optimizer):
if param.grad is None:
continue
# Get gradient (NOT modified by weight decay)
grad = param.grad.data
# Get gradient (NOT modified by weight decay) - param.grad is already a numpy array
grad = param.grad
# Initialize buffers if needed
if self.m_buffers[i] is None:

5
tinytorch/__init__.py generated
View File

@@ -10,6 +10,11 @@ from .core.activations import Sigmoid, ReLU, Tanh, GELU, Softmax
from .core.losses import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss
from .core.optimizers import SGD, AdamW
# 🔥 CRITICAL: Enable automatic differentiation
# This patches Tensor operations to track gradients
from .core.autograd import enable_autograd
enable_autograd()
# Export main public API
__all__ = [
'core',

View File

@@ -59,8 +59,15 @@ class Sigmoid:
"""
### BEGIN SOLUTION
# Apply sigmoid: 1 / (1 + exp(-x))
result = 1.0 / (1.0 + np.exp(-x.data))
return Tensor(result)
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
# Track gradients if autograd is enabled and input requires_grad
if SigmoidBackward is not None and x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:

View File

@@ -456,6 +456,20 @@ def enable_autograd():
# Initialize or accumulate gradient
if self.grad is None:
self.grad = np.zeros_like(self.data)
# Handle broadcasting: sum gradient to match self.data shape
if gradient.shape != self.grad.shape:
# Sum over broadcasted dimensions
# This handles cases like bias gradients that get broadcast
ndims_added = len(gradient.shape) - len(self.grad.shape)
for i in range(ndims_added):
gradient = np.sum(gradient, axis=0)
for i, (grad_dim, self_dim) in enumerate(zip(gradient.shape, self.grad.shape)):
if self_dim == 1 and grad_dim > 1:
gradient = np.sum(gradient, axis=i, keepdims=True)
elif self_dim != grad_dim:
gradient = np.sum(gradient, axis=i, keepdims=True)
self.grad += gradient
# Propagate gradients through computation graph
@@ -484,6 +498,52 @@ def enable_autograd():
Tensor.backward = backward
Tensor.zero_grad = zero_grad
# Patch activations and losses to track gradients
try:
from tinytorch.core.activations import Sigmoid
from tinytorch.core.losses import BinaryCrossEntropyLoss
# Store original methods
_original_sigmoid_forward = Sigmoid.forward
_original_bce_forward = BinaryCrossEntropyLoss.forward
def tracked_sigmoid_forward(self, x):
"""Sigmoid with gradient tracking."""
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
if x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
def tracked_bce_forward(self, predictions, targets):
"""Binary cross-entropy with gradient tracking."""
# Compute BCE loss
eps = 1e-7
clamped_preds = np.clip(predictions.data, eps, 1 - eps)
log_preds = np.log(clamped_preds)
log_one_minus_preds = np.log(1 - clamped_preds)
bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)
bce_loss = np.mean(bce_per_sample)
result = Tensor(bce_loss)
if predictions.requires_grad:
result.requires_grad = True
result._grad_fn = BCEBackward(predictions, targets)
return result
# Install patched methods
Sigmoid.forward = tracked_sigmoid_forward
BinaryCrossEntropyLoss.forward = tracked_bce_forward
except ImportError:
# Activations/losses not yet available (happens during module development)
pass
# Mark as enabled
Tensor._autograd_enabled = True

View File

@@ -162,8 +162,8 @@ class SGD(Optimizer):
if param.grad is None:
continue
# Get gradient
grad = param.grad.data
# Get gradient (param.grad is already a numpy array)
grad = param.grad
# Apply weight decay
if self.weight_decay != 0:
@@ -263,8 +263,8 @@ class Adam(Optimizer):
if param.grad is None:
continue
# Get gradient
grad = param.grad.data
# Get gradient (param.grad is already a numpy array)
grad = param.grad
# Apply weight decay
if self.weight_decay != 0:
@@ -366,8 +366,8 @@ class AdamW(Optimizer):
if param.grad is None:
continue
# Get gradient (NOT modified by weight decay)
grad = param.grad.data
# Get gradient (NOT modified by weight decay) - param.grad is already a numpy array
grad = param.grad
# Initialize buffers if needed
if self.m_buffers[i] is None: