diff --git a/debug_gradients.py b/debug_gradients.py new file mode 100644 index 00000000..0e89fbbe --- /dev/null +++ b/debug_gradients.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +""" +Debug script to trace gradient propagation through the TinyTorch stack. +Tests each component step-by-step to find where gradients stop flowing. +""" + +import numpy as np +from tinytorch import Tensor, Linear, Sigmoid, BinaryCrossEntropyLoss, SGD + +print("=" * 70) +print("šŸ” GRADIENT FLOW DEBUGGING") +print("=" * 70) + +# ============================================================================ +# TEST 1: Basic Tensor Operations +# ============================================================================ +print("\n[TEST 1] Basic Tensor Operations") +print("-" * 70) + +x = Tensor([[1.0, 2.0]], requires_grad=True) +print(f"āœ“ Created tensor x: {x.data}") +print(f" requires_grad: {x.requires_grad}") +print(f" grad: {x.grad}") + +y = x * 2 +print(f"\nāœ“ Created y = x * 2: {y.data}") +print(f" requires_grad: {y.requires_grad}") +print(f" grad: {y.grad}") + +loss = y.sum() +print(f"\nāœ“ Created loss = y.sum(): {loss.data}") +print(f" requires_grad: {loss.requires_grad}") + +print("\nšŸ“Š Before backward:") +print(f" x.grad: {x.grad}") + +loss.backward() + +print("\nšŸ“Š After backward:") +print(f" x.grad: {x.grad}") + +if x.grad is not None and np.allclose(x.grad, [[2.0, 2.0]]): + print("āœ… TEST 1 PASSED: Basic gradients work!") +else: + print("āŒ TEST 1 FAILED: Basic gradients don't work!") + print(f" Expected: [[2.0, 2.0]], Got: {x.grad}") + +# ============================================================================ +# TEST 2: Linear Layer Forward Pass +# ============================================================================ +print("\n\n[TEST 2] Linear Layer Forward Pass") +print("-" * 70) + +layer = Linear(2, 1) +print(f"āœ“ Created Linear(2, 1)") +print(f" weight.data: {layer.weight.data}") +print(f" weight.requires_grad: {layer.weight.requires_grad}") +print(f" bias.data: {layer.bias.data}") +print(f" bias.requires_grad: {layer.bias.requires_grad}") + +x = Tensor([[1.0, 2.0]], requires_grad=True) +out = layer(x) +print(f"\nāœ“ Forward pass output: {out.data}") +print(f" out.requires_grad: {out.requires_grad}") + +# ============================================================================ +# TEST 3: Linear Layer Backward Pass +# ============================================================================ +print("\n\n[TEST 3] Linear Layer Backward Pass") +print("-" * 70) + +layer = Linear(2, 1) +w_before = layer.weight.data.copy() +b_before = layer.bias.data.copy() + +print(f"Before backward:") +print(f" weight: {w_before}") +print(f" bias: {b_before}") +print(f" weight.grad: {layer.weight.grad}") +print(f" bias.grad: {layer.bias.grad}") + +x = Tensor([[1.0, 2.0]], requires_grad=True) +out = layer(x) +loss = out.sum() + +print(f"\nāœ“ Created loss: {loss.data}") + +loss.backward() + +print(f"\nAfter backward:") +print(f" weight.grad: {layer.weight.grad}") +print(f" bias.grad: {layer.bias.grad}") +print(f" x.grad: {x.grad}") + +if layer.weight.grad is not None and layer.bias.grad is not None: + print("āœ… TEST 3 PASSED: Linear layer gradients computed!") +else: + print("āŒ TEST 3 FAILED: Linear layer gradients missing!") + +# ============================================================================ +# TEST 4: Optimizer Step +# ============================================================================ +print("\n\n[TEST 4] Optimizer Step") +print("-" * 70) + +layer = Linear(2, 1) +optimizer = SGD(layer.parameters(), lr=0.1) + +print(f"āœ“ Created optimizer with lr=0.1") +print(f" Num parameters: {len(optimizer.params)}") + +w_before = layer.weight.data.copy() +b_before = layer.bias.data.copy() + +print(f"\nBefore training step:") +print(f" weight: {w_before}") +print(f" bias: {b_before}") + +# Forward +x = Tensor([[1.0, 2.0]], requires_grad=True) +out = layer(x) +loss = out.sum() + +print(f"\nāœ“ Forward pass, loss: {loss.data}") + +# Backward +loss.backward() + +print(f"\nAfter backward:") +print(f" weight.grad: {layer.weight.grad}") +print(f" bias.grad: {layer.bias.grad}") + +# Step +optimizer.step() + +w_after = layer.weight.data.copy() +b_after = layer.bias.data.copy() + +print(f"\nAfter optimizer.step():") +print(f" weight: {w_after}") +print(f" bias: {b_after}") +print(f" weight changed: {not np.allclose(w_before, w_after)}") +print(f" bias changed: {not np.allclose(b_before, b_after)}") + +if not np.allclose(w_before, w_after) or not np.allclose(b_before, b_after): + print("āœ… TEST 4 PASSED: Optimizer updates parameters!") +else: + print("āŒ TEST 4 FAILED: Optimizer didn't update parameters!") + +# ============================================================================ +# TEST 5: Full Training Step with Sigmoid + BCE +# ============================================================================ +print("\n\n[TEST 5] Full Training Step (Linear + Sigmoid + BCE)") +print("-" * 70) + +layer = Linear(2, 1) +sigmoid = Sigmoid() +loss_fn = BinaryCrossEntropyLoss() +optimizer = SGD(layer.parameters(), lr=0.1) + +w_before = layer.weight.data.copy() +b_before = layer.bias.data.copy() + +print(f"Before training:") +print(f" weight: {w_before}") +print(f" bias: {b_before}") + +# Data +x = Tensor([[1.0, 2.0]], requires_grad=True) +y_true = Tensor([[1.0]]) + +# Forward +logits = layer(x) +print(f"\nāœ“ Logits: {logits.data}") + +probs = sigmoid(logits) +print(f"āœ“ Probs: {probs.data}") + +loss = loss_fn(probs, y_true) +print(f"āœ“ Loss: {loss.data}") + +# Backward +print("\nšŸ“Š Calling loss.backward()...") +loss.backward() + +print(f"\nAfter backward:") +print(f" loss.grad: {loss.grad}") +print(f" probs.grad: {probs.grad}") +print(f" logits.grad: {logits.grad}") +print(f" weight.grad: {layer.weight.grad}") +print(f" bias.grad: {layer.bias.grad}") + +# Update +optimizer.step() + +w_after = layer.weight.data.copy() +b_after = layer.bias.data.copy() + +print(f"\nAfter optimizer.step():") +print(f" weight: {w_after}") +print(f" bias: {b_after}") +print(f" weight changed: {not np.allclose(w_before, w_after)}") +print(f" bias changed: {not np.allclose(b_before, b_after)}") + +if not np.allclose(w_before, w_after) or not np.allclose(b_before, b_after): + print("āœ… TEST 5 PASSED: Full training step works!") +else: + print("āŒ TEST 5 FAILED: Full training step doesn't update weights!") + +# ============================================================================ +# TEST 6: Parameters Function +# ============================================================================ +print("\n\n[TEST 6] Layer parameters() method") +print("-" * 70) + +layer = Linear(2, 1) +params = layer.parameters() + +print(f"āœ“ layer.parameters() returned {len(params)} parameters") +for i, p in enumerate(params): + print(f" param[{i}]: shape={p.shape}, requires_grad={p.requires_grad}, type={type(p)}") + +if len(params) == 2: + print("āœ… TEST 6 PASSED: parameters() returns weight and bias!") +else: + print("āŒ TEST 6 FAILED: parameters() should return 2 tensors!") + +print("\n" + "=" * 70) +print("šŸ DEBUGGING COMPLETE") +print("=" * 70) diff --git a/modules/source/02_activations/activations_dev.ipynb b/modules/source/02_activations/activations_dev.ipynb index f3dc5445..b2515f9a 100644 --- a/modules/source/02_activations/activations_dev.ipynb +++ b/modules/source/02_activations/activations_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "41637b5b", + "id": "e444d0e8", "metadata": { "cell_marker": "\"\"\"" }, @@ -34,7 +34,7 @@ }, { "cell_type": "markdown", - "id": "eb80f71c", + "id": "f8e11333", "metadata": { "cell_marker": "\"\"\"" }, @@ -59,7 +59,7 @@ }, { "cell_type": "markdown", - "id": "ad445b19", + "id": "9bb0541c", "metadata": { "cell_marker": "\"\"\"" }, @@ -78,7 +78,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7fc4b3ae", + "id": "c50f8430", "metadata": { "nbgrader": { "grade": false, @@ -102,7 +102,7 @@ }, { "cell_type": "markdown", - "id": "6c49b0a7", + "id": "7dd50568", "metadata": { "cell_marker": "\"\"\"" }, @@ -144,7 +144,7 @@ }, { "cell_type": "markdown", - "id": "a82d5ffc", + "id": "0402638e", "metadata": { "cell_marker": "\"\"\"" }, @@ -166,7 +166,7 @@ }, { "cell_type": "markdown", - "id": "d954190f", + "id": "9cc4031a", "metadata": { "cell_marker": "\"\"\"" }, @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "1d26aa84", + "id": "64941304", "metadata": { "cell_marker": "\"\"\"" }, @@ -228,7 +228,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cd112f28", + "id": "e6b7ee43", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -272,8 +272,15 @@ " \"\"\"\n", " ### BEGIN SOLUTION\n", " # Apply sigmoid: 1 / (1 + exp(-x))\n", - " result = 1.0 / (1.0 + np.exp(-x.data))\n", - " return Tensor(result)\n", + " result_data = 1.0 / (1.0 + np.exp(-x.data))\n", + " result = Tensor(result_data)\n", + " \n", + " # Track gradients if autograd is enabled and input requires_grad\n", + " if SigmoidBackward is not None and x.requires_grad:\n", + " result.requires_grad = True\n", + " result._grad_fn = SigmoidBackward(x, result)\n", + " \n", + " return result\n", " ### END SOLUTION\n", "\n", " def __call__(self, x: Tensor) -> Tensor:\n", @@ -287,7 +294,7 @@ }, { "cell_type": "markdown", - "id": "87407a56", + "id": "526bd575", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -303,7 +310,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8599e53a", + "id": "667cc4ec", "metadata": { "nbgrader": { "grade": true, @@ -344,7 +351,7 @@ }, { "cell_type": "markdown", - "id": "96438263", + "id": "6b301e45", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -386,7 +393,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6bdad44d", + "id": "c29349e4", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -442,7 +449,7 @@ }, { "cell_type": "markdown", - "id": "853265df", + "id": "34c5f3a4", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -458,7 +465,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e3f2e5fd", + "id": "517b440a", "metadata": { "nbgrader": { "grade": true, @@ -505,7 +512,7 @@ }, { "cell_type": "markdown", - "id": "d137e456", + "id": "b92601a1", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -544,7 +551,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a3ec4c5", + "id": "9e1cbe76", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -600,7 +607,7 @@ }, { "cell_type": "markdown", - "id": "b2ad2baa", + "id": "b2d12529", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -616,7 +623,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b92572ae", + "id": "a2e9f944", "metadata": { "nbgrader": { "grade": true, @@ -664,7 +671,7 @@ }, { "cell_type": "markdown", - "id": "d1cdd503", + "id": "f24f4022", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -707,7 +714,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90f15779", + "id": "3d86ca1f", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -768,7 +775,7 @@ }, { "cell_type": "markdown", - "id": "eb655b3b", + "id": "f16cc4b7", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -784,7 +791,7 @@ { "cell_type": "code", "execution_count": null, - "id": "838060ac", + "id": "07f74f6a", "metadata": { "nbgrader": { "grade": true, @@ -832,7 +839,7 @@ }, { "cell_type": "markdown", - "id": "a8047ea8", + "id": "4f0611e8", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -870,7 +877,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa266bb7", + "id": "39ff5a3a", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -942,7 +949,7 @@ }, { "cell_type": "markdown", - "id": "80e6ad27", + "id": "f0a20259", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -958,7 +965,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f3db3810", + "id": "3b6dd4d2", "metadata": { "nbgrader": { "grade": true, @@ -1016,7 +1023,7 @@ }, { "cell_type": "markdown", - "id": "2db83cef", + "id": "9d0b8c20", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 2 @@ -1029,7 +1036,7 @@ }, { "cell_type": "markdown", - "id": "428eaa1b", + "id": "9716c7c5", "metadata": { "cell_marker": "\"\"\"" }, @@ -1049,7 +1056,7 @@ }, { "cell_type": "markdown", - "id": "fe7666b9", + "id": "bdcbe6f6", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1063,7 +1070,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fac9ee55", + "id": "e7a90720", "metadata": { "lines_to_next_cell": 2, "nbgrader": { @@ -1162,7 +1169,7 @@ }, { "cell_type": "markdown", - "id": "6a9cc930", + "id": "d82baf1e", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/02_activations/activations_dev.py b/modules/source/02_activations/activations_dev.py index edadfae7..8f9787f0 100644 --- a/modules/source/02_activations/activations_dev.py +++ b/modules/source/02_activations/activations_dev.py @@ -223,8 +223,15 @@ class Sigmoid: """ ### BEGIN SOLUTION # Apply sigmoid: 1 / (1 + exp(-x)) - result = 1.0 / (1.0 + np.exp(-x.data)) - return Tensor(result) + result_data = 1.0 / (1.0 + np.exp(-x.data)) + result = Tensor(result_data) + + # Track gradients if autograd is enabled and input requires_grad + if SigmoidBackward is not None and x.requires_grad: + result.requires_grad = True + result._grad_fn = SigmoidBackward(x, result) + + return result ### END SOLUTION def __call__(self, x: Tensor) -> Tensor: diff --git a/modules/source/05_autograd/autograd_dev.ipynb b/modules/source/05_autograd/autograd_dev.ipynb index ca9e17fc..a1ca3ffe 100644 --- a/modules/source/05_autograd/autograd_dev.ipynb +++ b/modules/source/05_autograd/autograd_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "11a866a5", + "id": "dc404941", "metadata": { "cell_marker": "\"\"\"" }, @@ -54,7 +54,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f716656", + "id": "6b25fc7d", "metadata": { "nbgrader": { "grade": false, @@ -77,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "1c5fcfe6", + "id": "ad79196c", "metadata": { "cell_marker": "\"\"\"" }, @@ -131,7 +131,7 @@ }, { "cell_type": "markdown", - "id": "82cafe21", + "id": "268f66e4", "metadata": { "cell_marker": "\"\"\"" }, @@ -190,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "47bd67c9", + "id": "47d547ca", "metadata": { "cell_marker": "\"\"\"" }, @@ -227,7 +227,7 @@ }, { "cell_type": "markdown", - "id": "cce8538a", + "id": "b09afd8d", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -255,7 +255,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c604fa6", + "id": "d02437c4", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -321,7 +321,7 @@ }, { "cell_type": "markdown", - "id": "f721b07e", + "id": "8e0706ae", "metadata": { "cell_marker": "\"\"\"" }, @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "b783a909", + "id": "dea5fcaf", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -389,7 +389,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8c92aa2", + "id": "a0f8a601", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -444,7 +444,7 @@ }, { "cell_type": "markdown", - "id": "31a8a1ab", + "id": "e8dc3302", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1a6762d0", + "id": "c6812744", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -535,7 +535,7 @@ }, { "cell_type": "markdown", - "id": "11567a68", + "id": "303fb215", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -570,7 +570,7 @@ { "cell_type": "code", "execution_count": null, - "id": "102ba9f6", + "id": "5e7d9f70", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -627,7 +627,7 @@ }, { "cell_type": "markdown", - "id": "d9496bda", + "id": "08c1cf65", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -658,7 +658,7 @@ { "cell_type": "code", "execution_count": null, - "id": "37f9b250", + "id": "21c2fb7b", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -706,7 +706,7 @@ }, { "cell_type": "markdown", - "id": "116f71ea", + "id": "cb3a65f0", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -722,7 +722,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2120ecf", + "id": "78182b35", "metadata": { "nbgrader": { "grade": true, @@ -769,7 +769,7 @@ }, { "cell_type": "markdown", - "id": "9685115d", + "id": "25c230ca", "metadata": { "cell_marker": "\"\"\"" }, @@ -804,7 +804,7 @@ }, { "cell_type": "markdown", - "id": "5612e207", + "id": "8b293571", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -830,7 +830,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b49922a3", + "id": "eba66f56", "metadata": { "nbgrader": { "grade": false, @@ -874,7 +874,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e79f5497", + "id": "0fd1c893", "metadata": { "nbgrader": { "grade": false, @@ -918,7 +918,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58b86487", + "id": "1eb3f913", "metadata": { "nbgrader": { "grade": false, @@ -1084,6 +1084,20 @@ " # Initialize or accumulate gradient\n", " if self.grad is None:\n", " self.grad = np.zeros_like(self.data)\n", + " \n", + " # Handle broadcasting: sum gradient to match self.data shape\n", + " if gradient.shape != self.grad.shape:\n", + " # Sum over broadcasted dimensions\n", + " # This handles cases like bias gradients that get broadcast\n", + " ndims_added = len(gradient.shape) - len(self.grad.shape)\n", + " for i in range(ndims_added):\n", + " gradient = np.sum(gradient, axis=0)\n", + " for i, (grad_dim, self_dim) in enumerate(zip(gradient.shape, self.grad.shape)):\n", + " if self_dim == 1 and grad_dim > 1:\n", + " gradient = np.sum(gradient, axis=i, keepdims=True)\n", + " elif self_dim != grad_dim:\n", + " gradient = np.sum(gradient, axis=i, keepdims=True)\n", + " \n", " self.grad += gradient\n", "\n", " # Propagate gradients through computation graph\n", @@ -1112,6 +1126,52 @@ " Tensor.backward = backward\n", " Tensor.zero_grad = zero_grad\n", "\n", + " # Patch activations and losses to track gradients\n", + " try:\n", + " from tinytorch.core.activations import Sigmoid\n", + " from tinytorch.core.losses import BinaryCrossEntropyLoss\n", + " \n", + " # Store original methods\n", + " _original_sigmoid_forward = Sigmoid.forward\n", + " _original_bce_forward = BinaryCrossEntropyLoss.forward\n", + " \n", + " def tracked_sigmoid_forward(self, x):\n", + " \"\"\"Sigmoid with gradient tracking.\"\"\"\n", + " result_data = 1.0 / (1.0 + np.exp(-x.data))\n", + " result = Tensor(result_data)\n", + " \n", + " if x.requires_grad:\n", + " result.requires_grad = True\n", + " result._grad_fn = SigmoidBackward(x, result)\n", + " \n", + " return result\n", + " \n", + " def tracked_bce_forward(self, predictions, targets):\n", + " \"\"\"Binary cross-entropy with gradient tracking.\"\"\"\n", + " # Compute BCE loss\n", + " eps = 1e-7\n", + " clamped_preds = np.clip(predictions.data, eps, 1 - eps)\n", + " log_preds = np.log(clamped_preds)\n", + " log_one_minus_preds = np.log(1 - clamped_preds)\n", + " bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)\n", + " bce_loss = np.mean(bce_per_sample)\n", + " \n", + " result = Tensor(bce_loss)\n", + " \n", + " if predictions.requires_grad:\n", + " result.requires_grad = True\n", + " result._grad_fn = BCEBackward(predictions, targets)\n", + " \n", + " return result\n", + " \n", + " # Install patched methods\n", + " Sigmoid.forward = tracked_sigmoid_forward\n", + " BinaryCrossEntropyLoss.forward = tracked_bce_forward\n", + " \n", + " except ImportError:\n", + " # Activations/losses not yet available (happens during module development)\n", + " pass\n", + "\n", " # Mark as enabled\n", " Tensor._autograd_enabled = True\n", "\n", @@ -1126,7 +1186,7 @@ }, { "cell_type": "markdown", - "id": "d03e54f6", + "id": "19562839", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1142,7 +1202,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1bae0903", + "id": "469f4a49", "metadata": { "nbgrader": { "grade": true, @@ -1190,7 +1250,7 @@ }, { "cell_type": "markdown", - "id": "fc159b24", + "id": "db0b3fec", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1204,7 +1264,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92f51d47", + "id": "7d5d64d8", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1317,7 +1377,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef3b1668", + "id": "8fbb5c87", "metadata": {}, "outputs": [], "source": [ @@ -1328,7 +1388,7 @@ }, { "cell_type": "markdown", - "id": "7728d17d", + "id": "4e4a0fe1", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/05_autograd/autograd_dev.py b/modules/source/05_autograd/autograd_dev.py index d2d27cc1..4248f916 100644 --- a/modules/source/05_autograd/autograd_dev.py +++ b/modules/source/05_autograd/autograd_dev.py @@ -891,6 +891,20 @@ def enable_autograd(): # Initialize or accumulate gradient if self.grad is None: self.grad = np.zeros_like(self.data) + + # Handle broadcasting: sum gradient to match self.data shape + if gradient.shape != self.grad.shape: + # Sum over broadcasted dimensions + # This handles cases like bias gradients that get broadcast + ndims_added = len(gradient.shape) - len(self.grad.shape) + for i in range(ndims_added): + gradient = np.sum(gradient, axis=0) + for i, (grad_dim, self_dim) in enumerate(zip(gradient.shape, self.grad.shape)): + if self_dim == 1 and grad_dim > 1: + gradient = np.sum(gradient, axis=i, keepdims=True) + elif self_dim != grad_dim: + gradient = np.sum(gradient, axis=i, keepdims=True) + self.grad += gradient # Propagate gradients through computation graph @@ -919,6 +933,52 @@ def enable_autograd(): Tensor.backward = backward Tensor.zero_grad = zero_grad + # Patch activations and losses to track gradients + try: + from tinytorch.core.activations import Sigmoid + from tinytorch.core.losses import BinaryCrossEntropyLoss + + # Store original methods + _original_sigmoid_forward = Sigmoid.forward + _original_bce_forward = BinaryCrossEntropyLoss.forward + + def tracked_sigmoid_forward(self, x): + """Sigmoid with gradient tracking.""" + result_data = 1.0 / (1.0 + np.exp(-x.data)) + result = Tensor(result_data) + + if x.requires_grad: + result.requires_grad = True + result._grad_fn = SigmoidBackward(x, result) + + return result + + def tracked_bce_forward(self, predictions, targets): + """Binary cross-entropy with gradient tracking.""" + # Compute BCE loss + eps = 1e-7 + clamped_preds = np.clip(predictions.data, eps, 1 - eps) + log_preds = np.log(clamped_preds) + log_one_minus_preds = np.log(1 - clamped_preds) + bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds) + bce_loss = np.mean(bce_per_sample) + + result = Tensor(bce_loss) + + if predictions.requires_grad: + result.requires_grad = True + result._grad_fn = BCEBackward(predictions, targets) + + return result + + # Install patched methods + Sigmoid.forward = tracked_sigmoid_forward + BinaryCrossEntropyLoss.forward = tracked_bce_forward + + except ImportError: + # Activations/losses not yet available (happens during module development) + pass + # Mark as enabled Tensor._autograd_enabled = True diff --git a/modules/source/06_optimizers/optimizers_dev.ipynb b/modules/source/06_optimizers/optimizers_dev.ipynb index 7ef0e46e..85bb7623 100644 --- a/modules/source/06_optimizers/optimizers_dev.ipynb +++ b/modules/source/06_optimizers/optimizers_dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "518b6ae0", + "id": "12ec74ba", "metadata": { "cell_marker": "\"\"\"" }, @@ -51,7 +51,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30bbc6f8", + "id": "22b8191c", "metadata": { "nbgrader": { "grade": false, @@ -73,7 +73,7 @@ }, { "cell_type": "markdown", - "id": "9057f3bf", + "id": "5e247213", "metadata": { "cell_marker": "\"\"\"" }, @@ -130,7 +130,7 @@ }, { "cell_type": "markdown", - "id": "3b2f074e", + "id": "b02eef44", "metadata": { "cell_marker": "\"\"\"" }, @@ -216,7 +216,7 @@ }, { "cell_type": "markdown", - "id": "3000c581", + "id": "8e1f357d", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -244,7 +244,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d9343aa4", + "id": "e321b2f5", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -330,7 +330,7 @@ }, { "cell_type": "markdown", - "id": "0ded4383", + "id": "03cb57b3", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "25d61648", + "id": "144f8d5e", "metadata": { "nbgrader": { "grade": true, @@ -399,7 +399,7 @@ }, { "cell_type": "markdown", - "id": "bf5adabc", + "id": "9d29a09d", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -471,7 +471,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12f0f4b6", + "id": "9f0b044f", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -548,8 +548,8 @@ " if param.grad is None:\n", " continue\n", "\n", - " # Get gradient\n", - " grad = param.grad.data\n", + " # Get gradient (param.grad is already a numpy array)\n", + " grad = param.grad\n", "\n", " # Apply weight decay\n", " if self.weight_decay != 0:\n", @@ -575,7 +575,7 @@ }, { "cell_type": "markdown", - "id": "815d0bab", + "id": "6ef1174b", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -591,7 +591,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c01ebc69", + "id": "2bb83981", "metadata": { "nbgrader": { "grade": true, @@ -658,7 +658,7 @@ }, { "cell_type": "markdown", - "id": "c656b1b4", + "id": "340304b6", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -732,7 +732,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b545ed16", + "id": "5ea57b8b", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -820,8 +820,8 @@ " if param.grad is None:\n", " continue\n", "\n", - " # Get gradient\n", - " grad = param.grad.data\n", + " # Get gradient (param.grad is already a numpy array)\n", + " grad = param.grad\n", "\n", " # Apply weight decay\n", " if self.weight_decay != 0:\n", @@ -853,7 +853,7 @@ }, { "cell_type": "markdown", - "id": "b688bced", + "id": "9a2c3a83", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -869,7 +869,7 @@ { "cell_type": "code", "execution_count": null, - "id": "61fa7116", + "id": "313cea61", "metadata": { "nbgrader": { "grade": true, @@ -945,7 +945,7 @@ }, { "cell_type": "markdown", - "id": "7cb028b2", + "id": "2e3dd1a3", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1019,7 +1019,7 @@ { "cell_type": "code", "execution_count": null, - "id": "277056cc", + "id": "8a7bc513", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1103,8 +1103,8 @@ " if param.grad is None:\n", " continue\n", "\n", - " # Get gradient (NOT modified by weight decay)\n", - " grad = param.grad.data\n", + " # Get gradient (NOT modified by weight decay) - param.grad is already a numpy array\n", + " grad = param.grad\n", "\n", " # Initialize buffers if needed\n", " if self.m_buffers[i] is None:\n", @@ -1134,7 +1134,7 @@ }, { "cell_type": "markdown", - "id": "d59b1b2b", + "id": "3e8313ad", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1150,7 +1150,7 @@ { "cell_type": "code", "execution_count": null, - "id": "619464ee", + "id": "82da7b56", "metadata": { "nbgrader": { "grade": true, @@ -1225,7 +1225,7 @@ }, { "cell_type": "markdown", - "id": "29f5ad7b", + "id": "aed65b1f", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 2 @@ -1252,7 +1252,7 @@ }, { "cell_type": "markdown", - "id": "9dd160f5", + "id": "06a5f38a", "metadata": { "cell_marker": "\"\"\"", "lines_to_next_cell": 1 @@ -1298,7 +1298,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ab882d12", + "id": "2e3d283c", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1356,7 +1356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "abac74aa", + "id": "740b19ea", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1435,7 +1435,7 @@ }, { "cell_type": "markdown", - "id": "146f209d", + "id": "54210732", "metadata": { "lines_to_next_cell": 1 }, @@ -1457,7 +1457,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1726f746", + "id": "9ab2a813", "metadata": { "lines_to_next_cell": 1, "nbgrader": { @@ -1608,7 +1608,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7328ac69", + "id": "85af7526", "metadata": {}, "outputs": [], "source": [ @@ -1619,7 +1619,7 @@ }, { "cell_type": "markdown", - "id": "c662a5f7", + "id": "5f6cabd5", "metadata": { "cell_marker": "\"\"\"" }, diff --git a/modules/source/06_optimizers/optimizers_dev.py b/modules/source/06_optimizers/optimizers_dev.py index 0c778e98..b52546f8 100644 --- a/modules/source/06_optimizers/optimizers_dev.py +++ b/modules/source/06_optimizers/optimizers_dev.py @@ -473,8 +473,8 @@ class SGD(Optimizer): if param.grad is None: continue - # Get gradient - grad = param.grad.data + # Get gradient (param.grad is already a numpy array) + grad = param.grad # Apply weight decay if self.weight_decay != 0: @@ -705,8 +705,8 @@ class Adam(Optimizer): if param.grad is None: continue - # Get gradient - grad = param.grad.data + # Get gradient (param.grad is already a numpy array) + grad = param.grad # Apply weight decay if self.weight_decay != 0: @@ -948,8 +948,8 @@ class AdamW(Optimizer): if param.grad is None: continue - # Get gradient (NOT modified by weight decay) - grad = param.grad.data + # Get gradient (NOT modified by weight decay) - param.grad is already a numpy array + grad = param.grad # Initialize buffers if needed if self.m_buffers[i] is None: diff --git a/tinytorch/__init__.py b/tinytorch/__init__.py index 1c559b00..4ee5d1bc 100644 --- a/tinytorch/__init__.py +++ b/tinytorch/__init__.py @@ -10,6 +10,11 @@ from .core.activations import Sigmoid, ReLU, Tanh, GELU, Softmax from .core.losses import MSELoss, CrossEntropyLoss, BinaryCrossEntropyLoss from .core.optimizers import SGD, AdamW +# šŸ”„ CRITICAL: Enable automatic differentiation +# This patches Tensor operations to track gradients +from .core.autograd import enable_autograd +enable_autograd() + # Export main public API __all__ = [ 'core', diff --git a/tinytorch/core/activations.py b/tinytorch/core/activations.py index fff7d636..fe211edf 100644 --- a/tinytorch/core/activations.py +++ b/tinytorch/core/activations.py @@ -59,8 +59,15 @@ class Sigmoid: """ ### BEGIN SOLUTION # Apply sigmoid: 1 / (1 + exp(-x)) - result = 1.0 / (1.0 + np.exp(-x.data)) - return Tensor(result) + result_data = 1.0 / (1.0 + np.exp(-x.data)) + result = Tensor(result_data) + + # Track gradients if autograd is enabled and input requires_grad + if SigmoidBackward is not None and x.requires_grad: + result.requires_grad = True + result._grad_fn = SigmoidBackward(x, result) + + return result ### END SOLUTION def __call__(self, x: Tensor) -> Tensor: diff --git a/tinytorch/core/autograd.py b/tinytorch/core/autograd.py index 14e18f35..829b7a59 100644 --- a/tinytorch/core/autograd.py +++ b/tinytorch/core/autograd.py @@ -456,6 +456,20 @@ def enable_autograd(): # Initialize or accumulate gradient if self.grad is None: self.grad = np.zeros_like(self.data) + + # Handle broadcasting: sum gradient to match self.data shape + if gradient.shape != self.grad.shape: + # Sum over broadcasted dimensions + # This handles cases like bias gradients that get broadcast + ndims_added = len(gradient.shape) - len(self.grad.shape) + for i in range(ndims_added): + gradient = np.sum(gradient, axis=0) + for i, (grad_dim, self_dim) in enumerate(zip(gradient.shape, self.grad.shape)): + if self_dim == 1 and grad_dim > 1: + gradient = np.sum(gradient, axis=i, keepdims=True) + elif self_dim != grad_dim: + gradient = np.sum(gradient, axis=i, keepdims=True) + self.grad += gradient # Propagate gradients through computation graph @@ -484,6 +498,52 @@ def enable_autograd(): Tensor.backward = backward Tensor.zero_grad = zero_grad + # Patch activations and losses to track gradients + try: + from tinytorch.core.activations import Sigmoid + from tinytorch.core.losses import BinaryCrossEntropyLoss + + # Store original methods + _original_sigmoid_forward = Sigmoid.forward + _original_bce_forward = BinaryCrossEntropyLoss.forward + + def tracked_sigmoid_forward(self, x): + """Sigmoid with gradient tracking.""" + result_data = 1.0 / (1.0 + np.exp(-x.data)) + result = Tensor(result_data) + + if x.requires_grad: + result.requires_grad = True + result._grad_fn = SigmoidBackward(x, result) + + return result + + def tracked_bce_forward(self, predictions, targets): + """Binary cross-entropy with gradient tracking.""" + # Compute BCE loss + eps = 1e-7 + clamped_preds = np.clip(predictions.data, eps, 1 - eps) + log_preds = np.log(clamped_preds) + log_one_minus_preds = np.log(1 - clamped_preds) + bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds) + bce_loss = np.mean(bce_per_sample) + + result = Tensor(bce_loss) + + if predictions.requires_grad: + result.requires_grad = True + result._grad_fn = BCEBackward(predictions, targets) + + return result + + # Install patched methods + Sigmoid.forward = tracked_sigmoid_forward + BinaryCrossEntropyLoss.forward = tracked_bce_forward + + except ImportError: + # Activations/losses not yet available (happens during module development) + pass + # Mark as enabled Tensor._autograd_enabled = True diff --git a/tinytorch/core/optimizers.py b/tinytorch/core/optimizers.py index 3698541c..314a8db9 100644 --- a/tinytorch/core/optimizers.py +++ b/tinytorch/core/optimizers.py @@ -162,8 +162,8 @@ class SGD(Optimizer): if param.grad is None: continue - # Get gradient - grad = param.grad.data + # Get gradient (param.grad is already a numpy array) + grad = param.grad # Apply weight decay if self.weight_decay != 0: @@ -263,8 +263,8 @@ class Adam(Optimizer): if param.grad is None: continue - # Get gradient - grad = param.grad.data + # Get gradient (param.grad is already a numpy array) + grad = param.grad # Apply weight decay if self.weight_decay != 0: @@ -366,8 +366,8 @@ class AdamW(Optimizer): if param.grad is None: continue - # Get gradient (NOT modified by weight decay) - grad = param.grad.data + # Get gradient (NOT modified by weight decay) - param.grad is already a numpy array + grad = param.grad # Initialize buffers if needed if self.m_buffers[i] is None: