mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-26 15:42:18 -05:00
fix(module-05): Add SubBackward and DivBackward for autograd
- Implement gradient functions for subtraction and division operations - Patch Tensor.__sub__ and Tensor.__truediv__ in enable_autograd() - Required for LayerNorm (x - mean) and (normalized / std) operations These operations are used extensively in normalization layers
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3405f85e",
|
||||
"id": "c13b821c",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -54,7 +54,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "261c3177",
|
||||
"id": "0ecb4762",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -77,7 +77,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "984dc0f4",
|
||||
"id": "96f2be2b",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -131,7 +131,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4859deb3",
|
||||
"id": "1ef7e4de",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -190,7 +190,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bfc1da56",
|
||||
"id": "980d2040",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -227,7 +227,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a252129",
|
||||
"id": "9eb5a38c",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -255,7 +255,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7311a2dd",
|
||||
"id": "874e9478",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -321,7 +321,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c03db390",
|
||||
"id": "2412e2ee",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -360,7 +360,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c58b717a",
|
||||
"id": "fefea74f",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -389,7 +389,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "74a96c73",
|
||||
"id": "d487e50a",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -444,7 +444,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8ddb8b58",
|
||||
"id": "930b25c3",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -477,7 +477,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "167d60c6",
|
||||
"id": "c8e9489d",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -535,7 +535,141 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "90e9e19c",
|
||||
"id": "b39060be",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
},
|
||||
"source": [
|
||||
"### SubBackward - Gradient Rules for Subtraction\n",
|
||||
"\n",
|
||||
"Subtraction is mathematically simple but important for operations like normalization.\n",
|
||||
"\n",
|
||||
"**Mathematical Principle:**\n",
|
||||
"```\n",
|
||||
"If z = a - b, then:\n",
|
||||
"∂z/∂a = 1\n",
|
||||
"∂z/∂b = -1\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Key Insight:** Gradient flows forward to the first operand, but **negated** to the second.\n",
|
||||
"This is crucial for operations like `x - mean` in LayerNorm."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c2df1248",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
"grade_id": "sub-backward",
|
||||
"solution": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#| export\n",
|
||||
"class SubBackward(Function):\n",
|
||||
" \"\"\"\n",
|
||||
" Gradient computation for tensor subtraction.\n",
|
||||
" \n",
|
||||
" **Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def apply(self, grad_output):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute gradients for subtraction.\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" Tuple of (grad_a, grad_b) where grad_b is negated\n",
|
||||
" \"\"\"\n",
|
||||
" a, b = self.saved_tensors\n",
|
||||
" grad_a = grad_b = None\n",
|
||||
"\n",
|
||||
" if isinstance(a, Tensor) and a.requires_grad:\n",
|
||||
" grad_a = grad_output # ∂(a-b)/∂a = 1\n",
|
||||
"\n",
|
||||
" if isinstance(b, Tensor) and b.requires_grad:\n",
|
||||
" grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)\n",
|
||||
"\n",
|
||||
" return grad_a, grad_b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "587d4d21",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
},
|
||||
"source": [
|
||||
"### DivBackward - Gradient Rules for Division\n",
|
||||
"\n",
|
||||
"Division requires the quotient rule from calculus.\n",
|
||||
"\n",
|
||||
"**Mathematical Principle:**\n",
|
||||
"```\n",
|
||||
"If z = a / b, then:\n",
|
||||
"∂z/∂a = 1/b\n",
|
||||
"∂z/∂b = -a/b²\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Quotient Rule:** For z = f/g, dz = (g·df - f·dg)/g²"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fc0d09f0",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
"grade_id": "div-backward",
|
||||
"solution": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#| export\n",
|
||||
"class DivBackward(Function):\n",
|
||||
" \"\"\"\n",
|
||||
" Gradient computation for tensor division.\n",
|
||||
" \n",
|
||||
" **Mathematical Rule:** If z = a / b, then:\n",
|
||||
" - ∂z/∂a = 1/b\n",
|
||||
" - ∂z/∂b = -a/b²\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def apply(self, grad_output):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute gradients for division using quotient rule.\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" Tuple of (grad_a, grad_b)\n",
|
||||
" \"\"\"\n",
|
||||
" a, b = self.saved_tensors\n",
|
||||
" grad_a = grad_b = None\n",
|
||||
"\n",
|
||||
" if isinstance(a, Tensor) and a.requires_grad:\n",
|
||||
" # ∂(a/b)/∂a = 1/b\n",
|
||||
" if isinstance(b, Tensor):\n",
|
||||
" grad_a = grad_output / b.data\n",
|
||||
" else:\n",
|
||||
" grad_a = grad_output / b\n",
|
||||
"\n",
|
||||
" if isinstance(b, Tensor) and b.requires_grad:\n",
|
||||
" # ∂(a/b)/∂b = -a/b²\n",
|
||||
" grad_b = -grad_output * a.data / (b.data ** 2)\n",
|
||||
"\n",
|
||||
" return grad_a, grad_b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "444af002",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -570,7 +704,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c3ff8c4",
|
||||
"id": "34ade8a0",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -627,7 +761,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "53f8163c",
|
||||
"id": "abe5993d",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -658,7 +792,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b6b4ae48",
|
||||
"id": "be0a41c6",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -706,7 +840,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7be03d75",
|
||||
"id": "e34ad923",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -722,7 +856,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2da6c55b",
|
||||
"id": "18c10012",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -769,7 +903,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "503cbbfd",
|
||||
"id": "d653da60",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
@@ -804,7 +938,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "23ee7914",
|
||||
"id": "f58c9c2c",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -830,7 +964,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6ebf8d15",
|
||||
"id": "258a54f9",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -867,7 +1001,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eb9b24ed",
|
||||
"id": "7706cf3f",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -911,7 +1045,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "34e47d63",
|
||||
"id": "199114cc",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -951,7 +1085,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d7d1bfe9",
|
||||
"id": "b42b622b",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -995,7 +1129,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "62bdddaa",
|
||||
"id": "5eba9d67",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -1054,7 +1188,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56acda3f",
|
||||
"id": "7033ad72",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": false,
|
||||
@@ -1101,7 +1235,9 @@
|
||||
"\n",
|
||||
" # Store original operations\n",
|
||||
" _original_add = Tensor.__add__\n",
|
||||
" _original_sub = Tensor.__sub__\n",
|
||||
" _original_mul = Tensor.__mul__\n",
|
||||
" _original_div = Tensor.__truediv__\n",
|
||||
" _original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None\n",
|
||||
"\n",
|
||||
" # Enhanced operations that track gradients\n",
|
||||
@@ -1169,6 +1305,48 @@
|
||||
"\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
" def tracked_sub(self, other):\n",
|
||||
" \"\"\"\n",
|
||||
" Subtraction with gradient tracking.\n",
|
||||
" \n",
|
||||
" Enhances the original __sub__ method to build computation graphs\n",
|
||||
" when requires_grad=True for any input.\n",
|
||||
" \"\"\"\n",
|
||||
" # Convert scalar to Tensor if needed\n",
|
||||
" if not isinstance(other, Tensor):\n",
|
||||
" other = Tensor(other)\n",
|
||||
"\n",
|
||||
" # Call original operation\n",
|
||||
" result = _original_sub(self, other)\n",
|
||||
"\n",
|
||||
" # Track gradient if needed\n",
|
||||
" if self.requires_grad or other.requires_grad:\n",
|
||||
" result.requires_grad = True\n",
|
||||
" result._grad_fn = SubBackward(self, other)\n",
|
||||
"\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
" def tracked_div(self, other):\n",
|
||||
" \"\"\"\n",
|
||||
" Division with gradient tracking.\n",
|
||||
" \n",
|
||||
" Enhances the original __truediv__ method to build computation graphs\n",
|
||||
" when requires_grad=True for any input.\n",
|
||||
" \"\"\"\n",
|
||||
" # Convert scalar to Tensor if needed\n",
|
||||
" if not isinstance(other, Tensor):\n",
|
||||
" other = Tensor(other)\n",
|
||||
"\n",
|
||||
" # Call original operation\n",
|
||||
" result = _original_div(self, other)\n",
|
||||
"\n",
|
||||
" # Track gradient if needed\n",
|
||||
" if self.requires_grad or other.requires_grad:\n",
|
||||
" result.requires_grad = True\n",
|
||||
" result._grad_fn = DivBackward(self, other)\n",
|
||||
"\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
" def sum_op(self, axis=None, keepdims=False):\n",
|
||||
" \"\"\"\n",
|
||||
" Sum operation with gradient tracking.\n",
|
||||
@@ -1257,7 +1435,9 @@
|
||||
"\n",
|
||||
" # Install enhanced operations\n",
|
||||
" Tensor.__add__ = tracked_add\n",
|
||||
" Tensor.__sub__ = tracked_sub\n",
|
||||
" Tensor.__mul__ = tracked_mul\n",
|
||||
" Tensor.__truediv__ = tracked_div\n",
|
||||
" Tensor.matmul = tracked_matmul\n",
|
||||
" Tensor.sum = sum_op\n",
|
||||
" Tensor.backward = backward\n",
|
||||
@@ -1378,7 +1558,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a9ff4aea",
|
||||
"id": "b0768be1",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1394,7 +1574,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b4222797",
|
||||
"id": "3a1b7ef0",
|
||||
"metadata": {
|
||||
"nbgrader": {
|
||||
"grade": true,
|
||||
@@ -1442,7 +1622,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "96acf9fa",
|
||||
"id": "7c127e55",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\"",
|
||||
"lines_to_next_cell": 1
|
||||
@@ -1456,7 +1636,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ec61fc12",
|
||||
"id": "7d01be90",
|
||||
"metadata": {
|
||||
"lines_to_next_cell": 1,
|
||||
"nbgrader": {
|
||||
@@ -1569,7 +1749,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8aff36fd",
|
||||
"id": "55176118",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -1580,7 +1760,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c5db854b",
|
||||
"id": "22ed96d4",
|
||||
"metadata": {
|
||||
"cell_marker": "\"\"\""
|
||||
},
|
||||
|
||||
@@ -447,6 +447,100 @@ class MulBackward(Function):
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### SubBackward - Gradient Rules for Subtraction
|
||||
|
||||
Subtraction is mathematically simple but important for operations like normalization.
|
||||
|
||||
**Mathematical Principle:**
|
||||
```
|
||||
If z = a - b, then:
|
||||
∂z/∂a = 1
|
||||
∂z/∂b = -1
|
||||
```
|
||||
|
||||
**Key Insight:** Gradient flows forward to the first operand, but **negated** to the second.
|
||||
This is crucial for operations like `x - mean` in LayerNorm.
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "sub-backward", "solution": true}
|
||||
#| export
|
||||
class SubBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor subtraction.
|
||||
|
||||
**Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1
|
||||
"""
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for subtraction.
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_a, grad_b) where grad_b is negated
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
grad_a = grad_output # ∂(a-b)/∂a = 1
|
||||
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### DivBackward - Gradient Rules for Division
|
||||
|
||||
Division requires the quotient rule from calculus.
|
||||
|
||||
**Mathematical Principle:**
|
||||
```
|
||||
If z = a / b, then:
|
||||
∂z/∂a = 1/b
|
||||
∂z/∂b = -a/b²
|
||||
```
|
||||
|
||||
**Quotient Rule:** For z = f/g, dz = (g·df - f·dg)/g²
|
||||
"""
|
||||
|
||||
# %% nbgrader={"grade": false, "grade_id": "div-backward", "solution": true}
|
||||
#| export
|
||||
class DivBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor division.
|
||||
|
||||
**Mathematical Rule:** If z = a / b, then:
|
||||
- ∂z/∂a = 1/b
|
||||
- ∂z/∂b = -a/b²
|
||||
"""
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for division using quotient rule.
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_a, grad_b)
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
# ∂(a/b)/∂a = 1/b
|
||||
if isinstance(b, Tensor):
|
||||
grad_a = grad_output / b.data
|
||||
else:
|
||||
grad_a = grad_output / b
|
||||
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
# ∂(a/b)/∂b = -a/b²
|
||||
grad_b = -grad_output * a.data / (b.data ** 2)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### MatmulBackward - Gradient Rules for Matrix Multiplication
|
||||
@@ -872,7 +966,9 @@ def enable_autograd():
|
||||
|
||||
# Store original operations
|
||||
_original_add = Tensor.__add__
|
||||
_original_sub = Tensor.__sub__
|
||||
_original_mul = Tensor.__mul__
|
||||
_original_div = Tensor.__truediv__
|
||||
_original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
|
||||
|
||||
# Enhanced operations that track gradients
|
||||
@@ -940,6 +1036,48 @@ def enable_autograd():
|
||||
|
||||
return result
|
||||
|
||||
def tracked_sub(self, other):
|
||||
"""
|
||||
Subtraction with gradient tracking.
|
||||
|
||||
Enhances the original __sub__ method to build computation graphs
|
||||
when requires_grad=True for any input.
|
||||
"""
|
||||
# Convert scalar to Tensor if needed
|
||||
if not isinstance(other, Tensor):
|
||||
other = Tensor(other)
|
||||
|
||||
# Call original operation
|
||||
result = _original_sub(self, other)
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad or other.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = SubBackward(self, other)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_div(self, other):
|
||||
"""
|
||||
Division with gradient tracking.
|
||||
|
||||
Enhances the original __truediv__ method to build computation graphs
|
||||
when requires_grad=True for any input.
|
||||
"""
|
||||
# Convert scalar to Tensor if needed
|
||||
if not isinstance(other, Tensor):
|
||||
other = Tensor(other)
|
||||
|
||||
# Call original operation
|
||||
result = _original_div(self, other)
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad or other.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = DivBackward(self, other)
|
||||
|
||||
return result
|
||||
|
||||
def sum_op(self, axis=None, keepdims=False):
|
||||
"""
|
||||
Sum operation with gradient tracking.
|
||||
@@ -1028,7 +1166,9 @@ def enable_autograd():
|
||||
|
||||
# Install enhanced operations
|
||||
Tensor.__add__ = tracked_add
|
||||
Tensor.__sub__ = tracked_sub
|
||||
Tensor.__mul__ = tracked_mul
|
||||
Tensor.__truediv__ = tracked_div
|
||||
Tensor.matmul = tracked_matmul
|
||||
Tensor.sum = sum_op
|
||||
Tensor.backward = backward
|
||||
|
||||
123
tinytorch/core/autograd.py
generated
123
tinytorch/core/autograd.py
generated
@@ -15,8 +15,8 @@
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
|
||||
'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||||
__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'SumBackward',
|
||||
'ReLUBackward', 'SigmoidBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
|
||||
import numpy as np
|
||||
@@ -164,6 +164,65 @@ class MulBackward(Function):
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 13
|
||||
class SubBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor subtraction.
|
||||
|
||||
**Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1
|
||||
"""
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for subtraction.
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_a, grad_b) where grad_b is negated
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
grad_a = grad_output # ∂(a-b)/∂a = 1
|
||||
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15
|
||||
class DivBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor division.
|
||||
|
||||
**Mathematical Rule:** If z = a / b, then:
|
||||
- ∂z/∂a = 1/b
|
||||
- ∂z/∂b = -a/b²
|
||||
"""
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for division using quotient rule.
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_a, grad_b)
|
||||
"""
|
||||
a, b = self.saved_tensors
|
||||
grad_a = grad_b = None
|
||||
|
||||
if isinstance(a, Tensor) and a.requires_grad:
|
||||
# ∂(a/b)/∂a = 1/b
|
||||
if isinstance(b, Tensor):
|
||||
grad_a = grad_output / b.data
|
||||
else:
|
||||
grad_a = grad_output / b
|
||||
|
||||
if isinstance(b, Tensor) and b.requires_grad:
|
||||
# ∂(a/b)/∂b = -a/b²
|
||||
grad_b = -grad_output * a.data / (b.data ** 2)
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
|
||||
class MatmulBackward(Function):
|
||||
"""
|
||||
Gradient computation for matrix multiplication.
|
||||
@@ -206,7 +265,7 @@ class MatmulBackward(Function):
|
||||
|
||||
return grad_a, grad_b
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 15
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 19
|
||||
class SumBackward(Function):
|
||||
"""
|
||||
Gradient computation for tensor sum.
|
||||
@@ -240,7 +299,7 @@ class SumBackward(Function):
|
||||
return np.ones_like(tensor.data) * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 20
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
|
||||
class ReLUBackward(Function):
|
||||
"""
|
||||
Gradient computation for ReLU activation.
|
||||
@@ -263,7 +322,7 @@ class ReLUBackward(Function):
|
||||
return grad_output * relu_grad,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 21
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
|
||||
class SigmoidBackward(Function):
|
||||
"""
|
||||
Gradient computation for sigmoid activation.
|
||||
@@ -293,7 +352,7 @@ class SigmoidBackward(Function):
|
||||
return grad_output * sigmoid_grad,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 22
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
|
||||
class MSEBackward(Function):
|
||||
"""
|
||||
Gradient computation for Mean Squared Error Loss.
|
||||
@@ -319,7 +378,7 @@ class MSEBackward(Function):
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
|
||||
class BCEBackward(Function):
|
||||
"""
|
||||
Gradient computation for Binary Cross-Entropy Loss.
|
||||
@@ -349,7 +408,7 @@ class BCEBackward(Function):
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
|
||||
class CrossEntropyBackward(Function):
|
||||
"""
|
||||
Gradient computation for Cross-Entropy Loss.
|
||||
@@ -394,7 +453,7 @@ class CrossEntropyBackward(Function):
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
|
||||
def enable_autograd():
|
||||
"""
|
||||
Enable gradient tracking for all Tensor operations.
|
||||
@@ -431,7 +490,9 @@ def enable_autograd():
|
||||
|
||||
# Store original operations
|
||||
_original_add = Tensor.__add__
|
||||
_original_sub = Tensor.__sub__
|
||||
_original_mul = Tensor.__mul__
|
||||
_original_div = Tensor.__truediv__
|
||||
_original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
|
||||
|
||||
# Enhanced operations that track gradients
|
||||
@@ -499,6 +560,48 @@ def enable_autograd():
|
||||
|
||||
return result
|
||||
|
||||
def tracked_sub(self, other):
|
||||
"""
|
||||
Subtraction with gradient tracking.
|
||||
|
||||
Enhances the original __sub__ method to build computation graphs
|
||||
when requires_grad=True for any input.
|
||||
"""
|
||||
# Convert scalar to Tensor if needed
|
||||
if not isinstance(other, Tensor):
|
||||
other = Tensor(other)
|
||||
|
||||
# Call original operation
|
||||
result = _original_sub(self, other)
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad or other.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = SubBackward(self, other)
|
||||
|
||||
return result
|
||||
|
||||
def tracked_div(self, other):
|
||||
"""
|
||||
Division with gradient tracking.
|
||||
|
||||
Enhances the original __truediv__ method to build computation graphs
|
||||
when requires_grad=True for any input.
|
||||
"""
|
||||
# Convert scalar to Tensor if needed
|
||||
if not isinstance(other, Tensor):
|
||||
other = Tensor(other)
|
||||
|
||||
# Call original operation
|
||||
result = _original_div(self, other)
|
||||
|
||||
# Track gradient if needed
|
||||
if self.requires_grad or other.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = DivBackward(self, other)
|
||||
|
||||
return result
|
||||
|
||||
def sum_op(self, axis=None, keepdims=False):
|
||||
"""
|
||||
Sum operation with gradient tracking.
|
||||
@@ -587,7 +690,9 @@ def enable_autograd():
|
||||
|
||||
# Install enhanced operations
|
||||
Tensor.__add__ = tracked_add
|
||||
Tensor.__sub__ = tracked_sub
|
||||
Tensor.__mul__ = tracked_mul
|
||||
Tensor.__truediv__ = tracked_div
|
||||
Tensor.matmul = tracked_matmul
|
||||
Tensor.sum = sum_op
|
||||
Tensor.backward = backward
|
||||
|
||||
Reference in New Issue
Block a user