mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-23 01:19:29 -05:00
Fixes after merge conflicts: - Fix tensor reshape error message format - Fix __init__.py imports (remove BatchNorm2d, fix enable_autograd call) - Fix attention mask broadcasting for multi-head attention - Fix memoization module to use matmul instead of @ operator - Fix capstone module count_parameters and CosineSchedule usage - Add missing imports to benchmark.py (dataclass, Profiler, platform, os) - Simplify capstone pipeline test to avoid data shape mismatch All 20 modules now pass tito test --all
1298 lines
45 KiB
Python
Generated
1298 lines
45 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||
# ║ ║
|
||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||
# ║ ║
|
||
# ║ ✅ TO EDIT: src/05_autograd/05_autograd.py ║
|
||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||
# ║ ║
|
||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||
# ║ Editing it directly may break module functionality and training. ║
|
||
# ║ ║
|
||
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
|
||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||
# %% auto 0
|
||
__all__ = ['EPSILON', 'Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward',
|
||
'TransposeBackward', 'PermuteBackward', 'EmbeddingBackward', 'SliceBackward', 'ReshapeBackward',
|
||
'SumBackward', 'ReLUBackward', 'SigmoidBackward', 'SoftmaxBackward', 'GELUBackward', 'MSEBackward',
|
||
'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 1
|
||
import numpy as np
|
||
from typing import Optional, List, Tuple
|
||
import sys
|
||
import os
|
||
|
||
from .tensor import Tensor
|
||
|
||
# Constants for numerical differentiation
|
||
EPSILON = 1e-7 # Small perturbation for numerical gradient computation
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 6
|
||
class Function:
|
||
"""
|
||
Base class for differentiable operations.
|
||
|
||
Every operation that needs gradients (add, multiply, matmul, etc.)
|
||
will inherit from this class and implement the apply() method.
|
||
|
||
**Key Concepts:**
|
||
- **saved_tensors**: Store inputs needed for backward pass
|
||
- **apply()**: Compute gradients using chain rule
|
||
- **next_functions**: Track computation graph connections
|
||
|
||
**Example Usage:**
|
||
```python
|
||
class AddBackward(Function):
|
||
def apply(self, grad_output):
|
||
# Addition distributes gradients equally
|
||
return grad_output, grad_output
|
||
```
|
||
"""
|
||
|
||
def __init__(self, *tensors):
|
||
"""
|
||
Initialize function with input tensors.
|
||
|
||
Args:
|
||
*tensors: Input tensors that will be saved for backward pass
|
||
"""
|
||
self.saved_tensors = tensors
|
||
self.next_functions = []
|
||
|
||
# Build computation graph connections
|
||
for t in tensors:
|
||
if isinstance(t, Tensor) and t.requires_grad:
|
||
# Check if this tensor was created by another operation
|
||
# _grad_fn is only present if autograd is enabled and tensor came from an operation
|
||
if getattr(t, '_grad_fn', None) is not None:
|
||
self.next_functions.append(t._grad_fn)
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for inputs.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from the output
|
||
|
||
Returns:
|
||
Tuple of gradients for each input tensor
|
||
|
||
**Must be implemented by subclasses**
|
||
"""
|
||
raise NotImplementedError("Each Function must implement apply() method")
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 9
|
||
class AddBackward(Function):
|
||
"""
|
||
Gradient computation for tensor addition.
|
||
|
||
**Mathematical Rule:** If z = a + b, then ∂z/∂a = 1 and ∂z/∂b = 1
|
||
|
||
**Key Insight:** Addition distributes gradients equally to both inputs.
|
||
The gradient flowing backward is passed unchanged to each input.
|
||
|
||
**Broadcasting Handling:** When input shapes differ due to broadcasting,
|
||
we sum gradients appropriately to match original tensor shapes.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for addition.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) for the two inputs
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(a+b)/∂a = 1 → grad_a = grad_output
|
||
- ∂(a+b)/∂b = 1 → grad_b = grad_output
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
# Gradient for first input
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
grad_a = grad_output
|
||
|
||
# Gradient for second input
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
grad_b = grad_output
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 11
|
||
class MulBackward(Function):
|
||
"""
|
||
Gradient computation for tensor multiplication.
|
||
|
||
**Mathematical Rule:** If z = a * b, then ∂z/∂a = b and ∂z/∂b = a
|
||
|
||
**Key Insight:** Each input's gradient equals the gradient output
|
||
multiplied by the OTHER input's value (product rule).
|
||
|
||
**Applications:** Used in weight scaling, attention mechanisms,
|
||
and anywhere element-wise multiplication occurs.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for multiplication.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) for the two inputs
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(a*b)/∂a = b → grad_a = grad_output * b
|
||
- ∂(a*b)/∂b = a → grad_b = grad_output * a
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
# Gradient for first input: grad_output * b
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
if isinstance(b, Tensor):
|
||
grad_a = grad_output * b.data
|
||
else:
|
||
grad_a = grad_output * b
|
||
|
||
# Gradient for second input: grad_output * a
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
grad_b = grad_output * a.data
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 13
|
||
class SubBackward(Function):
|
||
"""
|
||
Gradient computation for tensor subtraction.
|
||
|
||
**Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for subtraction.
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) where grad_b is negated
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
grad_a = grad_output # ∂(a-b)/∂a = 1
|
||
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 15
|
||
class DivBackward(Function):
|
||
"""
|
||
Gradient computation for tensor division.
|
||
|
||
**Mathematical Rule:** If z = a / b, then:
|
||
- ∂z/∂a = 1/b
|
||
- ∂z/∂b = -a/b²
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for division using quotient rule.
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b)
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
# ∂(a/b)/∂a = 1/b
|
||
if isinstance(b, Tensor):
|
||
grad_a = grad_output / b.data
|
||
else:
|
||
grad_a = grad_output / b
|
||
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
# ∂(a/b)/∂b = -a/b²
|
||
grad_b = -grad_output * a.data / (b.data ** 2)
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 17
|
||
class MatmulBackward(Function):
|
||
"""
|
||
Gradient computation for matrix multiplication.
|
||
|
||
**Mathematical Rule:** If Z = A @ B, then:
|
||
- ∂Z/∂A = grad_Z @ B.T
|
||
- ∂Z/∂B = A.T @ grad_Z
|
||
|
||
**Key Insight:** Matrix multiplication gradients involve transposing
|
||
one input and multiplying with the gradient output.
|
||
|
||
**Applications:** Core operation in neural networks for weight updates
|
||
in linear layers, attention mechanisms, and transformers.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for matrix multiplication.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) for the two matrix inputs
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(A@B)/∂A = grad_output @ B.T
|
||
- ∂(A@B)/∂B = A.T @ grad_output
|
||
|
||
**Batched Operation:** For 3D+ tensors, we transpose only the last two
|
||
dimensions using np.swapaxes, preserving batch dimensions.
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
# Gradient for first input: grad_output @ b.T
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
# For batched tensors, transpose only last two dims
|
||
if b.data.ndim >= 2:
|
||
b_T = np.swapaxes(b.data, -2, -1)
|
||
else:
|
||
b_T = b.data.T
|
||
grad_a = np.matmul(grad_output, b_T)
|
||
|
||
# Gradient for second input: a.T @ grad_output
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
# For batched tensors, transpose only last two dims
|
||
if a.data.ndim >= 2:
|
||
a_T = np.swapaxes(a.data, -2, -1)
|
||
else:
|
||
a_T = a.data.T
|
||
grad_b = np.matmul(a_T, grad_output)
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 18
|
||
class TransposeBackward(Function):
|
||
"""
|
||
Gradient computation for transpose operation.
|
||
|
||
**Mathematical Rule:** If Y = X.T, then:
|
||
- ∂Y/∂X = grad_Y.T
|
||
|
||
**Key Insight:** The gradient of transpose is just transpose the gradient!
|
||
This is because transpose is a linear operation that just rearranges elements.
|
||
|
||
**Applications:** Used in attention (K.T for scores), weight gradients (W.T),
|
||
and any operation that needs to swap matrix dimensions.
|
||
"""
|
||
|
||
def __init__(self, tensor, dim0, dim1):
|
||
"""
|
||
Args:
|
||
tensor: Input tensor
|
||
dim0: First dimension to swap (None for default)
|
||
dim1: Second dimension to swap (None for default)
|
||
"""
|
||
super().__init__(tensor)
|
||
self.dim0 = dim0
|
||
self.dim1 = dim1
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for transpose.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple with single gradient for input tensor
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(X.T)/∂X = grad_output.T
|
||
- Just transpose the gradient back!
|
||
"""
|
||
x, = self.saved_tensors
|
||
grad_x = None
|
||
|
||
if isinstance(x, Tensor) and x.requires_grad:
|
||
# Transpose gradient using the same dims
|
||
if self.dim0 is None and self.dim1 is None:
|
||
# Default: transpose last two dimensions
|
||
if grad_output.ndim < 2:
|
||
grad_x = grad_output.copy()
|
||
else:
|
||
axes = list(range(grad_output.ndim))
|
||
axes[-2], axes[-1] = axes[-1], axes[-2]
|
||
grad_x = np.transpose(grad_output, axes)
|
||
else:
|
||
# Specific dimensions: swap them back
|
||
axes = list(range(grad_output.ndim))
|
||
axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0]
|
||
grad_x = np.transpose(grad_output, axes)
|
||
|
||
return (grad_x,)
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 19
|
||
class PermuteBackward(Function):
|
||
"""
|
||
Gradient computation for arbitrary axis permutation (general transpose).
|
||
|
||
**Mathematical Rule:** If Y = X.permute(axes), then:
|
||
- ∂Y/∂X = grad_Y.permute(inverse_axes)
|
||
|
||
**Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse).
|
||
More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0).
|
||
|
||
**Key Insight:** To reverse a permutation, we need to know where each axis went.
|
||
If axis i went to position axes[i], then in the inverse, position axes[i] should go to i.
|
||
|
||
**Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads.
|
||
"""
|
||
|
||
def __init__(self, tensor, axes):
|
||
"""
|
||
Args:
|
||
tensor: Input tensor
|
||
axes: Tuple of axis indices defining the permutation
|
||
"""
|
||
super().__init__(tensor)
|
||
self.axes = axes
|
||
# Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i
|
||
self.inverse_axes = tuple(np.argsort(axes))
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for permutation.
|
||
|
||
The gradient is permuted back using the inverse permutation.
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(X.permute(axes))/∂X = grad_output.permute(inverse_axes)
|
||
"""
|
||
x, = self.saved_tensors
|
||
grad_x = None
|
||
|
||
if isinstance(x, Tensor) and x.requires_grad:
|
||
# Permute gradient back to original axis order
|
||
grad_x = np.transpose(grad_output, self.inverse_axes)
|
||
|
||
return (grad_x,)
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 20
|
||
class EmbeddingBackward(Function):
|
||
"""
|
||
Gradient computation for embedding lookup operation.
|
||
|
||
**Mathematical Rule:** If Y = Embedding[indices], then:
|
||
- ∂Loss/∂Embedding[i] = sum of all gradients where index==i
|
||
|
||
**Key Insight:** Embedding lookup is a gather operation. The backward
|
||
is a scatter operation that accumulates gradients to the embedding weights.
|
||
|
||
**Applications:** Word embeddings, positional embeddings, token embeddings
|
||
in transformers.
|
||
"""
|
||
|
||
def __init__(self, weight, indices):
|
||
"""
|
||
Args:
|
||
weight: Embedding weight matrix
|
||
indices: Indices used for lookup
|
||
"""
|
||
super().__init__(weight)
|
||
self.indices = indices
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for embedding lookup.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple with single gradient for weight tensor
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(Embedding[indices])/∂Embedding = scatter gradients to selected rows
|
||
- Multiple indices can point to same embedding → gradients accumulate
|
||
"""
|
||
weight, = self.saved_tensors
|
||
grad_weight = None
|
||
|
||
if isinstance(weight, Tensor) and weight.requires_grad:
|
||
# Initialize gradient with zeros
|
||
grad_weight = np.zeros_like(weight.data)
|
||
|
||
# Scatter gradients back to embedding weights
|
||
# np.add.at accumulates gradients for repeated indices
|
||
indices_flat = self.indices.data.astype(int).flatten()
|
||
grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1])
|
||
|
||
np.add.at(grad_weight, indices_flat, grad_output_reshaped)
|
||
|
||
return (grad_weight,)
|
||
|
||
|
||
class SliceBackward(Function):
|
||
"""
|
||
Gradient computation for tensor slicing/indexing operations.
|
||
|
||
**Mathematical Rule:** If Y = X[key], then:
|
||
- ∂Loss/∂X[key] = grad_output
|
||
- ∂Loss/∂X[other positions] = 0
|
||
|
||
**Key Insight:** Slicing is a masking operation. The backward
|
||
places gradients back into the original tensor positions, with
|
||
zeros everywhere else.
|
||
|
||
**Applications:** Positional encodings, sequence slicing, batch selection,
|
||
attention masking in transformers.
|
||
|
||
**Examples:**
|
||
>>> x = Tensor([1, 2, 3, 4, 5], requires_grad=True)
|
||
>>> y = x[:3] # Slice first 3 elements
|
||
>>> loss = y.sum()
|
||
>>> loss.backward()
|
||
>>> # x.grad = [1, 1, 1, 0, 0] - gradients only for sliced positions
|
||
"""
|
||
|
||
def __init__(self, tensor, key):
|
||
"""
|
||
Args:
|
||
tensor: Original tensor being sliced
|
||
key: Slicing key (index, slice, tuple of slices, etc.)
|
||
"""
|
||
super().__init__(tensor)
|
||
self.key = key
|
||
self.original_shape = tensor.shape
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for slicing operation.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from sliced output
|
||
|
||
Returns:
|
||
Tuple with single gradient for input tensor
|
||
|
||
**Mathematical Foundation:**
|
||
- Slicing extracts a subset of elements
|
||
- Backward scatters gradients back to original positions
|
||
- Unsliced positions receive zero gradient
|
||
|
||
**Example:**
|
||
If X = [a, b, c, d, e] and Y = X[1:4] = [b, c, d]
|
||
Then dL/dX = [0, dL/db, dL/dc, dL/dd, 0]
|
||
"""
|
||
tensor, = self.saved_tensors
|
||
grad_input = None
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# Create gradient array with same shape as original tensor
|
||
grad_input = np.zeros(self.original_shape, dtype=np.float32)
|
||
|
||
# Place gradients back into the sliced positions
|
||
# This is the inverse of the forward slicing operation
|
||
grad_input[self.key] = grad_output
|
||
|
||
return (grad_input,)
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 21
|
||
class ReshapeBackward(Function):
|
||
"""
|
||
Gradient computation for reshape operation.
|
||
|
||
**Mathematical Rule:** If Y = X.reshape(new_shape), then:
|
||
- ∂Y/∂X = grad_Y.reshape(X.shape)
|
||
|
||
**Key Insight:** Reshape just rearranges the same elements.
|
||
The gradient is simply reshaped back to the original shape!
|
||
|
||
**Applications:** Flattening tensors for linear layers, reshaping
|
||
between convolutional and dense layers.
|
||
"""
|
||
|
||
def __init__(self, tensor, original_shape):
|
||
"""
|
||
Args:
|
||
tensor: Input tensor
|
||
original_shape: Shape before reshape
|
||
"""
|
||
super().__init__(tensor)
|
||
self.original_shape = original_shape
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for reshape.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple with single gradient for input tensor
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(X.reshape(...))/∂X = grad_output.reshape(X.shape)
|
||
- Just reshape the gradient back!
|
||
"""
|
||
x, = self.saved_tensors
|
||
grad_x = None
|
||
|
||
if isinstance(x, Tensor) and x.requires_grad:
|
||
# Reshape gradient back to original shape
|
||
grad_x = grad_output.reshape(self.original_shape)
|
||
|
||
return (grad_x,)
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 23
|
||
class SumBackward(Function):
|
||
"""
|
||
Gradient computation for tensor sum.
|
||
|
||
**Mathematical Rule:** If z = sum(a), then ∂z/∂a[i] = 1 for all i
|
||
|
||
**Key Insight:** Sum distributes the gradient equally to all input elements.
|
||
The gradient is broadcast from the reduced output back to input shape.
|
||
|
||
**Applications:** Used in loss functions, mean operations, and
|
||
anywhere tensor reduction occurs.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for sum operation.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple containing gradient for the input tensor
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂sum(a)/∂a[i] = 1 → grad_a = ones_like(a) * grad_output
|
||
"""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# Gradient is 1 for all elements, scaled by grad_output
|
||
return np.ones_like(tensor.data) * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 28
|
||
class ReLUBackward(Function):
|
||
"""
|
||
Gradient computation for ReLU activation.
|
||
|
||
ReLU: f(x) = max(0, x)
|
||
Derivative: f'(x) = 1 if x > 0, else 0
|
||
"""
|
||
|
||
def __init__(self, input_tensor):
|
||
"""Initialize with input tensor."""
|
||
super().__init__(input_tensor)
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for ReLU."""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# ReLU gradient: 1 if x > 0, else 0
|
||
relu_grad = (tensor.data > 0).astype(np.float32)
|
||
return grad_output * relu_grad,
|
||
return None,
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 29
|
||
class SigmoidBackward(Function):
|
||
"""
|
||
Gradient computation for sigmoid activation.
|
||
|
||
Sigmoid: σ(x) = 1/(1 + exp(-x))
|
||
Derivative: σ'(x) = σ(x) * (1 - σ(x))
|
||
"""
|
||
|
||
def __init__(self, input_tensor, output_tensor):
|
||
"""
|
||
Initialize with both input and output.
|
||
|
||
Args:
|
||
input_tensor: Original input to sigmoid
|
||
output_tensor: Output of sigmoid (saves recomputation)
|
||
"""
|
||
super().__init__(input_tensor)
|
||
self.output_data = output_tensor.data
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for sigmoid."""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# σ'(x) = σ(x) * (1 - σ(x))
|
||
sigmoid_grad = self.output_data * (1 - self.output_data)
|
||
return grad_output * sigmoid_grad,
|
||
return None,
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 30
|
||
class SoftmaxBackward(Function):
|
||
"""
|
||
Gradient computation for softmax activation.
|
||
|
||
Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x))
|
||
Derivative: ∂softmax/∂x[i] = softmax[i] * (δ[i,j] - softmax[j])
|
||
|
||
For gradient computation:
|
||
grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax))
|
||
|
||
**Key Insight:** The gradient depends on all elements of softmax due to
|
||
the normalization, not just the element being differentiated.
|
||
"""
|
||
|
||
def __init__(self, input_tensor, output_tensor, dim=-1):
|
||
"""
|
||
Initialize with input, output, and dimension.
|
||
|
||
Args:
|
||
input_tensor: Original input to softmax
|
||
output_tensor: Output of softmax (needed for gradient)
|
||
dim: Dimension along which softmax was applied
|
||
"""
|
||
super().__init__(input_tensor)
|
||
self.output_data = output_tensor.data
|
||
self.dim = dim
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for softmax.
|
||
|
||
Mathematical formula:
|
||
∂L/∂x[i] = softmax[i] * (∂L/∂y[i] - sum_j(∂L/∂y[j] * softmax[j]))
|
||
|
||
This can be vectorized as:
|
||
grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True))
|
||
"""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# Compute sum(grad_output * softmax) along the softmax dimension
|
||
sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True)
|
||
|
||
# Softmax gradient: softmax * (grad_output - sum_term)
|
||
grad_x = self.output_data * (grad_output - sum_term)
|
||
|
||
return (grad_x,)
|
||
return (None,)
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 31
|
||
class GELUBackward(Function):
|
||
"""
|
||
Gradient computation for GELU activation.
|
||
|
||
GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal
|
||
Approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
|
||
|
||
**Key Insight:** GELU is smoother than ReLU, providing non-zero gradients
|
||
for negative values, which helps training deep networks.
|
||
"""
|
||
|
||
def __init__(self, input_tensor):
|
||
"""Initialize with input tensor."""
|
||
super().__init__(input_tensor)
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradient for GELU.
|
||
|
||
Mathematical formula (using approximation):
|
||
∂gelu/∂x ≈ 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...)
|
||
|
||
Simplified: We compute the derivative numerically or use the formula.
|
||
"""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
x = tensor.data
|
||
# GELU derivative approximation
|
||
# Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
||
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
|
||
x_cubed = x ** 3
|
||
tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed)
|
||
tanh_out = np.tanh(tanh_arg)
|
||
sech_squared = 1 - tanh_out ** 2
|
||
|
||
# Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx
|
||
d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2)
|
||
gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg
|
||
|
||
return (grad_output * gelu_grad,)
|
||
return (None,)
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 32
|
||
class MSEBackward(Function):
|
||
"""
|
||
Gradient computation for Mean Squared Error Loss.
|
||
|
||
MSE: L = mean((predictions - targets)²)
|
||
Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N
|
||
"""
|
||
|
||
def __init__(self, predictions, targets):
|
||
"""Initialize with predictions and targets."""
|
||
super().__init__(predictions)
|
||
self.targets_data = targets.data
|
||
self.num_samples = np.size(targets.data)
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for MSE loss."""
|
||
predictions, = self.saved_tensors
|
||
|
||
if isinstance(predictions, Tensor) and predictions.requires_grad:
|
||
# Gradient: 2 * (predictions - targets) / N
|
||
grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples
|
||
|
||
return grad * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 33
|
||
class BCEBackward(Function):
|
||
"""
|
||
Gradient computation for Binary Cross-Entropy Loss.
|
||
|
||
BCE: L = -[y*log(p) + (1-y)*log(1-p)]
|
||
Derivative: ∂L/∂p = (p - y) / (p*(1-p)*N)
|
||
"""
|
||
|
||
def __init__(self, predictions, targets):
|
||
"""Initialize with predictions and targets."""
|
||
super().__init__(predictions)
|
||
self.targets_data = targets.data
|
||
self.num_samples = np.size(targets.data)
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for BCE loss."""
|
||
predictions, = self.saved_tensors
|
||
|
||
if isinstance(predictions, Tensor) and predictions.requires_grad:
|
||
eps = EPSILON
|
||
p = np.clip(predictions.data, eps, 1 - eps)
|
||
y = self.targets_data
|
||
|
||
# Gradient: (p - y) / (p * (1-p) * N)
|
||
grad = (p - y) / (p * (1 - p) * self.num_samples)
|
||
|
||
return grad * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 34
|
||
class CrossEntropyBackward(Function):
|
||
"""
|
||
Gradient computation for Cross-Entropy Loss.
|
||
|
||
CrossEntropy: L = -mean(log_softmax(logits)[targets])
|
||
|
||
The gradient with respect to logits is remarkably elegant:
|
||
∂L/∂logits = (softmax(logits) - one_hot(targets)) / N
|
||
|
||
This is one of the most beautiful results in machine learning:
|
||
- The gradient is simply the difference between predictions and targets
|
||
- It naturally scales with how wrong we are
|
||
- It's numerically stable when computed via softmax
|
||
"""
|
||
|
||
def __init__(self, logits, targets):
|
||
"""Initialize with logits and target class indices."""
|
||
super().__init__(logits)
|
||
self.targets_data = targets.data.astype(int)
|
||
self.batch_size = logits.data.shape[0]
|
||
self.num_classes = logits.data.shape[1]
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for cross-entropy loss."""
|
||
logits, = self.saved_tensors
|
||
|
||
if isinstance(logits, Tensor) and logits.requires_grad:
|
||
# Compute softmax probabilities
|
||
# Using stable softmax: subtract max for numerical stability
|
||
logits_data = logits.data
|
||
max_logits = np.max(logits_data, axis=1, keepdims=True)
|
||
exp_logits = np.exp(logits_data - max_logits)
|
||
softmax = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
|
||
|
||
# Create one-hot encoding of targets
|
||
one_hot = np.zeros((self.batch_size, self.num_classes), dtype=np.float32)
|
||
one_hot[np.arange(self.batch_size), self.targets_data] = 1.0
|
||
|
||
# Gradient: (softmax - one_hot) / batch_size
|
||
grad = (softmax - one_hot) / self.batch_size
|
||
|
||
return grad * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/05_autograd/autograd.ipynb 35
|
||
def enable_autograd():
|
||
"""
|
||
Enable gradient tracking for all Tensor operations.
|
||
|
||
This function enhances the existing Tensor class with autograd capabilities.
|
||
Call this once to activate gradients globally.
|
||
|
||
**What it does:**
|
||
- Replaces Tensor operations with gradient-tracking versions
|
||
- Adds backward() method for reverse-mode differentiation
|
||
- Enables computation graph building
|
||
- Maintains full backward compatibility
|
||
|
||
**After calling this:**
|
||
- Tensor operations will track computation graphs
|
||
- backward() method becomes available
|
||
- Gradients will flow through operations
|
||
- requires_grad=True enables tracking per tensor
|
||
|
||
**Example:**
|
||
```python
|
||
enable_autograd() # Call once
|
||
x = Tensor([2.0], requires_grad=True)
|
||
y = x * 3
|
||
y.backward()
|
||
print(x.grad) # [3.0]
|
||
```
|
||
"""
|
||
|
||
# Educational Note: hasattr() is LEGITIMATE here because:
|
||
# 1. This is a runtime monkey-patch system (meta-programming)
|
||
# 2. We're checking if a class has been dynamically modified
|
||
# 3. _autograd_enabled is a marker attribute we add at runtime
|
||
# This is the CORRECT use of hasattr() for dynamic class modification
|
||
if hasattr(Tensor, '_autograd_enabled'):
|
||
print("⚠️ Autograd already enabled")
|
||
return
|
||
|
||
# Store original operations
|
||
# These are guaranteed to exist from Module 01 (Tensor class)
|
||
_original_add = Tensor.__add__
|
||
_original_sub = Tensor.__sub__
|
||
_original_mul = Tensor.__mul__
|
||
_original_div = Tensor.__truediv__
|
||
_original_getitem = Tensor.__getitem__
|
||
|
||
# These methods are also guaranteed from Module 01 - trust Single Tensor Class
|
||
_original_matmul = Tensor.matmul
|
||
_original_transpose = Tensor.transpose
|
||
_original_reshape = Tensor.reshape
|
||
|
||
# Enhanced operations that track gradients
|
||
def tracked_add(self, other):
|
||
"""
|
||
Addition with gradient tracking.
|
||
|
||
Enhances the original __add__ method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Convert scalar to Tensor if needed
|
||
if not isinstance(other, Tensor):
|
||
other = Tensor(other)
|
||
|
||
# Call original operation
|
||
result = _original_add(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or other.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = AddBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_mul(self, other):
|
||
"""
|
||
Multiplication with gradient tracking.
|
||
|
||
Enhances the original __mul__ method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Convert scalar to Tensor if needed for consistency
|
||
if not isinstance(other, Tensor):
|
||
other_tensor = Tensor(other)
|
||
else:
|
||
other_tensor = other
|
||
|
||
# Call original operation
|
||
result = _original_mul(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or (isinstance(other, Tensor) and other.requires_grad):
|
||
result.requires_grad = True
|
||
result._grad_fn = MulBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_matmul(self, other):
|
||
"""
|
||
Matrix multiplication with gradient tracking.
|
||
|
||
Enhances the original matmul method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Call original matmul from Module 01
|
||
result = _original_matmul(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or other.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = MatmulBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_transpose(self, dim0=None, dim1=None):
|
||
"""
|
||
Transpose with gradient tracking.
|
||
|
||
Enhances the original transpose method to build computation graphs
|
||
when requires_grad=True for the input.
|
||
"""
|
||
# Call original transpose from Module 01
|
||
result = _original_transpose(self, dim0, dim1)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = TransposeBackward(self, dim0, dim1)
|
||
|
||
return result
|
||
|
||
def tracked_reshape(self, *shape):
|
||
"""
|
||
Reshape with gradient tracking.
|
||
|
||
Enhances the original reshape method to build computation graphs
|
||
when requires_grad=True for the input.
|
||
"""
|
||
original_shape = self.shape
|
||
|
||
# Call original reshape from Module 01
|
||
result = _original_reshape(self, *shape)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = ReshapeBackward(self, original_shape)
|
||
|
||
return result
|
||
|
||
def tracked_sub(self, other):
|
||
"""
|
||
Subtraction with gradient tracking.
|
||
|
||
Enhances the original __sub__ method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Convert scalar to Tensor if needed
|
||
if not isinstance(other, Tensor):
|
||
other = Tensor(other)
|
||
|
||
# Call original operation
|
||
result = _original_sub(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or other.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SubBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_div(self, other):
|
||
"""
|
||
Division with gradient tracking.
|
||
|
||
Enhances the original __truediv__ method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Convert scalar to Tensor if needed
|
||
if not isinstance(other, Tensor):
|
||
other = Tensor(other)
|
||
|
||
# Call original operation
|
||
result = _original_div(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or other.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = DivBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_getitem(self, key):
|
||
"""
|
||
Indexing/slicing with gradient tracking.
|
||
|
||
Enhances the original __getitem__ method to build computation graphs
|
||
when requires_grad=True for the input.
|
||
"""
|
||
# Call original __getitem__ from Module 01
|
||
result = _original_getitem(self, key)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SliceBackward(self, key)
|
||
|
||
return result
|
||
|
||
def sum_op(self, axis=None, keepdims=False):
|
||
"""
|
||
Sum operation with gradient tracking.
|
||
|
||
Creates a new sum method that builds computation graphs
|
||
when requires_grad=True.
|
||
"""
|
||
result_data = np.sum(self.data, axis=axis, keepdims=keepdims)
|
||
result = Tensor(result_data)
|
||
|
||
if self.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SumBackward(self)
|
||
|
||
return result
|
||
|
||
def backward(self, gradient=None):
|
||
"""
|
||
Compute gradients via backpropagation.
|
||
|
||
This is the key method that makes training possible!
|
||
It implements reverse-mode automatic differentiation.
|
||
|
||
**Algorithm:**
|
||
1. Initialize gradient if not provided (for scalar outputs)
|
||
2. Accumulate gradient in self.grad
|
||
3. If this tensor has a _grad_fn, call it to propagate gradients
|
||
4. Recursively call backward() on parent tensors
|
||
|
||
**Example:**
|
||
```python
|
||
x = Tensor([2.0], requires_grad=True)
|
||
y = x * 3
|
||
y.backward() # Computes gradients for x
|
||
print(x.grad) # [3.0]
|
||
```
|
||
"""
|
||
# Only compute gradients if required
|
||
if not self.requires_grad:
|
||
return
|
||
|
||
# Initialize gradient if not provided (for scalar outputs)
|
||
if gradient is None:
|
||
if self.data.size == 1:
|
||
gradient = np.ones_like(self.data)
|
||
else:
|
||
raise ValueError(
|
||
f"backward() called on non-scalar tensor without gradient argument.\n"
|
||
f" Tensor shape: {self.shape}\n"
|
||
f" Issue: For non-scalar outputs, you must provide the gradient from the next layer.\n"
|
||
f" Fix: Call backward(gradient) with the gradient tensor from the loss function."
|
||
)
|
||
|
||
# Initialize or accumulate gradient
|
||
if self.grad is None:
|
||
self.grad = np.zeros_like(self.data)
|
||
|
||
# Handle broadcasting: sum gradient to match self.data shape
|
||
# This happens when operations broadcast tensors (e.g., adding bias to batch)
|
||
if gradient.shape != self.grad.shape:
|
||
# Step 1: Remove extra leading dimensions added during forward pass
|
||
# Example: gradient (batch_size, features) → self.grad (features,)
|
||
while gradient.ndim > self.grad.ndim:
|
||
gradient = gradient.sum(axis=0)
|
||
|
||
# Step 2: Sum over dimensions that were size-1 in original tensor
|
||
# Example: bias with shape (1,) broadcast to (batch_size,) during forward
|
||
for i in range(gradient.ndim):
|
||
if self.grad.shape[i] == 1 and gradient.shape[i] != 1:
|
||
gradient = gradient.sum(axis=i, keepdims=True)
|
||
|
||
self.grad += gradient
|
||
|
||
# Propagate gradients through computation graph
|
||
# _grad_fn is set by autograd enhancement when tensor is created from an operation
|
||
grad_fn = getattr(self, '_grad_fn', None)
|
||
if grad_fn is not None:
|
||
grads = grad_fn.apply(gradient)
|
||
|
||
# Recursively call backward on parent tensors
|
||
for tensor, grad in zip(grad_fn.saved_tensors, grads):
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad and grad is not None:
|
||
tensor.backward(grad)
|
||
|
||
def zero_grad(self):
|
||
"""
|
||
Reset gradients to zero.
|
||
|
||
Call this before each backward pass to prevent gradient accumulation
|
||
from previous iterations.
|
||
"""
|
||
self.grad = None
|
||
|
||
# Install enhanced operations
|
||
Tensor.__add__ = tracked_add
|
||
Tensor.__sub__ = tracked_sub
|
||
Tensor.__mul__ = tracked_mul
|
||
Tensor.__truediv__ = tracked_div
|
||
Tensor.__getitem__ = tracked_getitem
|
||
Tensor.matmul = tracked_matmul
|
||
Tensor.transpose = tracked_transpose
|
||
Tensor.reshape = tracked_reshape
|
||
Tensor.sum = sum_op
|
||
Tensor.backward = backward
|
||
Tensor.zero_grad = zero_grad
|
||
|
||
# Patch activations and losses to track gradients
|
||
try:
|
||
from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU
|
||
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
|
||
|
||
# Store original methods
|
||
_original_sigmoid_forward = Sigmoid.forward
|
||
_original_relu_forward = ReLU.forward
|
||
_original_softmax_forward = Softmax.forward
|
||
_original_gelu_forward = GELU.forward
|
||
_original_bce_forward = BinaryCrossEntropyLoss.forward
|
||
_original_mse_forward = MSELoss.forward
|
||
_original_ce_forward = CrossEntropyLoss.forward
|
||
|
||
def tracked_sigmoid_forward(self, x):
|
||
"""Sigmoid with gradient tracking."""
|
||
result_data = 1.0 / (1.0 + np.exp(-x.data))
|
||
result = Tensor(result_data)
|
||
|
||
if x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SigmoidBackward(x, result)
|
||
|
||
return result
|
||
|
||
def tracked_relu_forward(self, x):
|
||
"""ReLU with gradient tracking."""
|
||
result_data = np.maximum(0, x.data)
|
||
result = Tensor(result_data)
|
||
|
||
if x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = ReLUBackward(x)
|
||
|
||
return result
|
||
|
||
def tracked_softmax_forward(self, x, dim=-1):
|
||
"""Softmax with gradient tracking."""
|
||
# Call original forward to get result using Tensor operations
|
||
result = _original_softmax_forward(self, x, dim=dim)
|
||
|
||
# Attach the correct gradient function
|
||
if x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SoftmaxBackward(x, result, dim)
|
||
|
||
return result
|
||
|
||
def tracked_gelu_forward(self, x):
|
||
"""GELU with gradient tracking."""
|
||
# Call original forward to get result
|
||
result = _original_gelu_forward(self, x)
|
||
|
||
# Attach the correct gradient function
|
||
if x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = GELUBackward(x)
|
||
|
||
return result
|
||
|
||
def tracked_bce_forward(self, predictions, targets):
|
||
"""Binary cross-entropy with gradient tracking."""
|
||
# Compute BCE loss
|
||
eps = EPSILON
|
||
clamped_preds = np.clip(predictions.data, eps, 1 - eps)
|
||
log_preds = np.log(clamped_preds)
|
||
log_one_minus_preds = np.log(1 - clamped_preds)
|
||
bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)
|
||
bce_loss = np.mean(bce_per_sample)
|
||
|
||
result = Tensor(bce_loss)
|
||
|
||
if predictions.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = BCEBackward(predictions, targets)
|
||
|
||
return result
|
||
|
||
def tracked_mse_forward(self, predictions, targets):
|
||
"""MSE loss with gradient tracking."""
|
||
# Compute MSE loss
|
||
diff = predictions.data - targets.data
|
||
squared_diff = diff ** 2
|
||
mse = np.mean(squared_diff)
|
||
|
||
result = Tensor(mse)
|
||
|
||
if predictions.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = MSEBackward(predictions, targets)
|
||
|
||
return result
|
||
|
||
def tracked_ce_forward(self, logits, targets):
|
||
"""Cross-entropy loss with gradient tracking."""
|
||
from tinytorch.core.losses import log_softmax
|
||
|
||
# Compute log-softmax for numerical stability
|
||
log_probs = log_softmax(logits, dim=-1)
|
||
|
||
# Select log-probabilities for correct classes
|
||
batch_size = logits.shape[0]
|
||
target_indices = targets.data.astype(int)
|
||
selected_log_probs = log_probs.data[np.arange(batch_size), target_indices]
|
||
|
||
# Return negative mean
|
||
ce_loss = -np.mean(selected_log_probs)
|
||
|
||
result = Tensor(ce_loss)
|
||
|
||
if logits.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = CrossEntropyBackward(logits, targets)
|
||
|
||
return result
|
||
|
||
# Install patched methods
|
||
Sigmoid.forward = tracked_sigmoid_forward
|
||
ReLU.forward = tracked_relu_forward
|
||
Softmax.forward = tracked_softmax_forward
|
||
GELU.forward = tracked_gelu_forward
|
||
BinaryCrossEntropyLoss.forward = tracked_bce_forward
|
||
MSELoss.forward = tracked_mse_forward
|
||
CrossEntropyLoss.forward = tracked_ce_forward
|
||
|
||
except ImportError:
|
||
# Activations/losses not yet available (happens during module development)
|
||
pass
|
||
|
||
# Mark as enabled
|
||
Tensor._autograd_enabled = True
|
||
|
||
print("✅ Autograd enabled! Tensors now track gradients.")
|
||
print(" - Operations build computation graphs")
|
||
print(" - backward() computes gradients")
|
||
print(" - requires_grad=True enables tracking")
|
||
|
||
# Auto-enable when module is imported
|
||
enable_autograd()
|