Files
TinyTorch/tinytorch/core/autograd.py
Vijay Janapa Reddi ee9355584f Fix all module tests after merge - 20/20 passing
Fixes after merge conflicts:
- Fix tensor reshape error message format
- Fix __init__.py imports (remove BatchNorm2d, fix enable_autograd call)
- Fix attention mask broadcasting for multi-head attention
- Fix memoization module to use matmul instead of @ operator
- Fix capstone module count_parameters and CosineSchedule usage
- Add missing imports to benchmark.py (dataclass, Profiler, platform, os)
- Simplify capstone pipeline test to avoid data shape mismatch

All 20 modules now pass tito test --all
2025-12-03 08:14:27 -08:00

1298 lines
45 KiB
Python
Generated
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: src/05_autograd/05_autograd.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['EPSILON', 'Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward',
'TransposeBackward', 'PermuteBackward', 'EmbeddingBackward', 'SliceBackward', 'ReshapeBackward',
'SumBackward', 'ReLUBackward', 'SigmoidBackward', 'SoftmaxBackward', 'GELUBackward', 'MSEBackward',
'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
# %% ../../modules/05_autograd/autograd.ipynb 1
import numpy as np
from typing import Optional, List, Tuple
import sys
import os
from .tensor import Tensor
# Constants for numerical differentiation
EPSILON = 1e-7 # Small perturbation for numerical gradient computation
# %% ../../modules/05_autograd/autograd.ipynb 6
class Function:
"""
Base class for differentiable operations.
Every operation that needs gradients (add, multiply, matmul, etc.)
will inherit from this class and implement the apply() method.
**Key Concepts:**
- **saved_tensors**: Store inputs needed for backward pass
- **apply()**: Compute gradients using chain rule
- **next_functions**: Track computation graph connections
**Example Usage:**
```python
class AddBackward(Function):
def apply(self, grad_output):
# Addition distributes gradients equally
return grad_output, grad_output
```
"""
def __init__(self, *tensors):
"""
Initialize function with input tensors.
Args:
*tensors: Input tensors that will be saved for backward pass
"""
self.saved_tensors = tensors
self.next_functions = []
# Build computation graph connections
for t in tensors:
if isinstance(t, Tensor) and t.requires_grad:
# Check if this tensor was created by another operation
# _grad_fn is only present if autograd is enabled and tensor came from an operation
if getattr(t, '_grad_fn', None) is not None:
self.next_functions.append(t._grad_fn)
def apply(self, grad_output):
"""
Compute gradients for inputs.
Args:
grad_output: Gradient flowing backward from the output
Returns:
Tuple of gradients for each input tensor
**Must be implemented by subclasses**
"""
raise NotImplementedError("Each Function must implement apply() method")
# %% ../../modules/05_autograd/autograd.ipynb 9
class AddBackward(Function):
"""
Gradient computation for tensor addition.
**Mathematical Rule:** If z = a + b, then ∂z/∂a = 1 and ∂z/∂b = 1
**Key Insight:** Addition distributes gradients equally to both inputs.
The gradient flowing backward is passed unchanged to each input.
**Broadcasting Handling:** When input shapes differ due to broadcasting,
we sum gradients appropriately to match original tensor shapes.
"""
def apply(self, grad_output):
"""
Compute gradients for addition.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple of (grad_a, grad_b) for the two inputs
**Mathematical Foundation:**
- ∂(a+b)/∂a = 1 → grad_a = grad_output
- ∂(a+b)/∂b = 1 → grad_b = grad_output
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Gradient for first input
if isinstance(a, Tensor) and a.requires_grad:
grad_a = grad_output
# Gradient for second input
if isinstance(b, Tensor) and b.requires_grad:
grad_b = grad_output
return grad_a, grad_b
# %% ../../modules/05_autograd/autograd.ipynb 11
class MulBackward(Function):
"""
Gradient computation for tensor multiplication.
**Mathematical Rule:** If z = a * b, then ∂z/∂a = b and ∂z/∂b = a
**Key Insight:** Each input's gradient equals the gradient output
multiplied by the OTHER input's value (product rule).
**Applications:** Used in weight scaling, attention mechanisms,
and anywhere element-wise multiplication occurs.
"""
def apply(self, grad_output):
"""
Compute gradients for multiplication.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple of (grad_a, grad_b) for the two inputs
**Mathematical Foundation:**
- ∂(a*b)/∂a = b → grad_a = grad_output * b
- ∂(a*b)/∂b = a → grad_b = grad_output * a
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Gradient for first input: grad_output * b
if isinstance(a, Tensor) and a.requires_grad:
if isinstance(b, Tensor):
grad_a = grad_output * b.data
else:
grad_a = grad_output * b
# Gradient for second input: grad_output * a
if isinstance(b, Tensor) and b.requires_grad:
grad_b = grad_output * a.data
return grad_a, grad_b
# %% ../../modules/05_autograd/autograd.ipynb 13
class SubBackward(Function):
"""
Gradient computation for tensor subtraction.
**Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1
"""
def apply(self, grad_output):
"""
Compute gradients for subtraction.
Returns:
Tuple of (grad_a, grad_b) where grad_b is negated
"""
a, b = self.saved_tensors
grad_a = grad_b = None
if isinstance(a, Tensor) and a.requires_grad:
grad_a = grad_output # ∂(a-b)/∂a = 1
if isinstance(b, Tensor) and b.requires_grad:
grad_b = -grad_output # ∂(a-b)/∂b = -1 (note the negative!)
return grad_a, grad_b
# %% ../../modules/05_autograd/autograd.ipynb 15
class DivBackward(Function):
"""
Gradient computation for tensor division.
**Mathematical Rule:** If z = a / b, then:
- ∂z/∂a = 1/b
- ∂z/∂b = -a/b²
"""
def apply(self, grad_output):
"""
Compute gradients for division using quotient rule.
Returns:
Tuple of (grad_a, grad_b)
"""
a, b = self.saved_tensors
grad_a = grad_b = None
if isinstance(a, Tensor) and a.requires_grad:
# ∂(a/b)/∂a = 1/b
if isinstance(b, Tensor):
grad_a = grad_output / b.data
else:
grad_a = grad_output / b
if isinstance(b, Tensor) and b.requires_grad:
# ∂(a/b)/∂b = -a/b²
grad_b = -grad_output * a.data / (b.data ** 2)
return grad_a, grad_b
# %% ../../modules/05_autograd/autograd.ipynb 17
class MatmulBackward(Function):
"""
Gradient computation for matrix multiplication.
**Mathematical Rule:** If Z = A @ B, then:
- ∂Z/∂A = grad_Z @ B.T
- ∂Z/∂B = A.T @ grad_Z
**Key Insight:** Matrix multiplication gradients involve transposing
one input and multiplying with the gradient output.
**Applications:** Core operation in neural networks for weight updates
in linear layers, attention mechanisms, and transformers.
"""
def apply(self, grad_output):
"""
Compute gradients for matrix multiplication.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple of (grad_a, grad_b) for the two matrix inputs
**Mathematical Foundation:**
- ∂(A@B)/∂A = grad_output @ B.T
- ∂(A@B)/∂B = A.T @ grad_output
**Batched Operation:** For 3D+ tensors, we transpose only the last two
dimensions using np.swapaxes, preserving batch dimensions.
"""
a, b = self.saved_tensors
grad_a = grad_b = None
# Gradient for first input: grad_output @ b.T
if isinstance(a, Tensor) and a.requires_grad:
# For batched tensors, transpose only last two dims
if b.data.ndim >= 2:
b_T = np.swapaxes(b.data, -2, -1)
else:
b_T = b.data.T
grad_a = np.matmul(grad_output, b_T)
# Gradient for second input: a.T @ grad_output
if isinstance(b, Tensor) and b.requires_grad:
# For batched tensors, transpose only last two dims
if a.data.ndim >= 2:
a_T = np.swapaxes(a.data, -2, -1)
else:
a_T = a.data.T
grad_b = np.matmul(a_T, grad_output)
return grad_a, grad_b
# %% ../../modules/05_autograd/autograd.ipynb 18
class TransposeBackward(Function):
"""
Gradient computation for transpose operation.
**Mathematical Rule:** If Y = X.T, then:
- ∂Y/∂X = grad_Y.T
**Key Insight:** The gradient of transpose is just transpose the gradient!
This is because transpose is a linear operation that just rearranges elements.
**Applications:** Used in attention (K.T for scores), weight gradients (W.T),
and any operation that needs to swap matrix dimensions.
"""
def __init__(self, tensor, dim0, dim1):
"""
Args:
tensor: Input tensor
dim0: First dimension to swap (None for default)
dim1: Second dimension to swap (None for default)
"""
super().__init__(tensor)
self.dim0 = dim0
self.dim1 = dim1
def apply(self, grad_output):
"""
Compute gradient for transpose.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- ∂(X.T)/∂X = grad_output.T
- Just transpose the gradient back!
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Transpose gradient using the same dims
if self.dim0 is None and self.dim1 is None:
# Default: transpose last two dimensions
if grad_output.ndim < 2:
grad_x = grad_output.copy()
else:
axes = list(range(grad_output.ndim))
axes[-2], axes[-1] = axes[-1], axes[-2]
grad_x = np.transpose(grad_output, axes)
else:
# Specific dimensions: swap them back
axes = list(range(grad_output.ndim))
axes[self.dim0], axes[self.dim1] = axes[self.dim1], axes[self.dim0]
grad_x = np.transpose(grad_output, axes)
return (grad_x,)
# %% ../../modules/05_autograd/autograd.ipynb 19
class PermuteBackward(Function):
"""
Gradient computation for arbitrary axis permutation (general transpose).
**Mathematical Rule:** If Y = X.permute(axes), then:
- ∂Y/∂X = grad_Y.permute(inverse_axes)
**Example:** If axes = (0, 2, 1, 3), the inverse is (0, 2, 1, 3) (self-inverse).
More generally, if axes = (2, 0, 1), the inverse is (1, 2, 0).
**Key Insight:** To reverse a permutation, we need to know where each axis went.
If axis i went to position axes[i], then in the inverse, position axes[i] should go to i.
**Applications:** Multi-head attention uses (0, 2, 1, 3) to rearrange heads.
"""
def __init__(self, tensor, axes):
"""
Args:
tensor: Input tensor
axes: Tuple of axis indices defining the permutation
"""
super().__init__(tensor)
self.axes = axes
# Compute inverse permutation: if axes[i] = j, then inverse_axes[j] = i
self.inverse_axes = tuple(np.argsort(axes))
def apply(self, grad_output):
"""
Compute gradient for permutation.
The gradient is permuted back using the inverse permutation.
**Mathematical Foundation:**
- ∂(X.permute(axes))/∂X = grad_output.permute(inverse_axes)
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Permute gradient back to original axis order
grad_x = np.transpose(grad_output, self.inverse_axes)
return (grad_x,)
# %% ../../modules/05_autograd/autograd.ipynb 20
class EmbeddingBackward(Function):
"""
Gradient computation for embedding lookup operation.
**Mathematical Rule:** If Y = Embedding[indices], then:
- ∂Loss/∂Embedding[i] = sum of all gradients where index==i
**Key Insight:** Embedding lookup is a gather operation. The backward
is a scatter operation that accumulates gradients to the embedding weights.
**Applications:** Word embeddings, positional embeddings, token embeddings
in transformers.
"""
def __init__(self, weight, indices):
"""
Args:
weight: Embedding weight matrix
indices: Indices used for lookup
"""
super().__init__(weight)
self.indices = indices
def apply(self, grad_output):
"""
Compute gradient for embedding lookup.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for weight tensor
**Mathematical Foundation:**
- ∂(Embedding[indices])/∂Embedding = scatter gradients to selected rows
- Multiple indices can point to same embedding → gradients accumulate
"""
weight, = self.saved_tensors
grad_weight = None
if isinstance(weight, Tensor) and weight.requires_grad:
# Initialize gradient with zeros
grad_weight = np.zeros_like(weight.data)
# Scatter gradients back to embedding weights
# np.add.at accumulates gradients for repeated indices
indices_flat = self.indices.data.astype(int).flatten()
grad_output_reshaped = grad_output.reshape(-1, grad_output.shape[-1])
np.add.at(grad_weight, indices_flat, grad_output_reshaped)
return (grad_weight,)
class SliceBackward(Function):
"""
Gradient computation for tensor slicing/indexing operations.
**Mathematical Rule:** If Y = X[key], then:
- ∂Loss/∂X[key] = grad_output
- ∂Loss/∂X[other positions] = 0
**Key Insight:** Slicing is a masking operation. The backward
places gradients back into the original tensor positions, with
zeros everywhere else.
**Applications:** Positional encodings, sequence slicing, batch selection,
attention masking in transformers.
**Examples:**
>>> x = Tensor([1, 2, 3, 4, 5], requires_grad=True)
>>> y = x[:3] # Slice first 3 elements
>>> loss = y.sum()
>>> loss.backward()
>>> # x.grad = [1, 1, 1, 0, 0] - gradients only for sliced positions
"""
def __init__(self, tensor, key):
"""
Args:
tensor: Original tensor being sliced
key: Slicing key (index, slice, tuple of slices, etc.)
"""
super().__init__(tensor)
self.key = key
self.original_shape = tensor.shape
def apply(self, grad_output):
"""
Compute gradient for slicing operation.
Args:
grad_output: Gradient flowing backward from sliced output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- Slicing extracts a subset of elements
- Backward scatters gradients back to original positions
- Unsliced positions receive zero gradient
**Example:**
If X = [a, b, c, d, e] and Y = X[1:4] = [b, c, d]
Then dL/dX = [0, dL/db, dL/dc, dL/dd, 0]
"""
tensor, = self.saved_tensors
grad_input = None
if isinstance(tensor, Tensor) and tensor.requires_grad:
# Create gradient array with same shape as original tensor
grad_input = np.zeros(self.original_shape, dtype=np.float32)
# Place gradients back into the sliced positions
# This is the inverse of the forward slicing operation
grad_input[self.key] = grad_output
return (grad_input,)
# %% ../../modules/05_autograd/autograd.ipynb 21
class ReshapeBackward(Function):
"""
Gradient computation for reshape operation.
**Mathematical Rule:** If Y = X.reshape(new_shape), then:
- ∂Y/∂X = grad_Y.reshape(X.shape)
**Key Insight:** Reshape just rearranges the same elements.
The gradient is simply reshaped back to the original shape!
**Applications:** Flattening tensors for linear layers, reshaping
between convolutional and dense layers.
"""
def __init__(self, tensor, original_shape):
"""
Args:
tensor: Input tensor
original_shape: Shape before reshape
"""
super().__init__(tensor)
self.original_shape = original_shape
def apply(self, grad_output):
"""
Compute gradient for reshape.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple with single gradient for input tensor
**Mathematical Foundation:**
- ∂(X.reshape(...))/∂X = grad_output.reshape(X.shape)
- Just reshape the gradient back!
"""
x, = self.saved_tensors
grad_x = None
if isinstance(x, Tensor) and x.requires_grad:
# Reshape gradient back to original shape
grad_x = grad_output.reshape(self.original_shape)
return (grad_x,)
# %% ../../modules/05_autograd/autograd.ipynb 23
class SumBackward(Function):
"""
Gradient computation for tensor sum.
**Mathematical Rule:** If z = sum(a), then ∂z/∂a[i] = 1 for all i
**Key Insight:** Sum distributes the gradient equally to all input elements.
The gradient is broadcast from the reduced output back to input shape.
**Applications:** Used in loss functions, mean operations, and
anywhere tensor reduction occurs.
"""
def apply(self, grad_output):
"""
Compute gradients for sum operation.
Args:
grad_output: Gradient flowing backward from output
Returns:
Tuple containing gradient for the input tensor
**Mathematical Foundation:**
- ∂sum(a)/∂a[i] = 1 → grad_a = ones_like(a) * grad_output
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
# Gradient is 1 for all elements, scaled by grad_output
return np.ones_like(tensor.data) * grad_output,
return None,
# %% ../../modules/05_autograd/autograd.ipynb 28
class ReLUBackward(Function):
"""
Gradient computation for ReLU activation.
ReLU: f(x) = max(0, x)
Derivative: f'(x) = 1 if x > 0, else 0
"""
def __init__(self, input_tensor):
"""Initialize with input tensor."""
super().__init__(input_tensor)
def apply(self, grad_output):
"""Compute gradient for ReLU."""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
# ReLU gradient: 1 if x > 0, else 0
relu_grad = (tensor.data > 0).astype(np.float32)
return grad_output * relu_grad,
return None,
# %% ../../modules/05_autograd/autograd.ipynb 29
class SigmoidBackward(Function):
"""
Gradient computation for sigmoid activation.
Sigmoid: σ(x) = 1/(1 + exp(-x))
Derivative: σ'(x) = σ(x) * (1 - σ(x))
"""
def __init__(self, input_tensor, output_tensor):
"""
Initialize with both input and output.
Args:
input_tensor: Original input to sigmoid
output_tensor: Output of sigmoid (saves recomputation)
"""
super().__init__(input_tensor)
self.output_data = output_tensor.data
def apply(self, grad_output):
"""Compute gradient for sigmoid."""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
# σ'(x) = σ(x) * (1 - σ(x))
sigmoid_grad = self.output_data * (1 - self.output_data)
return grad_output * sigmoid_grad,
return None,
# %% ../../modules/05_autograd/autograd.ipynb 30
class SoftmaxBackward(Function):
"""
Gradient computation for softmax activation.
Softmax: softmax(x)[i] = exp(x[i]) / sum(exp(x))
Derivative: ∂softmax/∂x[i] = softmax[i] * (δ[i,j] - softmax[j])
For gradient computation:
grad_x[i] = softmax[i] * (grad_y[i] - sum(grad_y * softmax))
**Key Insight:** The gradient depends on all elements of softmax due to
the normalization, not just the element being differentiated.
"""
def __init__(self, input_tensor, output_tensor, dim=-1):
"""
Initialize with input, output, and dimension.
Args:
input_tensor: Original input to softmax
output_tensor: Output of softmax (needed for gradient)
dim: Dimension along which softmax was applied
"""
super().__init__(input_tensor)
self.output_data = output_tensor.data
self.dim = dim
def apply(self, grad_output):
"""
Compute gradient for softmax.
Mathematical formula:
∂L/∂x[i] = softmax[i] * (∂L/∂y[i] - sum_j(∂L/∂y[j] * softmax[j]))
This can be vectorized as:
grad_x = softmax * (grad_y - sum(grad_y * softmax, keepdims=True))
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
# Compute sum(grad_output * softmax) along the softmax dimension
sum_term = np.sum(grad_output * self.output_data, axis=self.dim, keepdims=True)
# Softmax gradient: softmax * (grad_output - sum_term)
grad_x = self.output_data * (grad_output - sum_term)
return (grad_x,)
return (None,)
# %% ../../modules/05_autograd/autograd.ipynb 31
class GELUBackward(Function):
"""
Gradient computation for GELU activation.
GELU: f(x) = x * Φ(x) where Φ is the CDF of standard normal
Approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
**Key Insight:** GELU is smoother than ReLU, providing non-zero gradients
for negative values, which helps training deep networks.
"""
def __init__(self, input_tensor):
"""Initialize with input tensor."""
super().__init__(input_tensor)
def apply(self, grad_output):
"""
Compute gradient for GELU.
Mathematical formula (using approximation):
∂gelu/∂x ≈ 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * (...)
Simplified: We compute the derivative numerically or use the formula.
"""
tensor, = self.saved_tensors
if isinstance(tensor, Tensor) and tensor.requires_grad:
x = tensor.data
# GELU derivative approximation
# Using the tanh approximation: gelu(x) ≈ 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
sqrt_2_over_pi = np.sqrt(2.0 / np.pi)
x_cubed = x ** 3
tanh_arg = sqrt_2_over_pi * (x + 0.044715 * x_cubed)
tanh_out = np.tanh(tanh_arg)
sech_squared = 1 - tanh_out ** 2
# Derivative: 0.5 * (1 + tanh(...)) + 0.5 * x * sech²(...) * d(tanh_arg)/dx
d_tanh_arg = sqrt_2_over_pi * (1 + 0.134145 * x ** 2)
gelu_grad = 0.5 * (1 + tanh_out) + 0.5 * x * sech_squared * d_tanh_arg
return (grad_output * gelu_grad,)
return (None,)
# %% ../../modules/05_autograd/autograd.ipynb 32
class MSEBackward(Function):
"""
Gradient computation for Mean Squared Error Loss.
MSE: L = mean((predictions - targets)²)
Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N
"""
def __init__(self, predictions, targets):
"""Initialize with predictions and targets."""
super().__init__(predictions)
self.targets_data = targets.data
self.num_samples = np.size(targets.data)
def apply(self, grad_output):
"""Compute gradient for MSE loss."""
predictions, = self.saved_tensors
if isinstance(predictions, Tensor) and predictions.requires_grad:
# Gradient: 2 * (predictions - targets) / N
grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples
return grad * grad_output,
return None,
# %% ../../modules/05_autograd/autograd.ipynb 33
class BCEBackward(Function):
"""
Gradient computation for Binary Cross-Entropy Loss.
BCE: L = -[y*log(p) + (1-y)*log(1-p)]
Derivative: ∂L/∂p = (p - y) / (p*(1-p)*N)
"""
def __init__(self, predictions, targets):
"""Initialize with predictions and targets."""
super().__init__(predictions)
self.targets_data = targets.data
self.num_samples = np.size(targets.data)
def apply(self, grad_output):
"""Compute gradient for BCE loss."""
predictions, = self.saved_tensors
if isinstance(predictions, Tensor) and predictions.requires_grad:
eps = EPSILON
p = np.clip(predictions.data, eps, 1 - eps)
y = self.targets_data
# Gradient: (p - y) / (p * (1-p) * N)
grad = (p - y) / (p * (1 - p) * self.num_samples)
return grad * grad_output,
return None,
# %% ../../modules/05_autograd/autograd.ipynb 34
class CrossEntropyBackward(Function):
"""
Gradient computation for Cross-Entropy Loss.
CrossEntropy: L = -mean(log_softmax(logits)[targets])
The gradient with respect to logits is remarkably elegant:
∂L/∂logits = (softmax(logits) - one_hot(targets)) / N
This is one of the most beautiful results in machine learning:
- The gradient is simply the difference between predictions and targets
- It naturally scales with how wrong we are
- It's numerically stable when computed via softmax
"""
def __init__(self, logits, targets):
"""Initialize with logits and target class indices."""
super().__init__(logits)
self.targets_data = targets.data.astype(int)
self.batch_size = logits.data.shape[0]
self.num_classes = logits.data.shape[1]
def apply(self, grad_output):
"""Compute gradient for cross-entropy loss."""
logits, = self.saved_tensors
if isinstance(logits, Tensor) and logits.requires_grad:
# Compute softmax probabilities
# Using stable softmax: subtract max for numerical stability
logits_data = logits.data
max_logits = np.max(logits_data, axis=1, keepdims=True)
exp_logits = np.exp(logits_data - max_logits)
softmax = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
# Create one-hot encoding of targets
one_hot = np.zeros((self.batch_size, self.num_classes), dtype=np.float32)
one_hot[np.arange(self.batch_size), self.targets_data] = 1.0
# Gradient: (softmax - one_hot) / batch_size
grad = (softmax - one_hot) / self.batch_size
return grad * grad_output,
return None,
# %% ../../modules/05_autograd/autograd.ipynb 35
def enable_autograd():
"""
Enable gradient tracking for all Tensor operations.
This function enhances the existing Tensor class with autograd capabilities.
Call this once to activate gradients globally.
**What it does:**
- Replaces Tensor operations with gradient-tracking versions
- Adds backward() method for reverse-mode differentiation
- Enables computation graph building
- Maintains full backward compatibility
**After calling this:**
- Tensor operations will track computation graphs
- backward() method becomes available
- Gradients will flow through operations
- requires_grad=True enables tracking per tensor
**Example:**
```python
enable_autograd() # Call once
x = Tensor([2.0], requires_grad=True)
y = x * 3
y.backward()
print(x.grad) # [3.0]
```
"""
# Educational Note: hasattr() is LEGITIMATE here because:
# 1. This is a runtime monkey-patch system (meta-programming)
# 2. We're checking if a class has been dynamically modified
# 3. _autograd_enabled is a marker attribute we add at runtime
# This is the CORRECT use of hasattr() for dynamic class modification
if hasattr(Tensor, '_autograd_enabled'):
print("⚠️ Autograd already enabled")
return
# Store original operations
# These are guaranteed to exist from Module 01 (Tensor class)
_original_add = Tensor.__add__
_original_sub = Tensor.__sub__
_original_mul = Tensor.__mul__
_original_div = Tensor.__truediv__
_original_getitem = Tensor.__getitem__
# These methods are also guaranteed from Module 01 - trust Single Tensor Class
_original_matmul = Tensor.matmul
_original_transpose = Tensor.transpose
_original_reshape = Tensor.reshape
# Enhanced operations that track gradients
def tracked_add(self, other):
"""
Addition with gradient tracking.
Enhances the original __add__ method to build computation graphs
when requires_grad=True for any input.
"""
# Convert scalar to Tensor if needed
if not isinstance(other, Tensor):
other = Tensor(other)
# Call original operation
result = _original_add(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = AddBackward(self, other)
return result
def tracked_mul(self, other):
"""
Multiplication with gradient tracking.
Enhances the original __mul__ method to build computation graphs
when requires_grad=True for any input.
"""
# Convert scalar to Tensor if needed for consistency
if not isinstance(other, Tensor):
other_tensor = Tensor(other)
else:
other_tensor = other
# Call original operation
result = _original_mul(self, other)
# Track gradient if needed
if self.requires_grad or (isinstance(other, Tensor) and other.requires_grad):
result.requires_grad = True
result._grad_fn = MulBackward(self, other)
return result
def tracked_matmul(self, other):
"""
Matrix multiplication with gradient tracking.
Enhances the original matmul method to build computation graphs
when requires_grad=True for any input.
"""
# Call original matmul from Module 01
result = _original_matmul(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = MatmulBackward(self, other)
return result
def tracked_transpose(self, dim0=None, dim1=None):
"""
Transpose with gradient tracking.
Enhances the original transpose method to build computation graphs
when requires_grad=True for the input.
"""
# Call original transpose from Module 01
result = _original_transpose(self, dim0, dim1)
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = TransposeBackward(self, dim0, dim1)
return result
def tracked_reshape(self, *shape):
"""
Reshape with gradient tracking.
Enhances the original reshape method to build computation graphs
when requires_grad=True for the input.
"""
original_shape = self.shape
# Call original reshape from Module 01
result = _original_reshape(self, *shape)
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = ReshapeBackward(self, original_shape)
return result
def tracked_sub(self, other):
"""
Subtraction with gradient tracking.
Enhances the original __sub__ method to build computation graphs
when requires_grad=True for any input.
"""
# Convert scalar to Tensor if needed
if not isinstance(other, Tensor):
other = Tensor(other)
# Call original operation
result = _original_sub(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = SubBackward(self, other)
return result
def tracked_div(self, other):
"""
Division with gradient tracking.
Enhances the original __truediv__ method to build computation graphs
when requires_grad=True for any input.
"""
# Convert scalar to Tensor if needed
if not isinstance(other, Tensor):
other = Tensor(other)
# Call original operation
result = _original_div(self, other)
# Track gradient if needed
if self.requires_grad or other.requires_grad:
result.requires_grad = True
result._grad_fn = DivBackward(self, other)
return result
def tracked_getitem(self, key):
"""
Indexing/slicing with gradient tracking.
Enhances the original __getitem__ method to build computation graphs
when requires_grad=True for the input.
"""
# Call original __getitem__ from Module 01
result = _original_getitem(self, key)
# Track gradient if needed
if self.requires_grad:
result.requires_grad = True
result._grad_fn = SliceBackward(self, key)
return result
def sum_op(self, axis=None, keepdims=False):
"""
Sum operation with gradient tracking.
Creates a new sum method that builds computation graphs
when requires_grad=True.
"""
result_data = np.sum(self.data, axis=axis, keepdims=keepdims)
result = Tensor(result_data)
if self.requires_grad:
result.requires_grad = True
result._grad_fn = SumBackward(self)
return result
def backward(self, gradient=None):
"""
Compute gradients via backpropagation.
This is the key method that makes training possible!
It implements reverse-mode automatic differentiation.
**Algorithm:**
1. Initialize gradient if not provided (for scalar outputs)
2. Accumulate gradient in self.grad
3. If this tensor has a _grad_fn, call it to propagate gradients
4. Recursively call backward() on parent tensors
**Example:**
```python
x = Tensor([2.0], requires_grad=True)
y = x * 3
y.backward() # Computes gradients for x
print(x.grad) # [3.0]
```
"""
# Only compute gradients if required
if not self.requires_grad:
return
# Initialize gradient if not provided (for scalar outputs)
if gradient is None:
if self.data.size == 1:
gradient = np.ones_like(self.data)
else:
raise ValueError(
f"backward() called on non-scalar tensor without gradient argument.\n"
f" Tensor shape: {self.shape}\n"
f" Issue: For non-scalar outputs, you must provide the gradient from the next layer.\n"
f" Fix: Call backward(gradient) with the gradient tensor from the loss function."
)
# Initialize or accumulate gradient
if self.grad is None:
self.grad = np.zeros_like(self.data)
# Handle broadcasting: sum gradient to match self.data shape
# This happens when operations broadcast tensors (e.g., adding bias to batch)
if gradient.shape != self.grad.shape:
# Step 1: Remove extra leading dimensions added during forward pass
# Example: gradient (batch_size, features) → self.grad (features,)
while gradient.ndim > self.grad.ndim:
gradient = gradient.sum(axis=0)
# Step 2: Sum over dimensions that were size-1 in original tensor
# Example: bias with shape (1,) broadcast to (batch_size,) during forward
for i in range(gradient.ndim):
if self.grad.shape[i] == 1 and gradient.shape[i] != 1:
gradient = gradient.sum(axis=i, keepdims=True)
self.grad += gradient
# Propagate gradients through computation graph
# _grad_fn is set by autograd enhancement when tensor is created from an operation
grad_fn = getattr(self, '_grad_fn', None)
if grad_fn is not None:
grads = grad_fn.apply(gradient)
# Recursively call backward on parent tensors
for tensor, grad in zip(grad_fn.saved_tensors, grads):
if isinstance(tensor, Tensor) and tensor.requires_grad and grad is not None:
tensor.backward(grad)
def zero_grad(self):
"""
Reset gradients to zero.
Call this before each backward pass to prevent gradient accumulation
from previous iterations.
"""
self.grad = None
# Install enhanced operations
Tensor.__add__ = tracked_add
Tensor.__sub__ = tracked_sub
Tensor.__mul__ = tracked_mul
Tensor.__truediv__ = tracked_div
Tensor.__getitem__ = tracked_getitem
Tensor.matmul = tracked_matmul
Tensor.transpose = tracked_transpose
Tensor.reshape = tracked_reshape
Tensor.sum = sum_op
Tensor.backward = backward
Tensor.zero_grad = zero_grad
# Patch activations and losses to track gradients
try:
from tinytorch.core.activations import Sigmoid, ReLU, Softmax, GELU
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
# Store original methods
_original_sigmoid_forward = Sigmoid.forward
_original_relu_forward = ReLU.forward
_original_softmax_forward = Softmax.forward
_original_gelu_forward = GELU.forward
_original_bce_forward = BinaryCrossEntropyLoss.forward
_original_mse_forward = MSELoss.forward
_original_ce_forward = CrossEntropyLoss.forward
def tracked_sigmoid_forward(self, x):
"""Sigmoid with gradient tracking."""
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
if x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
def tracked_relu_forward(self, x):
"""ReLU with gradient tracking."""
result_data = np.maximum(0, x.data)
result = Tensor(result_data)
if x.requires_grad:
result.requires_grad = True
result._grad_fn = ReLUBackward(x)
return result
def tracked_softmax_forward(self, x, dim=-1):
"""Softmax with gradient tracking."""
# Call original forward to get result using Tensor operations
result = _original_softmax_forward(self, x, dim=dim)
# Attach the correct gradient function
if x.requires_grad:
result.requires_grad = True
result._grad_fn = SoftmaxBackward(x, result, dim)
return result
def tracked_gelu_forward(self, x):
"""GELU with gradient tracking."""
# Call original forward to get result
result = _original_gelu_forward(self, x)
# Attach the correct gradient function
if x.requires_grad:
result.requires_grad = True
result._grad_fn = GELUBackward(x)
return result
def tracked_bce_forward(self, predictions, targets):
"""Binary cross-entropy with gradient tracking."""
# Compute BCE loss
eps = EPSILON
clamped_preds = np.clip(predictions.data, eps, 1 - eps)
log_preds = np.log(clamped_preds)
log_one_minus_preds = np.log(1 - clamped_preds)
bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)
bce_loss = np.mean(bce_per_sample)
result = Tensor(bce_loss)
if predictions.requires_grad:
result.requires_grad = True
result._grad_fn = BCEBackward(predictions, targets)
return result
def tracked_mse_forward(self, predictions, targets):
"""MSE loss with gradient tracking."""
# Compute MSE loss
diff = predictions.data - targets.data
squared_diff = diff ** 2
mse = np.mean(squared_diff)
result = Tensor(mse)
if predictions.requires_grad:
result.requires_grad = True
result._grad_fn = MSEBackward(predictions, targets)
return result
def tracked_ce_forward(self, logits, targets):
"""Cross-entropy loss with gradient tracking."""
from tinytorch.core.losses import log_softmax
# Compute log-softmax for numerical stability
log_probs = log_softmax(logits, dim=-1)
# Select log-probabilities for correct classes
batch_size = logits.shape[0]
target_indices = targets.data.astype(int)
selected_log_probs = log_probs.data[np.arange(batch_size), target_indices]
# Return negative mean
ce_loss = -np.mean(selected_log_probs)
result = Tensor(ce_loss)
if logits.requires_grad:
result.requires_grad = True
result._grad_fn = CrossEntropyBackward(logits, targets)
return result
# Install patched methods
Sigmoid.forward = tracked_sigmoid_forward
ReLU.forward = tracked_relu_forward
Softmax.forward = tracked_softmax_forward
GELU.forward = tracked_gelu_forward
BinaryCrossEntropyLoss.forward = tracked_bce_forward
MSELoss.forward = tracked_mse_forward
CrossEntropyLoss.forward = tracked_ce_forward
except ImportError:
# Activations/losses not yet available (happens during module development)
pass
# Mark as enabled
Tensor._autograd_enabled = True
print("✅ Autograd enabled! Tensors now track gradients.")
print(" - Operations build computation graphs")
print(" - backward() computes gradients")
print(" - requires_grad=True enables tracking")
# Auto-enable when module is imported
enable_autograd()