mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-12 00:23:34 -05:00
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
693 lines
23 KiB
Python
Generated
693 lines
23 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/05_autograd/autograd_dev.ipynb.
|
||
|
||
# %% auto 0
|
||
__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
|
||
'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
|
||
import numpy as np
|
||
from typing import Optional, List, Tuple
|
||
import sys
|
||
import os
|
||
|
||
from .tensor import Tensor
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 6
|
||
class Function:
|
||
"""
|
||
Base class for differentiable operations.
|
||
|
||
Every operation that needs gradients (add, multiply, matmul, etc.)
|
||
will inherit from this class and implement the apply() method.
|
||
|
||
**Key Concepts:**
|
||
- **saved_tensors**: Store inputs needed for backward pass
|
||
- **apply()**: Compute gradients using chain rule
|
||
- **next_functions**: Track computation graph connections
|
||
|
||
**Example Usage:**
|
||
```python
|
||
class AddBackward(Function):
|
||
def apply(self, grad_output):
|
||
# Addition distributes gradients equally
|
||
return grad_output, grad_output
|
||
```
|
||
"""
|
||
|
||
def __init__(self, *tensors):
|
||
"""
|
||
Initialize function with input tensors.
|
||
|
||
Args:
|
||
*tensors: Input tensors that will be saved for backward pass
|
||
"""
|
||
self.saved_tensors = tensors
|
||
self.next_functions = []
|
||
|
||
# Build computation graph connections
|
||
for t in tensors:
|
||
if isinstance(t, Tensor) and t.requires_grad:
|
||
if hasattr(t, '_grad_fn'):
|
||
self.next_functions.append(t._grad_fn)
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for inputs.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from the output
|
||
|
||
Returns:
|
||
Tuple of gradients for each input tensor
|
||
|
||
**Must be implemented by subclasses**
|
||
"""
|
||
raise NotImplementedError("Each Function must implement apply() method")
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 9
|
||
class AddBackward(Function):
|
||
"""
|
||
Gradient computation for tensor addition.
|
||
|
||
**Mathematical Rule:** If z = a + b, then ∂z/∂a = 1 and ∂z/∂b = 1
|
||
|
||
**Key Insight:** Addition distributes gradients equally to both inputs.
|
||
The gradient flowing backward is passed unchanged to each input.
|
||
|
||
**Broadcasting Handling:** When input shapes differ due to broadcasting,
|
||
we sum gradients appropriately to match original tensor shapes.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for addition.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) for the two inputs
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(a+b)/∂a = 1 → grad_a = grad_output
|
||
- ∂(a+b)/∂b = 1 → grad_b = grad_output
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
# Gradient for first input
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
grad_a = grad_output
|
||
|
||
# Gradient for second input
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
grad_b = grad_output
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 11
|
||
class MulBackward(Function):
|
||
"""
|
||
Gradient computation for tensor multiplication.
|
||
|
||
**Mathematical Rule:** If z = a * b, then ∂z/∂a = b and ∂z/∂b = a
|
||
|
||
**Key Insight:** Each input's gradient equals the gradient output
|
||
multiplied by the OTHER input's value (product rule).
|
||
|
||
**Applications:** Used in weight scaling, attention mechanisms,
|
||
and anywhere element-wise multiplication occurs.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for multiplication.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) for the two inputs
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(a*b)/∂a = b → grad_a = grad_output * b
|
||
- ∂(a*b)/∂b = a → grad_b = grad_output * a
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
# Gradient for first input: grad_output * b
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
if isinstance(b, Tensor):
|
||
grad_a = grad_output * b.data
|
||
else:
|
||
grad_a = grad_output * b
|
||
|
||
# Gradient for second input: grad_output * a
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
grad_b = grad_output * a.data
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14
|
||
class MatmulBackward(Function):
|
||
"""
|
||
Gradient computation for matrix multiplication.
|
||
|
||
**Mathematical Rule:** If Z = A @ B, then:
|
||
- ∂Z/∂A = grad_Z @ B.T
|
||
- ∂Z/∂B = A.T @ grad_Z
|
||
|
||
**Key Insight:** Matrix multiplication gradients involve transposing
|
||
one input and multiplying with the gradient output.
|
||
|
||
**Applications:** Core operation in neural networks for weight updates
|
||
in linear layers, attention mechanisms, and transformers.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for matrix multiplication.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple of (grad_a, grad_b) for the two matrix inputs
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂(A@B)/∂A = grad_output @ B.T
|
||
- ∂(A@B)/∂B = A.T @ grad_output
|
||
"""
|
||
a, b = self.saved_tensors
|
||
grad_a = grad_b = None
|
||
|
||
# Gradient for first input: grad_output @ b.T
|
||
if isinstance(a, Tensor) and a.requires_grad:
|
||
grad_a = np.dot(grad_output, b.data.T)
|
||
|
||
# Gradient for second input: a.T @ grad_output
|
||
if isinstance(b, Tensor) and b.requires_grad:
|
||
grad_b = np.dot(a.data.T, grad_output)
|
||
|
||
return grad_a, grad_b
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16
|
||
class SumBackward(Function):
|
||
"""
|
||
Gradient computation for tensor sum.
|
||
|
||
**Mathematical Rule:** If z = sum(a), then ∂z/∂a[i] = 1 for all i
|
||
|
||
**Key Insight:** Sum distributes the gradient equally to all input elements.
|
||
The gradient is broadcast from the reduced output back to input shape.
|
||
|
||
**Applications:** Used in loss functions, mean operations, and
|
||
anywhere tensor reduction occurs.
|
||
"""
|
||
|
||
def apply(self, grad_output):
|
||
"""
|
||
Compute gradients for sum operation.
|
||
|
||
Args:
|
||
grad_output: Gradient flowing backward from output
|
||
|
||
Returns:
|
||
Tuple containing gradient for the input tensor
|
||
|
||
**Mathematical Foundation:**
|
||
- ∂sum(a)/∂a[i] = 1 → grad_a = ones_like(a) * grad_output
|
||
"""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# Gradient is 1 for all elements, scaled by grad_output
|
||
return np.ones_like(tensor.data) * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
|
||
class ReLUBackward(Function):
|
||
"""
|
||
Gradient computation for ReLU activation.
|
||
|
||
ReLU: f(x) = max(0, x)
|
||
Derivative: f'(x) = 1 if x > 0, else 0
|
||
"""
|
||
|
||
def __init__(self, input_tensor):
|
||
"""Initialize with input tensor."""
|
||
super().__init__(input_tensor)
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for ReLU."""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# ReLU gradient: 1 if x > 0, else 0
|
||
relu_grad = (tensor.data > 0).astype(np.float32)
|
||
return grad_output * relu_grad,
|
||
return None,
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
|
||
class SigmoidBackward(Function):
|
||
"""
|
||
Gradient computation for sigmoid activation.
|
||
|
||
Sigmoid: σ(x) = 1/(1 + exp(-x))
|
||
Derivative: σ'(x) = σ(x) * (1 - σ(x))
|
||
"""
|
||
|
||
def __init__(self, input_tensor, output_tensor):
|
||
"""
|
||
Initialize with both input and output.
|
||
|
||
Args:
|
||
input_tensor: Original input to sigmoid
|
||
output_tensor: Output of sigmoid (saves recomputation)
|
||
"""
|
||
super().__init__(input_tensor)
|
||
self.output_data = output_tensor.data
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for sigmoid."""
|
||
tensor, = self.saved_tensors
|
||
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad:
|
||
# σ'(x) = σ(x) * (1 - σ(x))
|
||
sigmoid_grad = self.output_data * (1 - self.output_data)
|
||
return grad_output * sigmoid_grad,
|
||
return None,
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
|
||
class MSEBackward(Function):
|
||
"""
|
||
Gradient computation for Mean Squared Error Loss.
|
||
|
||
MSE: L = mean((predictions - targets)²)
|
||
Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N
|
||
"""
|
||
|
||
def __init__(self, predictions, targets):
|
||
"""Initialize with predictions and targets."""
|
||
super().__init__(predictions)
|
||
self.targets_data = targets.data
|
||
self.num_samples = np.size(targets.data)
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for MSE loss."""
|
||
predictions, = self.saved_tensors
|
||
|
||
if isinstance(predictions, Tensor) and predictions.requires_grad:
|
||
# Gradient: 2 * (predictions - targets) / N
|
||
grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples
|
||
|
||
return grad * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
|
||
class BCEBackward(Function):
|
||
"""
|
||
Gradient computation for Binary Cross-Entropy Loss.
|
||
|
||
BCE: L = -[y*log(p) + (1-y)*log(1-p)]
|
||
Derivative: ∂L/∂p = (p - y) / (p*(1-p)*N)
|
||
"""
|
||
|
||
def __init__(self, predictions, targets):
|
||
"""Initialize with predictions and targets."""
|
||
super().__init__(predictions)
|
||
self.targets_data = targets.data
|
||
self.num_samples = np.size(targets.data)
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for BCE loss."""
|
||
predictions, = self.saved_tensors
|
||
|
||
if isinstance(predictions, Tensor) and predictions.requires_grad:
|
||
eps = 1e-7
|
||
p = np.clip(predictions.data, eps, 1 - eps)
|
||
y = self.targets_data
|
||
|
||
# Gradient: (p - y) / (p * (1-p) * N)
|
||
grad = (p - y) / (p * (1 - p) * self.num_samples)
|
||
|
||
return grad * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
|
||
class CrossEntropyBackward(Function):
|
||
"""
|
||
Gradient computation for Cross-Entropy Loss.
|
||
|
||
CrossEntropy: L = -mean(log_softmax(logits)[targets])
|
||
|
||
The gradient with respect to logits is remarkably elegant:
|
||
∂L/∂logits = (softmax(logits) - one_hot(targets)) / N
|
||
|
||
This is one of the most beautiful results in machine learning:
|
||
- The gradient is simply the difference between predictions and targets
|
||
- It naturally scales with how wrong we are
|
||
- It's numerically stable when computed via softmax
|
||
"""
|
||
|
||
def __init__(self, logits, targets):
|
||
"""Initialize with logits and target class indices."""
|
||
super().__init__(logits)
|
||
self.targets_data = targets.data.astype(int)
|
||
self.batch_size = logits.data.shape[0]
|
||
self.num_classes = logits.data.shape[1]
|
||
|
||
def apply(self, grad_output):
|
||
"""Compute gradient for cross-entropy loss."""
|
||
logits, = self.saved_tensors
|
||
|
||
if isinstance(logits, Tensor) and logits.requires_grad:
|
||
# Compute softmax probabilities
|
||
# Using stable softmax: subtract max for numerical stability
|
||
logits_data = logits.data
|
||
max_logits = np.max(logits_data, axis=1, keepdims=True)
|
||
exp_logits = np.exp(logits_data - max_logits)
|
||
softmax = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
|
||
|
||
# Create one-hot encoding of targets
|
||
one_hot = np.zeros((self.batch_size, self.num_classes), dtype=np.float32)
|
||
one_hot[np.arange(self.batch_size), self.targets_data] = 1.0
|
||
|
||
# Gradient: (softmax - one_hot) / batch_size
|
||
grad = (softmax - one_hot) / self.batch_size
|
||
|
||
return grad * grad_output,
|
||
return None,
|
||
|
||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
|
||
def enable_autograd():
|
||
"""
|
||
Enable gradient tracking for all Tensor operations.
|
||
|
||
This function enhances the existing Tensor class with autograd capabilities.
|
||
Call this once to activate gradients globally.
|
||
|
||
**What it does:**
|
||
- Replaces Tensor operations with gradient-tracking versions
|
||
- Adds backward() method for reverse-mode differentiation
|
||
- Enables computation graph building
|
||
- Maintains full backward compatibility
|
||
|
||
**After calling this:**
|
||
- Tensor operations will track computation graphs
|
||
- backward() method becomes available
|
||
- Gradients will flow through operations
|
||
- requires_grad=True enables tracking per tensor
|
||
|
||
**Example:**
|
||
```python
|
||
enable_autograd() # Call once
|
||
x = Tensor([2.0], requires_grad=True)
|
||
y = x * 3
|
||
y.backward()
|
||
print(x.grad) # [3.0]
|
||
```
|
||
"""
|
||
|
||
# Check if already enabled
|
||
if hasattr(Tensor, '_autograd_enabled'):
|
||
print("⚠️ Autograd already enabled")
|
||
return
|
||
|
||
# Store original operations
|
||
_original_add = Tensor.__add__
|
||
_original_mul = Tensor.__mul__
|
||
_original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None
|
||
|
||
# Enhanced operations that track gradients
|
||
def tracked_add(self, other):
|
||
"""
|
||
Addition with gradient tracking.
|
||
|
||
Enhances the original __add__ method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Convert scalar to Tensor if needed
|
||
if not isinstance(other, Tensor):
|
||
other = Tensor(other)
|
||
|
||
# Call original operation
|
||
result = _original_add(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or other.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = AddBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_mul(self, other):
|
||
"""
|
||
Multiplication with gradient tracking.
|
||
|
||
Enhances the original __mul__ method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
# Convert scalar to Tensor if needed for consistency
|
||
if not isinstance(other, Tensor):
|
||
other_tensor = Tensor(other)
|
||
else:
|
||
other_tensor = other
|
||
|
||
# Call original operation
|
||
result = _original_mul(self, other)
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or (isinstance(other, Tensor) and other.requires_grad):
|
||
result.requires_grad = True
|
||
result._grad_fn = MulBackward(self, other)
|
||
|
||
return result
|
||
|
||
def tracked_matmul(self, other):
|
||
"""
|
||
Matrix multiplication with gradient tracking.
|
||
|
||
Enhances the original matmul method to build computation graphs
|
||
when requires_grad=True for any input.
|
||
"""
|
||
if _original_matmul:
|
||
result = _original_matmul(self, other)
|
||
else:
|
||
# Fallback if matmul doesn't exist
|
||
result = Tensor(np.dot(self.data, other.data))
|
||
|
||
# Track gradient if needed
|
||
if self.requires_grad or other.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = MatmulBackward(self, other)
|
||
|
||
return result
|
||
|
||
def sum_op(self, axis=None, keepdims=False):
|
||
"""
|
||
Sum operation with gradient tracking.
|
||
|
||
Creates a new sum method that builds computation graphs
|
||
when requires_grad=True.
|
||
"""
|
||
result_data = np.sum(self.data, axis=axis, keepdims=keepdims)
|
||
result = Tensor(result_data)
|
||
|
||
if self.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SumBackward(self)
|
||
|
||
return result
|
||
|
||
def backward(self, gradient=None):
|
||
"""
|
||
Compute gradients via backpropagation.
|
||
|
||
This is the key method that makes training possible!
|
||
It implements reverse-mode automatic differentiation.
|
||
|
||
**Algorithm:**
|
||
1. Initialize gradient if not provided (for scalar outputs)
|
||
2. Accumulate gradient in self.grad
|
||
3. If this tensor has a _grad_fn, call it to propagate gradients
|
||
4. Recursively call backward() on parent tensors
|
||
|
||
**Example:**
|
||
```python
|
||
x = Tensor([2.0], requires_grad=True)
|
||
y = x * 3
|
||
y.backward() # Computes gradients for x
|
||
print(x.grad) # [3.0]
|
||
```
|
||
"""
|
||
# Only compute gradients if required
|
||
if not self.requires_grad:
|
||
return
|
||
|
||
# Initialize gradient if not provided (for scalar outputs)
|
||
if gradient is None:
|
||
if self.data.size == 1:
|
||
gradient = np.ones_like(self.data)
|
||
else:
|
||
raise ValueError("backward() requires gradient for non-scalar outputs")
|
||
|
||
# Initialize or accumulate gradient
|
||
if self.grad is None:
|
||
self.grad = np.zeros_like(self.data)
|
||
|
||
# Handle broadcasting: sum gradient to match self.data shape
|
||
# This happens when operations broadcast tensors (e.g., adding bias to batch)
|
||
if gradient.shape != self.grad.shape:
|
||
# Step 1: Remove extra leading dimensions added during forward pass
|
||
# Example: gradient (batch_size, features) → self.grad (features,)
|
||
while gradient.ndim > self.grad.ndim:
|
||
gradient = gradient.sum(axis=0)
|
||
|
||
# Step 2: Sum over dimensions that were size-1 in original tensor
|
||
# Example: bias with shape (1,) broadcast to (batch_size,) during forward
|
||
for i in range(gradient.ndim):
|
||
if self.grad.shape[i] == 1 and gradient.shape[i] != 1:
|
||
gradient = gradient.sum(axis=i, keepdims=True)
|
||
|
||
self.grad += gradient
|
||
|
||
# Propagate gradients through computation graph
|
||
if hasattr(self, '_grad_fn') and self._grad_fn:
|
||
grads = self._grad_fn.apply(gradient)
|
||
|
||
# Recursively call backward on parent tensors
|
||
for tensor, grad in zip(self._grad_fn.saved_tensors, grads):
|
||
if isinstance(tensor, Tensor) and tensor.requires_grad and grad is not None:
|
||
tensor.backward(grad)
|
||
|
||
def zero_grad(self):
|
||
"""
|
||
Reset gradients to zero.
|
||
|
||
Call this before each backward pass to prevent gradient accumulation
|
||
from previous iterations.
|
||
"""
|
||
self.grad = None
|
||
|
||
# Install enhanced operations
|
||
Tensor.__add__ = tracked_add
|
||
Tensor.__mul__ = tracked_mul
|
||
Tensor.matmul = tracked_matmul
|
||
Tensor.sum = sum_op
|
||
Tensor.backward = backward
|
||
Tensor.zero_grad = zero_grad
|
||
|
||
# Patch activations and losses to track gradients
|
||
try:
|
||
from tinytorch.core.activations import Sigmoid, ReLU
|
||
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
|
||
|
||
# Store original methods
|
||
_original_sigmoid_forward = Sigmoid.forward
|
||
_original_relu_forward = ReLU.forward
|
||
_original_bce_forward = BinaryCrossEntropyLoss.forward
|
||
_original_mse_forward = MSELoss.forward
|
||
_original_ce_forward = CrossEntropyLoss.forward
|
||
|
||
def tracked_sigmoid_forward(self, x):
|
||
"""Sigmoid with gradient tracking."""
|
||
result_data = 1.0 / (1.0 + np.exp(-x.data))
|
||
result = Tensor(result_data)
|
||
|
||
if x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SigmoidBackward(x, result)
|
||
|
||
return result
|
||
|
||
def tracked_relu_forward(self, x):
|
||
"""ReLU with gradient tracking."""
|
||
result_data = np.maximum(0, x.data)
|
||
result = Tensor(result_data)
|
||
|
||
if x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = ReLUBackward(x)
|
||
|
||
return result
|
||
|
||
def tracked_bce_forward(self, predictions, targets):
|
||
"""Binary cross-entropy with gradient tracking."""
|
||
# Compute BCE loss
|
||
eps = 1e-7
|
||
clamped_preds = np.clip(predictions.data, eps, 1 - eps)
|
||
log_preds = np.log(clamped_preds)
|
||
log_one_minus_preds = np.log(1 - clamped_preds)
|
||
bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)
|
||
bce_loss = np.mean(bce_per_sample)
|
||
|
||
result = Tensor(bce_loss)
|
||
|
||
if predictions.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = BCEBackward(predictions, targets)
|
||
|
||
return result
|
||
|
||
def tracked_mse_forward(self, predictions, targets):
|
||
"""MSE loss with gradient tracking."""
|
||
# Compute MSE loss
|
||
diff = predictions.data - targets.data
|
||
squared_diff = diff ** 2
|
||
mse = np.mean(squared_diff)
|
||
|
||
result = Tensor(mse)
|
||
|
||
if predictions.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = MSEBackward(predictions, targets)
|
||
|
||
return result
|
||
|
||
def tracked_ce_forward(self, logits, targets):
|
||
"""Cross-entropy loss with gradient tracking."""
|
||
from tinytorch.core.losses import log_softmax
|
||
|
||
# Compute log-softmax for numerical stability
|
||
log_probs = log_softmax(logits, dim=-1)
|
||
|
||
# Select log-probabilities for correct classes
|
||
batch_size = logits.shape[0]
|
||
target_indices = targets.data.astype(int)
|
||
selected_log_probs = log_probs.data[np.arange(batch_size), target_indices]
|
||
|
||
# Return negative mean
|
||
ce_loss = -np.mean(selected_log_probs)
|
||
|
||
result = Tensor(ce_loss)
|
||
|
||
if logits.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = CrossEntropyBackward(logits, targets)
|
||
|
||
return result
|
||
|
||
# Install patched methods
|
||
Sigmoid.forward = tracked_sigmoid_forward
|
||
ReLU.forward = tracked_relu_forward
|
||
BinaryCrossEntropyLoss.forward = tracked_bce_forward
|
||
MSELoss.forward = tracked_mse_forward
|
||
CrossEntropyLoss.forward = tracked_ce_forward
|
||
|
||
except ImportError:
|
||
# Activations/losses not yet available (happens during module development)
|
||
pass
|
||
|
||
# Mark as enabled
|
||
Tensor._autograd_enabled = True
|
||
|
||
print("✅ Autograd enabled! Tensors now track gradients.")
|
||
print(" - Operations build computation graphs")
|
||
print(" - backward() computes gradients")
|
||
print(" - requires_grad=True enables tracking")
|
||
|
||
# Auto-enable when module is imported
|
||
enable_autograd()
|