feat: Add CrossEntropyLoss autograd support + Milestone 03 MLP on digits

Key Changes:
- Implemented CrossEntropyBackward for gradient computation
- Integrated CrossEntropyLoss into enable_autograd() patching
- Created comprehensive loss gradient test suite
- Milestone 03: MLP digits classifier (77.5% accuracy)
- Shipped tiny 8x8 digits dataset (67KB) for instant demos
- Updated DataLoader module with ASCII visualizations

Tests:
- All 3 losses (MSE, BCE, CrossEntropy) now have gradient flow
- MLP successfully learns digit classification (6.9% → 77.5%)
- Integration tests pass

Technical:
- CrossEntropyBackward: softmax - one_hot gradient
- Numerically stable via log-softmax
- Works with raw class labels (no one-hot needed)
This commit is contained in:
Vijay Janapa Reddi
2025-09-30 16:22:09 -04:00
parent 1c26ce5164
commit 6187725af3
15 changed files with 1015 additions and 353 deletions

30
tinytorch/_modidx.py generated
View File

@@ -114,7 +114,9 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.core.losses.MSELoss.forward': ( '04_losses/losses_dev.html#mseloss.forward',
'tinytorch/core/losses.py'),
'tinytorch.core.losses.import_previous_module': ( '04_losses/losses_dev.html#import_previous_module',
'tinytorch/core/losses.py')},
'tinytorch/core/losses.py'),
'tinytorch.core.losses.log_softmax': ( '04_losses/losses_dev.html#log_softmax',
'tinytorch/core/losses.py')},
'tinytorch.core.optimizers': { 'tinytorch.core.optimizers.Adam': ( '06_optimizers/optimizers_dev.html#adam',
'tinytorch/core/optimizers.py'),
'tinytorch.core.optimizers.Adam.__init__': ( '06_optimizers/optimizers_dev.html#adam.__init__',
@@ -201,4 +203,28 @@ d = { 'settings': { 'branch': 'main',
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint',
'tinytorch/core/training.py'),
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch',
'tinytorch/core/training.py')}}}
'tinytorch/core/training.py')},
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader.__iter__': ( '08_dataloader/dataloader_dev.html#dataloader.__iter__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader.__len__': ( '08_dataloader/dataloader_dev.html#dataloader.__len__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.DataLoader._collate_batch': ( '08_dataloader/dataloader_dev.html#dataloader._collate_batch',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.Dataset': ( '08_dataloader/dataloader_dev.html#dataset',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.Dataset.__getitem__': ( '08_dataloader/dataloader_dev.html#dataset.__getitem__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.Dataset.__len__': ( '08_dataloader/dataloader_dev.html#dataset.__len__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset': ( '08_dataloader/dataloader_dev.html#tensordataset',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset.__getitem__': ( '08_dataloader/dataloader_dev.html#tensordataset.__getitem__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset.__init__': ( '08_dataloader/dataloader_dev.html#tensordataset.__init__',
'tinytorch/data/loader.py'),
'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__',
'tinytorch/data/loader.py')}}}

View File

@@ -16,7 +16,7 @@
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
'MSEBackward', 'BCEBackward', 'enable_autograd']
'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
import numpy as np
@@ -350,6 +350,51 @@ class BCEBackward(Function):
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
class CrossEntropyBackward(Function):
"""
Gradient computation for Cross-Entropy Loss.
CrossEntropy: L = -mean(log_softmax(logits)[targets])
The gradient with respect to logits is remarkably elegant:
L/logits = (softmax(logits) - one_hot(targets)) / N
This is one of the most beautiful results in machine learning:
- The gradient is simply the difference between predictions and targets
- It naturally scales with how wrong we are
- It's numerically stable when computed via softmax
"""
def __init__(self, logits, targets):
"""Initialize with logits and target class indices."""
super().__init__(logits)
self.targets_data = targets.data.astype(int)
self.batch_size = logits.data.shape[0]
self.num_classes = logits.data.shape[1]
def apply(self, grad_output):
"""Compute gradient for cross-entropy loss."""
logits, = self.saved_tensors
if isinstance(logits, Tensor) and logits.requires_grad:
# Compute softmax probabilities
# Using stable softmax: subtract max for numerical stability
logits_data = logits.data
max_logits = np.max(logits_data, axis=1, keepdims=True)
exp_logits = np.exp(logits_data - max_logits)
softmax = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
# Create one-hot encoding of targets
one_hot = np.zeros((self.batch_size, self.num_classes), dtype=np.float32)
one_hot[np.arange(self.batch_size), self.targets_data] = 1.0
# Gradient: (softmax - one_hot) / batch_size
grad = (softmax - one_hot) / self.batch_size
return grad * grad_output,
return None,
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
def enable_autograd():
"""
Enable gradient tracking for all Tensor operations.
@@ -551,13 +596,14 @@ def enable_autograd():
# Patch activations and losses to track gradients
try:
from tinytorch.core.activations import Sigmoid, ReLU
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
# Store original methods
_original_sigmoid_forward = Sigmoid.forward
_original_relu_forward = ReLU.forward
_original_bce_forward = BinaryCrossEntropyLoss.forward
_original_mse_forward = MSELoss.forward
_original_ce_forward = CrossEntropyLoss.forward
def tracked_sigmoid_forward(self, x):
"""Sigmoid with gradient tracking."""
@@ -614,11 +660,35 @@ def enable_autograd():
return result
def tracked_ce_forward(self, logits, targets):
"""Cross-entropy loss with gradient tracking."""
from tinytorch.core.losses import log_softmax
# Compute log-softmax for numerical stability
log_probs = log_softmax(logits, dim=-1)
# Select log-probabilities for correct classes
batch_size = logits.shape[0]
target_indices = targets.data.astype(int)
selected_log_probs = log_probs.data[np.arange(batch_size), target_indices]
# Return negative mean
ce_loss = -np.mean(selected_log_probs)
result = Tensor(ce_loss)
if logits.requires_grad:
result.requires_grad = True
result._grad_fn = CrossEntropyBackward(logits, targets)
return result
# Install patched methods
Sigmoid.forward = tracked_sigmoid_forward
ReLU.forward = tracked_relu_forward
BinaryCrossEntropyLoss.forward = tracked_bce_forward
MSELoss.forward = tracked_mse_forward
CrossEntropyLoss.forward = tracked_ce_forward
except ImportError:
# Activations/losses not yet available (happens during module development)

View File

@@ -15,7 +15,7 @@
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['import_previous_module', 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss']
__all__ = ['import_previous_module', 'log_softmax', 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss']
# %% ../../modules/source/04_losses/losses_dev.ipynb 3
import numpy as np
@@ -33,6 +33,43 @@ from .tensor import Tensor
from .layers import Linear
from .activations import ReLU
# %% ../../modules/source/04_losses/losses_dev.ipynb 8
def log_softmax(x: Tensor, dim: int = -1) -> Tensor:
"""
Compute log-softmax with numerical stability.
TODO: Implement numerically stable log-softmax using the log-sum-exp trick
APPROACH:
1. Find maximum along dimension (for stability)
2. Subtract max from input (prevents overflow)
3. Compute log(sum(exp(shifted_input)))
4. Return input - max - log_sum_exp
EXAMPLE:
>>> logits = Tensor([[1.0, 2.0, 3.0], [0.1, 0.2, 0.9]])
>>> result = log_softmax(logits, dim=-1)
>>> print(result.shape)
(2, 3)
HINT: Use np.max(x.data, axis=dim, keepdims=True) to preserve dimensions
"""
### BEGIN SOLUTION
# Step 1: Find max along dimension for numerical stability
max_vals = np.max(x.data, axis=dim, keepdims=True)
# Step 2: Subtract max to prevent overflow
shifted = x.data - max_vals
# Step 3: Compute log(sum(exp(shifted)))
log_sum_exp = np.log(np.sum(np.exp(shifted), axis=dim, keepdims=True))
# Step 4: Return log_softmax = input - max - log_sum_exp
result = x.data - max_vals - log_sum_exp
return Tensor(result)
### END SOLUTION
# %% ../../modules/source/04_losses/losses_dev.ipynb 11
class MSELoss:
"""Mean Squared Error loss for regression tasks."""