mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-29 07:28:13 -05:00
feat: Add CrossEntropyLoss autograd support + Milestone 03 MLP on digits
Key Changes: - Implemented CrossEntropyBackward for gradient computation - Integrated CrossEntropyLoss into enable_autograd() patching - Created comprehensive loss gradient test suite - Milestone 03: MLP digits classifier (77.5% accuracy) - Shipped tiny 8x8 digits dataset (67KB) for instant demos - Updated DataLoader module with ASCII visualizations Tests: - All 3 losses (MSE, BCE, CrossEntropy) now have gradient flow - MLP successfully learns digit classification (6.9% → 77.5%) - Integration tests pass Technical: - CrossEntropyBackward: softmax - one_hot gradient - Numerically stable via log-softmax - Works with raw class labels (no one-hot needed)
This commit is contained in:
30
tinytorch/_modidx.py
generated
30
tinytorch/_modidx.py
generated
@@ -114,7 +114,9 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.core.losses.MSELoss.forward': ( '04_losses/losses_dev.html#mseloss.forward',
|
||||
'tinytorch/core/losses.py'),
|
||||
'tinytorch.core.losses.import_previous_module': ( '04_losses/losses_dev.html#import_previous_module',
|
||||
'tinytorch/core/losses.py')},
|
||||
'tinytorch/core/losses.py'),
|
||||
'tinytorch.core.losses.log_softmax': ( '04_losses/losses_dev.html#log_softmax',
|
||||
'tinytorch/core/losses.py')},
|
||||
'tinytorch.core.optimizers': { 'tinytorch.core.optimizers.Adam': ( '06_optimizers/optimizers_dev.html#adam',
|
||||
'tinytorch/core/optimizers.py'),
|
||||
'tinytorch.core.optimizers.Adam.__init__': ( '06_optimizers/optimizers_dev.html#adam.__init__',
|
||||
@@ -201,4 +203,28 @@ d = { 'settings': { 'branch': 'main',
|
||||
'tinytorch.core.training.Trainer.save_checkpoint': ( '07_training/training_dev.html#trainer.save_checkpoint',
|
||||
'tinytorch/core/training.py'),
|
||||
'tinytorch.core.training.Trainer.train_epoch': ( '07_training/training_dev.html#trainer.train_epoch',
|
||||
'tinytorch/core/training.py')}}}
|
||||
'tinytorch/core/training.py')},
|
||||
'tinytorch.data.loader': { 'tinytorch.data.loader.DataLoader': ( '08_dataloader/dataloader_dev.html#dataloader',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader.__init__': ( '08_dataloader/dataloader_dev.html#dataloader.__init__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader.__iter__': ( '08_dataloader/dataloader_dev.html#dataloader.__iter__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader.__len__': ( '08_dataloader/dataloader_dev.html#dataloader.__len__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.DataLoader._collate_batch': ( '08_dataloader/dataloader_dev.html#dataloader._collate_batch',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.Dataset': ( '08_dataloader/dataloader_dev.html#dataset',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.Dataset.__getitem__': ( '08_dataloader/dataloader_dev.html#dataset.__getitem__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.Dataset.__len__': ( '08_dataloader/dataloader_dev.html#dataset.__len__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset': ( '08_dataloader/dataloader_dev.html#tensordataset',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset.__getitem__': ( '08_dataloader/dataloader_dev.html#tensordataset.__getitem__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset.__init__': ( '08_dataloader/dataloader_dev.html#tensordataset.__init__',
|
||||
'tinytorch/data/loader.py'),
|
||||
'tinytorch.data.loader.TensorDataset.__len__': ( '08_dataloader/dataloader_dev.html#tensordataset.__len__',
|
||||
'tinytorch/data/loader.py')}}}
|
||||
|
||||
74
tinytorch/core/autograd.py
generated
74
tinytorch/core/autograd.py
generated
@@ -16,7 +16,7 @@
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['Function', 'AddBackward', 'MulBackward', 'MatmulBackward', 'SumBackward', 'ReLUBackward', 'SigmoidBackward',
|
||||
'MSEBackward', 'BCEBackward', 'enable_autograd']
|
||||
'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
|
||||
import numpy as np
|
||||
@@ -350,6 +350,51 @@ class BCEBackward(Function):
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
|
||||
class CrossEntropyBackward(Function):
|
||||
"""
|
||||
Gradient computation for Cross-Entropy Loss.
|
||||
|
||||
CrossEntropy: L = -mean(log_softmax(logits)[targets])
|
||||
|
||||
The gradient with respect to logits is remarkably elegant:
|
||||
∂L/∂logits = (softmax(logits) - one_hot(targets)) / N
|
||||
|
||||
This is one of the most beautiful results in machine learning:
|
||||
- The gradient is simply the difference between predictions and targets
|
||||
- It naturally scales with how wrong we are
|
||||
- It's numerically stable when computed via softmax
|
||||
"""
|
||||
|
||||
def __init__(self, logits, targets):
|
||||
"""Initialize with logits and target class indices."""
|
||||
super().__init__(logits)
|
||||
self.targets_data = targets.data.astype(int)
|
||||
self.batch_size = logits.data.shape[0]
|
||||
self.num_classes = logits.data.shape[1]
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""Compute gradient for cross-entropy loss."""
|
||||
logits, = self.saved_tensors
|
||||
|
||||
if isinstance(logits, Tensor) and logits.requires_grad:
|
||||
# Compute softmax probabilities
|
||||
# Using stable softmax: subtract max for numerical stability
|
||||
logits_data = logits.data
|
||||
max_logits = np.max(logits_data, axis=1, keepdims=True)
|
||||
exp_logits = np.exp(logits_data - max_logits)
|
||||
softmax = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
|
||||
|
||||
# Create one-hot encoding of targets
|
||||
one_hot = np.zeros((self.batch_size, self.num_classes), dtype=np.float32)
|
||||
one_hot[np.arange(self.batch_size), self.targets_data] = 1.0
|
||||
|
||||
# Gradient: (softmax - one_hot) / batch_size
|
||||
grad = (softmax - one_hot) / self.batch_size
|
||||
|
||||
return grad * grad_output,
|
||||
return None,
|
||||
|
||||
# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
|
||||
def enable_autograd():
|
||||
"""
|
||||
Enable gradient tracking for all Tensor operations.
|
||||
@@ -551,13 +596,14 @@ def enable_autograd():
|
||||
# Patch activations and losses to track gradients
|
||||
try:
|
||||
from tinytorch.core.activations import Sigmoid, ReLU
|
||||
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss
|
||||
from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss
|
||||
|
||||
# Store original methods
|
||||
_original_sigmoid_forward = Sigmoid.forward
|
||||
_original_relu_forward = ReLU.forward
|
||||
_original_bce_forward = BinaryCrossEntropyLoss.forward
|
||||
_original_mse_forward = MSELoss.forward
|
||||
_original_ce_forward = CrossEntropyLoss.forward
|
||||
|
||||
def tracked_sigmoid_forward(self, x):
|
||||
"""Sigmoid with gradient tracking."""
|
||||
@@ -614,11 +660,35 @@ def enable_autograd():
|
||||
|
||||
return result
|
||||
|
||||
def tracked_ce_forward(self, logits, targets):
|
||||
"""Cross-entropy loss with gradient tracking."""
|
||||
from tinytorch.core.losses import log_softmax
|
||||
|
||||
# Compute log-softmax for numerical stability
|
||||
log_probs = log_softmax(logits, dim=-1)
|
||||
|
||||
# Select log-probabilities for correct classes
|
||||
batch_size = logits.shape[0]
|
||||
target_indices = targets.data.astype(int)
|
||||
selected_log_probs = log_probs.data[np.arange(batch_size), target_indices]
|
||||
|
||||
# Return negative mean
|
||||
ce_loss = -np.mean(selected_log_probs)
|
||||
|
||||
result = Tensor(ce_loss)
|
||||
|
||||
if logits.requires_grad:
|
||||
result.requires_grad = True
|
||||
result._grad_fn = CrossEntropyBackward(logits, targets)
|
||||
|
||||
return result
|
||||
|
||||
# Install patched methods
|
||||
Sigmoid.forward = tracked_sigmoid_forward
|
||||
ReLU.forward = tracked_relu_forward
|
||||
BinaryCrossEntropyLoss.forward = tracked_bce_forward
|
||||
MSELoss.forward = tracked_mse_forward
|
||||
CrossEntropyLoss.forward = tracked_ce_forward
|
||||
|
||||
except ImportError:
|
||||
# Activations/losses not yet available (happens during module development)
|
||||
|
||||
39
tinytorch/core/losses.py
generated
39
tinytorch/core/losses.py
generated
@@ -15,7 +15,7 @@
|
||||
# ║ happens! The tinytorch/ directory is just the compiled output. ║
|
||||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||||
# %% auto 0
|
||||
__all__ = ['import_previous_module', 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss']
|
||||
__all__ = ['import_previous_module', 'log_softmax', 'MSELoss', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss']
|
||||
|
||||
# %% ../../modules/source/04_losses/losses_dev.ipynb 3
|
||||
import numpy as np
|
||||
@@ -33,6 +33,43 @@ from .tensor import Tensor
|
||||
from .layers import Linear
|
||||
from .activations import ReLU
|
||||
|
||||
# %% ../../modules/source/04_losses/losses_dev.ipynb 8
|
||||
def log_softmax(x: Tensor, dim: int = -1) -> Tensor:
|
||||
"""
|
||||
Compute log-softmax with numerical stability.
|
||||
|
||||
TODO: Implement numerically stable log-softmax using the log-sum-exp trick
|
||||
|
||||
APPROACH:
|
||||
1. Find maximum along dimension (for stability)
|
||||
2. Subtract max from input (prevents overflow)
|
||||
3. Compute log(sum(exp(shifted_input)))
|
||||
4. Return input - max - log_sum_exp
|
||||
|
||||
EXAMPLE:
|
||||
>>> logits = Tensor([[1.0, 2.0, 3.0], [0.1, 0.2, 0.9]])
|
||||
>>> result = log_softmax(logits, dim=-1)
|
||||
>>> print(result.shape)
|
||||
(2, 3)
|
||||
|
||||
HINT: Use np.max(x.data, axis=dim, keepdims=True) to preserve dimensions
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
# Step 1: Find max along dimension for numerical stability
|
||||
max_vals = np.max(x.data, axis=dim, keepdims=True)
|
||||
|
||||
# Step 2: Subtract max to prevent overflow
|
||||
shifted = x.data - max_vals
|
||||
|
||||
# Step 3: Compute log(sum(exp(shifted)))
|
||||
log_sum_exp = np.log(np.sum(np.exp(shifted), axis=dim, keepdims=True))
|
||||
|
||||
# Step 4: Return log_softmax = input - max - log_sum_exp
|
||||
result = x.data - max_vals - log_sum_exp
|
||||
|
||||
return Tensor(result)
|
||||
### END SOLUTION
|
||||
|
||||
# %% ../../modules/source/04_losses/losses_dev.ipynb 11
|
||||
class MSELoss:
|
||||
"""Mean Squared Error loss for regression tasks."""
|
||||
|
||||
Reference in New Issue
Block a user