Files
TinyTorch/tinytorch/core/activations.py
Vijay Janapa Reddi 5ae68dd4b4 Fix gradient propagation: enable autograd and patch activations/losses
CRITICAL FIX: Gradients now flow through entire training stack!

Changes:
1. Enable autograd in __init__.py - patches Tensor operations on import
2. Extend enable_autograd() to patch Sigmoid and BCE forward methods
3. Fix gradient accumulation to handle broadcasting (bias gradients)
4. Fix optimizer.step() - param.grad is numpy array, not Tensor.data
5. Add debug_gradients.py for systematic gradient flow testing

Architecture:
- Clean patching pattern - all gradient tracking in enable_autograd()
- Activations/losses remain simple (Module 02/04)
- Autograd (Module 05) upgrades them with gradient tracking
- Pedagogically sound: separation of concerns

Results:
 All 6 debug tests pass
 Perceptron learns: 50% → 93% accuracy
 Loss decreases: 0.79 → 0.36
 Weights update correctly through SGD
2025-09-30 13:51:30 -04:00

269 lines
9.4 KiB
Python
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/03_activations/activations_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']
# %% ../../modules/source/02_activations/activations_dev.ipynb 3
import numpy as np
from typing import Optional
import sys
import os
# Import will be in export cell
# %% ../../modules/source/02_activations/activations_dev.ipynb 8
from .tensor import Tensor
class Sigmoid:
"""
Sigmoid activation: σ(x) = 1/(1 + e^(-x))
Maps any real number to (0, 1) range.
Perfect for probabilities and binary classification.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply sigmoid activation element-wise.
TODO: Implement sigmoid function
APPROACH:
1. Apply sigmoid formula: 1 / (1 + exp(-x))
2. Use np.exp for exponential
3. Return result wrapped in new Tensor
EXAMPLE:
>>> sigmoid = Sigmoid()
>>> x = Tensor([-2, 0, 2])
>>> result = sigmoid(x)
>>> print(result.data)
[0.119, 0.5, 0.881] # All values between 0 and 1
HINT: Use np.exp(-x.data) for numerical stability
"""
### BEGIN SOLUTION
# Apply sigmoid: 1 / (1 + exp(-x))
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
# Track gradients if autograd is enabled and input requires_grad
if SigmoidBackward is not None and x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 12
class ReLU:
"""
ReLU activation: f(x) = max(0, x)
Sets negative values to zero, keeps positive values unchanged.
Most popular activation for hidden layers.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply ReLU activation element-wise.
TODO: Implement ReLU function
APPROACH:
1. Use np.maximum(0, x.data) for element-wise max with zero
2. Return result wrapped in new Tensor
EXAMPLE:
>>> relu = ReLU()
>>> x = Tensor([-2, -1, 0, 1, 2])
>>> result = relu(x)
>>> print(result.data)
[0, 0, 0, 1, 2] # Negative values become 0, positive unchanged
HINT: np.maximum handles element-wise maximum automatically
"""
### BEGIN SOLUTION
# Apply ReLU: max(0, x)
result = np.maximum(0, x.data)
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 16
class Tanh:
"""
Tanh activation: f(x) = (e^x - e^(-x))/(e^x + e^(-x))
Maps any real number to (-1, 1) range.
Zero-centered alternative to sigmoid.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply tanh activation element-wise.
TODO: Implement tanh function
APPROACH:
1. Use np.tanh(x.data) for hyperbolic tangent
2. Return result wrapped in new Tensor
EXAMPLE:
>>> tanh = Tanh()
>>> x = Tensor([-2, 0, 2])
>>> result = tanh(x)
>>> print(result.data)
[-0.964, 0.0, 0.964] # Range (-1, 1), symmetric around 0
HINT: NumPy provides np.tanh function
"""
### BEGIN SOLUTION
# Apply tanh using NumPy
result = np.tanh(x.data)
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 20
class GELU:
"""
GELU activation: f(x) = x * Φ(x) ≈ x * Sigmoid(1.702 * x)
Smooth approximation to ReLU, used in modern transformers.
Where Φ(x) is the cumulative distribution function of standard normal.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply GELU activation element-wise.
TODO: Implement GELU approximation
APPROACH:
1. Use approximation: x * sigmoid(1.702 * x)
2. Compute sigmoid part: 1 / (1 + exp(-1.702 * x))
3. Multiply by x element-wise
4. Return result wrapped in new Tensor
EXAMPLE:
>>> gelu = GELU()
>>> x = Tensor([-1, 0, 1])
>>> result = gelu(x)
>>> print(result.data)
[-0.159, 0.0, 0.841] # Smooth, like ReLU but differentiable everywhere
HINT: The 1.702 constant comes from √(2/π) approximation
"""
### BEGIN SOLUTION
# GELU approximation: x * sigmoid(1.702 * x)
# First compute sigmoid part
sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
# Then multiply by x
result = x.data * sigmoid_part
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 24
class Softmax:
"""
Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))
Converts any vector to a probability distribution.
Sum of all outputs equals 1.0.
"""
def forward(self, x: Tensor, dim: int = -1) -> Tensor:
"""
Apply softmax activation along specified dimension.
TODO: Implement numerically stable softmax
APPROACH:
1. Subtract max for numerical stability: x - max(x)
2. Compute exponentials: exp(x - max(x))
3. Sum along dimension: sum(exp_values)
4. Divide: exp_values / sum
5. Return result wrapped in new Tensor
EXAMPLE:
>>> softmax = Softmax()
>>> x = Tensor([1, 2, 3])
>>> result = softmax(x)
>>> print(result.data)
[0.090, 0.245, 0.665] # Sums to 1.0, larger inputs get higher probability
HINTS:
- Use np.max(x.data, axis=dim, keepdims=True) for max
- Use np.sum(exp_values, axis=dim, keepdims=True) for sum
- The max subtraction prevents overflow in exponentials
"""
### BEGIN SOLUTION
# Numerical stability: subtract max to prevent overflow
x_max = np.max(x.data, axis=dim, keepdims=True)
x_shifted = x.data - x_max
# Compute exponentials
exp_values = np.exp(x_shifted)
# Sum along dimension
exp_sum = np.sum(exp_values, axis=dim, keepdims=True)
# Normalize to get probabilities
result = exp_values / exp_sum
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor, dim: int = -1) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x, dim)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05