Files
TinyTorch/tinytorch/core/activations.py
Vijay Janapa Reddi baf572738b fix(module-02): Rewrite Softmax to use Tensor operations
- Preserve computation graph by using Tensor arithmetic (x - x_max, exp / sum)
- No more .data extraction that breaks gradient flow
- Numerically stable with max subtraction before exp

Required for transformer attention softmax gradient flow
2025-10-27 20:29:35 -04:00

272 lines
9.8 KiB
Python
Generated
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/source/03_activations/activations_dev.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/source/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']
# %% ../../modules/source/02_activations/activations_dev.ipynb 3
import numpy as np
from typing import Optional
import sys
import os
# Import will be in export cell
# %% ../../modules/source/02_activations/activations_dev.ipynb 8
from .tensor import Tensor
class Sigmoid:
"""
Sigmoid activation: σ(x) = 1/(1 + e^(-x))
Maps any real number to (0, 1) range.
Perfect for probabilities and binary classification.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply sigmoid activation element-wise.
TODO: Implement sigmoid function
APPROACH:
1. Apply sigmoid formula: 1 / (1 + exp(-x))
2. Use np.exp for exponential
3. Return result wrapped in new Tensor
EXAMPLE:
>>> sigmoid = Sigmoid()
>>> x = Tensor([-2, 0, 2])
>>> result = sigmoid(x)
>>> print(result.data)
[0.119, 0.5, 0.881] # All values between 0 and 1
HINT: Use np.exp(-x.data) for numerical stability
"""
### BEGIN SOLUTION
# Apply sigmoid: 1 / (1 + exp(-x))
result_data = 1.0 / (1.0 + np.exp(-x.data))
result = Tensor(result_data)
# Track gradients if autograd is enabled and input requires_grad
if SigmoidBackward is not None and x.requires_grad:
result.requires_grad = True
result._grad_fn = SigmoidBackward(x, result)
return result
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 12
class ReLU:
"""
ReLU activation: f(x) = max(0, x)
Sets negative values to zero, keeps positive values unchanged.
Most popular activation for hidden layers.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply ReLU activation element-wise.
TODO: Implement ReLU function
APPROACH:
1. Use np.maximum(0, x.data) for element-wise max with zero
2. Return result wrapped in new Tensor
EXAMPLE:
>>> relu = ReLU()
>>> x = Tensor([-2, -1, 0, 1, 2])
>>> result = relu(x)
>>> print(result.data)
[0, 0, 0, 1, 2] # Negative values become 0, positive unchanged
HINT: np.maximum handles element-wise maximum automatically
"""
### BEGIN SOLUTION
# Apply ReLU: max(0, x)
result = np.maximum(0, x.data)
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 16
class Tanh:
"""
Tanh activation: f(x) = (e^x - e^(-x))/(e^x + e^(-x))
Maps any real number to (-1, 1) range.
Zero-centered alternative to sigmoid.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply tanh activation element-wise.
TODO: Implement tanh function
APPROACH:
1. Use np.tanh(x.data) for hyperbolic tangent
2. Return result wrapped in new Tensor
EXAMPLE:
>>> tanh = Tanh()
>>> x = Tensor([-2, 0, 2])
>>> result = tanh(x)
>>> print(result.data)
[-0.964, 0.0, 0.964] # Range (-1, 1), symmetric around 0
HINT: NumPy provides np.tanh function
"""
### BEGIN SOLUTION
# Apply tanh using NumPy
result = np.tanh(x.data)
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 20
class GELU:
"""
GELU activation: f(x) = x * Φ(x) ≈ x * Sigmoid(1.702 * x)
Smooth approximation to ReLU, used in modern transformers.
Where Φ(x) is the cumulative distribution function of standard normal.
"""
def forward(self, x: Tensor) -> Tensor:
"""
Apply GELU activation element-wise.
TODO: Implement GELU approximation
APPROACH:
1. Use approximation: x * sigmoid(1.702 * x)
2. Compute sigmoid part: 1 / (1 + exp(-1.702 * x))
3. Multiply by x element-wise
4. Return result wrapped in new Tensor
EXAMPLE:
>>> gelu = GELU()
>>> x = Tensor([-1, 0, 1])
>>> result = gelu(x)
>>> print(result.data)
[-0.159, 0.0, 0.841] # Smooth, like ReLU but differentiable everywhere
HINT: The 1.702 constant comes from √(2/π) approximation
"""
### BEGIN SOLUTION
# GELU approximation: x * sigmoid(1.702 * x)
# First compute sigmoid part
sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
# Then multiply by x
result = x.data * sigmoid_part
return Tensor(result)
### END SOLUTION
def __call__(self, x: Tensor) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05
# %% ../../modules/source/02_activations/activations_dev.ipynb 24
class Softmax:
"""
Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))
Converts any vector to a probability distribution.
Sum of all outputs equals 1.0.
"""
def forward(self, x: Tensor, dim: int = -1) -> Tensor:
"""
Apply softmax activation along specified dimension.
TODO: Implement numerically stable softmax
APPROACH:
1. Subtract max for numerical stability: x - max(x)
2. Compute exponentials: exp(x - max(x))
3. Sum along dimension: sum(exp_values)
4. Divide: exp_values / sum
5. Return result wrapped in new Tensor
EXAMPLE:
>>> softmax = Softmax()
>>> x = Tensor([1, 2, 3])
>>> result = softmax(x)
>>> print(result.data)
[0.090, 0.245, 0.665] # Sums to 1.0, larger inputs get higher probability
HINTS:
- Use np.max(x.data, axis=dim, keepdims=True) for max
- Use np.sum(exp_values, axis=dim, keepdims=True) for sum
- The max subtraction prevents overflow in exponentials
"""
### BEGIN SOLUTION
# Numerical stability: subtract max to prevent overflow
# Use Tensor operations to preserve gradient flow!
x_max_data = np.max(x.data, axis=dim, keepdims=True)
x_max = Tensor(x_max_data, requires_grad=False) # max is not differentiable in this context
x_shifted = x - x_max # Tensor subtraction!
# Compute exponentials (NumPy operation, but wrapped in Tensor)
exp_values = Tensor(np.exp(x_shifted.data), requires_grad=x_shifted.requires_grad)
# Sum along dimension (Tensor operation)
exp_sum_data = np.sum(exp_values.data, axis=dim, keepdims=True)
exp_sum = Tensor(exp_sum_data, requires_grad=exp_values.requires_grad)
# Normalize to get probabilities (Tensor division!)
result = exp_values / exp_sum
return result
### END SOLUTION
def __call__(self, x: Tensor, dim: int = -1) -> Tensor:
"""Allows the activation to be called like a function."""
return self.forward(x, dim)
def backward(self, grad: Tensor) -> Tensor:
"""Compute gradient (implemented in Module 05)."""
pass # Will implement backward pass in Module 05