mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 06:10:52 -05:00
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
258 lines
8.2 KiB
Python
Generated
258 lines
8.2 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/02_activations/activations_dev.ipynb.
|
||
|
||
# %% auto 0
|
||
__all__ = ['Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']
|
||
|
||
# %% ../../modules/source/02_activations/activations_dev.ipynb 3
|
||
import numpy as np
|
||
from typing import Optional
|
||
import sys
|
||
import os
|
||
|
||
|
||
# Import will be in export cell
|
||
|
||
# %% ../../modules/source/02_activations/activations_dev.ipynb 8
|
||
from .tensor import Tensor
|
||
|
||
class Sigmoid:
|
||
"""
|
||
Sigmoid activation: σ(x) = 1/(1 + e^(-x))
|
||
|
||
Maps any real number to (0, 1) range.
|
||
Perfect for probabilities and binary classification.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply sigmoid activation element-wise.
|
||
|
||
TODO: Implement sigmoid function
|
||
|
||
APPROACH:
|
||
1. Apply sigmoid formula: 1 / (1 + exp(-x))
|
||
2. Use np.exp for exponential
|
||
3. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> sigmoid = Sigmoid()
|
||
>>> x = Tensor([-2, 0, 2])
|
||
>>> result = sigmoid(x)
|
||
>>> print(result.data)
|
||
[0.119, 0.5, 0.881] # All values between 0 and 1
|
||
|
||
HINT: Use np.exp(-x.data) for numerical stability
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Apply sigmoid: 1 / (1 + exp(-x))
|
||
result_data = 1.0 / (1.0 + np.exp(-x.data))
|
||
result = Tensor(result_data)
|
||
|
||
# Track gradients if autograd is enabled and input requires_grad
|
||
if SigmoidBackward is not None and x.requires_grad:
|
||
result.requires_grad = True
|
||
result._grad_fn = SigmoidBackward(x, result)
|
||
|
||
return result
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/source/02_activations/activations_dev.ipynb 12
|
||
class ReLU:
|
||
"""
|
||
ReLU activation: f(x) = max(0, x)
|
||
|
||
Sets negative values to zero, keeps positive values unchanged.
|
||
Most popular activation for hidden layers.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply ReLU activation element-wise.
|
||
|
||
TODO: Implement ReLU function
|
||
|
||
APPROACH:
|
||
1. Use np.maximum(0, x.data) for element-wise max with zero
|
||
2. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> relu = ReLU()
|
||
>>> x = Tensor([-2, -1, 0, 1, 2])
|
||
>>> result = relu(x)
|
||
>>> print(result.data)
|
||
[0, 0, 0, 1, 2] # Negative values become 0, positive unchanged
|
||
|
||
HINT: np.maximum handles element-wise maximum automatically
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Apply ReLU: max(0, x)
|
||
result = np.maximum(0, x.data)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/source/02_activations/activations_dev.ipynb 16
|
||
class Tanh:
|
||
"""
|
||
Tanh activation: f(x) = (e^x - e^(-x))/(e^x + e^(-x))
|
||
|
||
Maps any real number to (-1, 1) range.
|
||
Zero-centered alternative to sigmoid.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply tanh activation element-wise.
|
||
|
||
TODO: Implement tanh function
|
||
|
||
APPROACH:
|
||
1. Use np.tanh(x.data) for hyperbolic tangent
|
||
2. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> tanh = Tanh()
|
||
>>> x = Tensor([-2, 0, 2])
|
||
>>> result = tanh(x)
|
||
>>> print(result.data)
|
||
[-0.964, 0.0, 0.964] # Range (-1, 1), symmetric around 0
|
||
|
||
HINT: NumPy provides np.tanh function
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Apply tanh using NumPy
|
||
result = np.tanh(x.data)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/source/02_activations/activations_dev.ipynb 20
|
||
class GELU:
|
||
"""
|
||
GELU activation: f(x) = x * Φ(x) ≈ x * Sigmoid(1.702 * x)
|
||
|
||
Smooth approximation to ReLU, used in modern transformers.
|
||
Where Φ(x) is the cumulative distribution function of standard normal.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply GELU activation element-wise.
|
||
|
||
TODO: Implement GELU approximation
|
||
|
||
APPROACH:
|
||
1. Use approximation: x * sigmoid(1.702 * x)
|
||
2. Compute sigmoid part: 1 / (1 + exp(-1.702 * x))
|
||
3. Multiply by x element-wise
|
||
4. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> gelu = GELU()
|
||
>>> x = Tensor([-1, 0, 1])
|
||
>>> result = gelu(x)
|
||
>>> print(result.data)
|
||
[-0.159, 0.0, 0.841] # Smooth, like ReLU but differentiable everywhere
|
||
|
||
HINT: The 1.702 constant comes from √(2/π) approximation
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# GELU approximation: x * sigmoid(1.702 * x)
|
||
# First compute sigmoid part
|
||
sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
|
||
# Then multiply by x
|
||
result = x.data * sigmoid_part
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/source/02_activations/activations_dev.ipynb 24
|
||
class Softmax:
|
||
"""
|
||
Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))
|
||
|
||
Converts any vector to a probability distribution.
|
||
Sum of all outputs equals 1.0.
|
||
"""
|
||
|
||
def forward(self, x: Tensor, dim: int = -1) -> Tensor:
|
||
"""
|
||
Apply softmax activation along specified dimension.
|
||
|
||
TODO: Implement numerically stable softmax
|
||
|
||
APPROACH:
|
||
1. Subtract max for numerical stability: x - max(x)
|
||
2. Compute exponentials: exp(x - max(x))
|
||
3. Sum along dimension: sum(exp_values)
|
||
4. Divide: exp_values / sum
|
||
5. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> softmax = Softmax()
|
||
>>> x = Tensor([1, 2, 3])
|
||
>>> result = softmax(x)
|
||
>>> print(result.data)
|
||
[0.090, 0.245, 0.665] # Sums to 1.0, larger inputs get higher probability
|
||
|
||
HINTS:
|
||
- Use np.max(x.data, axis=dim, keepdims=True) for max
|
||
- Use np.sum(exp_values, axis=dim, keepdims=True) for sum
|
||
- The max subtraction prevents overflow in exponentials
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Numerical stability: subtract max to prevent overflow
|
||
# Use Tensor operations to preserve gradient flow!
|
||
x_max_data = np.max(x.data, axis=dim, keepdims=True)
|
||
x_max = Tensor(x_max_data, requires_grad=False) # max is not differentiable in this context
|
||
x_shifted = x - x_max # Tensor subtraction!
|
||
|
||
# Compute exponentials (NumPy operation, but wrapped in Tensor)
|
||
exp_values = Tensor(np.exp(x_shifted.data), requires_grad=x_shifted.requires_grad)
|
||
|
||
# Sum along dimension (Tensor operation)
|
||
exp_sum_data = np.sum(exp_values.data, axis=dim, keepdims=True)
|
||
exp_sum = Tensor(exp_sum_data, requires_grad=exp_values.requires_grad)
|
||
|
||
# Normalize to get probabilities (Tensor division!)
|
||
result = exp_values / exp_sum
|
||
return result
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor, dim: int = -1) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x, dim)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|