mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-03-11 19:53:33 -05:00
Major directory restructure to support both developer and learner workflows: Structure Changes: - NEW: src/ directory for Python source files (version controlled) - Files renamed: tensor.py → 01_tensor.py (matches directory naming) - All 20 modules moved from modules/ to src/ - CHANGED: modules/ now holds generated notebooks (gitignored) - Generated from src/*.py using jupytext - Learners work in notebooks, developers work in Python source - UNCHANGED: tinytorch/ package (still auto-generated from notebooks) Workflow: src/*.py → modules/*.ipynb → tinytorch/*.py Command Updates: - Updated export command to read from src/ and generate to modules/ - Export flow: discovers modules in src/, converts to notebooks in modules/, exports to tinytorch/ - All 20 modules tested and working Configuration: - Updated .gitignore to ignore modules/ directory - Updated README.md with new three-layer architecture explanation - Updated export.py source mappings and paths Benefits: - Clean separation: developers edit Python, learners use notebooks - Better version control: only Python source committed, notebooks generated - Flexible learning: can work in notebooks OR Python source - Maintains backward compatibility: tinytorch package unchanged Tested: - Single module export: tito export 01_tensor ✅ - All modules export: tito export --all ✅ - Package imports: from tinytorch.core.tensor import Tensor ✅ - 20/20 modules successfully converted and exported
283 lines
10 KiB
Python
Generated
283 lines
10 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
||
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
||
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
||
# ║ ║
|
||
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
||
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
||
# ║ ║
|
||
# ║ ✅ TO EDIT: src/02_activations/02_activations.py ║
|
||
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
||
# ║ ║
|
||
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
||
# ║ Editing it directly may break module functionality and training. ║
|
||
# ║ ║
|
||
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
|
||
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
||
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
||
# %% auto 0
|
||
__all__ = ['TOLERANCE', 'Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']
|
||
|
||
# %% ../../modules/02_activations/02_activations.ipynb 3
|
||
import numpy as np
|
||
from typing import Optional
|
||
|
||
# Import from TinyTorch package (previous modules must be completed and exported)
|
||
from .tensor import Tensor
|
||
|
||
# Constants for numerical comparisons
|
||
TOLERANCE = 1e-10 # Small tolerance for floating-point comparisons in tests
|
||
|
||
# %% ../../modules/02_activations/02_activations.ipynb 8
|
||
from .tensor import Tensor
|
||
|
||
class Sigmoid:
|
||
"""
|
||
Sigmoid activation: σ(x) = 1/(1 + e^(-x))
|
||
|
||
Maps any real number to (0, 1) range.
|
||
Perfect for probabilities and binary classification.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply sigmoid activation element-wise.
|
||
|
||
TODO: Implement sigmoid function
|
||
|
||
APPROACH:
|
||
1. Apply sigmoid formula: 1 / (1 + exp(-x))
|
||
2. Use np.exp for exponential
|
||
3. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> sigmoid = Sigmoid()
|
||
>>> x = Tensor([-2, 0, 2])
|
||
>>> result = sigmoid(x)
|
||
>>> print(result.data)
|
||
[0.119, 0.5, 0.881] # All values between 0 and 1
|
||
|
||
HINT: Use np.exp(-x.data) for numerical stability
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Apply sigmoid: 1 / (1 + exp(-x))
|
||
# Clip extreme values to prevent overflow (sigmoid(-500) ≈ 0, sigmoid(500) ≈ 1)
|
||
# Clipping at ±500 ensures exp() stays within float64 range
|
||
z = np.clip(x.data, -500, 500)
|
||
|
||
# Use numerically stable sigmoid
|
||
# For positive values: 1 / (1 + exp(-x))
|
||
# For negative values: exp(x) / (1 + exp(x)) = 1 / (1 + exp(-x)) after clipping
|
||
result_data = np.zeros_like(z)
|
||
|
||
# Positive values (including zero)
|
||
pos_mask = z >= 0
|
||
result_data[pos_mask] = 1.0 / (1.0 + np.exp(-z[pos_mask]))
|
||
|
||
# Negative values
|
||
neg_mask = z < 0
|
||
exp_z = np.exp(z[neg_mask])
|
||
result_data[neg_mask] = exp_z / (1.0 + exp_z)
|
||
|
||
return Tensor(result_data)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/02_activations/02_activations.ipynb 12
|
||
class ReLU:
|
||
"""
|
||
ReLU activation: f(x) = max(0, x)
|
||
|
||
Sets negative values to zero, keeps positive values unchanged.
|
||
Most popular activation for hidden layers.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply ReLU activation element-wise.
|
||
|
||
TODO: Implement ReLU function
|
||
|
||
APPROACH:
|
||
1. Use np.maximum(0, x.data) for element-wise max with zero
|
||
2. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> relu = ReLU()
|
||
>>> x = Tensor([-2, -1, 0, 1, 2])
|
||
>>> result = relu(x)
|
||
>>> print(result.data)
|
||
[0, 0, 0, 1, 2] # Negative values become 0, positive unchanged
|
||
|
||
HINT: np.maximum handles element-wise maximum automatically
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Apply ReLU: max(0, x)
|
||
result = np.maximum(0, x.data)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/02_activations/02_activations.ipynb 16
|
||
class Tanh:
|
||
"""
|
||
Tanh activation: f(x) = (e^x - e^(-x))/(e^x + e^(-x))
|
||
|
||
Maps any real number to (-1, 1) range.
|
||
Zero-centered alternative to sigmoid.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply tanh activation element-wise.
|
||
|
||
TODO: Implement tanh function
|
||
|
||
APPROACH:
|
||
1. Use np.tanh(x.data) for hyperbolic tangent
|
||
2. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> tanh = Tanh()
|
||
>>> x = Tensor([-2, 0, 2])
|
||
>>> result = tanh(x)
|
||
>>> print(result.data)
|
||
[-0.964, 0.0, 0.964] # Range (-1, 1), symmetric around 0
|
||
|
||
HINT: NumPy provides np.tanh function
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Apply tanh using NumPy
|
||
result = np.tanh(x.data)
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/02_activations/02_activations.ipynb 20
|
||
class GELU:
|
||
"""
|
||
GELU activation: f(x) = x * Φ(x) ≈ x * Sigmoid(1.702 * x)
|
||
|
||
Smooth approximation to ReLU, used in modern transformers.
|
||
Where Φ(x) is the cumulative distribution function of standard normal.
|
||
"""
|
||
|
||
def forward(self, x: Tensor) -> Tensor:
|
||
"""
|
||
Apply GELU activation element-wise.
|
||
|
||
TODO: Implement GELU approximation
|
||
|
||
APPROACH:
|
||
1. Use approximation: x * sigmoid(1.702 * x)
|
||
2. Compute sigmoid part: 1 / (1 + exp(-1.702 * x))
|
||
3. Multiply by x element-wise
|
||
4. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> gelu = GELU()
|
||
>>> x = Tensor([-1, 0, 1])
|
||
>>> result = gelu(x)
|
||
>>> print(result.data)
|
||
[-0.159, 0.0, 0.841] # Smooth, like ReLU but differentiable everywhere
|
||
|
||
HINT: The 1.702 constant comes from √(2/π) approximation
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# GELU approximation: x * sigmoid(1.702 * x)
|
||
# First compute sigmoid part
|
||
sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
|
||
# Then multiply by x
|
||
result = x.data * sigmoid_part
|
||
return Tensor(result)
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|
||
|
||
# %% ../../modules/02_activations/02_activations.ipynb 24
|
||
class Softmax:
|
||
"""
|
||
Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))
|
||
|
||
Converts any vector to a probability distribution.
|
||
Sum of all outputs equals 1.0.
|
||
"""
|
||
|
||
def forward(self, x: Tensor, dim: int = -1) -> Tensor:
|
||
"""
|
||
Apply softmax activation along specified dimension.
|
||
|
||
TODO: Implement numerically stable softmax
|
||
|
||
APPROACH:
|
||
1. Subtract max for numerical stability: x - max(x)
|
||
2. Compute exponentials: exp(x - max(x))
|
||
3. Sum along dimension: sum(exp_values)
|
||
4. Divide: exp_values / sum
|
||
5. Return result wrapped in new Tensor
|
||
|
||
EXAMPLE:
|
||
>>> softmax = Softmax()
|
||
>>> x = Tensor([1, 2, 3])
|
||
>>> result = softmax(x)
|
||
>>> print(result.data)
|
||
[0.090, 0.245, 0.665] # Sums to 1.0, larger inputs get higher probability
|
||
|
||
HINTS:
|
||
- Use np.max(x.data, axis=dim, keepdims=True) for max
|
||
- Use np.sum(exp_values, axis=dim, keepdims=True) for sum
|
||
- The max subtraction prevents overflow in exponentials
|
||
"""
|
||
### BEGIN SOLUTION
|
||
# Numerical stability: subtract max to prevent overflow
|
||
# Use Tensor operations to preserve gradient flow!
|
||
x_max_data = np.max(x.data, axis=dim, keepdims=True)
|
||
x_max = Tensor(x_max_data, requires_grad=False) # max is not differentiable in this context
|
||
x_shifted = x - x_max # Tensor subtraction!
|
||
|
||
# Compute exponentials (NumPy operation, but wrapped in Tensor)
|
||
exp_values = Tensor(np.exp(x_shifted.data), requires_grad=x_shifted.requires_grad)
|
||
|
||
# Sum along dimension (Tensor operation)
|
||
exp_sum_data = np.sum(exp_values.data, axis=dim, keepdims=True)
|
||
exp_sum = Tensor(exp_sum_data, requires_grad=exp_values.requires_grad)
|
||
|
||
# Normalize to get probabilities (Tensor division!)
|
||
result = exp_values / exp_sum
|
||
return result
|
||
### END SOLUTION
|
||
|
||
def __call__(self, x: Tensor, dim: int = -1) -> Tensor:
|
||
"""Allows the activation to be called like a function."""
|
||
return self.forward(x, dim)
|
||
|
||
def backward(self, grad: Tensor) -> Tensor:
|
||
"""Compute gradient (implemented in Module 05)."""
|
||
pass # Will implement backward pass in Module 05
|