mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 20:22:34 -05:00
- Enhanced attention proof to use A-Z letters instead of numbers - Shows MCYWUH → HUWYCM instead of [1,2,3] → [3,2,1] - More intuitive and fun for students - Removed quickdemo, generation, dialogue scripts (too slow/gibberish)
276 lines
9.9 KiB
Python
Generated
276 lines
9.9 KiB
Python
Generated
# ╔═══════════════════════════════════════════════════════════════════════════════╗
|
|
# ║ 🚨 CRITICAL WARNING 🚨 ║
|
|
# ║ AUTOGENERATED! DO NOT EDIT! ║
|
|
# ║ ║
|
|
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
|
|
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
|
|
# ║ ║
|
|
# ║ ✅ TO EDIT: src/03_layers/03_layers.py ║
|
|
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
|
|
# ║ ║
|
|
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
|
|
# ║ Editing it directly may break module functionality and training. ║
|
|
# ║ ║
|
|
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
|
|
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
|
|
# ╚═══════════════════════════════════════════════════════════════════════════════╝
|
|
# %% auto 0
|
|
__all__ = ['XAVIER_SCALE_FACTOR', 'HE_SCALE_FACTOR', 'DROPOUT_MIN_PROB', 'DROPOUT_MAX_PROB', 'Layer', 'Linear', 'Dropout']
|
|
|
|
# %% ../../modules/03_layers/03_layers.ipynb 1
|
|
import numpy as np
|
|
|
|
# Import from TinyTorch package (previous modules must be completed and exported)
|
|
from .tensor import Tensor
|
|
from .activations import ReLU, Sigmoid
|
|
|
|
# Constants for weight initialization
|
|
XAVIER_SCALE_FACTOR = 1.0 # Xavier/Glorot initialization uses sqrt(1/fan_in)
|
|
HE_SCALE_FACTOR = 2.0 # He initialization uses sqrt(2/fan_in) for ReLU
|
|
|
|
# Constants for dropout
|
|
DROPOUT_MIN_PROB = 0.0 # Minimum dropout probability (no dropout)
|
|
DROPOUT_MAX_PROB = 1.0 # Maximum dropout probability (drop everything)
|
|
|
|
# %% ../../modules/03_layers/03_layers.ipynb 6
|
|
class Layer:
|
|
"""
|
|
Base class for all neural network layers.
|
|
|
|
All layers should inherit from this class and implement:
|
|
- forward(x): Compute layer output
|
|
- parameters(): Return list of trainable parameters
|
|
|
|
The __call__ method is provided to make layers callable.
|
|
"""
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Forward pass through the layer.
|
|
|
|
Args:
|
|
x: Input tensor
|
|
|
|
Returns:
|
|
Output tensor after transformation
|
|
"""
|
|
raise NotImplementedError("Subclasses must implement forward()")
|
|
|
|
def __call__(self, x, *args, **kwargs):
|
|
"""Allow layer to be called like a function."""
|
|
return self.forward(x, *args, **kwargs)
|
|
|
|
def parameters(self):
|
|
"""
|
|
Return list of trainable parameters.
|
|
|
|
Returns:
|
|
List of Tensor objects with requires_grad=True
|
|
"""
|
|
return [] # Base class has no parameters
|
|
|
|
def __repr__(self):
|
|
"""String representation of the layer."""
|
|
return f"{self.__class__.__name__}()"
|
|
|
|
# %% ../../modules/03_layers/03_layers.ipynb 8
|
|
class Linear(Layer):
|
|
"""
|
|
Linear (fully connected) layer: y = xW + b
|
|
|
|
This is the fundamental building block of neural networks.
|
|
Applies a linear transformation to incoming data.
|
|
"""
|
|
|
|
def __init__(self, in_features, out_features, bias=True):
|
|
"""
|
|
Initialize linear layer with proper weight initialization.
|
|
|
|
TODO: Initialize weights and bias with Xavier initialization
|
|
|
|
APPROACH:
|
|
1. Create weight matrix (in_features, out_features) with Xavier scaling
|
|
2. Create bias vector (out_features,) initialized to zeros if bias=True
|
|
3. Set requires_grad=True for parameters (ready for Module 05)
|
|
|
|
EXAMPLE:
|
|
>>> layer = Linear(784, 10) # MNIST classifier final layer
|
|
>>> print(layer.weight.shape)
|
|
(784, 10)
|
|
>>> print(layer.bias.shape)
|
|
(10,)
|
|
|
|
HINTS:
|
|
- Xavier init: scale = sqrt(1/in_features)
|
|
- Use np.random.randn() for normal distribution
|
|
- bias=None when bias=False
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.in_features = in_features
|
|
self.out_features = out_features
|
|
|
|
# Xavier/Glorot initialization for stable gradients
|
|
scale = np.sqrt(XAVIER_SCALE_FACTOR / in_features)
|
|
weight_data = np.random.randn(in_features, out_features) * scale
|
|
self.weight = Tensor(weight_data, requires_grad=True)
|
|
|
|
# Initialize bias to zeros or None
|
|
if bias:
|
|
bias_data = np.zeros(out_features)
|
|
self.bias = Tensor(bias_data, requires_grad=True)
|
|
else:
|
|
self.bias = None
|
|
### END SOLUTION
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Forward pass through linear layer.
|
|
|
|
TODO: Implement y = xW + b
|
|
|
|
APPROACH:
|
|
1. Matrix multiply input with weights: xW
|
|
2. Add bias if it exists
|
|
3. Return result as new Tensor
|
|
|
|
EXAMPLE:
|
|
>>> layer = Linear(3, 2)
|
|
>>> x = Tensor([[1, 2, 3], [4, 5, 6]]) # 2 samples, 3 features
|
|
>>> y = layer.forward(x)
|
|
>>> print(y.shape)
|
|
(2, 2) # 2 samples, 2 outputs
|
|
|
|
HINTS:
|
|
- Use tensor.matmul() for matrix multiplication
|
|
- Handle bias=None case
|
|
- Broadcasting automatically handles bias addition
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Linear transformation: y = xW
|
|
output = x.matmul(self.weight)
|
|
|
|
# Add bias if present
|
|
if self.bias is not None:
|
|
output = output + self.bias
|
|
|
|
return output
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x):
|
|
"""Allows the layer to be called like a function."""
|
|
return self.forward(x)
|
|
|
|
def parameters(self):
|
|
"""
|
|
Return list of trainable parameters.
|
|
|
|
TODO: Return all tensors that need gradients
|
|
|
|
APPROACH:
|
|
1. Start with weight (always present)
|
|
2. Add bias if it exists
|
|
3. Return as list for optimizer
|
|
"""
|
|
### BEGIN SOLUTION
|
|
params = [self.weight]
|
|
if self.bias is not None:
|
|
params.append(self.bias)
|
|
return params
|
|
### END SOLUTION
|
|
|
|
def __repr__(self):
|
|
"""String representation for debugging."""
|
|
bias_str = f", bias={self.bias is not None}"
|
|
return f"Linear(in_features={self.in_features}, out_features={self.out_features}{bias_str})"
|
|
|
|
# %% ../../modules/03_layers/03_layers.ipynb 16
|
|
class Dropout(Layer):
|
|
"""
|
|
Dropout layer for regularization.
|
|
|
|
During training: randomly zeros elements with probability p
|
|
During inference: scales outputs by (1-p) to maintain expected value
|
|
|
|
This prevents overfitting by forcing the network to not rely on specific neurons.
|
|
"""
|
|
|
|
def __init__(self, p=0.5):
|
|
"""
|
|
Initialize dropout layer.
|
|
|
|
TODO: Store dropout probability
|
|
|
|
Args:
|
|
p: Probability of zeroing each element (0.0 = no dropout, 1.0 = zero everything)
|
|
|
|
EXAMPLE:
|
|
>>> dropout = Dropout(0.5) # Zero 50% of elements during training
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not DROPOUT_MIN_PROB <= p <= DROPOUT_MAX_PROB:
|
|
raise ValueError(f"Dropout probability must be between {DROPOUT_MIN_PROB} and {DROPOUT_MAX_PROB}, got {p}")
|
|
self.p = p
|
|
### END SOLUTION
|
|
|
|
def forward(self, x, training=True):
|
|
"""
|
|
Forward pass through dropout layer.
|
|
|
|
During training: randomly zeros elements with probability p
|
|
During inference: scales outputs by (1-p) to maintain expected value
|
|
|
|
This prevents overfitting by forcing the network to not rely on specific neurons.
|
|
|
|
TODO: Implement dropout forward pass
|
|
|
|
APPROACH:
|
|
1. If training=False or p=0, return input unchanged
|
|
2. If p=1, return zeros (preserve requires_grad)
|
|
3. Otherwise: create random mask, apply it, scale by 1/(1-p)
|
|
|
|
EXAMPLE:
|
|
>>> dropout = Dropout(0.5)
|
|
>>> x = Tensor([1, 2, 3, 4])
|
|
>>> y_train = dropout.forward(x, training=True) # Some elements zeroed
|
|
>>> y_eval = dropout.forward(x, training=False) # All elements preserved
|
|
|
|
HINTS:
|
|
- Use np.random.random() < keep_prob for mask
|
|
- Scale by 1/(1-p) to maintain expected value
|
|
- training=False should return input unchanged
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not training or self.p == DROPOUT_MIN_PROB:
|
|
# During inference or no dropout, pass through unchanged
|
|
return x
|
|
|
|
if self.p == DROPOUT_MAX_PROB:
|
|
# Drop everything (preserve requires_grad for gradient flow)
|
|
return Tensor(np.zeros_like(x.data), requires_grad=x.requires_grad)
|
|
|
|
# During training, apply dropout
|
|
keep_prob = 1.0 - self.p
|
|
|
|
# Create random mask: True where we keep elements
|
|
mask = np.random.random(x.data.shape) < keep_prob
|
|
|
|
# Apply mask and scale using Tensor operations to preserve gradients!
|
|
mask_tensor = Tensor(mask.astype(np.float32), requires_grad=False) # Mask doesn't need gradients
|
|
scale = Tensor(np.array(1.0 / keep_prob), requires_grad=False)
|
|
|
|
# Use Tensor operations: x * mask * scale
|
|
output = x * mask_tensor * scale
|
|
return output
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x, training=True):
|
|
"""Allows the layer to be called like a function."""
|
|
return self.forward(x, training)
|
|
|
|
def parameters(self):
|
|
"""Dropout has no parameters."""
|
|
return []
|
|
|
|
def __repr__(self):
|
|
return f"Dropout(p={self.p})"
|