Files
TinyTorch/tinytorch/core/layers.py
Vijay Janapa Reddi 7f6dd19c10 Improve milestone 05 (Transformer) with letters for better visualization
- Enhanced attention proof to use A-Z letters instead of numbers
- Shows MCYWUH → HUWYCM instead of [1,2,3] → [3,2,1]
- More intuitive and fun for students
- Removed quickdemo, generation, dialogue scripts (too slow/gibberish)
2025-12-02 23:33:58 -08:00

276 lines
9.9 KiB
Python
Generated

# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: src/03_layers/03_layers.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners) ║
# ║ The tinytorch/ directory is generated code - edit source files instead! ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['XAVIER_SCALE_FACTOR', 'HE_SCALE_FACTOR', 'DROPOUT_MIN_PROB', 'DROPOUT_MAX_PROB', 'Layer', 'Linear', 'Dropout']
# %% ../../modules/03_layers/03_layers.ipynb 1
import numpy as np
# Import from TinyTorch package (previous modules must be completed and exported)
from .tensor import Tensor
from .activations import ReLU, Sigmoid
# Constants for weight initialization
XAVIER_SCALE_FACTOR = 1.0 # Xavier/Glorot initialization uses sqrt(1/fan_in)
HE_SCALE_FACTOR = 2.0 # He initialization uses sqrt(2/fan_in) for ReLU
# Constants for dropout
DROPOUT_MIN_PROB = 0.0 # Minimum dropout probability (no dropout)
DROPOUT_MAX_PROB = 1.0 # Maximum dropout probability (drop everything)
# %% ../../modules/03_layers/03_layers.ipynb 6
class Layer:
"""
Base class for all neural network layers.
All layers should inherit from this class and implement:
- forward(x): Compute layer output
- parameters(): Return list of trainable parameters
The __call__ method is provided to make layers callable.
"""
def forward(self, x):
"""
Forward pass through the layer.
Args:
x: Input tensor
Returns:
Output tensor after transformation
"""
raise NotImplementedError("Subclasses must implement forward()")
def __call__(self, x, *args, **kwargs):
"""Allow layer to be called like a function."""
return self.forward(x, *args, **kwargs)
def parameters(self):
"""
Return list of trainable parameters.
Returns:
List of Tensor objects with requires_grad=True
"""
return [] # Base class has no parameters
def __repr__(self):
"""String representation of the layer."""
return f"{self.__class__.__name__}()"
# %% ../../modules/03_layers/03_layers.ipynb 8
class Linear(Layer):
"""
Linear (fully connected) layer: y = xW + b
This is the fundamental building block of neural networks.
Applies a linear transformation to incoming data.
"""
def __init__(self, in_features, out_features, bias=True):
"""
Initialize linear layer with proper weight initialization.
TODO: Initialize weights and bias with Xavier initialization
APPROACH:
1. Create weight matrix (in_features, out_features) with Xavier scaling
2. Create bias vector (out_features,) initialized to zeros if bias=True
3. Set requires_grad=True for parameters (ready for Module 05)
EXAMPLE:
>>> layer = Linear(784, 10) # MNIST classifier final layer
>>> print(layer.weight.shape)
(784, 10)
>>> print(layer.bias.shape)
(10,)
HINTS:
- Xavier init: scale = sqrt(1/in_features)
- Use np.random.randn() for normal distribution
- bias=None when bias=False
"""
### BEGIN SOLUTION
self.in_features = in_features
self.out_features = out_features
# Xavier/Glorot initialization for stable gradients
scale = np.sqrt(XAVIER_SCALE_FACTOR / in_features)
weight_data = np.random.randn(in_features, out_features) * scale
self.weight = Tensor(weight_data, requires_grad=True)
# Initialize bias to zeros or None
if bias:
bias_data = np.zeros(out_features)
self.bias = Tensor(bias_data, requires_grad=True)
else:
self.bias = None
### END SOLUTION
def forward(self, x):
"""
Forward pass through linear layer.
TODO: Implement y = xW + b
APPROACH:
1. Matrix multiply input with weights: xW
2. Add bias if it exists
3. Return result as new Tensor
EXAMPLE:
>>> layer = Linear(3, 2)
>>> x = Tensor([[1, 2, 3], [4, 5, 6]]) # 2 samples, 3 features
>>> y = layer.forward(x)
>>> print(y.shape)
(2, 2) # 2 samples, 2 outputs
HINTS:
- Use tensor.matmul() for matrix multiplication
- Handle bias=None case
- Broadcasting automatically handles bias addition
"""
### BEGIN SOLUTION
# Linear transformation: y = xW
output = x.matmul(self.weight)
# Add bias if present
if self.bias is not None:
output = output + self.bias
return output
### END SOLUTION
def __call__(self, x):
"""Allows the layer to be called like a function."""
return self.forward(x)
def parameters(self):
"""
Return list of trainable parameters.
TODO: Return all tensors that need gradients
APPROACH:
1. Start with weight (always present)
2. Add bias if it exists
3. Return as list for optimizer
"""
### BEGIN SOLUTION
params = [self.weight]
if self.bias is not None:
params.append(self.bias)
return params
### END SOLUTION
def __repr__(self):
"""String representation for debugging."""
bias_str = f", bias={self.bias is not None}"
return f"Linear(in_features={self.in_features}, out_features={self.out_features}{bias_str})"
# %% ../../modules/03_layers/03_layers.ipynb 16
class Dropout(Layer):
"""
Dropout layer for regularization.
During training: randomly zeros elements with probability p
During inference: scales outputs by (1-p) to maintain expected value
This prevents overfitting by forcing the network to not rely on specific neurons.
"""
def __init__(self, p=0.5):
"""
Initialize dropout layer.
TODO: Store dropout probability
Args:
p: Probability of zeroing each element (0.0 = no dropout, 1.0 = zero everything)
EXAMPLE:
>>> dropout = Dropout(0.5) # Zero 50% of elements during training
"""
### BEGIN SOLUTION
if not DROPOUT_MIN_PROB <= p <= DROPOUT_MAX_PROB:
raise ValueError(f"Dropout probability must be between {DROPOUT_MIN_PROB} and {DROPOUT_MAX_PROB}, got {p}")
self.p = p
### END SOLUTION
def forward(self, x, training=True):
"""
Forward pass through dropout layer.
During training: randomly zeros elements with probability p
During inference: scales outputs by (1-p) to maintain expected value
This prevents overfitting by forcing the network to not rely on specific neurons.
TODO: Implement dropout forward pass
APPROACH:
1. If training=False or p=0, return input unchanged
2. If p=1, return zeros (preserve requires_grad)
3. Otherwise: create random mask, apply it, scale by 1/(1-p)
EXAMPLE:
>>> dropout = Dropout(0.5)
>>> x = Tensor([1, 2, 3, 4])
>>> y_train = dropout.forward(x, training=True) # Some elements zeroed
>>> y_eval = dropout.forward(x, training=False) # All elements preserved
HINTS:
- Use np.random.random() < keep_prob for mask
- Scale by 1/(1-p) to maintain expected value
- training=False should return input unchanged
"""
### BEGIN SOLUTION
if not training or self.p == DROPOUT_MIN_PROB:
# During inference or no dropout, pass through unchanged
return x
if self.p == DROPOUT_MAX_PROB:
# Drop everything (preserve requires_grad for gradient flow)
return Tensor(np.zeros_like(x.data), requires_grad=x.requires_grad)
# During training, apply dropout
keep_prob = 1.0 - self.p
# Create random mask: True where we keep elements
mask = np.random.random(x.data.shape) < keep_prob
# Apply mask and scale using Tensor operations to preserve gradients!
mask_tensor = Tensor(mask.astype(np.float32), requires_grad=False) # Mask doesn't need gradients
scale = Tensor(np.array(1.0 / keep_prob), requires_grad=False)
# Use Tensor operations: x * mask * scale
output = x * mask_tensor * scale
return output
### END SOLUTION
def __call__(self, x, training=True):
"""Allows the layer to be called like a function."""
return self.forward(x, training)
def parameters(self):
"""Dropout has no parameters."""
return []
def __repr__(self):
return f"Dropout(p={self.p})"