mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-26 19:29:25 -05:00
Re-exported all modules after restructuring: - Updated _modidx.py with new module locations - Removed outdated autogeneration headers - Updated all core modules (tensor, autograd, layers, etc.) - Updated optimization modules (quantization, compression, etc.) - Updated TITO commands for new structure Changes include: - 24 tinytorch/ module files - 24 tito/ command and core files - Updated references from modules/source/ to modules/ All modules re-exported via nbdev from their new locations.
211 lines
6.6 KiB
Python
Generated
211 lines
6.6 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_layers/layers_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Linear', 'Dropout']
|
|
|
|
# %% ../../modules/source/03_layers/layers_dev.ipynb 1
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
|
|
# Import dependencies from tinytorch package
|
|
from .tensor import Tensor
|
|
from .activations import ReLU, Sigmoid
|
|
|
|
# %% ../../modules/source/03_layers/layers_dev.ipynb 6
|
|
class Linear:
|
|
"""
|
|
Linear (fully connected) layer: y = xW + b
|
|
|
|
This is the fundamental building block of neural networks.
|
|
Applies a linear transformation to incoming data.
|
|
"""
|
|
|
|
def __init__(self, in_features, out_features, bias=True):
|
|
"""
|
|
Initialize linear layer with proper weight initialization.
|
|
|
|
TODO: Initialize weights and bias with Xavier initialization
|
|
|
|
APPROACH:
|
|
1. Create weight matrix (in_features, out_features) with Xavier scaling
|
|
2. Create bias vector (out_features,) initialized to zeros if bias=True
|
|
3. Set requires_grad=True for parameters (ready for Module 05)
|
|
|
|
EXAMPLE:
|
|
>>> layer = Linear(784, 10) # MNIST classifier final layer
|
|
>>> print(layer.weight.shape)
|
|
(784, 10)
|
|
>>> print(layer.bias.shape)
|
|
(10,)
|
|
|
|
HINTS:
|
|
- Xavier init: scale = sqrt(1/in_features)
|
|
- Use np.random.randn() for normal distribution
|
|
- bias=None when bias=False
|
|
"""
|
|
### BEGIN SOLUTION
|
|
self.in_features = in_features
|
|
self.out_features = out_features
|
|
|
|
# Xavier/Glorot initialization for stable gradients
|
|
scale = np.sqrt(1.0 / in_features)
|
|
weight_data = np.random.randn(in_features, out_features) * scale
|
|
self.weight = Tensor(weight_data, requires_grad=True)
|
|
|
|
# Initialize bias to zeros or None
|
|
if bias:
|
|
bias_data = np.zeros(out_features)
|
|
self.bias = Tensor(bias_data, requires_grad=True)
|
|
else:
|
|
self.bias = None
|
|
### END SOLUTION
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Forward pass through linear layer.
|
|
|
|
TODO: Implement y = xW + b
|
|
|
|
APPROACH:
|
|
1. Matrix multiply input with weights: xW
|
|
2. Add bias if it exists
|
|
3. Return result as new Tensor
|
|
|
|
EXAMPLE:
|
|
>>> layer = Linear(3, 2)
|
|
>>> x = Tensor([[1, 2, 3], [4, 5, 6]]) # 2 samples, 3 features
|
|
>>> y = layer.forward(x)
|
|
>>> print(y.shape)
|
|
(2, 2) # 2 samples, 2 outputs
|
|
|
|
HINTS:
|
|
- Use tensor.matmul() for matrix multiplication
|
|
- Handle bias=None case
|
|
- Broadcasting automatically handles bias addition
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Linear transformation: y = xW
|
|
output = x.matmul(self.weight)
|
|
|
|
# Add bias if present
|
|
if self.bias is not None:
|
|
output = output + self.bias
|
|
|
|
return output
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x):
|
|
"""Allows the layer to be called like a function."""
|
|
return self.forward(x)
|
|
|
|
def parameters(self):
|
|
"""
|
|
Return list of trainable parameters.
|
|
|
|
TODO: Return all tensors that need gradients
|
|
|
|
APPROACH:
|
|
1. Start with weight (always present)
|
|
2. Add bias if it exists
|
|
3. Return as list for optimizer
|
|
"""
|
|
### BEGIN SOLUTION
|
|
params = [self.weight]
|
|
if self.bias is not None:
|
|
params.append(self.bias)
|
|
return params
|
|
### END SOLUTION
|
|
|
|
def __repr__(self):
|
|
"""String representation for debugging."""
|
|
bias_str = f", bias={self.bias is not None}"
|
|
return f"Linear(in_features={self.in_features}, out_features={self.out_features}{bias_str})"
|
|
|
|
# %% ../../modules/source/03_layers/layers_dev.ipynb 10
|
|
class Dropout:
|
|
"""
|
|
Dropout layer for regularization.
|
|
|
|
During training: randomly zeros elements with probability p
|
|
During inference: scales outputs by (1-p) to maintain expected value
|
|
|
|
This prevents overfitting by forcing the network to not rely on specific neurons.
|
|
"""
|
|
|
|
def __init__(self, p=0.5):
|
|
"""
|
|
Initialize dropout layer.
|
|
|
|
TODO: Store dropout probability
|
|
|
|
Args:
|
|
p: Probability of zeroing each element (0.0 = no dropout, 1.0 = zero everything)
|
|
|
|
EXAMPLE:
|
|
>>> dropout = Dropout(0.5) # Zero 50% of elements during training
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not 0.0 <= p <= 1.0:
|
|
raise ValueError(f"Dropout probability must be between 0 and 1, got {p}")
|
|
self.p = p
|
|
### END SOLUTION
|
|
|
|
def forward(self, x, training=True):
|
|
"""
|
|
Forward pass through dropout layer.
|
|
|
|
TODO: Apply dropout during training, pass through during inference
|
|
|
|
APPROACH:
|
|
1. If not training, return input unchanged
|
|
2. If training, create random mask with probability (1-p)
|
|
3. Multiply input by mask and scale by 1/(1-p)
|
|
4. Return result as new Tensor
|
|
|
|
EXAMPLE:
|
|
>>> dropout = Dropout(0.5)
|
|
>>> x = Tensor([1, 2, 3, 4])
|
|
>>> y_train = dropout.forward(x, training=True) # Some elements zeroed
|
|
>>> y_eval = dropout.forward(x, training=False) # All elements preserved
|
|
|
|
HINTS:
|
|
- Use np.random.random() < keep_prob for mask
|
|
- Scale by 1/(1-p) to maintain expected value
|
|
- training=False should return input unchanged
|
|
"""
|
|
### BEGIN SOLUTION
|
|
if not training or self.p == 0.0:
|
|
# During inference or no dropout, pass through unchanged
|
|
return x
|
|
|
|
if self.p == 1.0:
|
|
# Drop everything (preserve requires_grad for gradient flow)
|
|
return Tensor(np.zeros_like(x.data), requires_grad=x.requires_grad if hasattr(x, 'requires_grad') else False)
|
|
|
|
# During training, apply dropout
|
|
keep_prob = 1.0 - self.p
|
|
|
|
# Create random mask: True where we keep elements
|
|
mask = np.random.random(x.data.shape) < keep_prob
|
|
|
|
# Apply mask and scale using Tensor operations to preserve gradients!
|
|
mask_tensor = Tensor(mask.astype(np.float32), requires_grad=False) # Mask doesn't need gradients
|
|
scale = Tensor(np.array(1.0 / keep_prob), requires_grad=False)
|
|
|
|
# Use Tensor operations: x * mask * scale
|
|
output = x * mask_tensor * scale
|
|
return output
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x, training=True):
|
|
"""Allows the layer to be called like a function."""
|
|
return self.forward(x, training)
|
|
|
|
def parameters(self):
|
|
"""Dropout has no parameters."""
|
|
return []
|
|
|
|
def __repr__(self):
|
|
return f"Dropout(p={self.p})"
|