mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-06-04 07:26:00 -05:00
484 lines
18 KiB
Python
Generated
484 lines
18 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/03_layers/layers_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Dense', 'Module', 'matmul', 'Linear', 'Sequential', 'Flatten', 'flatten']
|
|
|
|
# %% ../../modules/03_layers/layers_dev.ipynb 1
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
from typing import Union, Tuple, Optional, Any
|
|
|
|
# Import our building blocks - try package first, then local modules
|
|
try:
|
|
from tinytorch.core.tensor import Tensor, Parameter
|
|
except ImportError:
|
|
# For development, import from local modules
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
|
|
from tensor_dev import Tensor, Parameter
|
|
|
|
# %% ../../modules/03_layers/layers_dev.ipynb 4
|
|
class Module:
|
|
"""
|
|
Base class for all neural network modules.
|
|
|
|
Provides automatic parameter collection, forward pass management,
|
|
and clean composition patterns. All layers (Dense, Conv2d, etc.)
|
|
inherit from this class.
|
|
|
|
Key Features:
|
|
- Automatic parameter registration when you assign Tensors with requires_grad=True
|
|
- Recursive parameter collection from sub-modules
|
|
- Clean __call__ interface: model(x) instead of model.forward(x)
|
|
- Extensible for custom layers
|
|
|
|
Example Usage:
|
|
class MLP(Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.layer1 = Dense(784, 128) # Auto-registered!
|
|
self.layer2 = Dense(128, 10) # Auto-registered!
|
|
|
|
def forward(self, x):
|
|
x = self.layer1(x)
|
|
return self.layer2(x)
|
|
|
|
model = MLP()
|
|
params = model.parameters() # Gets all parameters automatically!
|
|
output = model(input) # Clean interface!
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize module with empty parameter and sub-module storage."""
|
|
self._parameters = []
|
|
self._modules = []
|
|
|
|
def __setattr__(self, name, value):
|
|
"""
|
|
Intercept attribute assignment to auto-register parameters and modules.
|
|
|
|
When you do self.weight = Parameter(...), this automatically adds
|
|
the parameter to our collection for easy optimization.
|
|
"""
|
|
# Check if it's a tensor that needs gradients (a parameter)
|
|
if hasattr(value, 'requires_grad') and value.requires_grad:
|
|
self._parameters.append(value)
|
|
# Check if it's another Module (sub-module)
|
|
elif isinstance(value, Module):
|
|
self._modules.append(value)
|
|
|
|
# Always call parent to actually set the attribute
|
|
super().__setattr__(name, value)
|
|
|
|
def parameters(self):
|
|
"""
|
|
Recursively collect all parameters from this module and sub-modules.
|
|
|
|
Returns:
|
|
List of all parameters (Tensors with requires_grad=True)
|
|
|
|
This enables: optimizer = Adam(model.parameters())
|
|
"""
|
|
# Start with our own parameters
|
|
params = list(self._parameters)
|
|
|
|
# Add parameters from sub-modules recursively
|
|
for module in self._modules:
|
|
params.extend(module.parameters())
|
|
|
|
return params
|
|
|
|
def __call__(self, *args, **kwargs):
|
|
"""
|
|
Makes modules callable: model(x) instead of model.forward(x).
|
|
|
|
This is the magic that enables clean syntax like:
|
|
output = model(input)
|
|
instead of:
|
|
output = model.forward(input)
|
|
"""
|
|
return self.forward(*args, **kwargs)
|
|
|
|
def forward(self, *args, **kwargs):
|
|
"""
|
|
Forward pass - must be implemented by subclasses.
|
|
|
|
This is where the actual computation happens. Every layer
|
|
defines its own forward() method.
|
|
"""
|
|
raise NotImplementedError("Subclasses must implement forward()")
|
|
|
|
# %% ../../modules/03_layers/layers_dev.ipynb 7
|
|
def matmul(a: Tensor, b: Tensor) -> Tensor:
|
|
"""
|
|
Matrix multiplication for tensors.
|
|
|
|
Args:
|
|
a: Left tensor (shape: ..., m, k)
|
|
b: Right tensor (shape: ..., k, n)
|
|
|
|
Returns:
|
|
Result tensor (shape: ..., m, n)
|
|
|
|
TODO: Implement matrix multiplication using numpy's @ operator.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy arrays from both tensors using .data
|
|
2. Perform matrix multiplication: result_data = a_data @ b_data
|
|
3. Wrap result in a new Tensor and return
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is the core operation in Dense layers: output = input @ weights
|
|
- PyTorch uses optimized BLAS libraries for this operation
|
|
- GPU implementations parallelize this across thousands of cores
|
|
- Understanding this operation is key to neural network performance
|
|
|
|
EXAMPLE:
|
|
```python
|
|
a = Tensor([[1, 2], [3, 4]]) # shape (2, 2)
|
|
b = Tensor([[5, 6], [7, 8]]) # shape (2, 2)
|
|
result = matmul(a, b)
|
|
# result.data = [[19, 22], [43, 50]]
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use the @ operator for clean matrix multiplication
|
|
- Ensure you return a Tensor, not a numpy array
|
|
- The operation should work for any compatible matrix shapes
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Check if we're dealing with Variables (autograd) or plain Tensors
|
|
a_is_variable = hasattr(a, 'requires_grad') and hasattr(a, 'grad_fn')
|
|
b_is_variable = hasattr(b, 'requires_grad') and hasattr(b, 'grad_fn')
|
|
|
|
# Extract numpy data appropriately
|
|
if a_is_variable:
|
|
a_data = a.data.data # Variable.data is a Tensor, so .data.data gets numpy array
|
|
else:
|
|
a_data = a.data # Tensor.data is numpy array directly
|
|
|
|
if b_is_variable:
|
|
b_data = b.data.data
|
|
else:
|
|
b_data = b.data
|
|
|
|
# Perform matrix multiplication
|
|
result_data = a_data @ b_data
|
|
|
|
# If any input is a Variable, return Variable with gradient tracking
|
|
if a_is_variable or b_is_variable:
|
|
# Import Variable locally to avoid circular imports
|
|
if 'Variable' not in globals():
|
|
try:
|
|
from tinytorch.core.autograd import Variable
|
|
except ImportError:
|
|
from autograd_dev import Variable
|
|
|
|
# Create gradient function for matrix multiplication
|
|
def grad_fn(grad_output):
|
|
# Matrix multiplication backward pass:
|
|
# If C = A @ B, then:
|
|
# dA = grad_output @ B^T
|
|
# dB = A^T @ grad_output
|
|
|
|
if a_is_variable and a.requires_grad:
|
|
# Gradient w.r.t. A: grad_output @ B^T
|
|
grad_a_data = grad_output.data.data @ b_data.T
|
|
a.backward(Variable(grad_a_data))
|
|
|
|
if b_is_variable and b.requires_grad:
|
|
# Gradient w.r.t. B: A^T @ grad_output
|
|
grad_b_data = a_data.T @ grad_output.data.data
|
|
b.backward(Variable(grad_b_data))
|
|
|
|
# Determine if result should require gradients
|
|
requires_grad = (a_is_variable and a.requires_grad) or (b_is_variable and b.requires_grad)
|
|
|
|
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
|
|
else:
|
|
# Both inputs are Tensors, return Tensor (backward compatible)
|
|
return Tensor(result_data)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/03_layers/layers_dev.ipynb 11
|
|
class Linear(Module):
|
|
"""
|
|
Linear (Fully Connected) Layer implementation.
|
|
|
|
Applies the transformation: output = input @ weights + bias
|
|
|
|
Inherits from Module for automatic parameter management and clean API.
|
|
This is PyTorch's nn.Linear equivalent with the same name for familiarity.
|
|
|
|
Features:
|
|
- Automatic parameter registration (weights and bias)
|
|
- Clean call interface: layer(input) instead of layer.forward(input)
|
|
- Works with optimizers via model.parameters()
|
|
"""
|
|
|
|
def __init__(self, input_size: int, output_size: int, use_bias: bool = True):
|
|
"""
|
|
Initialize Linear layer with random weights and optional bias.
|
|
|
|
Args:
|
|
input_size: Number of input features
|
|
output_size: Number of output features
|
|
use_bias: Whether to include bias term
|
|
|
|
TODO: Implement Linear layer initialization.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Store input_size and output_size as instance variables
|
|
2. Initialize weights as Tensor with shape (input_size, output_size)
|
|
3. Use small random values: np.random.randn(...) * 0.1
|
|
4. Initialize bias as Tensor with shape (output_size,) if use_bias is True
|
|
5. Set bias to None if use_bias is False
|
|
|
|
LEARNING CONNECTIONS:
|
|
- Small random initialization prevents symmetry breaking
|
|
- Weight shape (input_size, output_size) enables matrix multiplication
|
|
- Bias allows shifting the output (like y-intercept in linear regression)
|
|
- PyTorch uses more sophisticated initialization (Xavier, Kaiming)
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use np.random.randn() for Gaussian random numbers
|
|
- Scale by 0.1 to keep initial values small
|
|
- Remember to wrap numpy arrays in Tensor()
|
|
- Store use_bias flag for forward pass logic
|
|
"""
|
|
### BEGIN SOLUTION
|
|
super().__init__() # Initialize Module base class
|
|
|
|
self.input_size = input_size
|
|
self.output_size = output_size
|
|
self.use_bias = use_bias
|
|
|
|
# Initialize weights with small random values using Parameter
|
|
# Shape: (input_size, output_size) for matrix multiplication
|
|
weight_data = np.random.randn(input_size, output_size) * 0.1
|
|
self.weights = Parameter(weight_data) # Auto-registers for optimization!
|
|
|
|
# Initialize bias if requested
|
|
if use_bias:
|
|
bias_data = np.random.randn(output_size) * 0.1
|
|
self.bias = Parameter(bias_data) # Auto-registers for optimization!
|
|
else:
|
|
self.bias = None
|
|
### END SOLUTION
|
|
|
|
def forward(self, x: Union[Tensor, 'Variable']) -> Union[Tensor, 'Variable']:
|
|
"""
|
|
Forward pass through the Linear layer.
|
|
|
|
Args:
|
|
x: Input tensor or Variable (shape: ..., input_size)
|
|
|
|
Returns:
|
|
Output tensor or Variable (shape: ..., output_size)
|
|
Preserves Variable type for gradient tracking in training
|
|
|
|
TODO: Implement autograd-aware forward pass: output = input @ weights + bias
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Perform matrix multiplication: output = matmul(x, self.weights)
|
|
2. If bias exists, add it appropriately based on input type
|
|
3. Preserve Variable type for gradient tracking if input is Variable
|
|
4. Return result maintaining autograd capabilities
|
|
|
|
AUTOGRAD CONSIDERATIONS:
|
|
- If x is Variable: weights and bias should also be Variables for training
|
|
- Preserve gradient tracking through the entire computation
|
|
- Enable backpropagation through this layer's parameters
|
|
- Handle mixed Tensor/Variable scenarios gracefully
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is the core neural network transformation
|
|
- Matrix multiplication scales input features to output features
|
|
- Bias provides offset (like y-intercept in linear equations)
|
|
- Broadcasting handles different batch sizes automatically
|
|
- Autograd support enables automatic parameter optimization
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Use the matmul function you implemented above (now autograd-aware)
|
|
- Handle bias addition based on input/output types
|
|
- Variables support + operator for gradient-tracked addition
|
|
- Check if self.bias is not None before adding
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Matrix multiplication: input @ weights (now autograd-aware)
|
|
output = matmul(x, self.weights)
|
|
|
|
# Add bias if it exists
|
|
# The addition will preserve Variable type if output is Variable
|
|
if self.bias is not None:
|
|
# Check if we need Variable-aware addition
|
|
if hasattr(output, 'requires_grad'):
|
|
# output is a Variable, use Variable addition
|
|
if hasattr(self.bias, 'requires_grad'):
|
|
# bias is also Variable, direct addition works
|
|
output = output + self.bias
|
|
else:
|
|
# bias is Tensor, convert to Variable for addition
|
|
# Import Variable if not already available
|
|
if 'Variable' not in globals():
|
|
try:
|
|
from tinytorch.core.autograd import Variable
|
|
except ImportError:
|
|
from autograd_dev import Variable
|
|
|
|
bias_var = Variable(self.bias.data, requires_grad=False)
|
|
output = output + bias_var
|
|
else:
|
|
# output is Tensor, use regular addition
|
|
output = output + self.bias
|
|
|
|
return output
|
|
### END SOLUTION
|
|
|
|
# Backward compatibility alias
|
|
Dense = Linear
|
|
|
|
class Sequential(Module):
|
|
"""
|
|
Sequential Network: Composes layers in sequence.
|
|
|
|
The most fundamental network architecture that applies layers in order:
|
|
f(x) = layer_n(...layer_2(layer_1(x)))
|
|
|
|
Inherits from Module for automatic parameter collection from all sub-layers.
|
|
This enables optimizers to find all parameters automatically.
|
|
|
|
Example Usage:
|
|
# Create a 3-layer MLP
|
|
model = Sequential([
|
|
Linear(784, 128),
|
|
ReLU(),
|
|
Linear(128, 64),
|
|
ReLU(),
|
|
Linear(64, 10)
|
|
])
|
|
|
|
# Use the model
|
|
output = model(input_data) # Clean interface!
|
|
params = model.parameters() # All parameters from all layers!
|
|
"""
|
|
|
|
def __init__(self, layers=None):
|
|
"""
|
|
Initialize Sequential network with layers.
|
|
|
|
Args:
|
|
layers: List of layers to compose in order (optional)
|
|
"""
|
|
super().__init__() # Initialize Module base class
|
|
self.layers = layers if layers is not None else []
|
|
|
|
# Register all layers as sub-modules for parameter collection
|
|
for i, layer in enumerate(self.layers):
|
|
# This automatically adds each layer to self._modules
|
|
setattr(self, f'layer_{i}', layer)
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Forward pass through all layers in sequence.
|
|
|
|
Args:
|
|
x: Input tensor
|
|
|
|
Returns:
|
|
Output tensor after passing through all layers
|
|
"""
|
|
for layer in self.layers:
|
|
x = layer(x)
|
|
return x
|
|
|
|
def add(self, layer):
|
|
"""Add a layer to the network."""
|
|
self.layers.append(layer)
|
|
# Register the new layer for parameter collection
|
|
setattr(self, f'layer_{len(self.layers)-1}', layer)
|
|
|
|
def flatten(x, start_dim=1):
|
|
"""
|
|
Flatten tensor starting from a given dimension.
|
|
|
|
This is essential for transitioning from convolutional layers
|
|
(which output 4D tensors) to linear layers (which expect 2D).
|
|
|
|
Args:
|
|
x: Input tensor (Tensor or any array-like)
|
|
start_dim: Dimension to start flattening from (default: 1 to preserve batch)
|
|
|
|
Returns:
|
|
Flattened tensor preserving batch dimension
|
|
|
|
Examples:
|
|
# Flatten CNN output for Linear layer
|
|
conv_output = Tensor(np.random.randn(32, 64, 8, 8)) # (batch, channels, height, width)
|
|
flat = flatten(conv_output) # (32, 4096) - ready for Linear layer!
|
|
|
|
# Flatten image for MLP
|
|
images = Tensor(np.random.randn(32, 3, 28, 28)) # CIFAR-10 batch
|
|
flat = flatten(images) # (32, 2352) - ready for MLP!
|
|
"""
|
|
# Get the data (handle both Tensor and numpy arrays)
|
|
if hasattr(x, 'data'):
|
|
data = x.data
|
|
else:
|
|
data = x
|
|
|
|
# Calculate new shape
|
|
batch_size = data.shape[0] if start_dim > 0 else 1
|
|
remaining_size = np.prod(data.shape[start_dim:])
|
|
new_shape = (batch_size, remaining_size) if start_dim > 0 else (remaining_size,)
|
|
|
|
# Reshape while preserving the original tensor type
|
|
if hasattr(x, 'data'):
|
|
# It's a Tensor - create a new Tensor with flattened data
|
|
flattened_data = data.reshape(new_shape)
|
|
# Use type(x) to preserve the exact Tensor type (Parameter vs regular Tensor)
|
|
# This ensures that if input was a Parameter, output is also a Parameter
|
|
return type(x)(flattened_data)
|
|
else:
|
|
# It's a numpy array - just reshape and return
|
|
return data.reshape(new_shape)
|
|
|
|
class Flatten(Module):
|
|
"""
|
|
Flatten layer that reshapes tensors from multi-dimensional to 2D.
|
|
|
|
Essential for connecting convolutional layers (which output 4D tensors)
|
|
to linear layers (which expect 2D tensors). Preserves the batch dimension.
|
|
|
|
Example Usage:
|
|
# In a CNN architecture
|
|
model = Sequential([
|
|
Conv2D(3, 16, kernel_size=3), # Output: (batch, 16, height, width)
|
|
ReLU(),
|
|
Flatten(), # Output: (batch, 16*height*width)
|
|
Linear(16*height*width, 10) # Now compatible!
|
|
])
|
|
"""
|
|
|
|
def __init__(self, start_dim=1):
|
|
"""
|
|
Initialize Flatten layer.
|
|
|
|
Args:
|
|
start_dim: Dimension to start flattening from (default: 1 to preserve batch)
|
|
"""
|
|
super().__init__()
|
|
self.start_dim = start_dim
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Flatten tensor starting from start_dim.
|
|
|
|
Args:
|
|
x: Input tensor
|
|
|
|
Returns:
|
|
Flattened tensor with batch dimension preserved
|
|
"""
|
|
return flatten(x, start_dim=self.start_dim)
|