Files
TinyTorch/tinytorch/core/spatial.py
Vijay Janapa Reddi d05daeb83b Add comprehensive milestone learning verification tests
- Created test suite that verifies actual learning (gradient flow, weight updates, loss convergence)
- Fixed MLP Digits (1986): increased training epochs from 15 to 25
- Added requires_grad=True to Conv2d weights (partial fix)
- Identified gradient flow issues in Conv2d, Embedding, and Attention layers
- Comprehensive documentation of issues and fixes needed
2025-11-22 17:02:10 -05:00

572 lines
22 KiB
Python
Generated
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║ 🚨 CRITICAL WARNING 🚨 ║
# ║ AUTOGENERATED! DO NOT EDIT! ║
# ║ ║
# ║ This file is AUTOMATICALLY GENERATED from source modules. ║
# ║ ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported! ║
# ║ ║
# ║ ✅ TO EDIT: modules/06_spatial/spatial.py ║
# ║ ✅ TO EXPORT: Run 'tito module complete <module_name>' ║
# ║ ║
# ║ 🛡️ STUDENT PROTECTION: This file contains optimized implementations. ║
# ║ Editing it directly may break module functionality and training. ║
# ║ ║
# ║ 🎓 LEARNING TIP: Work in modules/ - that's where real development ║
# ║ happens! The tinytorch/ directory is just the compiled output. ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['DEFAULT_KERNEL_SIZE', 'DEFAULT_STRIDE', 'DEFAULT_PADDING', 'Conv2d', 'MaxPool2d', 'AvgPool2d', 'SimpleCNN']
# %% ../../modules/09_spatial/spatial.ipynb 1
import numpy as np
import time
from .tensor import Tensor
# Constants for convolution defaults
DEFAULT_KERNEL_SIZE = 3 # Default kernel size for convolutions
DEFAULT_STRIDE = 1 # Default stride for convolutions
DEFAULT_PADDING = 0 # Default padding for convolutions
# %% ../../modules/09_spatial/spatial.ipynb 6
class Conv2d:
"""
2D Convolution layer for spatial feature extraction.
Implements convolution with explicit loops to demonstrate
computational complexity and memory access patterns.
Args:
in_channels: Number of input channels
out_channels: Number of output feature maps
kernel_size: Size of convolution kernel (int or tuple)
stride: Stride of convolution (default: 1)
padding: Zero-padding added to input (default: 0)
bias: Whether to add learnable bias (default: True)
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
"""
Initialize Conv2d layer with proper weight initialization.
TODO: Complete Conv2d initialization
APPROACH:
1. Store hyperparameters (channels, kernel_size, stride, padding)
2. Initialize weights using He initialization for ReLU compatibility
3. Initialize bias (if enabled) to zeros
4. Use proper shapes: weight (out_channels, in_channels, kernel_h, kernel_w)
WEIGHT INITIALIZATION:
- He init: std = sqrt(2 / (in_channels * kernel_h * kernel_w))
- This prevents vanishing/exploding gradients with ReLU
HINT: Convert kernel_size to tuple if it's an integer
"""
super().__init__()
### BEGIN SOLUTION
self.in_channels = in_channels
self.out_channels = out_channels
# Handle kernel_size as int or tuple
if isinstance(kernel_size, int):
self.kernel_size = (kernel_size, kernel_size)
else:
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
# He initialization for ReLU networks
kernel_h, kernel_w = self.kernel_size
fan_in = in_channels * kernel_h * kernel_w
std = np.sqrt(2.0 / fan_in)
# Weight shape: (out_channels, in_channels, kernel_h, kernel_w)
self.weight = Tensor(np.random.normal(0, std,
(out_channels, in_channels, kernel_h, kernel_w)),
requires_grad=True)
# Bias initialization
if bias:
self.bias = Tensor(np.zeros(out_channels), requires_grad=True)
else:
self.bias = None
### END SOLUTION
def forward(self, x):
"""
Forward pass through Conv2d layer.
TODO: Implement convolution with explicit loops
APPROACH:
1. Extract input dimensions and validate
2. Calculate output dimensions
3. Apply padding if needed
4. Implement 6 nested loops for full convolution
5. Add bias if present
LOOP STRUCTURE:
for batch in range(batch_size):
for out_ch in range(out_channels):
for out_h in range(out_height):
for out_w in range(out_width):
for k_h in range(kernel_height):
for k_w in range(kernel_width):
for in_ch in range(in_channels):
# Accumulate: out += input * weight
EXAMPLE:
>>> conv = Conv2d(3, 16, kernel_size=3, padding=1)
>>> x = Tensor(np.random.randn(2, 3, 32, 32)) # batch=2, RGB, 32x32
>>> out = conv(x)
>>> print(out.shape) # Should be (2, 16, 32, 32)
HINTS:
- Handle padding by creating padded input array
- Watch array bounds in inner loops
- Accumulate products for each output position
"""
### BEGIN SOLUTION
# Input validation and shape extraction
if len(x.shape) != 4:
raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
batch_size, in_channels, in_height, in_width = x.shape
out_channels = self.out_channels
kernel_h, kernel_w = self.kernel_size
# Calculate output dimensions
out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1
# Apply padding if needed
if self.padding > 0:
padded_input = np.pad(x.data,
((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
mode='constant', constant_values=0)
else:
padded_input = x.data
# Initialize output
output = np.zeros((batch_size, out_channels, out_height, out_width))
# Explicit 6-nested loop convolution to show complexity
for b in range(batch_size):
for out_ch in range(out_channels):
for out_h in range(out_height):
for out_w in range(out_width):
# Calculate input region for this output position
in_h_start = out_h * self.stride
in_w_start = out_w * self.stride
# Accumulate convolution result
conv_sum = 0.0
for k_h in range(kernel_h):
for k_w in range(kernel_w):
for in_ch in range(in_channels):
# Get input and weight values
input_val = padded_input[b, in_ch,
in_h_start + k_h,
in_w_start + k_w]
weight_val = self.weight.data[out_ch, in_ch, k_h, k_w]
# Accumulate
conv_sum += input_val * weight_val
# Store result
output[b, out_ch, out_h, out_w] = conv_sum
# Add bias if present
if self.bias is not None:
# Broadcast bias across spatial dimensions
for out_ch in range(out_channels):
output[:, out_ch, :, :] += self.bias.data[out_ch]
# Return Tensor with gradient tracking enabled
result = Tensor(output, requires_grad=(x.requires_grad or self.weight.requires_grad))
# Note: This simple implementation uses manual loops and doesn't integrate
# with autograd's computation graph. For full gradient support, Conv2d
# needs a backward() implementation or should use tensor operations that
# autograd tracks automatically. This is left as a future enhancement.
# Current implementation works for inference and demonstrates O(N²M²K²) complexity.
return result
### END SOLUTION
def parameters(self):
"""Return trainable parameters."""
params = [self.weight]
if self.bias is not None:
params.append(self.bias)
return params
def __call__(self, x):
"""Enable model(x) syntax."""
return self.forward(x)
# %% ../../modules/09_spatial/spatial.ipynb 11
class MaxPool2d:
"""
2D Max Pooling layer for spatial dimension reduction.
Applies maximum operation over spatial windows, preserving
the strongest activations while reducing computational load.
Args:
kernel_size: Size of pooling window (int or tuple)
stride: Stride of pooling operation (default: same as kernel_size)
padding: Zero-padding added to input (default: 0)
"""
def __init__(self, kernel_size, stride=None, padding=0):
"""
Initialize MaxPool2d layer.
TODO: Store pooling parameters
APPROACH:
1. Convert kernel_size to tuple if needed
2. Set stride to kernel_size if not provided (non-overlapping)
3. Store padding parameter
HINT: Default stride equals kernel_size for non-overlapping windows
"""
super().__init__()
### BEGIN SOLUTION
# Handle kernel_size as int or tuple
if isinstance(kernel_size, int):
self.kernel_size = (kernel_size, kernel_size)
else:
self.kernel_size = kernel_size
# Default stride equals kernel_size (non-overlapping)
if stride is None:
self.stride = self.kernel_size[0]
else:
self.stride = stride
self.padding = padding
### END SOLUTION
def forward(self, x):
"""
Forward pass through MaxPool2d layer.
TODO: Implement max pooling with explicit loops
APPROACH:
1. Extract input dimensions
2. Calculate output dimensions
3. Apply padding if needed
4. Implement nested loops for pooling windows
5. Find maximum value in each window
LOOP STRUCTURE:
for batch in range(batch_size):
for channel in range(channels):
for out_h in range(out_height):
for out_w in range(out_width):
# Find max in window [in_h:in_h+k_h, in_w:in_w+k_w]
max_val = -infinity
for k_h in range(kernel_height):
for k_w in range(kernel_width):
max_val = max(max_val, input[...])
EXAMPLE:
>>> pool = MaxPool2d(kernel_size=2, stride=2)
>>> x = Tensor(np.random.randn(1, 3, 8, 8))
>>> out = pool(x)
>>> print(out.shape) # Should be (1, 3, 4, 4)
HINTS:
- Initialize max_val to negative infinity
- Handle stride correctly when accessing input
- No parameters to update (pooling has no weights)
"""
### BEGIN SOLUTION
# Input validation and shape extraction
if len(x.shape) != 4:
raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
batch_size, channels, in_height, in_width = x.shape
kernel_h, kernel_w = self.kernel_size
# Calculate output dimensions
out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1
# Apply padding if needed
if self.padding > 0:
padded_input = np.pad(x.data,
((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
mode='constant', constant_values=-np.inf)
else:
padded_input = x.data
# Initialize output
output = np.zeros((batch_size, channels, out_height, out_width))
# Explicit nested loop max pooling
for b in range(batch_size):
for c in range(channels):
for out_h in range(out_height):
for out_w in range(out_width):
# Calculate input region for this output position
in_h_start = out_h * self.stride
in_w_start = out_w * self.stride
# Find maximum in window
max_val = -np.inf
for k_h in range(kernel_h):
for k_w in range(kernel_w):
input_val = padded_input[b, c,
in_h_start + k_h,
in_w_start + k_w]
max_val = max(max_val, input_val)
# Store result
output[b, c, out_h, out_w] = max_val
return Tensor(output)
### END SOLUTION
def parameters(self):
"""Return empty list (pooling has no parameters)."""
return []
def __call__(self, x):
"""Enable model(x) syntax."""
return self.forward(x)
# %% ../../modules/09_spatial/spatial.ipynb 13
class AvgPool2d:
"""
2D Average Pooling layer for spatial dimension reduction.
Applies average operation over spatial windows, smoothing
features while reducing computational load.
Args:
kernel_size: Size of pooling window (int or tuple)
stride: Stride of pooling operation (default: same as kernel_size)
padding: Zero-padding added to input (default: 0)
"""
def __init__(self, kernel_size, stride=None, padding=0):
"""
Initialize AvgPool2d layer.
TODO: Store pooling parameters (same as MaxPool2d)
APPROACH:
1. Convert kernel_size to tuple if needed
2. Set stride to kernel_size if not provided
3. Store padding parameter
"""
super().__init__()
### BEGIN SOLUTION
# Handle kernel_size as int or tuple
if isinstance(kernel_size, int):
self.kernel_size = (kernel_size, kernel_size)
else:
self.kernel_size = kernel_size
# Default stride equals kernel_size (non-overlapping)
if stride is None:
self.stride = self.kernel_size[0]
else:
self.stride = stride
self.padding = padding
### END SOLUTION
def forward(self, x):
"""
Forward pass through AvgPool2d layer.
TODO: Implement average pooling with explicit loops
APPROACH:
1. Similar structure to MaxPool2d
2. Instead of max, compute average of window
3. Divide sum by window area for true average
LOOP STRUCTURE:
for batch in range(batch_size):
for channel in range(channels):
for out_h in range(out_height):
for out_w in range(out_width):
# Compute average in window
window_sum = 0
for k_h in range(kernel_height):
for k_w in range(kernel_width):
window_sum += input[...]
avg_val = window_sum / (kernel_height * kernel_width)
HINT: Remember to divide by window area to get true average
"""
### BEGIN SOLUTION
# Input validation and shape extraction
if len(x.shape) != 4:
raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")
batch_size, channels, in_height, in_width = x.shape
kernel_h, kernel_w = self.kernel_size
# Calculate output dimensions
out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1
# Apply padding if needed
if self.padding > 0:
padded_input = np.pad(x.data,
((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
mode='constant', constant_values=0)
else:
padded_input = x.data
# Initialize output
output = np.zeros((batch_size, channels, out_height, out_width))
# Explicit nested loop average pooling
for b in range(batch_size):
for c in range(channels):
for out_h in range(out_height):
for out_w in range(out_width):
# Calculate input region for this output position
in_h_start = out_h * self.stride
in_w_start = out_w * self.stride
# Compute sum in window
window_sum = 0.0
for k_h in range(kernel_h):
for k_w in range(kernel_w):
input_val = padded_input[b, c,
in_h_start + k_h,
in_w_start + k_w]
window_sum += input_val
# Compute average
avg_val = window_sum / (kernel_h * kernel_w)
# Store result
output[b, c, out_h, out_w] = avg_val
return Tensor(output)
### END SOLUTION
def parameters(self):
"""Return empty list (pooling has no parameters)."""
return []
def __call__(self, x):
"""Enable model(x) syntax."""
return self.forward(x)
# %% ../../modules/09_spatial/spatial.ipynb 21
class SimpleCNN:
"""
Simple CNN demonstrating spatial operations integration.
Architecture:
- Conv2d(3→16, 3×3) + ReLU + MaxPool(2×2)
- Conv2d(16→32, 3×3) + ReLU + MaxPool(2×2)
- Flatten + Linear(features→num_classes)
"""
def __init__(self, num_classes=10):
"""
Initialize SimpleCNN.
TODO: Build CNN architecture with spatial and dense layers
APPROACH:
1. Conv layer 1: 3 → 16 channels, 3×3 kernel, padding=1
2. Pool layer 1: 2×2 max pooling
3. Conv layer 2: 16 → 32 channels, 3×3 kernel, padding=1
4. Pool layer 2: 2×2 max pooling
5. Calculate flattened size and add final linear layer
HINT: For 32×32 input → 32→16→8→4 spatial reduction
Final feature size: 32 channels × 4×4 = 512 features
"""
super().__init__()
### BEGIN SOLUTION
# Convolutional layers
self.conv1 = Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
self.pool1 = MaxPool2d(kernel_size=2, stride=2)
self.conv2 = Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
self.pool2 = MaxPool2d(kernel_size=2, stride=2)
# Calculate flattened size
# Input: 32×32 → Conv1+Pool1: 16×16 → Conv2+Pool2: 8×8
# Wait, let's recalculate: 32×32 → Pool1: 16×16 → Pool2: 8×8
# Final: 32 channels × 8×8 = 2048 features
self.flattened_size = 32 * 8 * 8
# Import Linear layer (we'll implement a simple version)
# For now, we'll use a placeholder that we can replace
# This represents the final classification layer
self.num_classes = num_classes
self.flattened_size = 32 * 8 * 8 # Will be used when we add Linear layer
### END SOLUTION
def forward(self, x):
"""
Forward pass through SimpleCNN.
TODO: Implement CNN forward pass
APPROACH:
1. Apply conv1 → ReLU → pool1
2. Apply conv2 → ReLU → pool2
3. Flatten spatial dimensions
4. Apply final linear layer (when available)
For now, return features before final linear layer
since we haven't imported Linear from layers module yet.
"""
### BEGIN SOLUTION
# First conv block
x = self.conv1(x)
x = self.relu(x) # ReLU activation
x = self.pool1(x)
# Second conv block
x = self.conv2(x)
x = self.relu(x) # ReLU activation
x = self.pool2(x)
# Flatten for classification (reshape to 2D)
batch_size = x.shape[0]
x_flat = x.data.reshape(batch_size, -1)
# Return flattened features
# In a complete implementation, this would go through a Linear layer
return Tensor(x_flat)
### END SOLUTION
def relu(self, x):
"""Simple ReLU implementation for CNN."""
return Tensor(np.maximum(0, x.data))
def parameters(self):
"""Return all trainable parameters."""
params = []
params.extend(self.conv1.parameters())
params.extend(self.conv2.parameters())
# Linear layer parameters would be added here
return params
def __call__(self, x):
"""Enable model(x) syntax."""
return self.forward(x)