mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 17:20:21 -05:00
fix: restore Conv2dBackward and MaxPool2dBackward for CNN gradient flow
- Restore Conv2dBackward class removed in commit 23c5eb2b5
- Restore MaxPool2dBackward class for pooling gradient routing
- Update Conv2d/MaxPool2d forward() to attach _grad_fn
- Set requires_grad=True on Conv2d weights and bias
- Add enable_autograd() to Module 11 (Embeddings) for progressive disclosure
- Remove skip markers from convolution gradient tests
CNN training now works correctly - conv weights receive gradients and update
during training. All 40 convolution tests pass.
This commit is contained in:
@@ -65,7 +65,7 @@ import time
|
||||
from tinytorch.core.tensor import Tensor
|
||||
|
||||
# Enable autograd for gradient tracking (required for BatchNorm2d learnable parameters)
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
from tinytorch.core.autograd import enable_autograd, Function
|
||||
enable_autograd()
|
||||
|
||||
# Constants for convolution defaults
|
||||
@@ -332,6 +332,110 @@ This reveals why convolution is expensive: O(B×C_out×H×W×K_h×K_w×C_in) ope
|
||||
|
||||
#| export
|
||||
|
||||
class Conv2dBackward(Function):
|
||||
"""
|
||||
Gradient computation for 2D convolution.
|
||||
|
||||
Computes gradients for Conv2d backward pass:
|
||||
- grad_input: gradient w.r.t. input (for backprop to previous layer)
|
||||
- grad_weight: gradient w.r.t. filters (for weight updates)
|
||||
- grad_bias: gradient w.r.t. bias (for bias updates)
|
||||
|
||||
This uses explicit loops to show the gradient computation, matching
|
||||
the educational approach of the forward pass.
|
||||
"""
|
||||
|
||||
def __init__(self, x, weight, bias, stride, padding, kernel_size, padded_shape):
|
||||
# Register all tensors that need gradients with autograd
|
||||
if bias is not None:
|
||||
super().__init__(x, weight, bias)
|
||||
else:
|
||||
super().__init__(x, weight)
|
||||
self.x = x
|
||||
self.weight = weight
|
||||
self.bias = bias
|
||||
self.stride = stride
|
||||
self.padding = padding
|
||||
self.kernel_size = kernel_size
|
||||
self.padded_shape = padded_shape
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Compute gradients for convolution inputs and parameters.
|
||||
|
||||
Args:
|
||||
grad_output: Gradient flowing back from next layer
|
||||
Shape: (batch_size, out_channels, out_height, out_width)
|
||||
|
||||
Returns:
|
||||
Tuple of (grad_input, grad_weight, grad_bias)
|
||||
"""
|
||||
batch_size, out_channels, out_height, out_width = grad_output.shape
|
||||
_, in_channels, in_height, in_width = self.x.shape
|
||||
kernel_h, kernel_w = self.kernel_size
|
||||
|
||||
# Apply padding to input if needed (for gradient computation)
|
||||
if self.padding > 0:
|
||||
padded_input = np.pad(self.x.data,
|
||||
((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
|
||||
mode='constant', constant_values=0)
|
||||
else:
|
||||
padded_input = self.x.data
|
||||
|
||||
# Initialize gradients
|
||||
grad_input_padded = np.zeros_like(padded_input)
|
||||
grad_weight = np.zeros_like(self.weight.data)
|
||||
grad_bias = None if self.bias is None else np.zeros_like(self.bias.data)
|
||||
|
||||
# Compute gradients using explicit loops (educational approach)
|
||||
for b in range(batch_size):
|
||||
for out_ch in range(out_channels):
|
||||
for out_h in range(out_height):
|
||||
for out_w in range(out_width):
|
||||
# Position in input
|
||||
in_h_start = out_h * self.stride
|
||||
in_w_start = out_w * self.stride
|
||||
|
||||
# Gradient value flowing back to this position
|
||||
grad_val = grad_output[b, out_ch, out_h, out_w]
|
||||
|
||||
# Distribute gradient to weight and input
|
||||
for k_h in range(kernel_h):
|
||||
for k_w in range(kernel_w):
|
||||
for in_ch in range(in_channels):
|
||||
# Input position
|
||||
in_h = in_h_start + k_h
|
||||
in_w = in_w_start + k_w
|
||||
|
||||
# Gradient w.r.t. weight
|
||||
grad_weight[out_ch, in_ch, k_h, k_w] += (
|
||||
padded_input[b, in_ch, in_h, in_w] * grad_val
|
||||
)
|
||||
|
||||
# Gradient w.r.t. input
|
||||
grad_input_padded[b, in_ch, in_h, in_w] += (
|
||||
self.weight.data[out_ch, in_ch, k_h, k_w] * grad_val
|
||||
)
|
||||
|
||||
# Compute gradient w.r.t. bias (sum over batch and spatial dimensions)
|
||||
if grad_bias is not None:
|
||||
for out_ch in range(out_channels):
|
||||
grad_bias[out_ch] = grad_output[:, out_ch, :, :].sum()
|
||||
|
||||
# Remove padding from input gradient
|
||||
if self.padding > 0:
|
||||
grad_input = grad_input_padded[:, :,
|
||||
self.padding:-self.padding,
|
||||
self.padding:-self.padding]
|
||||
else:
|
||||
grad_input = grad_input_padded
|
||||
|
||||
# Return gradients as numpy arrays (autograd system handles storage)
|
||||
# Following TinyTorch protocol: return (grad_input, grad_weight, grad_bias)
|
||||
return grad_input, grad_weight, grad_bias
|
||||
|
||||
#| export
|
||||
|
||||
class Conv2d:
|
||||
"""
|
||||
2D Convolution layer for spatial feature extraction.
|
||||
@@ -388,11 +492,12 @@ class Conv2d:
|
||||
|
||||
# Weight shape: (out_channels, in_channels, kernel_h, kernel_w)
|
||||
self.weight = Tensor(np.random.normal(0, std,
|
||||
(out_channels, in_channels, kernel_h, kernel_w)))
|
||||
(out_channels, in_channels, kernel_h, kernel_w)),
|
||||
requires_grad=True)
|
||||
|
||||
# Bias initialization
|
||||
if bias:
|
||||
self.bias = Tensor(np.zeros(out_channels))
|
||||
self.bias = Tensor(np.zeros(out_channels), requires_grad=True)
|
||||
else:
|
||||
self.bias = None
|
||||
### END SOLUTION
|
||||
@@ -487,7 +592,18 @@ class Conv2d:
|
||||
for out_ch in range(out_channels):
|
||||
output[:, out_ch, :, :] += self.bias.data[out_ch]
|
||||
|
||||
return Tensor(output)
|
||||
# Return Tensor with gradient tracking enabled
|
||||
result = Tensor(output, requires_grad=(x.requires_grad or self.weight.requires_grad))
|
||||
|
||||
# Attach backward function for gradient computation (following TinyTorch protocol)
|
||||
if result.requires_grad:
|
||||
result._grad_fn = Conv2dBackward(
|
||||
x, self.weight, self.bias,
|
||||
self.stride, self.padding, self.kernel_size,
|
||||
padded_input.shape
|
||||
)
|
||||
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
@@ -719,6 +835,84 @@ For input (1, 64, 224, 224) with 2×2 pooling:
|
||||
|
||||
#| export
|
||||
|
||||
class MaxPool2dBackward(Function):
|
||||
"""
|
||||
Gradient computation for 2D max pooling.
|
||||
|
||||
Max pooling gradients flow only to the positions that were selected
|
||||
as the maximum in the forward pass.
|
||||
"""
|
||||
|
||||
def __init__(self, x, output_shape, kernel_size, stride, padding):
|
||||
super().__init__(x)
|
||||
self.x = x
|
||||
self.output_shape = output_shape
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.padding = padding
|
||||
# Store max positions for gradient routing
|
||||
self.max_positions = {}
|
||||
|
||||
def apply(self, grad_output):
|
||||
"""
|
||||
Route gradients back to max positions.
|
||||
|
||||
Args:
|
||||
grad_output: Gradient from next layer
|
||||
|
||||
Returns:
|
||||
Gradient w.r.t. input
|
||||
"""
|
||||
batch_size, channels, in_height, in_width = self.x.shape
|
||||
_, _, out_height, out_width = self.output_shape
|
||||
kernel_h, kernel_w = self.kernel_size
|
||||
|
||||
# Apply padding if needed
|
||||
if self.padding > 0:
|
||||
padded_input = np.pad(self.x.data,
|
||||
((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
|
||||
mode='constant', constant_values=-np.inf)
|
||||
grad_input_padded = np.zeros_like(padded_input)
|
||||
else:
|
||||
padded_input = self.x.data
|
||||
grad_input_padded = np.zeros_like(self.x.data)
|
||||
|
||||
# Route gradients to max positions
|
||||
for b in range(batch_size):
|
||||
for c in range(channels):
|
||||
for out_h in range(out_height):
|
||||
for out_w in range(out_width):
|
||||
in_h_start = out_h * self.stride
|
||||
in_w_start = out_w * self.stride
|
||||
|
||||
# Find max position in this window
|
||||
max_val = -np.inf
|
||||
max_h, max_w = 0, 0
|
||||
for k_h in range(kernel_h):
|
||||
for k_w in range(kernel_w):
|
||||
in_h = in_h_start + k_h
|
||||
in_w = in_w_start + k_w
|
||||
val = padded_input[b, c, in_h, in_w]
|
||||
if val > max_val:
|
||||
max_val = val
|
||||
max_h, max_w = in_h, in_w
|
||||
|
||||
# Route gradient to max position
|
||||
grad_input_padded[b, c, max_h, max_w] += grad_output[b, c, out_h, out_w]
|
||||
|
||||
# Remove padding
|
||||
if self.padding > 0:
|
||||
grad_input = grad_input_padded[:, :,
|
||||
self.padding:-self.padding,
|
||||
self.padding:-self.padding]
|
||||
else:
|
||||
grad_input = grad_input_padded
|
||||
|
||||
# Return as tuple (following Function protocol)
|
||||
return (grad_input,)
|
||||
|
||||
#| export
|
||||
|
||||
class MaxPool2d:
|
||||
"""
|
||||
2D Max Pooling layer for spatial dimension reduction.
|
||||
@@ -842,7 +1036,16 @@ class MaxPool2d:
|
||||
# Store result
|
||||
output[b, c, out_h, out_w] = max_val
|
||||
|
||||
return Tensor(output)
|
||||
# Return Tensor with gradient tracking enabled
|
||||
result = Tensor(output, requires_grad=x.requires_grad)
|
||||
|
||||
# Attach backward function for gradient computation
|
||||
if result.requires_grad:
|
||||
result._grad_fn = MaxPool2dBackward(
|
||||
x, result.shape, self.kernel_size, self.stride, self.padding
|
||||
)
|
||||
|
||||
return result
|
||||
### END SOLUTION
|
||||
|
||||
def parameters(self):
|
||||
|
||||
@@ -66,6 +66,10 @@ from typing import List, Optional, Tuple
|
||||
# Import from previous modules - following dependency chain
|
||||
from tinytorch.core.tensor import Tensor
|
||||
|
||||
# Enable autograd for gradient tracking (required for learnable embeddings)
|
||||
from tinytorch.core.autograd import enable_autograd
|
||||
enable_autograd()
|
||||
|
||||
# Constants for memory calculations
|
||||
BYTES_PER_FLOAT32 = 4 # Standard float32 size in bytes
|
||||
KB_TO_BYTES = 1024 # Kilobytes to bytes conversion
|
||||
|
||||
Reference in New Issue
Block a user