mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 20:04:51 -05:00
🔧 TESTING INFRASTRUCTURE FIXES: - Fixed pytest configuration (removed duplicate timeout) - Exported all modules to tinytorch package using nbdev - Converted .py files to .ipynb for proper NBDev processing - Fixed import issues in test files with fallback strategies 📊 TESTING RESULTS: - 145 tests passing, 15 failing, 16 skipped - Major improvement from previous import errors - All modules now properly exported and testable - Analysis tool working correctly on all modules 🎯 MODULE QUALITY STATUS: - Most modules: Grade C, Scaffolding 3/5 - 01_tensor: Grade C, Scaffolding 2/5 (needs improvement) - 07_autograd: Grade D, Scaffolding 2/5 (needs improvement) - Overall: Functional but needs educational enhancement ✅ RESOLVED ISSUES: - All import errors resolved - NBDev export process working - Test infrastructure functional - Analysis tools operational 🚀 READY FOR NEXT PHASE: Professional report cards and improvements
829 lines
26 KiB
Python
829 lines
26 KiB
Python
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_autograd/autograd_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['Variable', 'add', 'multiply', 'subtract', 'divide', 'relu_with_grad', 'sigmoid_with_grad', 'power', 'exp', 'log',
|
|
'sum_all', 'mean', 'clip_gradients', 'collect_parameters', 'zero_gradients']
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 1
|
|
import numpy as np
|
|
import sys
|
|
from typing import Union, List, Tuple, Optional, Any, Callable
|
|
from collections import defaultdict
|
|
|
|
# Import our existing components
|
|
from .tensor import Tensor
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 6
|
|
class Variable:
|
|
"""
|
|
Variable: Tensor wrapper with automatic differentiation capabilities.
|
|
|
|
The fundamental class for gradient computation in TinyTorch.
|
|
Wraps Tensor objects and tracks computational history for backpropagation.
|
|
"""
|
|
|
|
def __init__(self, data: Union[Tensor, np.ndarray, list, float, int],
|
|
requires_grad: bool = True, grad_fn: Optional[Callable] = None):
|
|
"""
|
|
Create a Variable with gradient tracking.
|
|
|
|
Args:
|
|
data: The data to wrap (will be converted to Tensor)
|
|
requires_grad: Whether to compute gradients for this Variable
|
|
grad_fn: Function to compute gradients (None for leaf nodes)
|
|
|
|
TODO: Implement Variable initialization with gradient tracking.
|
|
|
|
APPROACH:
|
|
1. Convert data to Tensor if it's not already
|
|
2. Store the tensor data
|
|
3. Set gradient tracking flag
|
|
4. Initialize gradient to None (will be computed later)
|
|
5. Store the gradient function for backward pass
|
|
6. Track if this is a leaf node (no grad_fn)
|
|
|
|
EXAMPLE:
|
|
Variable(5.0) → Variable wrapping Tensor(5.0)
|
|
Variable([1, 2, 3]) → Variable wrapping Tensor([1, 2, 3])
|
|
|
|
HINTS:
|
|
- Use isinstance() to check if data is already a Tensor
|
|
- Store requires_grad, grad_fn, and is_leaf flags
|
|
- Initialize self.grad to None
|
|
- A leaf node has grad_fn=None
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Convert data to Tensor if needed
|
|
if isinstance(data, Tensor):
|
|
self.data = data
|
|
else:
|
|
self.data = Tensor(data)
|
|
|
|
# Set gradient tracking
|
|
self.requires_grad = requires_grad
|
|
self.grad = None # Will be initialized when needed
|
|
self.grad_fn = grad_fn
|
|
self.is_leaf = grad_fn is None
|
|
|
|
# For computational graph
|
|
self._backward_hooks = []
|
|
### END SOLUTION
|
|
|
|
@property
|
|
def shape(self) -> Tuple[int, ...]:
|
|
"""Get the shape of the underlying tensor."""
|
|
return self.data.shape
|
|
|
|
@property
|
|
def size(self) -> int:
|
|
"""Get the total number of elements."""
|
|
return self.data.size
|
|
|
|
def __repr__(self) -> str:
|
|
"""String representation of the Variable."""
|
|
grad_str = f", grad_fn={self.grad_fn.__name__}" if self.grad_fn else ""
|
|
return f"Variable({self.data.data.tolist()}, requires_grad={self.requires_grad}{grad_str})"
|
|
|
|
def backward(self, gradient: Optional['Variable'] = None) -> None:
|
|
"""
|
|
Compute gradients using backpropagation.
|
|
|
|
Args:
|
|
gradient: The gradient to backpropagate (defaults to ones)
|
|
|
|
TODO: Implement backward propagation.
|
|
|
|
APPROACH:
|
|
1. If gradient is None, create a gradient of ones with same shape
|
|
2. If this Variable doesn't require gradients, return early
|
|
3. If this is a leaf node, accumulate the gradient
|
|
4. If this has a grad_fn, call it to propagate gradients
|
|
|
|
EXAMPLE:
|
|
x = Variable(5.0)
|
|
y = x * 2
|
|
y.backward() # Computes x.grad = 2.0
|
|
|
|
HINTS:
|
|
- Use np.ones_like() to create default gradient
|
|
- Accumulate gradients with += for leaf nodes
|
|
- Call self.grad_fn(gradient) for non-leaf nodes
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Default gradient is ones
|
|
if gradient is None:
|
|
gradient = Variable(np.ones_like(self.data.data))
|
|
|
|
# Skip if gradients not required
|
|
if not self.requires_grad:
|
|
return
|
|
|
|
# Accumulate gradient for leaf nodes
|
|
if self.is_leaf:
|
|
if self.grad is None:
|
|
self.grad = Variable(np.zeros_like(self.data.data))
|
|
self.grad.data._data += gradient.data.data
|
|
else:
|
|
# Propagate gradients through grad_fn
|
|
if self.grad_fn is not None:
|
|
self.grad_fn(gradient)
|
|
### END SOLUTION
|
|
|
|
def zero_grad(self) -> None:
|
|
"""Zero out the gradient."""
|
|
if self.grad is not None:
|
|
self.grad.data._data.fill(0)
|
|
|
|
# Arithmetic operations with gradient tracking
|
|
def __add__(self, other: Union['Variable', float, int]) -> 'Variable':
|
|
"""Addition with gradient tracking."""
|
|
return add(self, other)
|
|
|
|
def __mul__(self, other: Union['Variable', float, int]) -> 'Variable':
|
|
"""Multiplication with gradient tracking."""
|
|
return multiply(self, other)
|
|
|
|
def __sub__(self, other: Union['Variable', float, int]) -> 'Variable':
|
|
"""Subtraction with gradient tracking."""
|
|
return subtract(self, other)
|
|
|
|
def __truediv__(self, other: Union['Variable', float, int]) -> 'Variable':
|
|
"""Division with gradient tracking."""
|
|
return divide(self, other)
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 8
|
|
def add(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
|
|
"""
|
|
Addition operation with gradient tracking.
|
|
|
|
Args:
|
|
a: First operand
|
|
b: Second operand
|
|
|
|
Returns:
|
|
Variable with sum and gradient function
|
|
|
|
TODO: Implement addition with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Convert inputs to Variables if needed
|
|
2. Compute forward pass: result = a + b
|
|
3. Create gradient function that distributes gradients
|
|
4. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = x + y, then dz/dx = 1, dz/dy = 1
|
|
|
|
EXAMPLE:
|
|
x = Variable(2.0), y = Variable(3.0)
|
|
z = add(x, y) # z.data = 5.0
|
|
z.backward() # x.grad = 1.0, y.grad = 1.0
|
|
|
|
HINTS:
|
|
- Use isinstance() to check if inputs are Variables
|
|
- Create a closure that captures a and b
|
|
- In grad_fn, call a.backward() and b.backward() with appropriate gradients
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Convert to Variables if needed
|
|
if not isinstance(a, Variable):
|
|
a = Variable(a, requires_grad=False)
|
|
if not isinstance(b, Variable):
|
|
b = Variable(b, requires_grad=False)
|
|
|
|
# Forward pass
|
|
result_data = a.data + b.data
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
# Addition distributes gradients equally
|
|
if a.requires_grad:
|
|
a.backward(grad_output)
|
|
if b.requires_grad:
|
|
b.backward(grad_output)
|
|
|
|
# Determine if result requires gradients
|
|
requires_grad = a.requires_grad or b.requires_grad
|
|
|
|
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 9
|
|
def multiply(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
|
|
"""
|
|
Multiplication operation with gradient tracking.
|
|
|
|
Args:
|
|
a: First operand
|
|
b: Second operand
|
|
|
|
Returns:
|
|
Variable with product and gradient function
|
|
|
|
TODO: Implement multiplication with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Convert inputs to Variables if needed
|
|
2. Compute forward pass: result = a * b
|
|
3. Create gradient function using product rule
|
|
4. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = x * y, then dz/dx = y, dz/dy = x
|
|
|
|
EXAMPLE:
|
|
x = Variable(2.0), y = Variable(3.0)
|
|
z = multiply(x, y) # z.data = 6.0
|
|
z.backward() # x.grad = 3.0, y.grad = 2.0
|
|
|
|
HINTS:
|
|
- Store a.data and b.data for gradient computation
|
|
- In grad_fn, multiply incoming gradient by the other operand
|
|
- Handle broadcasting if shapes are different
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Convert to Variables if needed
|
|
if not isinstance(a, Variable):
|
|
a = Variable(a, requires_grad=False)
|
|
if not isinstance(b, Variable):
|
|
b = Variable(b, requires_grad=False)
|
|
|
|
# Forward pass
|
|
result_data = a.data * b.data
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
# Product rule: d(xy)/dx = y, d(xy)/dy = x
|
|
if a.requires_grad:
|
|
a_grad = Variable(grad_output.data * b.data)
|
|
a.backward(a_grad)
|
|
if b.requires_grad:
|
|
b_grad = Variable(grad_output.data * a.data)
|
|
b.backward(b_grad)
|
|
|
|
# Determine if result requires gradients
|
|
requires_grad = a.requires_grad or b.requires_grad
|
|
|
|
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 10
|
|
def subtract(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
|
|
"""
|
|
Subtraction operation with gradient tracking.
|
|
|
|
Args:
|
|
a: First operand (minuend)
|
|
b: Second operand (subtrahend)
|
|
|
|
Returns:
|
|
Variable with difference and gradient function
|
|
|
|
TODO: Implement subtraction with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Convert inputs to Variables if needed
|
|
2. Compute forward pass: result = a - b
|
|
3. Create gradient function with correct signs
|
|
4. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = x - y, then dz/dx = 1, dz/dy = -1
|
|
|
|
EXAMPLE:
|
|
x = Variable(5.0), y = Variable(3.0)
|
|
z = subtract(x, y) # z.data = 2.0
|
|
z.backward() # x.grad = 1.0, y.grad = -1.0
|
|
|
|
HINTS:
|
|
- Forward pass is straightforward: a - b
|
|
- Gradient for a is positive, for b is negative
|
|
- Remember to negate the gradient for b
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Convert to Variables if needed
|
|
if not isinstance(a, Variable):
|
|
a = Variable(a, requires_grad=False)
|
|
if not isinstance(b, Variable):
|
|
b = Variable(b, requires_grad=False)
|
|
|
|
# Forward pass
|
|
result_data = a.data - b.data
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
# Subtraction rule: d(x-y)/dx = 1, d(x-y)/dy = -1
|
|
if a.requires_grad:
|
|
a.backward(grad_output)
|
|
if b.requires_grad:
|
|
b_grad = Variable(-grad_output.data.data)
|
|
b.backward(b_grad)
|
|
|
|
# Determine if result requires gradients
|
|
requires_grad = a.requires_grad or b.requires_grad
|
|
|
|
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 11
|
|
def divide(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
|
|
"""
|
|
Division operation with gradient tracking.
|
|
|
|
Args:
|
|
a: Numerator
|
|
b: Denominator
|
|
|
|
Returns:
|
|
Variable with quotient and gradient function
|
|
|
|
TODO: Implement division with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Convert inputs to Variables if needed
|
|
2. Compute forward pass: result = a / b
|
|
3. Create gradient function using quotient rule
|
|
4. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = x / y, then dz/dx = 1/y, dz/dy = -x/y²
|
|
|
|
EXAMPLE:
|
|
x = Variable(6.0), y = Variable(2.0)
|
|
z = divide(x, y) # z.data = 3.0
|
|
z.backward() # x.grad = 0.5, y.grad = -1.5
|
|
|
|
HINTS:
|
|
- Forward pass: a.data / b.data
|
|
- Gradient for a: grad_output / b.data
|
|
- Gradient for b: -grad_output * a.data / (b.data ** 2)
|
|
- Be careful with numerical stability
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Convert to Variables if needed
|
|
if not isinstance(a, Variable):
|
|
a = Variable(a, requires_grad=False)
|
|
if not isinstance(b, Variable):
|
|
b = Variable(b, requires_grad=False)
|
|
|
|
# Forward pass
|
|
result_data = a.data / b.data
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
# Quotient rule: d(x/y)/dx = 1/y, d(x/y)/dy = -x/y²
|
|
if a.requires_grad:
|
|
a_grad = Variable(grad_output.data.data / b.data.data)
|
|
a.backward(a_grad)
|
|
if b.requires_grad:
|
|
b_grad = Variable(-grad_output.data.data * a.data.data / (b.data.data ** 2))
|
|
b.backward(b_grad)
|
|
|
|
# Determine if result requires gradients
|
|
requires_grad = a.requires_grad or b.requires_grad
|
|
|
|
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 17
|
|
def relu_with_grad(x: Variable) -> Variable:
|
|
"""
|
|
ReLU activation with gradient tracking.
|
|
|
|
Args:
|
|
x: Input Variable
|
|
|
|
Returns:
|
|
Variable with ReLU applied and gradient function
|
|
|
|
TODO: Implement ReLU with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: max(0, x)
|
|
2. Create gradient function using ReLU derivative
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
f(x) = max(0, x)
|
|
f'(x) = 1 if x > 0, else 0
|
|
|
|
EXAMPLE:
|
|
x = Variable([-1.0, 0.0, 1.0])
|
|
y = relu_with_grad(x) # y.data = [0.0, 0.0, 1.0]
|
|
y.backward() # x.grad = [0.0, 0.0, 1.0]
|
|
|
|
HINTS:
|
|
- Use np.maximum(0, x.data.data) for forward pass
|
|
- Use (x.data.data > 0) for gradient mask
|
|
- Only propagate gradients where input was positive
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass
|
|
result_data = Tensor(np.maximum(0, x.data.data))
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# ReLU derivative: 1 if x > 0, else 0
|
|
mask = (x.data.data > 0).astype(np.float32)
|
|
x_grad = Variable(grad_output.data.data * mask)
|
|
x.backward(x_grad)
|
|
|
|
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 18
|
|
def sigmoid_with_grad(x: Variable) -> Variable:
|
|
"""
|
|
Sigmoid activation with gradient tracking.
|
|
|
|
Args:
|
|
x: Input Variable
|
|
|
|
Returns:
|
|
Variable with sigmoid applied and gradient function
|
|
|
|
TODO: Implement sigmoid with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: 1 / (1 + exp(-x))
|
|
2. Create gradient function using sigmoid derivative
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
f(x) = 1 / (1 + exp(-x))
|
|
f'(x) = f(x) * (1 - f(x))
|
|
|
|
EXAMPLE:
|
|
x = Variable(0.0)
|
|
y = sigmoid_with_grad(x) # y.data = 0.5
|
|
y.backward() # x.grad = 0.25
|
|
|
|
HINTS:
|
|
- Use np.clip for numerical stability
|
|
- Store sigmoid output for gradient computation
|
|
- Gradient is sigmoid * (1 - sigmoid)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass with numerical stability
|
|
clipped = np.clip(x.data.data, -500, 500)
|
|
sigmoid_output = 1.0 / (1.0 + np.exp(-clipped))
|
|
result_data = Tensor(sigmoid_output)
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Sigmoid derivative: sigmoid * (1 - sigmoid)
|
|
sigmoid_grad = sigmoid_output * (1.0 - sigmoid_output)
|
|
x_grad = Variable(grad_output.data.data * sigmoid_grad)
|
|
x.backward(x_grad)
|
|
|
|
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 23
|
|
def power(base: Variable, exponent: Union[float, int]) -> Variable:
|
|
"""
|
|
Power operation with gradient tracking: base^exponent.
|
|
|
|
Args:
|
|
base: Base Variable
|
|
exponent: Exponent (scalar)
|
|
|
|
Returns:
|
|
Variable with power applied and gradient function
|
|
|
|
TODO: Implement power operation with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: base^exponent
|
|
2. Create gradient function using power rule
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = x^n, then dz/dx = n * x^(n-1)
|
|
|
|
EXAMPLE:
|
|
x = Variable(2.0)
|
|
y = power(x, 3) # y.data = 8.0
|
|
y.backward() # x.grad = 3 * 2^2 = 12.0
|
|
|
|
HINTS:
|
|
- Use np.power() for forward pass
|
|
- Power rule: gradient = exponent * base^(exponent-1)
|
|
- Handle edge cases like exponent=0 or base=0
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass
|
|
result_data = Tensor(np.power(base.data.data, exponent))
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if base.requires_grad:
|
|
# Power rule: d(x^n)/dx = n * x^(n-1)
|
|
if exponent == 0:
|
|
# Special case: derivative of constant is 0
|
|
base_grad = Variable(np.zeros_like(base.data.data))
|
|
else:
|
|
base_grad_data = exponent * np.power(base.data.data, exponent - 1)
|
|
base_grad = Variable(grad_output.data.data * base_grad_data)
|
|
base.backward(base_grad)
|
|
|
|
return Variable(result_data, requires_grad=base.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 24
|
|
def exp(x: Variable) -> Variable:
|
|
"""
|
|
Exponential operation with gradient tracking: e^x.
|
|
|
|
Args:
|
|
x: Input Variable
|
|
|
|
Returns:
|
|
Variable with exponential applied and gradient function
|
|
|
|
TODO: Implement exponential operation with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: e^x
|
|
2. Create gradient function using exponential derivative
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = e^x, then dz/dx = e^x
|
|
|
|
EXAMPLE:
|
|
x = Variable(1.0)
|
|
y = exp(x) # y.data = e^1 ≈ 2.718
|
|
y.backward() # x.grad = e^1 ≈ 2.718
|
|
|
|
HINTS:
|
|
- Use np.exp() for forward pass
|
|
- Exponential derivative is itself: d(e^x)/dx = e^x
|
|
- Store result for gradient computation
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass
|
|
exp_result = np.exp(x.data.data)
|
|
result_data = Tensor(exp_result)
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Exponential derivative: d(e^x)/dx = e^x
|
|
x_grad = Variable(grad_output.data.data * exp_result)
|
|
x.backward(x_grad)
|
|
|
|
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 25
|
|
def log(x: Variable) -> Variable:
|
|
"""
|
|
Natural logarithm operation with gradient tracking: ln(x).
|
|
|
|
Args:
|
|
x: Input Variable
|
|
|
|
Returns:
|
|
Variable with logarithm applied and gradient function
|
|
|
|
TODO: Implement logarithm operation with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: ln(x)
|
|
2. Create gradient function using logarithm derivative
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = ln(x), then dz/dx = 1/x
|
|
|
|
EXAMPLE:
|
|
x = Variable(2.0)
|
|
y = log(x) # y.data = ln(2) ≈ 0.693
|
|
y.backward() # x.grad = 1/2 = 0.5
|
|
|
|
HINTS:
|
|
- Use np.log() for forward pass
|
|
- Logarithm derivative: d(ln(x))/dx = 1/x
|
|
- Handle numerical stability for small x
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass with numerical stability
|
|
clipped_x = np.clip(x.data.data, 1e-8, np.inf) # Avoid log(0)
|
|
result_data = Tensor(np.log(clipped_x))
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Logarithm derivative: d(ln(x))/dx = 1/x
|
|
x_grad = Variable(grad_output.data.data / clipped_x)
|
|
x.backward(x_grad)
|
|
|
|
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 26
|
|
def sum_all(x: Variable) -> Variable:
|
|
"""
|
|
Sum all elements operation with gradient tracking.
|
|
|
|
Args:
|
|
x: Input Variable
|
|
|
|
Returns:
|
|
Variable with sum and gradient function
|
|
|
|
TODO: Implement sum operation with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: sum of all elements
|
|
2. Create gradient function that broadcasts gradient back
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = sum(x), then dz/dx_i = 1 for all i
|
|
|
|
EXAMPLE:
|
|
x = Variable([[1, 2], [3, 4]])
|
|
y = sum_all(x) # y.data = 10
|
|
y.backward() # x.grad = [[1, 1], [1, 1]]
|
|
|
|
HINTS:
|
|
- Use np.sum() for forward pass
|
|
- Gradient is ones with same shape as input
|
|
- This is used for loss computation
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass
|
|
result_data = Tensor(np.sum(x.data.data))
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Sum gradient: broadcasts to all elements
|
|
x_grad = Variable(grad_output.data.data * np.ones_like(x.data.data))
|
|
x.backward(x_grad)
|
|
|
|
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 27
|
|
def mean(x: Variable) -> Variable:
|
|
"""
|
|
Mean operation with gradient tracking.
|
|
|
|
Args:
|
|
x: Input Variable
|
|
|
|
Returns:
|
|
Variable with mean and gradient function
|
|
|
|
TODO: Implement mean operation with gradient computation.
|
|
|
|
APPROACH:
|
|
1. Compute forward pass: mean of all elements
|
|
2. Create gradient function that distributes gradient evenly
|
|
3. Return Variable with result and grad_fn
|
|
|
|
MATHEMATICAL RULE:
|
|
If z = mean(x), then dz/dx_i = 1/n for all i (where n is number of elements)
|
|
|
|
EXAMPLE:
|
|
x = Variable([[1, 2], [3, 4]])
|
|
y = mean(x) # y.data = 2.5
|
|
y.backward() # x.grad = [[0.25, 0.25], [0.25, 0.25]]
|
|
|
|
HINTS:
|
|
- Use np.mean() for forward pass
|
|
- Gradient is 1/n for each element
|
|
- This is commonly used for loss computation
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Forward pass
|
|
result_data = Tensor(np.mean(x.data.data))
|
|
|
|
# Create gradient function
|
|
def grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Mean gradient: 1/n for each element
|
|
n = x.data.size
|
|
x_grad = Variable(grad_output.data.data * np.ones_like(x.data.data) / n)
|
|
x.backward(x_grad)
|
|
|
|
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 29
|
|
def clip_gradients(variables: List[Variable], max_norm: float = 1.0) -> None:
|
|
"""
|
|
Clip gradients to prevent exploding gradients.
|
|
|
|
Args:
|
|
variables: List of Variables to clip gradients for
|
|
max_norm: Maximum gradient norm allowed
|
|
|
|
TODO: Implement gradient clipping.
|
|
|
|
APPROACH:
|
|
1. Compute total gradient norm across all variables
|
|
2. If norm exceeds max_norm, scale all gradients down
|
|
3. Modify gradients in-place
|
|
|
|
MATHEMATICAL RULE:
|
|
If ||g|| > max_norm, then g := g * (max_norm / ||g||)
|
|
|
|
EXAMPLE:
|
|
variables = [w1, w2, b1, b2]
|
|
clip_gradients(variables, max_norm=1.0)
|
|
|
|
HINTS:
|
|
- Compute L2 norm of all gradients combined
|
|
- Scale factor = max_norm / total_norm
|
|
- Only clip if total_norm > max_norm
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Compute total gradient norm
|
|
total_norm = 0.0
|
|
for var in variables:
|
|
if var.grad is not None:
|
|
total_norm += np.sum(var.grad.data.data ** 2)
|
|
total_norm = np.sqrt(total_norm)
|
|
|
|
# Clip if necessary
|
|
if total_norm > max_norm:
|
|
scale_factor = max_norm / total_norm
|
|
for var in variables:
|
|
if var.grad is not None:
|
|
var.grad.data._data *= scale_factor
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 30
|
|
def collect_parameters(*modules) -> List[Variable]:
|
|
"""
|
|
Collect all parameters from modules for optimization.
|
|
|
|
Args:
|
|
*modules: Variable number of modules/objects with parameters
|
|
|
|
Returns:
|
|
List of all Variables that require gradients
|
|
|
|
TODO: Implement parameter collection.
|
|
|
|
APPROACH:
|
|
1. Iterate through all provided modules
|
|
2. Find all Variable attributes that require gradients
|
|
3. Return list of all such Variables
|
|
|
|
EXAMPLE:
|
|
layer1 = SomeLayer()
|
|
layer2 = SomeLayer()
|
|
params = collect_parameters(layer1, layer2)
|
|
|
|
HINTS:
|
|
- Use hasattr() and getattr() to find Variable attributes
|
|
- Check if attribute is Variable and requires_grad
|
|
- Handle different module types gracefully
|
|
"""
|
|
### BEGIN SOLUTION
|
|
parameters = []
|
|
for module in modules:
|
|
if hasattr(module, '__dict__'):
|
|
for attr_name, attr_value in module.__dict__.items():
|
|
if isinstance(attr_value, Variable) and attr_value.requires_grad:
|
|
parameters.append(attr_value)
|
|
return parameters
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 31
|
|
def zero_gradients(variables: List[Variable]) -> None:
|
|
"""
|
|
Zero out gradients for all variables.
|
|
|
|
Args:
|
|
variables: List of Variables to zero gradients for
|
|
|
|
TODO: Implement gradient zeroing.
|
|
|
|
APPROACH:
|
|
1. Iterate through all variables
|
|
2. Call zero_grad() on each variable
|
|
3. Handle None gradients gracefully
|
|
|
|
EXAMPLE:
|
|
parameters = [w1, w2, b1, b2]
|
|
zero_gradients(parameters)
|
|
|
|
HINTS:
|
|
- Use the zero_grad() method on each Variable
|
|
- Check if variable has gradients before zeroing
|
|
- This is typically called before each training step
|
|
"""
|
|
### BEGIN SOLUTION
|
|
for var in variables:
|
|
if var.grad is not None:
|
|
var.zero_grad()
|
|
### END SOLUTION
|