Files
TinyTorch/tinytorch/core/autograd.py
Vijay Janapa Reddi eafbb4ac8d Fix comprehensive testing and module exports
🔧 TESTING INFRASTRUCTURE FIXES:
- Fixed pytest configuration (removed duplicate timeout)
- Exported all modules to tinytorch package using nbdev
- Converted .py files to .ipynb for proper NBDev processing
- Fixed import issues in test files with fallback strategies

📊 TESTING RESULTS:
- 145 tests passing, 15 failing, 16 skipped
- Major improvement from previous import errors
- All modules now properly exported and testable
- Analysis tool working correctly on all modules

🎯 MODULE QUALITY STATUS:
- Most modules: Grade C, Scaffolding 3/5
- 01_tensor: Grade C, Scaffolding 2/5 (needs improvement)
- 07_autograd: Grade D, Scaffolding 2/5 (needs improvement)
- Overall: Functional but needs educational enhancement

 RESOLVED ISSUES:
- All import errors resolved
- NBDev export process working
- Test infrastructure functional
- Analysis tools operational

🚀 READY FOR NEXT PHASE: Professional report cards and improvements
2025-07-13 09:20:32 -04:00

829 lines
26 KiB
Python

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_autograd/autograd_dev.ipynb.
# %% auto 0
__all__ = ['Variable', 'add', 'multiply', 'subtract', 'divide', 'relu_with_grad', 'sigmoid_with_grad', 'power', 'exp', 'log',
'sum_all', 'mean', 'clip_gradients', 'collect_parameters', 'zero_gradients']
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 1
import numpy as np
import sys
from typing import Union, List, Tuple, Optional, Any, Callable
from collections import defaultdict
# Import our existing components
from .tensor import Tensor
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 6
class Variable:
"""
Variable: Tensor wrapper with automatic differentiation capabilities.
The fundamental class for gradient computation in TinyTorch.
Wraps Tensor objects and tracks computational history for backpropagation.
"""
def __init__(self, data: Union[Tensor, np.ndarray, list, float, int],
requires_grad: bool = True, grad_fn: Optional[Callable] = None):
"""
Create a Variable with gradient tracking.
Args:
data: The data to wrap (will be converted to Tensor)
requires_grad: Whether to compute gradients for this Variable
grad_fn: Function to compute gradients (None for leaf nodes)
TODO: Implement Variable initialization with gradient tracking.
APPROACH:
1. Convert data to Tensor if it's not already
2. Store the tensor data
3. Set gradient tracking flag
4. Initialize gradient to None (will be computed later)
5. Store the gradient function for backward pass
6. Track if this is a leaf node (no grad_fn)
EXAMPLE:
Variable(5.0) → Variable wrapping Tensor(5.0)
Variable([1, 2, 3]) → Variable wrapping Tensor([1, 2, 3])
HINTS:
- Use isinstance() to check if data is already a Tensor
- Store requires_grad, grad_fn, and is_leaf flags
- Initialize self.grad to None
- A leaf node has grad_fn=None
"""
### BEGIN SOLUTION
# Convert data to Tensor if needed
if isinstance(data, Tensor):
self.data = data
else:
self.data = Tensor(data)
# Set gradient tracking
self.requires_grad = requires_grad
self.grad = None # Will be initialized when needed
self.grad_fn = grad_fn
self.is_leaf = grad_fn is None
# For computational graph
self._backward_hooks = []
### END SOLUTION
@property
def shape(self) -> Tuple[int, ...]:
"""Get the shape of the underlying tensor."""
return self.data.shape
@property
def size(self) -> int:
"""Get the total number of elements."""
return self.data.size
def __repr__(self) -> str:
"""String representation of the Variable."""
grad_str = f", grad_fn={self.grad_fn.__name__}" if self.grad_fn else ""
return f"Variable({self.data.data.tolist()}, requires_grad={self.requires_grad}{grad_str})"
def backward(self, gradient: Optional['Variable'] = None) -> None:
"""
Compute gradients using backpropagation.
Args:
gradient: The gradient to backpropagate (defaults to ones)
TODO: Implement backward propagation.
APPROACH:
1. If gradient is None, create a gradient of ones with same shape
2. If this Variable doesn't require gradients, return early
3. If this is a leaf node, accumulate the gradient
4. If this has a grad_fn, call it to propagate gradients
EXAMPLE:
x = Variable(5.0)
y = x * 2
y.backward() # Computes x.grad = 2.0
HINTS:
- Use np.ones_like() to create default gradient
- Accumulate gradients with += for leaf nodes
- Call self.grad_fn(gradient) for non-leaf nodes
"""
### BEGIN SOLUTION
# Default gradient is ones
if gradient is None:
gradient = Variable(np.ones_like(self.data.data))
# Skip if gradients not required
if not self.requires_grad:
return
# Accumulate gradient for leaf nodes
if self.is_leaf:
if self.grad is None:
self.grad = Variable(np.zeros_like(self.data.data))
self.grad.data._data += gradient.data.data
else:
# Propagate gradients through grad_fn
if self.grad_fn is not None:
self.grad_fn(gradient)
### END SOLUTION
def zero_grad(self) -> None:
"""Zero out the gradient."""
if self.grad is not None:
self.grad.data._data.fill(0)
# Arithmetic operations with gradient tracking
def __add__(self, other: Union['Variable', float, int]) -> 'Variable':
"""Addition with gradient tracking."""
return add(self, other)
def __mul__(self, other: Union['Variable', float, int]) -> 'Variable':
"""Multiplication with gradient tracking."""
return multiply(self, other)
def __sub__(self, other: Union['Variable', float, int]) -> 'Variable':
"""Subtraction with gradient tracking."""
return subtract(self, other)
def __truediv__(self, other: Union['Variable', float, int]) -> 'Variable':
"""Division with gradient tracking."""
return divide(self, other)
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 8
def add(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
"""
Addition operation with gradient tracking.
Args:
a: First operand
b: Second operand
Returns:
Variable with sum and gradient function
TODO: Implement addition with gradient computation.
APPROACH:
1. Convert inputs to Variables if needed
2. Compute forward pass: result = a + b
3. Create gradient function that distributes gradients
4. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = x + y, then dz/dx = 1, dz/dy = 1
EXAMPLE:
x = Variable(2.0), y = Variable(3.0)
z = add(x, y) # z.data = 5.0
z.backward() # x.grad = 1.0, y.grad = 1.0
HINTS:
- Use isinstance() to check if inputs are Variables
- Create a closure that captures a and b
- In grad_fn, call a.backward() and b.backward() with appropriate gradients
"""
### BEGIN SOLUTION
# Convert to Variables if needed
if not isinstance(a, Variable):
a = Variable(a, requires_grad=False)
if not isinstance(b, Variable):
b = Variable(b, requires_grad=False)
# Forward pass
result_data = a.data + b.data
# Create gradient function
def grad_fn(grad_output):
# Addition distributes gradients equally
if a.requires_grad:
a.backward(grad_output)
if b.requires_grad:
b.backward(grad_output)
# Determine if result requires gradients
requires_grad = a.requires_grad or b.requires_grad
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 9
def multiply(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
"""
Multiplication operation with gradient tracking.
Args:
a: First operand
b: Second operand
Returns:
Variable with product and gradient function
TODO: Implement multiplication with gradient computation.
APPROACH:
1. Convert inputs to Variables if needed
2. Compute forward pass: result = a * b
3. Create gradient function using product rule
4. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = x * y, then dz/dx = y, dz/dy = x
EXAMPLE:
x = Variable(2.0), y = Variable(3.0)
z = multiply(x, y) # z.data = 6.0
z.backward() # x.grad = 3.0, y.grad = 2.0
HINTS:
- Store a.data and b.data for gradient computation
- In grad_fn, multiply incoming gradient by the other operand
- Handle broadcasting if shapes are different
"""
### BEGIN SOLUTION
# Convert to Variables if needed
if not isinstance(a, Variable):
a = Variable(a, requires_grad=False)
if not isinstance(b, Variable):
b = Variable(b, requires_grad=False)
# Forward pass
result_data = a.data * b.data
# Create gradient function
def grad_fn(grad_output):
# Product rule: d(xy)/dx = y, d(xy)/dy = x
if a.requires_grad:
a_grad = Variable(grad_output.data * b.data)
a.backward(a_grad)
if b.requires_grad:
b_grad = Variable(grad_output.data * a.data)
b.backward(b_grad)
# Determine if result requires gradients
requires_grad = a.requires_grad or b.requires_grad
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 10
def subtract(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
"""
Subtraction operation with gradient tracking.
Args:
a: First operand (minuend)
b: Second operand (subtrahend)
Returns:
Variable with difference and gradient function
TODO: Implement subtraction with gradient computation.
APPROACH:
1. Convert inputs to Variables if needed
2. Compute forward pass: result = a - b
3. Create gradient function with correct signs
4. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = x - y, then dz/dx = 1, dz/dy = -1
EXAMPLE:
x = Variable(5.0), y = Variable(3.0)
z = subtract(x, y) # z.data = 2.0
z.backward() # x.grad = 1.0, y.grad = -1.0
HINTS:
- Forward pass is straightforward: a - b
- Gradient for a is positive, for b is negative
- Remember to negate the gradient for b
"""
### BEGIN SOLUTION
# Convert to Variables if needed
if not isinstance(a, Variable):
a = Variable(a, requires_grad=False)
if not isinstance(b, Variable):
b = Variable(b, requires_grad=False)
# Forward pass
result_data = a.data - b.data
# Create gradient function
def grad_fn(grad_output):
# Subtraction rule: d(x-y)/dx = 1, d(x-y)/dy = -1
if a.requires_grad:
a.backward(grad_output)
if b.requires_grad:
b_grad = Variable(-grad_output.data.data)
b.backward(b_grad)
# Determine if result requires gradients
requires_grad = a.requires_grad or b.requires_grad
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 11
def divide(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
"""
Division operation with gradient tracking.
Args:
a: Numerator
b: Denominator
Returns:
Variable with quotient and gradient function
TODO: Implement division with gradient computation.
APPROACH:
1. Convert inputs to Variables if needed
2. Compute forward pass: result = a / b
3. Create gradient function using quotient rule
4. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = x / y, then dz/dx = 1/y, dz/dy = -x/y²
EXAMPLE:
x = Variable(6.0), y = Variable(2.0)
z = divide(x, y) # z.data = 3.0
z.backward() # x.grad = 0.5, y.grad = -1.5
HINTS:
- Forward pass: a.data / b.data
- Gradient for a: grad_output / b.data
- Gradient for b: -grad_output * a.data / (b.data ** 2)
- Be careful with numerical stability
"""
### BEGIN SOLUTION
# Convert to Variables if needed
if not isinstance(a, Variable):
a = Variable(a, requires_grad=False)
if not isinstance(b, Variable):
b = Variable(b, requires_grad=False)
# Forward pass
result_data = a.data / b.data
# Create gradient function
def grad_fn(grad_output):
# Quotient rule: d(x/y)/dx = 1/y, d(x/y)/dy = -x/y²
if a.requires_grad:
a_grad = Variable(grad_output.data.data / b.data.data)
a.backward(a_grad)
if b.requires_grad:
b_grad = Variable(-grad_output.data.data * a.data.data / (b.data.data ** 2))
b.backward(b_grad)
# Determine if result requires gradients
requires_grad = a.requires_grad or b.requires_grad
return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 17
def relu_with_grad(x: Variable) -> Variable:
"""
ReLU activation with gradient tracking.
Args:
x: Input Variable
Returns:
Variable with ReLU applied and gradient function
TODO: Implement ReLU with gradient computation.
APPROACH:
1. Compute forward pass: max(0, x)
2. Create gradient function using ReLU derivative
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
f(x) = max(0, x)
f'(x) = 1 if x > 0, else 0
EXAMPLE:
x = Variable([-1.0, 0.0, 1.0])
y = relu_with_grad(x) # y.data = [0.0, 0.0, 1.0]
y.backward() # x.grad = [0.0, 0.0, 1.0]
HINTS:
- Use np.maximum(0, x.data.data) for forward pass
- Use (x.data.data > 0) for gradient mask
- Only propagate gradients where input was positive
"""
### BEGIN SOLUTION
# Forward pass
result_data = Tensor(np.maximum(0, x.data.data))
# Create gradient function
def grad_fn(grad_output):
if x.requires_grad:
# ReLU derivative: 1 if x > 0, else 0
mask = (x.data.data > 0).astype(np.float32)
x_grad = Variable(grad_output.data.data * mask)
x.backward(x_grad)
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 18
def sigmoid_with_grad(x: Variable) -> Variable:
"""
Sigmoid activation with gradient tracking.
Args:
x: Input Variable
Returns:
Variable with sigmoid applied and gradient function
TODO: Implement sigmoid with gradient computation.
APPROACH:
1. Compute forward pass: 1 / (1 + exp(-x))
2. Create gradient function using sigmoid derivative
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
f(x) = 1 / (1 + exp(-x))
f'(x) = f(x) * (1 - f(x))
EXAMPLE:
x = Variable(0.0)
y = sigmoid_with_grad(x) # y.data = 0.5
y.backward() # x.grad = 0.25
HINTS:
- Use np.clip for numerical stability
- Store sigmoid output for gradient computation
- Gradient is sigmoid * (1 - sigmoid)
"""
### BEGIN SOLUTION
# Forward pass with numerical stability
clipped = np.clip(x.data.data, -500, 500)
sigmoid_output = 1.0 / (1.0 + np.exp(-clipped))
result_data = Tensor(sigmoid_output)
# Create gradient function
def grad_fn(grad_output):
if x.requires_grad:
# Sigmoid derivative: sigmoid * (1 - sigmoid)
sigmoid_grad = sigmoid_output * (1.0 - sigmoid_output)
x_grad = Variable(grad_output.data.data * sigmoid_grad)
x.backward(x_grad)
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 23
def power(base: Variable, exponent: Union[float, int]) -> Variable:
"""
Power operation with gradient tracking: base^exponent.
Args:
base: Base Variable
exponent: Exponent (scalar)
Returns:
Variable with power applied and gradient function
TODO: Implement power operation with gradient computation.
APPROACH:
1. Compute forward pass: base^exponent
2. Create gradient function using power rule
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = x^n, then dz/dx = n * x^(n-1)
EXAMPLE:
x = Variable(2.0)
y = power(x, 3) # y.data = 8.0
y.backward() # x.grad = 3 * 2^2 = 12.0
HINTS:
- Use np.power() for forward pass
- Power rule: gradient = exponent * base^(exponent-1)
- Handle edge cases like exponent=0 or base=0
"""
### BEGIN SOLUTION
# Forward pass
result_data = Tensor(np.power(base.data.data, exponent))
# Create gradient function
def grad_fn(grad_output):
if base.requires_grad:
# Power rule: d(x^n)/dx = n * x^(n-1)
if exponent == 0:
# Special case: derivative of constant is 0
base_grad = Variable(np.zeros_like(base.data.data))
else:
base_grad_data = exponent * np.power(base.data.data, exponent - 1)
base_grad = Variable(grad_output.data.data * base_grad_data)
base.backward(base_grad)
return Variable(result_data, requires_grad=base.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 24
def exp(x: Variable) -> Variable:
"""
Exponential operation with gradient tracking: e^x.
Args:
x: Input Variable
Returns:
Variable with exponential applied and gradient function
TODO: Implement exponential operation with gradient computation.
APPROACH:
1. Compute forward pass: e^x
2. Create gradient function using exponential derivative
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = e^x, then dz/dx = e^x
EXAMPLE:
x = Variable(1.0)
y = exp(x) # y.data = e^1 ≈ 2.718
y.backward() # x.grad = e^1 ≈ 2.718
HINTS:
- Use np.exp() for forward pass
- Exponential derivative is itself: d(e^x)/dx = e^x
- Store result for gradient computation
"""
### BEGIN SOLUTION
# Forward pass
exp_result = np.exp(x.data.data)
result_data = Tensor(exp_result)
# Create gradient function
def grad_fn(grad_output):
if x.requires_grad:
# Exponential derivative: d(e^x)/dx = e^x
x_grad = Variable(grad_output.data.data * exp_result)
x.backward(x_grad)
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 25
def log(x: Variable) -> Variable:
"""
Natural logarithm operation with gradient tracking: ln(x).
Args:
x: Input Variable
Returns:
Variable with logarithm applied and gradient function
TODO: Implement logarithm operation with gradient computation.
APPROACH:
1. Compute forward pass: ln(x)
2. Create gradient function using logarithm derivative
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = ln(x), then dz/dx = 1/x
EXAMPLE:
x = Variable(2.0)
y = log(x) # y.data = ln(2) ≈ 0.693
y.backward() # x.grad = 1/2 = 0.5
HINTS:
- Use np.log() for forward pass
- Logarithm derivative: d(ln(x))/dx = 1/x
- Handle numerical stability for small x
"""
### BEGIN SOLUTION
# Forward pass with numerical stability
clipped_x = np.clip(x.data.data, 1e-8, np.inf) # Avoid log(0)
result_data = Tensor(np.log(clipped_x))
# Create gradient function
def grad_fn(grad_output):
if x.requires_grad:
# Logarithm derivative: d(ln(x))/dx = 1/x
x_grad = Variable(grad_output.data.data / clipped_x)
x.backward(x_grad)
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 26
def sum_all(x: Variable) -> Variable:
"""
Sum all elements operation with gradient tracking.
Args:
x: Input Variable
Returns:
Variable with sum and gradient function
TODO: Implement sum operation with gradient computation.
APPROACH:
1. Compute forward pass: sum of all elements
2. Create gradient function that broadcasts gradient back
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = sum(x), then dz/dx_i = 1 for all i
EXAMPLE:
x = Variable([[1, 2], [3, 4]])
y = sum_all(x) # y.data = 10
y.backward() # x.grad = [[1, 1], [1, 1]]
HINTS:
- Use np.sum() for forward pass
- Gradient is ones with same shape as input
- This is used for loss computation
"""
### BEGIN SOLUTION
# Forward pass
result_data = Tensor(np.sum(x.data.data))
# Create gradient function
def grad_fn(grad_output):
if x.requires_grad:
# Sum gradient: broadcasts to all elements
x_grad = Variable(grad_output.data.data * np.ones_like(x.data.data))
x.backward(x_grad)
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 27
def mean(x: Variable) -> Variable:
"""
Mean operation with gradient tracking.
Args:
x: Input Variable
Returns:
Variable with mean and gradient function
TODO: Implement mean operation with gradient computation.
APPROACH:
1. Compute forward pass: mean of all elements
2. Create gradient function that distributes gradient evenly
3. Return Variable with result and grad_fn
MATHEMATICAL RULE:
If z = mean(x), then dz/dx_i = 1/n for all i (where n is number of elements)
EXAMPLE:
x = Variable([[1, 2], [3, 4]])
y = mean(x) # y.data = 2.5
y.backward() # x.grad = [[0.25, 0.25], [0.25, 0.25]]
HINTS:
- Use np.mean() for forward pass
- Gradient is 1/n for each element
- This is commonly used for loss computation
"""
### BEGIN SOLUTION
# Forward pass
result_data = Tensor(np.mean(x.data.data))
# Create gradient function
def grad_fn(grad_output):
if x.requires_grad:
# Mean gradient: 1/n for each element
n = x.data.size
x_grad = Variable(grad_output.data.data * np.ones_like(x.data.data) / n)
x.backward(x_grad)
return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 29
def clip_gradients(variables: List[Variable], max_norm: float = 1.0) -> None:
"""
Clip gradients to prevent exploding gradients.
Args:
variables: List of Variables to clip gradients for
max_norm: Maximum gradient norm allowed
TODO: Implement gradient clipping.
APPROACH:
1. Compute total gradient norm across all variables
2. If norm exceeds max_norm, scale all gradients down
3. Modify gradients in-place
MATHEMATICAL RULE:
If ||g|| > max_norm, then g := g * (max_norm / ||g||)
EXAMPLE:
variables = [w1, w2, b1, b2]
clip_gradients(variables, max_norm=1.0)
HINTS:
- Compute L2 norm of all gradients combined
- Scale factor = max_norm / total_norm
- Only clip if total_norm > max_norm
"""
### BEGIN SOLUTION
# Compute total gradient norm
total_norm = 0.0
for var in variables:
if var.grad is not None:
total_norm += np.sum(var.grad.data.data ** 2)
total_norm = np.sqrt(total_norm)
# Clip if necessary
if total_norm > max_norm:
scale_factor = max_norm / total_norm
for var in variables:
if var.grad is not None:
var.grad.data._data *= scale_factor
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 30
def collect_parameters(*modules) -> List[Variable]:
"""
Collect all parameters from modules for optimization.
Args:
*modules: Variable number of modules/objects with parameters
Returns:
List of all Variables that require gradients
TODO: Implement parameter collection.
APPROACH:
1. Iterate through all provided modules
2. Find all Variable attributes that require gradients
3. Return list of all such Variables
EXAMPLE:
layer1 = SomeLayer()
layer2 = SomeLayer()
params = collect_parameters(layer1, layer2)
HINTS:
- Use hasattr() and getattr() to find Variable attributes
- Check if attribute is Variable and requires_grad
- Handle different module types gracefully
"""
### BEGIN SOLUTION
parameters = []
for module in modules:
if hasattr(module, '__dict__'):
for attr_name, attr_value in module.__dict__.items():
if isinstance(attr_value, Variable) and attr_value.requires_grad:
parameters.append(attr_value)
return parameters
### END SOLUTION
# %% ../../modules/source/07_autograd/autograd_dev.ipynb 31
def zero_gradients(variables: List[Variable]) -> None:
"""
Zero out gradients for all variables.
Args:
variables: List of Variables to zero gradients for
TODO: Implement gradient zeroing.
APPROACH:
1. Iterate through all variables
2. Call zero_grad() on each variable
3. Handle None gradients gracefully
EXAMPLE:
parameters = [w1, w2, b1, b2]
zero_gradients(parameters)
HINTS:
- Use the zero_grad() method on each Variable
- Check if variable has gradients before zeroing
- This is typically called before each training step
"""
### BEGIN SOLUTION
for var in variables:
if var.grad is not None:
var.zero_grad()
### END SOLUTION