TinyTorch/tinytorch/core/autograd.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/07_autograd/autograd_dev.ipynb.

# %% auto 0
__all__ = ['Variable', 'add', 'multiply', 'subtract', 'divide', 'relu_with_grad', 'sigmoid_with_grad', 'power', 'exp', 'log',
           'sum_all', 'mean', 'clip_gradients', 'collect_parameters', 'zero_gradients']

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 1
import numpy as np
import sys
from typing import Union, List, Tuple, Optional, Any, Callable
from collections import defaultdict

# Import our existing components
from .tensor import Tensor

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 6
class Variable:
    """
    Variable: Tensor wrapper with automatic differentiation capabilities.

    The fundamental class for gradient computation in TinyTorch.
    Wraps Tensor objects and tracks computational history for backpropagation.
    """

    def __init__(self, data: Union[Tensor, np.ndarray, list, float, int],
                 requires_grad: bool = True, grad_fn: Optional[Callable] = None):
        """
        Create a Variable with gradient tracking.

        Args:
            data: The data to wrap (will be converted to Tensor)
            requires_grad: Whether to compute gradients for this Variable
            grad_fn: Function to compute gradients (None for leaf nodes)

        TODO: Implement Variable initialization with gradient tracking.

        APPROACH:
        1. Convert data to Tensor if it's not already
        2. Store the tensor data
        3. Set gradient tracking flag
        4. Initialize gradient to None (will be computed later)
        5. Store the gradient function for backward pass
        6. Track if this is a leaf node (no grad_fn)

        EXAMPLE:
        Variable(5.0) → Variable wrapping Tensor(5.0)
        Variable([1, 2, 3]) → Variable wrapping Tensor([1, 2, 3])

        HINTS:
        - Use isinstance() to check if data is already a Tensor
        - Store requires_grad, grad_fn, and is_leaf flags
        - Initialize self.grad to None
        - A leaf node has grad_fn=None
        """
        ### BEGIN SOLUTION
        # Convert data to Tensor if needed
        if isinstance(data, Tensor):
            self.data = data
        else:
            self.data = Tensor(data)

        # Set gradient tracking
        self.requires_grad = requires_grad
        self.grad = None  # Will be initialized when needed
        self.grad_fn = grad_fn
        self.is_leaf = grad_fn is None

        # For computational graph
        self._backward_hooks = []
        ### END SOLUTION

    @property
    def shape(self) -> Tuple[int, ...]:
        """Get the shape of the underlying tensor."""
        return self.data.shape

    @property
    def size(self) -> int:
        """Get the total number of elements."""
        return self.data.size

    def __repr__(self) -> str:
        """String representation of the Variable."""
        grad_str = f", grad_fn={self.grad_fn.__name__}" if self.grad_fn else ""
        return f"Variable({self.data.data.tolist()}, requires_grad={self.requires_grad}{grad_str})"

    def backward(self, gradient: Optional['Variable'] = None) -> None:
        """
        Compute gradients using backpropagation.

        Args:
            gradient: The gradient to backpropagate (defaults to ones)

        TODO: Implement backward propagation.

        APPROACH:
        1. If gradient is None, create a gradient of ones with same shape
        2. If this Variable doesn't require gradients, return early
        3. If this is a leaf node, accumulate the gradient
        4. If this has a grad_fn, call it to propagate gradients

        EXAMPLE:
        x = Variable(5.0)
        y = x * 2
        y.backward()  # Computes x.grad = 2.0

        HINTS:
        - Use np.ones_like() to create default gradient
        - Accumulate gradients with += for leaf nodes
        - Call self.grad_fn(gradient) for non-leaf nodes
        """
        ### BEGIN SOLUTION
        # Default gradient is ones
        if gradient is None:
            gradient = Variable(np.ones_like(self.data.data))

        # Skip if gradients not required
        if not self.requires_grad:
            return

        # Accumulate gradient for leaf nodes
        if self.is_leaf:
            if self.grad is None:
                self.grad = Variable(np.zeros_like(self.data.data))
            self.grad.data._data += gradient.data.data
        else:
            # Propagate gradients through grad_fn
            if self.grad_fn is not None:
                self.grad_fn(gradient)
        ### END SOLUTION

    def zero_grad(self) -> None:
        """Zero out the gradient."""
        if self.grad is not None:
            self.grad.data._data.fill(0)

    # Arithmetic operations with gradient tracking
    def __add__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Addition with gradient tracking."""
        return add(self, other)

    def __mul__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Multiplication with gradient tracking."""
        return multiply(self, other)

    def __sub__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Subtraction with gradient tracking."""
        return subtract(self, other)

    def __truediv__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Division with gradient tracking."""
        return divide(self, other)

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 8
def add(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Addition operation with gradient tracking.

    Args:
        a: First operand
        b: Second operand

    Returns:
        Variable with sum and gradient function

    TODO: Implement addition with gradient computation.

    APPROACH:
    1. Convert inputs to Variables if needed
    2. Compute forward pass: result = a + b
    3. Create gradient function that distributes gradients
    4. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = x + y, then dz/dx = 1, dz/dy = 1

    EXAMPLE:
    x = Variable(2.0), y = Variable(3.0)
    z = add(x, y)  # z.data = 5.0
    z.backward()   # x.grad = 1.0, y.grad = 1.0

    HINTS:
    - Use isinstance() to check if inputs are Variables
    - Create a closure that captures a and b
    - In grad_fn, call a.backward() and b.backward() with appropriate gradients
    """
    ### BEGIN SOLUTION
    # Convert to Variables if needed
    if not isinstance(a, Variable):
        a = Variable(a, requires_grad=False)
    if not isinstance(b, Variable):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data + b.data

    # Create gradient function
    def grad_fn(grad_output):
        # Addition distributes gradients equally
        if a.requires_grad:
            a.backward(grad_output)
        if b.requires_grad:
            b.backward(grad_output)

    # Determine if result requires gradients
    requires_grad = a.requires_grad or b.requires_grad

    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 9
def multiply(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Multiplication operation with gradient tracking.

    Args:
        a: First operand
        b: Second operand

    Returns:
        Variable with product and gradient function

    TODO: Implement multiplication with gradient computation.

    APPROACH:
    1. Convert inputs to Variables if needed
    2. Compute forward pass: result = a * b
    3. Create gradient function using product rule
    4. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = x * y, then dz/dx = y, dz/dy = x

    EXAMPLE:
    x = Variable(2.0), y = Variable(3.0)
    z = multiply(x, y)  # z.data = 6.0
    z.backward()        # x.grad = 3.0, y.grad = 2.0

    HINTS:
    - Store a.data and b.data for gradient computation
    - In grad_fn, multiply incoming gradient by the other operand
    - Handle broadcasting if shapes are different
    """
    ### BEGIN SOLUTION
    # Convert to Variables if needed
    if not isinstance(a, Variable):
        a = Variable(a, requires_grad=False)
    if not isinstance(b, Variable):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data * b.data

    # Create gradient function
    def grad_fn(grad_output):
        # Product rule: d(xy)/dx = y, d(xy)/dy = x
        if a.requires_grad:
            a_grad = Variable(grad_output.data * b.data)
            a.backward(a_grad)
        if b.requires_grad:
            b_grad = Variable(grad_output.data * a.data)
            b.backward(b_grad)

    # Determine if result requires gradients
    requires_grad = a.requires_grad or b.requires_grad

    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 10
def subtract(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Subtraction operation with gradient tracking.

    Args:
        a: First operand (minuend)
        b: Second operand (subtrahend)

    Returns:
        Variable with difference and gradient function

    TODO: Implement subtraction with gradient computation.

    APPROACH:
    1. Convert inputs to Variables if needed
    2. Compute forward pass: result = a - b
    3. Create gradient function with correct signs
    4. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = x - y, then dz/dx = 1, dz/dy = -1

    EXAMPLE:
    x = Variable(5.0), y = Variable(3.0)
    z = subtract(x, y)  # z.data = 2.0
    z.backward()        # x.grad = 1.0, y.grad = -1.0

    HINTS:
    - Forward pass is straightforward: a - b
    - Gradient for a is positive, for b is negative
    - Remember to negate the gradient for b
    """
    ### BEGIN SOLUTION
    # Convert to Variables if needed
    if not isinstance(a, Variable):
        a = Variable(a, requires_grad=False)
    if not isinstance(b, Variable):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data - b.data

    # Create gradient function
    def grad_fn(grad_output):
        # Subtraction rule: d(x-y)/dx = 1, d(x-y)/dy = -1
        if a.requires_grad:
            a.backward(grad_output)
        if b.requires_grad:
            b_grad = Variable(-grad_output.data.data)
            b.backward(b_grad)

    # Determine if result requires gradients
    requires_grad = a.requires_grad or b.requires_grad

    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 11
def divide(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Division operation with gradient tracking.

    Args:
        a: Numerator
        b: Denominator

    Returns:
        Variable with quotient and gradient function

    TODO: Implement division with gradient computation.

    APPROACH:
    1. Convert inputs to Variables if needed
    2. Compute forward pass: result = a / b
    3. Create gradient function using quotient rule
    4. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = x / y, then dz/dx = 1/y, dz/dy = -x/y²

    EXAMPLE:
    x = Variable(6.0), y = Variable(2.0)
    z = divide(x, y)  # z.data = 3.0
    z.backward()      # x.grad = 0.5, y.grad = -1.5

    HINTS:
    - Forward pass: a.data / b.data
    - Gradient for a: grad_output / b.data
    - Gradient for b: -grad_output * a.data / (b.data ** 2)
    - Be careful with numerical stability
    """
    ### BEGIN SOLUTION
    # Convert to Variables if needed
    if not isinstance(a, Variable):
        a = Variable(a, requires_grad=False)
    if not isinstance(b, Variable):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data / b.data

    # Create gradient function
    def grad_fn(grad_output):
        # Quotient rule: d(x/y)/dx = 1/y, d(x/y)/dy = -x/y²
        if a.requires_grad:
            a_grad = Variable(grad_output.data.data / b.data.data)
            a.backward(a_grad)
        if b.requires_grad:
            b_grad = Variable(-grad_output.data.data * a.data.data / (b.data.data ** 2))
            b.backward(b_grad)

    # Determine if result requires gradients
    requires_grad = a.requires_grad or b.requires_grad

    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 17
def relu_with_grad(x: Variable) -> Variable:
    """
    ReLU activation with gradient tracking.

    Args:
        x: Input Variable

    Returns:
        Variable with ReLU applied and gradient function

    TODO: Implement ReLU with gradient computation.

    APPROACH:
    1. Compute forward pass: max(0, x)
    2. Create gradient function using ReLU derivative
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    f(x) = max(0, x)
    f'(x) = 1 if x > 0, else 0

    EXAMPLE:
    x = Variable([-1.0, 0.0, 1.0])
    y = relu_with_grad(x)  # y.data = [0.0, 0.0, 1.0]
    y.backward()           # x.grad = [0.0, 0.0, 1.0]

    HINTS:
    - Use np.maximum(0, x.data.data) for forward pass
    - Use (x.data.data > 0) for gradient mask
    - Only propagate gradients where input was positive
    """
    ### BEGIN SOLUTION
    # Forward pass
    result_data = Tensor(np.maximum(0, x.data.data))

    # Create gradient function
    def grad_fn(grad_output):
        if x.requires_grad:
            # ReLU derivative: 1 if x > 0, else 0
            mask = (x.data.data > 0).astype(np.float32)
            x_grad = Variable(grad_output.data.data * mask)
            x.backward(x_grad)

    return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 18
def sigmoid_with_grad(x: Variable) -> Variable:
    """
    Sigmoid activation with gradient tracking.

    Args:
        x: Input Variable

    Returns:
        Variable with sigmoid applied and gradient function

    TODO: Implement sigmoid with gradient computation.

    APPROACH:
    1. Compute forward pass: 1 / (1 + exp(-x))
    2. Create gradient function using sigmoid derivative
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    f(x) = 1 / (1 + exp(-x))
    f'(x) = f(x) * (1 - f(x))

    EXAMPLE:
    x = Variable(0.0)
    y = sigmoid_with_grad(x)  # y.data = 0.5
    y.backward()              # x.grad = 0.25

    HINTS:
    - Use np.clip for numerical stability
    - Store sigmoid output for gradient computation
    - Gradient is sigmoid * (1 - sigmoid)
    """
    ### BEGIN SOLUTION
    # Forward pass with numerical stability
    clipped = np.clip(x.data.data, -500, 500)
    sigmoid_output = 1.0 / (1.0 + np.exp(-clipped))
    result_data = Tensor(sigmoid_output)

    # Create gradient function
    def grad_fn(grad_output):
        if x.requires_grad:
            # Sigmoid derivative: sigmoid * (1 - sigmoid)
            sigmoid_grad = sigmoid_output * (1.0 - sigmoid_output)
            x_grad = Variable(grad_output.data.data * sigmoid_grad)
            x.backward(x_grad)

    return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 23
def power(base: Variable, exponent: Union[float, int]) -> Variable:
    """
    Power operation with gradient tracking: base^exponent.

    Args:
        base: Base Variable
        exponent: Exponent (scalar)

    Returns:
        Variable with power applied and gradient function

    TODO: Implement power operation with gradient computation.

    APPROACH:
    1. Compute forward pass: base^exponent
    2. Create gradient function using power rule
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = x^n, then dz/dx = n * x^(n-1)

    EXAMPLE:
    x = Variable(2.0)
    y = power(x, 3)  # y.data = 8.0
    y.backward()     # x.grad = 3 * 2^2 = 12.0

    HINTS:
    - Use np.power() for forward pass
    - Power rule: gradient = exponent * base^(exponent-1)
    - Handle edge cases like exponent=0 or base=0
    """
    ### BEGIN SOLUTION
    # Forward pass
    result_data = Tensor(np.power(base.data.data, exponent))

    # Create gradient function
    def grad_fn(grad_output):
        if base.requires_grad:
            # Power rule: d(x^n)/dx = n * x^(n-1)
            if exponent == 0:
                # Special case: derivative of constant is 0
                base_grad = Variable(np.zeros_like(base.data.data))
            else:
                base_grad_data = exponent * np.power(base.data.data, exponent - 1)
                base_grad = Variable(grad_output.data.data * base_grad_data)
            base.backward(base_grad)

    return Variable(result_data, requires_grad=base.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 24
def exp(x: Variable) -> Variable:
    """
    Exponential operation with gradient tracking: e^x.

    Args:
        x: Input Variable

    Returns:
        Variable with exponential applied and gradient function

    TODO: Implement exponential operation with gradient computation.

    APPROACH:
    1. Compute forward pass: e^x
    2. Create gradient function using exponential derivative
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = e^x, then dz/dx = e^x

    EXAMPLE:
    x = Variable(1.0)
    y = exp(x)  # y.data = e^1 ≈ 2.718
    y.backward()  # x.grad = e^1 ≈ 2.718

    HINTS:
    - Use np.exp() for forward pass
    - Exponential derivative is itself: d(e^x)/dx = e^x
    - Store result for gradient computation
    """
    ### BEGIN SOLUTION
    # Forward pass
    exp_result = np.exp(x.data.data)
    result_data = Tensor(exp_result)

    # Create gradient function
    def grad_fn(grad_output):
        if x.requires_grad:
            # Exponential derivative: d(e^x)/dx = e^x
            x_grad = Variable(grad_output.data.data * exp_result)
            x.backward(x_grad)

    return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 25
def log(x: Variable) -> Variable:
    """
    Natural logarithm operation with gradient tracking: ln(x).

    Args:
        x: Input Variable

    Returns:
        Variable with logarithm applied and gradient function

    TODO: Implement logarithm operation with gradient computation.

    APPROACH:
    1. Compute forward pass: ln(x)
    2. Create gradient function using logarithm derivative
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = ln(x), then dz/dx = 1/x

    EXAMPLE:
    x = Variable(2.0)
    y = log(x)  # y.data = ln(2) ≈ 0.693
    y.backward()  # x.grad = 1/2 = 0.5

    HINTS:
    - Use np.log() for forward pass
    - Logarithm derivative: d(ln(x))/dx = 1/x
    - Handle numerical stability for small x
    """
    ### BEGIN SOLUTION
    # Forward pass with numerical stability
    clipped_x = np.clip(x.data.data, 1e-8, np.inf)  # Avoid log(0)
    result_data = Tensor(np.log(clipped_x))

    # Create gradient function
    def grad_fn(grad_output):
        if x.requires_grad:
            # Logarithm derivative: d(ln(x))/dx = 1/x
            x_grad = Variable(grad_output.data.data / clipped_x)
            x.backward(x_grad)

    return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 26
def sum_all(x: Variable) -> Variable:
    """
    Sum all elements operation with gradient tracking.

    Args:
        x: Input Variable

    Returns:
        Variable with sum and gradient function

    TODO: Implement sum operation with gradient computation.

    APPROACH:
    1. Compute forward pass: sum of all elements
    2. Create gradient function that broadcasts gradient back
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = sum(x), then dz/dx_i = 1 for all i

    EXAMPLE:
    x = Variable([[1, 2], [3, 4]])
    y = sum_all(x)  # y.data = 10
    y.backward()    # x.grad = [[1, 1], [1, 1]]

    HINTS:
    - Use np.sum() for forward pass
    - Gradient is ones with same shape as input
    - This is used for loss computation
    """
    ### BEGIN SOLUTION
    # Forward pass
    result_data = Tensor(np.sum(x.data.data))

    # Create gradient function
    def grad_fn(grad_output):
        if x.requires_grad:
            # Sum gradient: broadcasts to all elements
            x_grad = Variable(grad_output.data.data * np.ones_like(x.data.data))
            x.backward(x_grad)

    return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 27
def mean(x: Variable) -> Variable:
    """
    Mean operation with gradient tracking.

    Args:
        x: Input Variable

    Returns:
        Variable with mean and gradient function

    TODO: Implement mean operation with gradient computation.

    APPROACH:
    1. Compute forward pass: mean of all elements
    2. Create gradient function that distributes gradient evenly
    3. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = mean(x), then dz/dx_i = 1/n for all i (where n is number of elements)

    EXAMPLE:
    x = Variable([[1, 2], [3, 4]])
    y = mean(x)  # y.data = 2.5
    y.backward()  # x.grad = [[0.25, 0.25], [0.25, 0.25]]

    HINTS:
    - Use np.mean() for forward pass
    - Gradient is 1/n for each element
    - This is commonly used for loss computation
    """
    ### BEGIN SOLUTION
    # Forward pass
    result_data = Tensor(np.mean(x.data.data))

    # Create gradient function
    def grad_fn(grad_output):
        if x.requires_grad:
            # Mean gradient: 1/n for each element
            n = x.data.size
            x_grad = Variable(grad_output.data.data * np.ones_like(x.data.data) / n)
            x.backward(x_grad)

    return Variable(result_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 29
def clip_gradients(variables: List[Variable], max_norm: float = 1.0) -> None:
    """
    Clip gradients to prevent exploding gradients.

    Args:
        variables: List of Variables to clip gradients for
        max_norm: Maximum gradient norm allowed

    TODO: Implement gradient clipping.

    APPROACH:
    1. Compute total gradient norm across all variables
    2. If norm exceeds max_norm, scale all gradients down
    3. Modify gradients in-place

    MATHEMATICAL RULE:
    If ||g|| > max_norm, then g := g * (max_norm / ||g||)

    EXAMPLE:
    variables = [w1, w2, b1, b2]
    clip_gradients(variables, max_norm=1.0)

    HINTS:
    - Compute L2 norm of all gradients combined
    - Scale factor = max_norm / total_norm
    - Only clip if total_norm > max_norm
    """
    ### BEGIN SOLUTION
    # Compute total gradient norm
    total_norm = 0.0
    for var in variables:
        if var.grad is not None:
            total_norm += np.sum(var.grad.data.data ** 2)
    total_norm = np.sqrt(total_norm)

    # Clip if necessary
    if total_norm > max_norm:
        scale_factor = max_norm / total_norm
        for var in variables:
            if var.grad is not None:
                var.grad.data._data *= scale_factor
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 30
def collect_parameters(*modules) -> List[Variable]:
    """
    Collect all parameters from modules for optimization.

    Args:
        *modules: Variable number of modules/objects with parameters

    Returns:
        List of all Variables that require gradients

    TODO: Implement parameter collection.

    APPROACH:
    1. Iterate through all provided modules
    2. Find all Variable attributes that require gradients
    3. Return list of all such Variables

    EXAMPLE:
    layer1 = SomeLayer()
    layer2 = SomeLayer()
    params = collect_parameters(layer1, layer2)

    HINTS:
    - Use hasattr() and getattr() to find Variable attributes
    - Check if attribute is Variable and requires_grad
    - Handle different module types gracefully
    """
    ### BEGIN SOLUTION
    parameters = []
    for module in modules:
        if hasattr(module, '__dict__'):
            for attr_name, attr_value in module.__dict__.items():
                if isinstance(attr_value, Variable) and attr_value.requires_grad:
                    parameters.append(attr_value)
    return parameters
    ### END SOLUTION

# %% ../../modules/source/07_autograd/autograd_dev.ipynb 31
def zero_gradients(variables: List[Variable]) -> None:
    """
    Zero out gradients for all variables.

    Args:
        variables: List of Variables to zero gradients for

    TODO: Implement gradient zeroing.

    APPROACH:
    1. Iterate through all variables
    2. Call zero_grad() on each variable
    3. Handle None gradients gracefully

    EXAMPLE:
    parameters = [w1, w2, b1, b2]
    zero_gradients(parameters)

    HINTS:
    - Use the zero_grad() method on each Variable
    - Check if variable has gradients before zeroing
    - This is typically called before each training step
    """
    ### BEGIN SOLUTION
    for var in variables:
        if var.grad is not None:
            var.zero_grad()
    ### END SOLUTION