TinyTorch/tinytorch/core/autograd.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/05_autograd/autograd_dev.ipynb.

# %% auto 0
__all__ = ['Function', 'AddBackward', 'MulBackward', 'SubBackward', 'DivBackward', 'MatmulBackward', 'SumBackward',
           'ReshapeBackward', 'EmbeddingBackward', 'SqrtBackward', 'MeanBackward', 'ReLUBackward', 'GELUBackward',
           'SigmoidBackward', 'MSEBackward', 'BCEBackward', 'CrossEntropyBackward', 'enable_autograd']

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 1
import numpy as np
from typing import Optional, List, Tuple
import sys
import os

from .tensor import Tensor

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 6
class Function:
    """
    Base class for differentiable operations.

    Every operation that needs gradients (add, multiply, matmul, etc.)
    will inherit from this class and implement the apply() method.

    **Key Concepts:**
    - **saved_tensors**: Store inputs needed for backward pass
    - **apply()**: Compute gradients using chain rule
    - **next_functions**: Track computation graph connections

    **Example Usage:**
    ```python
    class AddBackward(Function):
        def apply(self, grad_output):
            # Addition distributes gradients equally
            return grad_output, grad_output
    ```
    """

    def __init__(self, *tensors):
        """
        Initialize function with input tensors.

        Args:
            *tensors: Input tensors that will be saved for backward pass
        """
        self.saved_tensors = tensors
        self.next_functions = []

        # Build computation graph connections
        for t in tensors:
            if isinstance(t, Tensor) and t.requires_grad:
                if hasattr(t, '_grad_fn'):
                    self.next_functions.append(t._grad_fn)

    def apply(self, grad_output):
        """
        Compute gradients for inputs.

        Args:
            grad_output: Gradient flowing backward from the output

        Returns:
            Tuple of gradients for each input tensor

        **Must be implemented by subclasses**
        """
        raise NotImplementedError("Each Function must implement apply() method")

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 9
class AddBackward(Function):
    """
    Gradient computation for tensor addition.

    **Mathematical Rule:** If z = a + b, then ∂z/∂a = 1 and ∂z/∂b = 1

    **Key Insight:** Addition distributes gradients equally to both inputs.
    The gradient flowing backward is passed unchanged to each input.

    **Broadcasting Handling:** When input shapes differ due to broadcasting,
    we sum gradients appropriately to match original tensor shapes.
    """

    def apply(self, grad_output):
        """
        Compute gradients for addition.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple of (grad_a, grad_b) for the two inputs

        **Mathematical Foundation:**
        - ∂(a+b)/∂a = 1 → grad_a = grad_output
        - ∂(a+b)/∂b = 1 → grad_b = grad_output
        """
        a, b = self.saved_tensors
        grad_a = grad_b = None

        # Gradient for first input
        if isinstance(a, Tensor) and a.requires_grad:
            grad_a = grad_output

        # Gradient for second input
        if isinstance(b, Tensor) and b.requires_grad:
            grad_b = grad_output

        return grad_a, grad_b

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 11
class MulBackward(Function):
    """
    Gradient computation for tensor multiplication.

    **Mathematical Rule:** If z = a * b, then ∂z/∂a = b and ∂z/∂b = a

    **Key Insight:** Each input's gradient equals the gradient output
    multiplied by the OTHER input's value (product rule).

    **Applications:** Used in weight scaling, attention mechanisms,
    and anywhere element-wise multiplication occurs.
    """

    def apply(self, grad_output):
        """
        Compute gradients for multiplication.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple of (grad_a, grad_b) for the two inputs

        **Mathematical Foundation:**
        - ∂(a*b)/∂a = b → grad_a = grad_output * b
        - ∂(a*b)/∂b = a → grad_b = grad_output * a
        """
        a, b = self.saved_tensors
        grad_a = grad_b = None

        # Gradient for first input: grad_output * b
        if isinstance(a, Tensor) and a.requires_grad:
            if isinstance(b, Tensor):
                grad_a = grad_output * b.data
            else:
                grad_a = grad_output * b

        # Gradient for second input: grad_output * a
        if isinstance(b, Tensor) and b.requires_grad:
            grad_b = grad_output * a.data

        return grad_a, grad_b

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 12
class SubBackward(Function):
    """
    Gradient computation for tensor subtraction.

    **Mathematical Rule:** If z = a - b, then ∂z/∂a = 1 and ∂z/∂b = -1

    **Key Insight:** Subtraction passes gradient unchanged to first input,
    but negates it for second input (because of the minus sign).

    **Applications:** Used in residual connections, computing differences in losses.
    """

    def apply(self, grad_output):
        """
        Compute gradients for subtraction.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple of (grad_a, grad_b) for the two inputs

        **Mathematical Foundation:**
        - ∂(a-b)/∂a = 1 → grad_a = grad_output
        - ∂(a-b)/∂b = -1 → grad_b = -grad_output
        """
        a, b = self.saved_tensors
        grad_a = grad_b = None

        # Gradient for first input: grad_output (unchanged)
        if isinstance(a, Tensor) and a.requires_grad:
            grad_a = grad_output

        # Gradient for second input: -grad_output (negated)
        if isinstance(b, Tensor) and b.requires_grad:
            grad_b = -grad_output

        return grad_a, grad_b


#| export
class DivBackward(Function):
    """
    Gradient computation for tensor division.

    **Mathematical Rule:** If z = a / b, then ∂z/∂a = 1/b and ∂z/∂b = -a/b²

    **Key Insight:** Division gradient for numerator is 1/denominator,
    for denominator is -numerator/denominator².

    **Applications:** Used in normalization (LayerNorm, BatchNorm), loss functions.
    """

    def apply(self, grad_output):
        """
        Compute gradients for division.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple of (grad_a, grad_b) for the two inputs

        **Mathematical Foundation:**
        - ∂(a/b)/∂a = 1/b → grad_a = grad_output / b
        - ∂(a/b)/∂b = -a/b² → grad_b = -grad_output * a / b²
        """
        a, b = self.saved_tensors
        grad_a = grad_b = None

        # Gradient for numerator: grad_output / b
        if isinstance(a, Tensor) and a.requires_grad:
            if isinstance(b, Tensor):
                grad_a = grad_output / b.data
            else:
                grad_a = grad_output / b

        # Gradient for denominator: -grad_output * a / b²
        if isinstance(b, Tensor) and b.requires_grad:
            grad_b = -grad_output * a.data / (b.data ** 2)

        return grad_a, grad_b


# %% ../../modules/source/05_autograd/autograd_dev.ipynb 14
class MatmulBackward(Function):
    """
    Gradient computation for matrix multiplication.

    **Mathematical Rule:** If Z = A @ B, then:
    - ∂Z/∂A = grad_Z @ B.T
    - ∂Z/∂B = A.T @ grad_Z

    **Key Insight:** Matrix multiplication gradients involve transposing
    one input and multiplying with the gradient output.

    **Applications:** Core operation in neural networks for weight updates
    in linear layers, attention mechanisms, and transformers.
    """

    def apply(self, grad_output):
        """
        Compute gradients for matrix multiplication.

        Handles both 2D matrices and 3D batched tensors (for transformers).

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple of (grad_a, grad_b) for the two matrix inputs

        **Mathematical Foundation:**
        - 2D: ∂(A@B)/∂A = grad_output @ B.T
        - 3D: ∂(A@B)/∂A = grad_output @ swapaxes(B, -2, -1)

        **Why Both Cases:**
        - 2D: Traditional matrix multiplication (Linear layers)
        - 3D: Batched operations (Transformers: batch, seq, embed)
        """
        a, b = self.saved_tensors
        grad_a = grad_b = None

        # Detect if we're dealing with batched (3D) or regular (2D) tensors
        is_batched = len(grad_output.shape) == 3

        # Gradient for first input: grad_output @ b.T (or batched equivalent)
        if isinstance(a, Tensor) and a.requires_grad:
            if is_batched:
                # Batched: use matmul and swapaxes for transpose
                grad_a = np.matmul(grad_output, np.swapaxes(b.data, -2, -1))
            else:
                # 2D: use dot and .T for transpose
                grad_a = np.dot(grad_output, b.data.T)

        # Gradient for second input: a.T @ grad_output (or batched equivalent)
        if isinstance(b, Tensor) and b.requires_grad:
            if is_batched:
                # Batched: use matmul and swapaxes for transpose
                grad_b = np.matmul(np.swapaxes(a.data, -2, -1), grad_output)
            else:
                # 2D: use dot and .T for transpose
                grad_b = np.dot(a.data.T, grad_output)

        return grad_a, grad_b

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 16
class SumBackward(Function):
    """
    Gradient computation for tensor sum.

    **Mathematical Rule:** If z = sum(a), then ∂z/∂a[i] = 1 for all i

    **Key Insight:** Sum distributes the gradient equally to all input elements.
    The gradient is broadcast from the reduced output back to input shape.

    **Applications:** Used in loss functions, mean operations, and
    anywhere tensor reduction occurs.
    """

    def apply(self, grad_output):
        """
        Compute gradients for sum operation.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple containing gradient for the input tensor

        **Mathematical Foundation:**
        - ∂sum(a)/∂a[i] = 1 → grad_a = ones_like(a) * grad_output
        """
        tensor, = self.saved_tensors

        if isinstance(tensor, Tensor) and tensor.requires_grad:
            # Gradient is 1 for all elements, scaled by grad_output
            return np.ones_like(tensor.data) * grad_output,
        return None,

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 17
class ReshapeBackward(Function):
    """
    Gradient computation for tensor reshape.

    **Mathematical Rule:** If z = reshape(a, new_shape), then ∂z/∂a is reshape(grad_z, old_shape)

    **Key Insight:** Reshape doesn't change values, only their arrangement.
    Gradients flow back by reshaping to the original shape.

    **Applications:** Used in transformers (flattening for loss), CNNs, and
    anywhere tensor dimensions need to be rearranged.
    """

    def apply(self, grad_output):
        """
        Compute gradients for reshape operation.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple containing gradient for the input tensor

        **Mathematical Foundation:**
        - Reshape is a view operation: grad_input = reshape(grad_output, original_shape)
        """
        tensor, = self.saved_tensors
        original_shape = tensor.shape

        if isinstance(tensor, Tensor) and tensor.requires_grad:
            # Reshape gradient back to original input shape
            return np.reshape(grad_output, original_shape),
        return None,


# %% ../../modules/source/05_autograd/autograd_dev.ipynb 18
class EmbeddingBackward(Function):
    """
    Gradient computation for embedding lookup.

    **Mathematical Rule:** If z = embedding[indices], gradients accumulate at indexed positions.

    **Key Insight:** Multiple indices can point to the same embedding vector,
    so gradients must accumulate (not overwrite) at each position.

    **Applications:** Used in NLP transformers, language models, and any discrete input.
    """

    def apply(self, grad_output):
        """
        Compute gradients for embedding lookup.

        Args:
            grad_output: Gradient flowing backward from output (batch, seq, embed_dim)

        Returns:
            Tuple containing gradient for the embedding weight matrix

        **Mathematical Foundation:**
        - Embedding is a lookup: output[i] = weight[indices[i]]
        - Gradients scatter back to indexed positions: grad_weight[indices[i]] += grad_output[i]
        - Must accumulate because multiple positions can use same embedding
        """
        weight, indices = self.saved_tensors

        if isinstance(weight, Tensor) and weight.requires_grad:
            # Initialize gradient matrix with zeros
            grad_weight = np.zeros_like(weight.data)

            # Scatter gradients back to embedding table
            # np.add.at accumulates values at repeated indices
            flat_indices = indices.data.astype(int).flatten()
            flat_grad_output = grad_output.reshape((-1, weight.shape[-1]))

            np.add.at(grad_weight, flat_indices, flat_grad_output)

            return grad_weight, None

        return None, None


#| export
class SqrtBackward(Function):
    """
    Gradient computation for square root.

    **Mathematical Rule:** If z = sqrt(x), then ∂z/∂x = 1 / (2 * sqrt(x))

    **Key Insight:** Gradient is inversely proportional to the square root output.

    **Applications:** Used in normalization (LayerNorm, BatchNorm), distance metrics.
    """

    def apply(self, grad_output):
        """
        Compute gradients for sqrt operation.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple containing gradient for the input

        **Mathematical Foundation:**
        - d/dx(sqrt(x)) = 1 / (2 * sqrt(x)) = 1 / (2 * output)
        """
        x, = self.saved_tensors
        output = self.saved_output

        if isinstance(x, Tensor) and x.requires_grad:
            # Gradient: 1 / (2 * sqrt(x))
            grad_x = grad_output / (2.0 * output.data)
            return grad_x,

        return None,


#| export
class MeanBackward(Function):
    """
    Gradient computation for mean reduction.

    **Mathematical Rule:** If z = mean(x), then ∂z/∂x_i = 1 / N for all i

    **Key Insight:** Mean distributes gradient equally to all input elements.

    **Applications:** Used in loss functions, normalization (LayerNorm, BatchNorm).
    """

    def apply(self, grad_output):
        """
        Compute gradients for mean reduction.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple containing gradient for the input

        **Mathematical Foundation:**
        - mean reduces by averaging, so gradient is distributed equally
        - Each input element contributes 1/N to the output
        - Gradient: grad_output / N, broadcasted to input shape
        """
        x, = self.saved_tensors
        axis = self.axis
        keepdims = self.keepdims

        if isinstance(x, Tensor) and x.requires_grad:
            # Number of elements that were averaged
            if axis is None:
                N = x.size
            else:
                if isinstance(axis, int):
                    N = x.shape[axis]
                else:
                    N = np.prod([x.shape[ax] for ax in axis])

            # Distribute gradient equally: each element gets grad_output / N
            grad_x = grad_output / N

            # Broadcast gradient back to original shape
            if not keepdims and axis is not None:
                # Need to add back the reduced dimensions for broadcasting
                if isinstance(axis, int):
                    grad_x = np.expand_dims(grad_x, axis=axis)
                else:
                    for ax in sorted(axis):
                        grad_x = np.expand_dims(grad_x, axis=ax)

            # Broadcast to match input shape
            grad_x = np.broadcast_to(grad_x, x.shape)

            return grad_x,

        return None,


# %% ../../modules/source/05_autograd/autograd_dev.ipynb 23
class ReLUBackward(Function):
    """
    Gradient computation for ReLU activation.

    ReLU: f(x) = max(0, x)
    Derivative: f'(x) = 1 if x > 0, else 0
    """

    def __init__(self, input_tensor):
        """Initialize with input tensor."""
        super().__init__(input_tensor)

    def apply(self, grad_output):
        """Compute gradient for ReLU."""
        tensor, = self.saved_tensors

        if isinstance(tensor, Tensor) and tensor.requires_grad:
            # ReLU gradient: 1 if x > 0, else 0
            relu_grad = (tensor.data > 0).astype(np.float32)
            return grad_output * relu_grad,
        return None,

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 24
class GELUBackward(Function):
    """
    Gradient computation for GELU activation.

    **Mathematical Rule:** GELU(x) = x * Φ(x) where Φ is the standard normal CDF

    **Key Insight:** GELU gradient involves both the function value and its derivative.

    **Applications:** Used in modern transformers (GPT, BERT) as a smooth alternative to ReLU.
    """

    def apply(self, grad_output):
        """
        Compute gradients for GELU activation.

        Args:
            grad_output: Gradient flowing backward from output

        Returns:
            Tuple containing gradient for the input

        **Mathematical Foundation:**
        - GELU approximation: f(x) = x * sigmoid(1.702 * x)
        - Gradient: f'(x) = sigmoid(1.702*x) + x * sigmoid(1.702*x) * (1-sigmoid(1.702*x)) * 1.702
        """
        x, = self.saved_tensors

        if isinstance(x, Tensor) and x.requires_grad:
            # GELU gradient using approximation
            # f(x) = x * sigmoid(1.702*x)
            # f'(x) = sigmoid(1.702*x) + 1.702 * x * sigmoid(1.702*x) * (1 - sigmoid(1.702*x))

            sig = 1.0 / (1.0 + np.exp(-1.702 * x.data))
            grad_x = grad_output * (sig + 1.702 * x.data * sig * (1 - sig))

            return grad_x,

        return None,


# %% ../../modules/source/05_autograd/autograd_dev.ipynb 25
class SigmoidBackward(Function):
    """
    Gradient computation for sigmoid activation.

    Sigmoid: σ(x) = 1/(1 + exp(-x))
    Derivative: σ'(x) = σ(x) * (1 - σ(x))
    """

    def __init__(self, input_tensor, output_tensor):
        """
        Initialize with both input and output.

        Args:
            input_tensor: Original input to sigmoid
            output_tensor: Output of sigmoid (saves recomputation)
        """
        super().__init__(input_tensor)
        self.output_data = output_tensor.data

    def apply(self, grad_output):
        """Compute gradient for sigmoid."""
        tensor, = self.saved_tensors

        if isinstance(tensor, Tensor) and tensor.requires_grad:
            # σ'(x) = σ(x) * (1 - σ(x))
            sigmoid_grad = self.output_data * (1 - self.output_data)
            return grad_output * sigmoid_grad,
        return None,

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 26
class MSEBackward(Function):
    """
    Gradient computation for Mean Squared Error Loss.

    MSE: L = mean((predictions - targets)²)
    Derivative: ∂L/∂predictions = 2 * (predictions - targets) / N
    """

    def __init__(self, predictions, targets):
        """Initialize with predictions and targets."""
        super().__init__(predictions)
        self.targets_data = targets.data
        self.num_samples = np.size(targets.data)

    def apply(self, grad_output):
        """Compute gradient for MSE loss."""
        predictions, = self.saved_tensors

        if isinstance(predictions, Tensor) and predictions.requires_grad:
            # Gradient: 2 * (predictions - targets) / N
            grad = 2.0 * (predictions.data - self.targets_data) / self.num_samples

            return grad * grad_output,
        return None,

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 27
class BCEBackward(Function):
    """
    Gradient computation for Binary Cross-Entropy Loss.

    BCE: L = -[y*log(p) + (1-y)*log(1-p)]
    Derivative: ∂L/∂p = (p - y) / (p*(1-p)*N)
    """

    def __init__(self, predictions, targets):
        """Initialize with predictions and targets."""
        super().__init__(predictions)
        self.targets_data = targets.data
        self.num_samples = np.size(targets.data)

    def apply(self, grad_output):
        """Compute gradient for BCE loss."""
        predictions, = self.saved_tensors

        if isinstance(predictions, Tensor) and predictions.requires_grad:
            eps = 1e-7
            p = np.clip(predictions.data, eps, 1 - eps)
            y = self.targets_data

            # Gradient: (p - y) / (p * (1-p) * N)
            grad = (p - y) / (p * (1 - p) * self.num_samples)

            return grad * grad_output,
        return None,

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 28
class CrossEntropyBackward(Function):
    """
    Gradient computation for Cross-Entropy Loss.

    CrossEntropy: L = -mean(log_softmax(logits)[targets])

    The gradient with respect to logits is remarkably elegant:
    ∂L/∂logits = (softmax(logits) - one_hot(targets)) / N

    This is one of the most beautiful results in machine learning:
    - The gradient is simply the difference between predictions and targets
    - It naturally scales with how wrong we are
    - It's numerically stable when computed via softmax
    """

    def __init__(self, logits, targets):
        """Initialize with logits and target class indices."""
        super().__init__(logits)
        self.targets_data = targets.data.astype(int)
        self.batch_size = logits.data.shape[0]
        self.num_classes = logits.data.shape[1]

    def apply(self, grad_output):
        """Compute gradient for cross-entropy loss."""
        logits, = self.saved_tensors

        if isinstance(logits, Tensor) and logits.requires_grad:
            # Compute softmax probabilities
            # Using stable softmax: subtract max for numerical stability
            logits_data = logits.data
            max_logits = np.max(logits_data, axis=1, keepdims=True)
            exp_logits = np.exp(logits_data - max_logits)
            softmax = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

            # Create one-hot encoding of targets
            one_hot = np.zeros((self.batch_size, self.num_classes), dtype=np.float32)
            one_hot[np.arange(self.batch_size), self.targets_data] = 1.0

            # Gradient: (softmax - one_hot) / batch_size
            grad = (softmax - one_hot) / self.batch_size

            return grad * grad_output,
        return None,

# %% ../../modules/source/05_autograd/autograd_dev.ipynb 29
def enable_autograd():
    """
    Enable gradient tracking for all Tensor operations.

    This function enhances the existing Tensor class with autograd capabilities.
    Call this once to activate gradients globally.

    **What it does:**
    - Replaces Tensor operations with gradient-tracking versions
    - Adds backward() method for reverse-mode differentiation
    - Enables computation graph building
    - Maintains full backward compatibility

    **After calling this:**
    - Tensor operations will track computation graphs
    - backward() method becomes available
    - Gradients will flow through operations
    - requires_grad=True enables tracking per tensor

    **Example:**
    ```python
    enable_autograd()  # Call once
    x = Tensor([2.0], requires_grad=True)
    y = x * 3
    y.backward()
    print(x.grad)  # [3.0]
    ```
    """

    # Check if already enabled
    if hasattr(Tensor, '_autograd_enabled'):
        print("⚠️ Autograd already enabled")
        return

    # Store original operations
    _original_add = Tensor.__add__
    _original_sub = Tensor.__sub__
    _original_mul = Tensor.__mul__
    _original_truediv = Tensor.__truediv__
    _original_matmul = Tensor.matmul if hasattr(Tensor, 'matmul') else None

    # Enhanced operations that track gradients
    def tracked_add(self, other):
        """
        Addition with gradient tracking.

        Enhances the original __add__ method to build computation graphs
        when requires_grad=True for any input.
        """
        # Convert scalar to Tensor if needed
        if not isinstance(other, Tensor):
            other = Tensor(other)

        # Call original operation
        result = _original_add(self, other)

        # Track gradient if needed
        if self.requires_grad or other.requires_grad:
            result.requires_grad = True
            result._grad_fn = AddBackward(self, other)

        return result

    def tracked_mul(self, other):
        """
        Multiplication with gradient tracking.

        Enhances the original __mul__ method to build computation graphs
        when requires_grad=True for any input.
        """
        # Convert scalar to Tensor if needed for consistency
        if not isinstance(other, Tensor):
            other_tensor = Tensor(other)
        else:
            other_tensor = other

        # Call original operation
        result = _original_mul(self, other)

        # Track gradient if needed
        if self.requires_grad or (isinstance(other, Tensor) and other.requires_grad):
            result.requires_grad = True
            result._grad_fn = MulBackward(self, other)

        return result

    def tracked_sub(self, other):
        """
        Subtraction with gradient tracking.

        Enhances the original __sub__ method to build computation graphs
        when requires_grad=True for any input.
        """
        # Convert scalar to Tensor if needed
        if not isinstance(other, Tensor):
            other = Tensor(other)

        # Call original operation
        result = _original_sub(self, other)

        # Track gradient if needed
        if self.requires_grad or other.requires_grad:
            result.requires_grad = True
            result._grad_fn = SubBackward(self, other)

        return result

    def tracked_truediv(self, other):
        """
        Division with gradient tracking.

        Enhances the original __truediv__ method to build computation graphs
        when requires_grad=True for any input.
        """
        # Convert scalar to Tensor if needed
        if not isinstance(other, Tensor):
            other = Tensor(other)

        # Call original operation
        result = _original_truediv(self, other)

        # Track gradient if needed
        if self.requires_grad or other.requires_grad:
            result.requires_grad = True
            result._grad_fn = DivBackward(self, other)

        return result

    def tracked_matmul(self, other):
        """
        Matrix multiplication with gradient tracking.

        Enhances the original matmul method to build computation graphs
        when requires_grad=True for any input.
        """
        if _original_matmul:
            result = _original_matmul(self, other)
        else:
            # Fallback if matmul doesn't exist
            result = Tensor(np.dot(self.data, other.data))

        # Track gradient if needed
        if self.requires_grad or other.requires_grad:
            result.requires_grad = True
            result._grad_fn = MatmulBackward(self, other)

        return result

    def sum_op(self, axis=None, keepdims=False):
        """
        Sum operation with gradient tracking.

        Creates a new sum method that builds computation graphs
        when requires_grad=True.
        """
        result_data = np.sum(self.data, axis=axis, keepdims=keepdims)
        result = Tensor(result_data)

        if self.requires_grad:
            result.requires_grad = True
            result._grad_fn = SumBackward(self)

        return result

    def backward(self, gradient=None):
        """
        Compute gradients via backpropagation.

        This is the key method that makes training possible!
        It implements reverse-mode automatic differentiation.

        **Algorithm:**
        1. Initialize gradient if not provided (for scalar outputs)
        2. Accumulate gradient in self.grad
        3. If this tensor has a _grad_fn, call it to propagate gradients
        4. Recursively call backward() on parent tensors

        **Example:**
        ```python
        x = Tensor([2.0], requires_grad=True)
        y = x * 3
        y.backward()  # Computes gradients for x
        print(x.grad)  # [3.0]
        ```
        """
        # Only compute gradients if required
        if not self.requires_grad:
            return

        # Initialize gradient if not provided (for scalar outputs)
        if gradient is None:
            if self.data.size == 1:
                gradient = np.ones_like(self.data)
            else:
                raise ValueError("backward() requires gradient for non-scalar outputs")

        # Initialize or accumulate gradient
        if self.grad is None:
            self.grad = np.zeros_like(self.data)

        # Handle broadcasting: sum gradient to match self.data shape
        # This happens when operations broadcast tensors (e.g., adding bias to batch)
        if gradient.shape != self.grad.shape:
            # Step 1: Remove extra leading dimensions added during forward pass
            # Example: gradient (batch_size, features) → self.grad (features,)
            while gradient.ndim > self.grad.ndim:
                gradient = gradient.sum(axis=0)

            # Step 2: Sum over dimensions that were size-1 in original tensor
            # Example: bias with shape (1,) broadcast to (batch_size,) during forward
            for i in range(gradient.ndim):
                if self.grad.shape[i] == 1 and gradient.shape[i] != 1:
                    gradient = gradient.sum(axis=i, keepdims=True)

        self.grad += gradient

        # Propagate gradients through computation graph
        if hasattr(self, '_grad_fn') and self._grad_fn:
            grads = self._grad_fn.apply(gradient)

            # Recursively call backward on parent tensors
            for tensor, grad in zip(self._grad_fn.saved_tensors, grads):
                if isinstance(tensor, Tensor) and tensor.requires_grad and grad is not None:
                    tensor.backward(grad)

    def zero_grad(self):
        """
        Reset gradients to zero.

        Call this before each backward pass to prevent gradient accumulation
        from previous iterations.
        """
        self.grad = None

    # Install enhanced operations
    Tensor.__add__ = tracked_add
    Tensor.__sub__ = tracked_sub
    Tensor.__mul__ = tracked_mul
    Tensor.__truediv__ = tracked_truediv
    Tensor.matmul = tracked_matmul
    Tensor.sum = sum_op
    Tensor.backward = backward
    Tensor.zero_grad = zero_grad

    # Patch activations and losses to track gradients
    try:
        from tinytorch.core.activations import Sigmoid, ReLU, GELU
        from tinytorch.core.losses import BinaryCrossEntropyLoss, MSELoss, CrossEntropyLoss

        # Store original methods
        _original_sigmoid_forward = Sigmoid.forward
        _original_relu_forward = ReLU.forward
        _original_gelu_forward = GELU.forward
        _original_bce_forward = BinaryCrossEntropyLoss.forward
        _original_mse_forward = MSELoss.forward
        _original_ce_forward = CrossEntropyLoss.forward

        def tracked_sigmoid_forward(self, x):
            """Sigmoid with gradient tracking."""
            result_data = 1.0 / (1.0 + np.exp(-x.data))
            result = Tensor(result_data)

            if x.requires_grad:
                result.requires_grad = True
                result._grad_fn = SigmoidBackward(x, result)

            return result

        def tracked_relu_forward(self, x):
            """ReLU with gradient tracking."""
            result_data = np.maximum(0, x.data)
            result = Tensor(result_data)

            if x.requires_grad:
                result.requires_grad = True
                result._grad_fn = ReLUBackward(x)

            return result

        def tracked_gelu_forward(self, x):
            """GELU with gradient tracking."""
            # GELU approximation: x * sigmoid(1.702 * x)
            sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
            result_data = x.data * sigmoid_part
            result = Tensor(result_data)

            if x.requires_grad:
                result.requires_grad = True
                result._grad_fn = GELUBackward(x)

            return result

        def tracked_bce_forward(self, predictions, targets):
            """Binary cross-entropy with gradient tracking."""
            # Compute BCE loss
            eps = 1e-7
            clamped_preds = np.clip(predictions.data, eps, 1 - eps)
            log_preds = np.log(clamped_preds)
            log_one_minus_preds = np.log(1 - clamped_preds)
            bce_per_sample = -(targets.data * log_preds + (1 - targets.data) * log_one_minus_preds)
            bce_loss = np.mean(bce_per_sample)

            result = Tensor(bce_loss)

            if predictions.requires_grad:
                result.requires_grad = True
                result._grad_fn = BCEBackward(predictions, targets)

            return result

        def tracked_mse_forward(self, predictions, targets):
            """MSE loss with gradient tracking."""
            # Compute MSE loss
            diff = predictions.data - targets.data
            squared_diff = diff ** 2
            mse = np.mean(squared_diff)

            result = Tensor(mse)

            if predictions.requires_grad:
                result.requires_grad = True
                result._grad_fn = MSEBackward(predictions, targets)

            return result

        def tracked_ce_forward(self, logits, targets):
            """Cross-entropy loss with gradient tracking."""
            from tinytorch.core.losses import log_softmax

            # Compute log-softmax for numerical stability
            log_probs = log_softmax(logits, dim=-1)

            # Select log-probabilities for correct classes
            batch_size = logits.shape[0]
            target_indices = targets.data.astype(int)
            selected_log_probs = log_probs.data[np.arange(batch_size), target_indices]

            # Return negative mean
            ce_loss = -np.mean(selected_log_probs)

            result = Tensor(ce_loss)

            if logits.requires_grad:
                result.requires_grad = True
                result._grad_fn = CrossEntropyBackward(logits, targets)

            return result

        # Install patched methods
        Sigmoid.forward = tracked_sigmoid_forward
        ReLU.forward = tracked_relu_forward
        GELU.forward = tracked_gelu_forward
        BinaryCrossEntropyLoss.forward = tracked_bce_forward
        MSELoss.forward = tracked_mse_forward
        CrossEntropyLoss.forward = tracked_ce_forward

    except ImportError:
        # Activations/losses not yet available (happens during module development)
        pass

    # Mark as enabled
    Tensor._autograd_enabled = True

    print("✅ Autograd enabled! Tensors now track gradients.")
    print("   - Operations build computation graphs")
    print("   - backward() computes gradients")
    print("   - requires_grad=True enables tracking")

# Auto-enable when module is imported
enable_autograd()