TinyTorch/tinytorch/core/autograd.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_autograd/autograd_dev.ipynb.

# %% auto 0
__all__ = ['Variable', 'add', 'multiply', 'subtract', 'AutogradSystemsProfiler', 'to_numpy']

# %% ../../modules/source/08_autograd/autograd_dev.ipynb 1
import numpy as np
import sys
from typing import Union, List, Tuple, Optional, Any, Callable
from collections import defaultdict

# Import our existing components
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local modules
    import os
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    from tensor_dev import Tensor

def to_numpy(x):
    """
    Universal data extraction utility - PyTorch-inspired solution.

    This function provides a clean interface for extracting numpy arrays
    from any tensor-like object, eliminating the need for complex
    conditional logic throughout the codebase.

    Args:
        x: Any tensor-like object (Tensor, Variable, numpy array, or scalar)

    Returns:
        np.ndarray: The underlying numpy array

    Usage:
        # Before (hacky conditional logic):
        if hasattr(x, 'data') and hasattr(x.data, 'data'):
            data = x.data.data
        elif hasattr(x, 'data'):
            data = x.data
        else:
            data = x

        # After (clean universal interface):
        data = to_numpy(x)
    """
    if hasattr(x, 'numpy'):
        # Tensor or Variable with .numpy() method (preferred)
        return x.numpy()
    elif hasattr(x, 'data'):
        # Fallback for objects with .data attribute
        if hasattr(x.data, 'data'):
            return x.data.data
        else:
            return np.array(x.data)
    else:
        # Raw numpy array or scalar
        return np.array(x)

# %% ../../modules/source/08_autograd/autograd_dev.ipynb 7
class Variable:
    """
    Variable: Tensor wrapper with automatic differentiation capabilities.

    The fundamental class for gradient computation in TinyTorch.
    Wraps Tensor objects and tracks computational history for backpropagation.
    """

    def __init__(self, data: Union[Tensor, np.ndarray, list, float, int],
                 requires_grad: bool = True, grad_fn: Optional[Callable] = None):
        """
        Create a Variable with gradient tracking.

        TODO: Implement Variable initialization with gradient tracking.

        STEP-BY-STEP IMPLEMENTATION:
        1. Convert data to Tensor if it is not already a Tensor
        2. Store the tensor data in self.data
        3. Set gradient tracking flag (requires_grad)
        4. Initialize gradient to None (will be computed during backward pass)
        5. Store the gradient function for backward pass
        6. Track if this is a leaf node (no grad_fn means it is a leaf)

        EXAMPLE USAGE:
        ```python
        # Create leaf variables (input data)
        x = Variable(5.0, requires_grad=True)
        y = Variable([1, 2, 3], requires_grad=True)

        # Create intermediate variables (results of operations)
        z = x + y  # Has grad_fn for addition
        ```

        IMPLEMENTATION HINTS:
        - Use isinstance(data, Tensor) to check type
        - Convert with Tensor(data) if needed
        - Store requires_grad, grad_fn flags
        - Initialize self.grad = None
        - Leaf nodes have grad_fn = None
        - Set self.is_leaf = (grad_fn is None)

        LEARNING CONNECTIONS:
        - This is like torch.Tensor with requires_grad=True
        - Forms the basis for all neural network training
        - Each Variable is a node in the computational graph
        - Enables automatic gradient computation
        """
        ### BEGIN SOLUTION
        # Convert data to Tensor if needed
        if isinstance(data, Tensor):
            self.data = data
            # CRITICAL FIX: Keep reference to source tensor for gradient flow
            self._source_tensor = data if data.requires_grad else None
        else:
            self.data = Tensor(data)
            self._source_tensor = None

        # Set gradient tracking
        self.requires_grad = requires_grad or (isinstance(data, Tensor) and data.requires_grad)
        self.grad = None  # Will be initialized when needed
        self.grad_fn = grad_fn
        self.is_leaf = grad_fn is None

        # For computational graph
        self._backward_hooks = []
        ### END SOLUTION

    @property
    def shape(self) -> Tuple[int, ...]:
        """Get the shape of the underlying tensor."""
        return self.data.shape

    @property
    def size(self) -> int:
        """Get the total number of elements."""
        return self.data.size

    def __repr__(self) -> str:
        """String representation of the Variable."""
        grad_str = f", grad_fn={self.grad_fn.__name__}" if self.grad_fn else ""
        return f"Variable({self.data.data.tolist()}, requires_grad={self.requires_grad}{grad_str})"

    def backward(self, gradient: Optional['Variable'] = None) -> None:
        """
        Compute gradients using backpropagation.

        TODO: Implement backward pass for gradient computation.

        STEP-BY-STEP IMPLEMENTATION:
        1. If gradient is None, create gradient of ones (for scalar outputs)
        2. If this Variable requires gradients, accumulate the gradient
        3. If this Variable has a grad_fn, call it to propagate gradients
        4. The grad_fn will recursively call backward on input Variables

        EXAMPLE USAGE:
        ```python
        x = Variable(2.0, requires_grad=True)
        y = Variable(3.0, requires_grad=True)
        z = add(x, y)  # z = 5.0
        z.backward()
        print(x.grad)  # 1.0 (∂z/∂x = 1)
        print(y.grad)  # 1.0 (∂z/∂y = 1)
        ```

        IMPLEMENTATION HINTS:
        - If gradient is None: gradient = Variable(np.ones_like(self.data.data))
        - If self.requires_grad: accumulate gradient into self.grad
        - If self.grad_fn: call self.grad_fn(gradient)
        - Handle gradient accumulation (add to existing gradient)

        LEARNING CONNECTIONS:
        - This implements the chain rule of calculus
        - Gradients flow backward through the computational graph
        - Each operation contributes its local gradient
        - Enables training of any differentiable function
        """
        ### BEGIN SOLUTION
        if gradient is None:
            gradient = Variable(np.ones_like(self.data.data))

        if self.requires_grad:
            # Store gradient in Variable
            if self.grad is None:
                self.grad = gradient
            else:
                # Accumulate gradients
                self.grad = Variable(self.grad.data.data + gradient.data.data)

            # CRITICAL FIX: Propagate gradients back to source Tensor (Parameters)
            if self._source_tensor is not None and self._source_tensor.requires_grad:
                if self._source_tensor.grad is None:
                    self._source_tensor.grad = gradient.data
                else:
                    # Accumulate gradients in the source tensor
                    self._source_tensor.grad = Tensor(self._source_tensor.grad.data + gradient.data.data)

        if self.grad_fn is not None:
            self.grad_fn(gradient)
        ### END SOLUTION

    def zero_grad(self) -> None:
        """Reset gradients to zero."""
        self.grad = None

    def numpy(self) -> np.ndarray:
        """
        Convert Variable to NumPy array - Universal data extraction interface.

        This is the PyTorch-inspired solution to inconsistent data access.
        ALWAYS returns np.ndarray, regardless of internal structure.

        Returns:
            NumPy array containing the variable's data

        Usage:
            var = Variable([1, 2, 3])
            array = var.numpy()  # Always np.ndarray, no conditional logic needed
        """
        return self.data.data

    def __add__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Addition operator: self + other"""
        return add(self, other)

    def __mul__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Multiplication operator: self * other"""
        return multiply(self, other)

    def __sub__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Subtraction operator: self - other"""
        return subtract(self, other)

    def __truediv__(self, other: Union['Variable', float, int]) -> 'Variable':
        """Division operator: self / other"""
        return divide(self, other)

    def __matmul__(self, other: 'Variable') -> 'Variable':
        """Matrix multiplication operator: self @ other"""
        return matmul_vars(self, other)

    def __pow__(self, power: Union[int, float]) -> 'Variable':
        """Power operator: self ** power"""
        return power_op(self, power)

# %% ../../modules/source/08_autograd/autograd_dev.ipynb 11
def add(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Addition operation with gradient tracking: a + b

    TODO: Implement addition with automatic differentiation.

    STEP-BY-STEP IMPLEMENTATION:
    1. Convert inputs to Variables if they are scalars
    2. Compute forward pass: result = a.data + b.data
    3. Create gradient function that implements: ∂(a+b)/∂a = 1, ∂(a+b)/∂b = 1
    4. Return new Variable with result and gradient function

    MATHEMATICAL FOUNDATION:
    - Forward: z = x + y
    - Backward: ∂z/∂x = 1, ∂z/∂y = 1
    - Chain rule: ∂L/∂x = ∂L/∂z · ∂z/∂x = ∂L/∂z · 1 = ∂L/∂z

    EXAMPLE USAGE:
    ```python
    x = Variable(2.0, requires_grad=True)
    y = Variable(3.0, requires_grad=True)
    z = add(x, y)  # z = 5.0
    z.backward()
    print(x.grad)  # 1.0 (∂z/∂x = 1)
    print(y.grad)  # 1.0 (∂z/∂y = 1)
    ```

    IMPLEMENTATION HINTS:
    - Convert scalars: if isinstance(a, (int, float)): a = Variable(a, requires_grad=False)
    - Forward pass: result_data = a.data + b.data
    - Backward function: def grad_fn(grad_output): if a.requires_grad: a.backward(grad_output)
    - Return: Variable(result_data, grad_fn=grad_fn)
    - Only propagate gradients to Variables that require them

    LEARNING CONNECTIONS:
    - This is like torch.add() with autograd
    - Addition distributes gradients equally to both inputs
    - Forms the basis for bias addition in neural networks
    - Chain rule propagates gradients through the graph
    """
    ### BEGIN SOLUTION
    # Convert scalars to Variables
    if isinstance(a, (int, float)):
        a = Variable(a, requires_grad=False)
    if isinstance(b, (int, float)):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data + b.data

    # Backward function
    def grad_fn(grad_output):
        # Addition distributes gradients equally, but must handle broadcasting
        if a.requires_grad:
            # Get gradient data using universal interface
            grad_data = to_numpy(grad_output)

            # Check if we need to sum over broadcasted dimensions
            a_shape = a.data.shape if hasattr(a.data, 'shape') else ()
            if grad_data.shape != a_shape:
                # Sum over the broadcasted dimensions
                # For bias: (batch_size, features) -> (features,)
                if len(grad_data.shape) == 2 and len(a_shape) == 1:
                    grad_for_a = Variable(Tensor(np.sum(grad_data, axis=0)))
                else:
                    # Handle other broadcasting cases
                    grad_for_a = grad_output
            else:
                grad_for_a = grad_output

            a.backward(grad_for_a)

        if b.requires_grad:
            # Get gradient data using universal interface
            grad_data = to_numpy(grad_output)

            # Check if we need to sum over broadcasted dimensions
            b_shape = b.data.shape if hasattr(b.data, 'shape') else ()
            if grad_data.shape != b_shape:
                # Sum over the broadcasted dimensions
                # For bias: (batch_size, features) -> (features,)
                if len(grad_data.shape) == 2 and len(b_shape) == 1:
                    grad_for_b = Variable(Tensor(np.sum(grad_data, axis=0)))
                else:
                    # Handle other broadcasting cases
                    grad_for_b = grad_output
            else:
                grad_for_b = grad_output

            b.backward(grad_for_b)

    # Return new Variable with gradient function
    requires_grad = a.requires_grad or b.requires_grad
    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/08_autograd/autograd_dev.ipynb 15
def multiply(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Multiplication operation with gradient tracking: a * b

    TODO: Implement multiplication with automatic differentiation.

    STEP-BY-STEP IMPLEMENTATION:
    1. Convert inputs to Variables if they are scalars
    2. Compute forward pass: result = a.data * b.data
    3. Create gradient function implementing product rule: ∂(a*b)/∂a = b, ∂(a*b)/∂b = a
    4. Return new Variable with result and gradient function

    MATHEMATICAL FOUNDATION:
    - Forward: z = x * y
    - Backward: ∂z/∂x = y, ∂z/∂y = x
    - Chain rule: ∂L/∂x = ∂L/∂z · y, ∂L/∂y = ∂L/∂z · x

    EXAMPLE USAGE:
    ```python
    x = Variable(2.0, requires_grad=True)
    y = Variable(3.0, requires_grad=True)
    z = multiply(x, y)  # z = 6.0
    z.backward()
    print(x.grad)  # 3.0 (∂z/∂x = y)
    print(y.grad)  # 2.0 (∂z/∂y = x)
    ```

    IMPLEMENTATION HINTS:
    - Convert scalars to Variables (same as addition)
    - Forward pass: result_data = a.data * b.data
    - Backward function: multiply incoming gradient by the other variable
    - For a: a.backward(grad_output * b.data)
    - For b: b.backward(grad_output * a.data)

    LEARNING CONNECTIONS:
    - This is like torch.mul() with autograd
    - Product rule is fundamental to backpropagation
    - Used in weight updates and attention mechanisms
    - Each input's gradient depends on the other input's value
    """
    ### BEGIN SOLUTION
    # Convert scalars to Variables
    if isinstance(a, (int, float)):
        a = Variable(a, requires_grad=False)
    if isinstance(b, (int, float)):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data * b.data

    # Backward function
    def grad_fn(grad_output):
        # Product rule: d(xy)/dx = y, d(xy)/dy = x
        if a.requires_grad:
            a.backward(Variable(grad_output.data.data * b.data.data))
        if b.requires_grad:
            b.backward(Variable(grad_output.data.data * a.data.data))

    # Return new Variable with gradient function
    requires_grad = a.requires_grad or b.requires_grad
    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/08_autograd/autograd_dev.ipynb 18
def subtract(a: Union[Variable, float, int], b: Union[Variable, float, int]) -> Variable:
    """
    Subtraction operation with gradient tracking.

    Args:
        a: First operand (minuend)
        b: Second operand (subtrahend)

    Returns:
        Variable with difference and gradient function

    TODO: Implement subtraction with gradient computation.

    APPROACH:
    1. Convert inputs to Variables if needed
    2. Compute forward pass: result = a - b
    3. Create gradient function with correct signs
    4. Return Variable with result and grad_fn

    MATHEMATICAL RULE:
    If z = x - y, then dz/dx = 1, dz/dy = -1

    EXAMPLE:
    x = Variable(5.0), y = Variable(3.0)
    z = subtract(x, y)  # z.data = 2.0
    z.backward()        # x.grad = 1.0, y.grad = -1.0

    HINTS:
    - Forward pass is straightforward: a - b
    - Gradient for a is positive, for b is negative
    - Remember to negate the gradient for b
    """
    ### BEGIN SOLUTION
    # Convert to Variables if needed
    if not isinstance(a, Variable):
        a = Variable(a, requires_grad=False)
    if not isinstance(b, Variable):
        b = Variable(b, requires_grad=False)

    # Forward pass
    result_data = a.data - b.data

    # Create gradient function
    def grad_fn(grad_output):
        # Subtraction rule: d(x-y)/dx = 1, d(x-y)/dy = -1
        if a.requires_grad:
            a.backward(grad_output)
        if b.requires_grad:
            b_grad = Variable(-grad_output.data.data)
            b.backward(b_grad)

    # Determine if result requires gradients
    requires_grad = a.requires_grad or b.requires_grad

    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    ### END SOLUTION

# %% ../../modules/source/08_autograd/autograd_dev.ipynb 25
import time
import gc
from collections import defaultdict, deque

class AutogradSystemsProfiler:
    """
    Production Autograd System Performance Analysis and Optimization

    Analyzes computational graph efficiency, memory patterns, and optimization
    opportunities for production automatic differentiation systems.
    """

    def __init__(self):
        """Initialize autograd systems profiler."""
        self.profiling_data = defaultdict(list)
        self.graph_analysis = defaultdict(list)
        self.optimization_strategies = []

    def profile_computational_graph_depth(self, max_depth=10, operations_per_level=5):
        """
        Profile computational graph performance vs depth.

        TODO: Implement computational graph depth analysis.

        APPROACH:
        1. Create computational graphs of increasing depth
        2. Measure forward and backward pass timing
        3. Analyze memory usage patterns during gradient computation
        4. Identify memory accumulation and gradient flow bottlenecks
        5. Generate graph optimization recommendations

        EXAMPLE:
        profiler = AutogradSystemsProfiler()
        graph_analysis = profiler.profile_computational_graph_depth(max_depth=8)
        print(f"Memory scaling factor: {graph_analysis['memory_scaling_factor']:.2f}")

        HINTS:
        - Build graphs by chaining operations: x -> op1 -> op2 -> ... -> loss
        - Measure both forward and backward pass timing separately
        - Track memory usage throughout the computation
        - Monitor gradient accumulation patterns
        - Focus on production-relevant graph depths
        """
        ### BEGIN SOLUTION
        print("🔧 Profiling Computational Graph Depth Impact...")

        results = {}

        for depth in range(1, max_depth + 1):
            print(f"  Testing graph depth: {depth}")

            # Create a computational graph of specified depth
            # Each level adds more operations to test scaling

            # Start with input variable
            try:
                # Use Variable if available, otherwise simulate
                x = Variable(np.random.randn(100, 100), requires_grad=True)
            except:
                # Fallback for testing - simulate Variable with Tensor
                x = Tensor(np.random.randn(100, 100))

            # Build computational graph of specified depth
            current_var = x
            operations = []

            for level in range(depth):
                # Add multiple operations per level to increase complexity
                for op_idx in range(operations_per_level):
                    try:
                        # Simulate various operations
                        if op_idx % 4 == 0:
                            current_var = current_var * 0.9  # Scale operation
                        elif op_idx % 4 == 1:
                            current_var = current_var + 0.1  # Add operation
                        elif op_idx % 4 == 2:
                            # Matrix multiplication (most expensive)
                            weight = Tensor(np.random.randn(100, 100))
                            if hasattr(current_var, 'data'):
                                current_var = Tensor(current_var.data @ weight.data)
                            else:
                                current_var = current_var @ weight
                        else:
                            # Activation-like operation
                            if hasattr(current_var, 'data'):
                                current_var = Tensor(np.maximum(0, current_var.data))
                            else:
                                current_var = current_var  # Skip for simplicity

                        operations.append(f"level_{level}_op_{op_idx}")
                    except:
                        # Fallback for testing
                        current_var = Tensor(np.random.randn(100, 100))
                        operations.append(f"level_{level}_op_{op_idx}_fallback")

            # Add final loss computation
            try:
                if hasattr(current_var, 'data'):
                    loss = Tensor(np.sum(current_var.data ** 2))
                else:
                    loss = np.sum(current_var ** 2)
            except:
                loss = Tensor(np.array([1.0]))

            # Measure forward pass timing
            forward_iterations = 3
            forward_start = time.time()

            for _ in range(forward_iterations):
                # Simulate forward pass computation
                temp_x = x
                for level in range(depth):
                    for op_idx in range(operations_per_level):
                        if op_idx % 4 == 0:
                            temp_x = temp_x * 0.9
                        elif op_idx % 4 == 1:
                            temp_x = temp_x + 0.1
                        # Skip expensive ops for timing

            forward_end = time.time()
            avg_forward_time = (forward_end - forward_start) / forward_iterations

            # Measure backward pass timing (simulated)
            # In real implementation, this would be loss.backward()
            backward_start = time.time()

            # Simulate gradient computation through the graph
            for _ in range(forward_iterations):
                # Simulate backpropagation through all operations
                gradient_accumulation = 0
                for level in range(depth):
                    for op_idx in range(operations_per_level):
                        # Simulate gradient computation
                        gradient_accumulation += level * op_idx * 0.001

            backward_end = time.time()
            avg_backward_time = (backward_end - backward_start) / forward_iterations

            # Memory analysis
            try:
                if hasattr(x, 'data'):
                    base_memory = x.data.nbytes / (1024 * 1024)  # MB
                    if hasattr(current_var, 'data'):
                        result_memory = current_var.data.nbytes / (1024 * 1024)
                    else:
                        result_memory = base_memory
                else:
                    base_memory = x.nbytes / (1024 * 1024) if hasattr(x, 'nbytes') else 1.0
                    result_memory = base_memory
            except:
                base_memory = 1.0
                result_memory = 1.0

            # Estimate gradient memory (in production, each operation stores gradients)
            estimated_gradient_memory = depth * operations_per_level * base_memory * 0.5
            total_memory = base_memory + result_memory + estimated_gradient_memory

            # Calculate efficiency metrics
            total_operations = depth * operations_per_level
            total_time = avg_forward_time + avg_backward_time
            operations_per_second = total_operations / total_time if total_time > 0 else 0

            result = {
                'graph_depth': depth,
                'total_operations': total_operations,
                'forward_time_ms': avg_forward_time * 1000,
                'backward_time_ms': avg_backward_time * 1000,
                'total_time_ms': total_time * 1000,
                'base_memory_mb': base_memory,
                'estimated_gradient_memory_mb': estimated_gradient_memory,
                'total_memory_mb': total_memory,
                'operations_per_second': operations_per_second,
                'memory_per_operation': total_memory / total_operations if total_operations > 0 else 0
            }

            results[depth] = result

            print(f"    Forward: {avg_forward_time*1000:.3f}ms, Backward: {avg_backward_time*1000:.3f}ms, Memory: {total_memory:.2f}MB")

        # Analyze scaling patterns
        graph_analysis = self._analyze_graph_scaling(results)

        # Store profiling data
        self.profiling_data['graph_depth_analysis'] = results
        self.graph_analysis = graph_analysis

        return {
            'detailed_results': results,
            'graph_analysis': graph_analysis,
            'optimization_strategies': self._generate_graph_optimizations(results)
        }
        ### END SOLUTION

    def _analyze_graph_scaling(self, results):
        """Analyze computational graph scaling patterns."""
        analysis = {}

        # Extract metrics for scaling analysis
        depths = sorted(results.keys())
        forward_times = [results[d]['forward_time_ms'] for d in depths]
        backward_times = [results[d]['backward_time_ms'] for d in depths]
        total_times = [results[d]['total_time_ms'] for d in depths]
        memory_usage = [results[d]['total_memory_mb'] for d in depths]

        # Calculate scaling factors
        if len(depths) >= 2:
            shallow = depths[0]
            deep = depths[-1]

            depth_ratio = deep / shallow
            forward_time_ratio = results[deep]['forward_time_ms'] / results[shallow]['forward_time_ms']
            backward_time_ratio = results[deep]['backward_time_ms'] / results[shallow]['backward_time_ms']
            memory_ratio = results[deep]['total_memory_mb'] / results[shallow]['total_memory_mb']

            analysis['scaling_metrics'] = {
                'depth_ratio': depth_ratio,
                'forward_time_scaling': forward_time_ratio,
                'backward_time_scaling': backward_time_ratio,
                'memory_scaling': memory_ratio,
                'theoretical_linear': depth_ratio  # Expected linear scaling
            }

            # Identify bottlenecks
            if backward_time_ratio > forward_time_ratio * 1.5:
                analysis['primary_bottleneck'] = 'backward_pass'
                analysis['bottleneck_reason'] = 'Gradient computation scaling worse than forward pass'
            elif memory_ratio > depth_ratio * 1.5:
                analysis['primary_bottleneck'] = 'memory'
                analysis['bottleneck_reason'] = 'Memory usage scaling faster than linear'
            else:
                analysis['primary_bottleneck'] = 'balanced'
                analysis['bottleneck_reason'] = 'Forward and backward passes scaling proportionally'

        # Backward/Forward ratio analysis
        backward_forward_ratios = [
            results[d]['backward_time_ms'] / max(results[d]['forward_time_ms'], 0.001)
            for d in depths
        ]
        avg_backward_forward_ratio = sum(backward_forward_ratios) / len(backward_forward_ratios)

        analysis['efficiency_metrics'] = {
            'avg_backward_forward_ratio': avg_backward_forward_ratio,
            'peak_memory_mb': max(memory_usage),
            'memory_efficiency_trend': 'increasing' if memory_usage[-1] > memory_usage[0] * 2 else 'stable'
        }

        return analysis

    def _generate_graph_optimizations(self, results):
        """Generate computational graph optimization strategies."""
        strategies = []

        # Analyze memory growth patterns
        peak_memory = max(result['total_memory_mb'] for result in results.values())

        if peak_memory > 50:  # > 50MB memory usage
            strategies.append("💾 High memory usage detected in computational graph")
            strategies.append("🔧 Strategy: Gradient checkpointing for deep graphs")
            strategies.append("🔧 Strategy: In-place operations where mathematically valid")

        # Analyze computational efficiency
        graph_analysis = self.graph_analysis
        if graph_analysis and 'scaling_metrics' in graph_analysis:
            backward_scaling = graph_analysis['scaling_metrics']['backward_time_scaling']
            if backward_scaling > 2.0:
                strategies.append("🐌 Backward pass scaling poorly with graph depth")
                strategies.append("🔧 Strategy: Kernel fusion for backward operations")
                strategies.append("🔧 Strategy: Parallel gradient computation")

        # Memory vs computation trade-offs
        if graph_analysis and 'efficiency_metrics' in graph_analysis:
            backward_forward_ratio = graph_analysis['efficiency_metrics']['avg_backward_forward_ratio']
            if backward_forward_ratio > 3.0:
                strategies.append("⚖️ Backward pass significantly slower than forward")
                strategies.append("🔧 Strategy: Optimize gradient computation with sparse gradients")
                strategies.append("🔧 Strategy: Use mixed precision to reduce memory bandwidth")

        # Production optimization recommendations
        strategies.append("🏭 Production graph optimizations:")
        strategies.append("   • Graph compilation and optimization (TorchScript, XLA)")
        strategies.append("   • Operator fusion to minimize intermediate allocations")
        strategies.append("   • Dynamic shape optimization for variable input sizes")
        strategies.append("   • Gradient accumulation for large effective batch sizes")

        return strategies

    def analyze_memory_checkpointing_trade_offs(self, checkpoint_frequencies=[1, 2, 4, 8]):
        """
        Analyze memory vs computation trade-offs with gradient checkpointing.

        This function is PROVIDED to demonstrate checkpointing analysis.
        Students use it to understand memory optimization strategies.
        """
        print("🔍 GRADIENT CHECKPOINTING ANALYSIS")
        print("=" * 45)

        base_graph_depth = 12
        base_memory_per_layer = 10  # MB per layer
        base_computation_time = 5  # ms per layer

        checkpointing_results = []

        for freq in checkpoint_frequencies:
            # Calculate memory savings
            # Without checkpointing: store all intermediate activations
            no_checkpoint_memory = base_graph_depth * base_memory_per_layer

            # With checkpointing: only store every freq-th activation
            checkpointed_memory = (base_graph_depth // freq + 1) * base_memory_per_layer
            memory_savings = no_checkpoint_memory - checkpointed_memory
            memory_reduction_pct = (memory_savings / no_checkpoint_memory) * 100

            # Calculate recomputation overhead
            # Need to recompute (freq-1) layers for each checkpoint
            recomputation_layers = base_graph_depth * (freq - 1) / freq
            recomputation_time = recomputation_layers * base_computation_time

            # Total training time = forward + backward + recomputation
            base_training_time = base_graph_depth * base_computation_time * 2  # forward + backward
            total_training_time = base_training_time + recomputation_time
            time_overhead_pct = (recomputation_time / base_training_time) * 100

            result = {
                'checkpoint_frequency': freq,
                'memory_mb': checkpointed_memory,
                'memory_reduction_pct': memory_reduction_pct,
                'recomputation_time_ms': recomputation_time,
                'time_overhead_pct': time_overhead_pct,
                'memory_time_ratio': memory_reduction_pct / max(time_overhead_pct, 1)
            }
            checkpointing_results.append(result)

            print(f"  Checkpoint every {freq} layers:")
            print(f"    Memory: {checkpointed_memory:.0f}MB ({memory_reduction_pct:.1f}% reduction)")
            print(f"    Time overhead: {time_overhead_pct:.1f}%")
            print(f"    Efficiency ratio: {result['memory_time_ratio']:.2f}")

        # Find optimal trade-off
        optimal = max(checkpointing_results, key=lambda x: x['memory_time_ratio'])

        print(f"\n📈 Checkpointing Analysis:")
        print(f"  Optimal frequency: Every {optimal['checkpoint_frequency']} layers")
        print(f"  Best trade-off: {optimal['memory_reduction_pct']:.1f}% memory reduction")
        print(f"  Cost: {optimal['time_overhead_pct']:.1f}% time overhead")

        return checkpointing_results
def matmul_vars(a: 'Variable', b: 'Variable') -> 'Variable':
    """
    Matrix multiplication for Variables with gradient tracking.

    Args:
        a: Left Variable (shape: ..., m, k)
        b: Right Variable (shape: ..., k, n)

    Returns:
        Result Variable (shape: ..., m, n) with gradient function
    """
    # Forward pass
    result_data = a.data.data @ b.data.data

    # Create gradient function
    def grad_fn(grad_output):
        # Matrix multiplication backward pass:
        # If C = A @ B, then:
        # dA = grad_output @ B^T
        # dB = A^T @ grad_output

        if a.requires_grad:
            grad_a_data = grad_output.data.data @ b.data.data.T
            a.backward(Variable(grad_a_data))

        if b.requires_grad:
            grad_b_data = a.data.data.T @ grad_output.data.data
            b.backward(Variable(grad_b_data))

    # Create result Variable
    requires_grad = a.requires_grad or b.requires_grad
    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn if requires_grad else None)

def power_op(a: Variable, power: Union[int, float]) -> Variable:
    """
    Power operation with gradient tracking: a ** power

    Args:
        a: Base variable
        power: Power to raise to (int or float)

    Returns:
        Variable with power result and gradient function
    """
    # Forward pass
    result_data = a.data.data ** power

    def grad_fn(grad_output):
        if a.requires_grad:
            # Gradient of x^n is n * x^(n-1)
            grad_a_data = power * (a.data.data ** (power - 1)) * grad_output.data.data
            a.backward(Variable(grad_a_data))

    requires_grad = a.requires_grad
    return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn if requires_grad else None)