# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: src/09_spatial/09_spatial.py                           ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners)    ║
# ║     The tinytorch/ directory is generated code - edit source files instead!  ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['DEFAULT_KERNEL_SIZE', 'DEFAULT_STRIDE', 'DEFAULT_PADDING', 'Conv2dBackward', 'Conv2d', 'MaxPool2dBackward',
           'MaxPool2d', 'AvgPool2d', 'SimpleCNN']

# %% ../../modules/09_spatial/spatial.ipynb 1
import numpy as np
import time

from .tensor import Tensor
from .autograd import Function

# Constants for convolution defaults
DEFAULT_KERNEL_SIZE = 3  # Default kernel size for convolutions
DEFAULT_STRIDE = 1  # Default stride for convolutions
DEFAULT_PADDING = 0  # Default padding for convolutions

# %% ../../modules/09_spatial/spatial.ipynb 6
class Conv2dBackward(Function):
    """
    Gradient computation for 2D convolution.
    
    Computes gradients for Conv2d backward pass:
    - grad_input: gradient w.r.t. input (for backprop to previous layer)
    - grad_weight: gradient w.r.t. filters (for weight updates)
    - grad_bias: gradient w.r.t. bias (for bias updates)
    
    This uses explicit loops to show the gradient computation, matching
    the educational approach of the forward pass.
    """
    
    def __init__(self, x, weight, bias, stride, padding, kernel_size, padded_shape):
        # Register all tensors that need gradients with autograd
        if bias is not None:
            super().__init__(x, weight, bias)
        else:
            super().__init__(x, weight)
        self.x = x
        self.weight = weight
        self.bias = bias
        self.stride = stride
        self.padding = padding
        self.kernel_size = kernel_size
        self.padded_shape = padded_shape
    
    def apply(self, grad_output):
        """
        Compute gradients for convolution inputs and parameters.
        
        Args:
            grad_output: Gradient flowing back from next layer
                        Shape: (batch_size, out_channels, out_height, out_width)
        
        Returns:
            Tuple of (grad_input, grad_weight, grad_bias)
        """
        batch_size, out_channels, out_height, out_width = grad_output.shape
        _, in_channels, in_height, in_width = self.x.shape
        kernel_h, kernel_w = self.kernel_size
        
        # Apply padding to input if needed (for gradient computation)
        if self.padding > 0:
            padded_input = np.pad(self.x.data,
                                ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
                                mode='constant', constant_values=0)
        else:
            padded_input = self.x.data
        
        # Initialize gradients
        grad_input_padded = np.zeros_like(padded_input)
        grad_weight = np.zeros_like(self.weight.data)
        grad_bias = None if self.bias is None else np.zeros_like(self.bias.data)
        
        # Compute gradients using explicit loops (educational approach)
        for b in range(batch_size):
            for out_ch in range(out_channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        # Position in input
                        in_h_start = out_h * self.stride
                        in_w_start = out_w * self.stride
                        
                        # Gradient value flowing back to this position
                        grad_val = grad_output[b, out_ch, out_h, out_w]
                        
                        # Distribute gradient to weight and input
                        for k_h in range(kernel_h):
                            for k_w in range(kernel_w):
                                for in_ch in range(in_channels):
                                    # Input position
                                    in_h = in_h_start + k_h
                                    in_w = in_w_start + k_w
                                    
                                    # Gradient w.r.t. weight
                                    grad_weight[out_ch, in_ch, k_h, k_w] += (
                                        padded_input[b, in_ch, in_h, in_w] * grad_val
                                    )
                                    
                                    # Gradient w.r.t. input
                                    grad_input_padded[b, in_ch, in_h, in_w] += (
                                        self.weight.data[out_ch, in_ch, k_h, k_w] * grad_val
                                    )
        
        # Compute gradient w.r.t. bias (sum over batch and spatial dimensions)
        if grad_bias is not None:
            for out_ch in range(out_channels):
                grad_bias[out_ch] = grad_output[:, out_ch, :, :].sum()
        
        # Remove padding from input gradient
        if self.padding > 0:
            grad_input = grad_input_padded[:, :, 
                                          self.padding:-self.padding, 
                                          self.padding:-self.padding]
        else:
            grad_input = grad_input_padded
        
        # Return gradients as numpy arrays (autograd system handles storage)
        # Following TinyTorch protocol: return (grad_input, grad_weight, grad_bias)
        return grad_input, grad_weight, grad_bias


class Conv2d:
    """
    2D Convolution layer for spatial feature extraction.

    Implements convolution with explicit loops to demonstrate
    computational complexity and memory access patterns.

    Args:
        in_channels: Number of input channels
        out_channels: Number of output feature maps
        kernel_size: Size of convolution kernel (int or tuple)
        stride: Stride of convolution (default: 1)
        padding: Zero-padding added to input (default: 0)
        bias: Whether to add learnable bias (default: True)
    """

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True):
        """
        Initialize Conv2d layer with proper weight initialization.

        TODO: Complete Conv2d initialization

        APPROACH:
        1. Store hyperparameters (channels, kernel_size, stride, padding)
        2. Initialize weights using He initialization for ReLU compatibility
        3. Initialize bias (if enabled) to zeros
        4. Use proper shapes: weight (out_channels, in_channels, kernel_h, kernel_w)

        WEIGHT INITIALIZATION:
        - He init: std = sqrt(2 / (in_channels * kernel_h * kernel_w))
        - This prevents vanishing/exploding gradients with ReLU

        HINT: Convert kernel_size to tuple if it's an integer
        """
        super().__init__()

        ### BEGIN SOLUTION
        self.in_channels = in_channels
        self.out_channels = out_channels

        # Handle kernel_size as int or tuple
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size

        self.stride = stride
        self.padding = padding

        # He initialization for ReLU networks
        kernel_h, kernel_w = self.kernel_size
        fan_in = in_channels * kernel_h * kernel_w
        std = np.sqrt(2.0 / fan_in)

        # Weight shape: (out_channels, in_channels, kernel_h, kernel_w)
        self.weight = Tensor(np.random.normal(0, std,
                           (out_channels, in_channels, kernel_h, kernel_w)),
                           requires_grad=True)

        # Bias initialization
        if bias:
            self.bias = Tensor(np.zeros(out_channels), requires_grad=True)
        else:
            self.bias = None
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through Conv2d layer.

        TODO: Implement convolution with explicit loops

        APPROACH:
        1. Extract input dimensions and validate
        2. Calculate output dimensions
        3. Apply padding if needed
        4. Implement 6 nested loops for full convolution
        5. Add bias if present

        LOOP STRUCTURE:
        for batch in range(batch_size):
            for out_ch in range(out_channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        for k_h in range(kernel_height):
                            for k_w in range(kernel_width):
                                for in_ch in range(in_channels):
                                    # Accumulate: out += input * weight

        EXAMPLE:
        >>> conv = Conv2d(3, 16, kernel_size=3, padding=1)
        >>> x = Tensor(np.random.randn(2, 3, 32, 32))  # batch=2, RGB, 32x32
        >>> out = conv(x)
        >>> print(out.shape)  # Should be (2, 16, 32, 32)

        HINTS:
        - Handle padding by creating padded input array
        - Watch array bounds in inner loops
        - Accumulate products for each output position
        """
        ### BEGIN SOLUTION
        # Input validation and shape extraction
        if len(x.shape) != 4:
            raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")

        batch_size, in_channels, in_height, in_width = x.shape
        out_channels = self.out_channels
        kernel_h, kernel_w = self.kernel_size

        # Calculate output dimensions
        out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
        out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1

        # Apply padding if needed
        if self.padding > 0:
            padded_input = np.pad(x.data,
                                ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
                                mode='constant', constant_values=0)
        else:
            padded_input = x.data

        # Initialize output
        output = np.zeros((batch_size, out_channels, out_height, out_width))

        # Explicit 6-nested loop convolution to show complexity
        for b in range(batch_size):
            for out_ch in range(out_channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        # Calculate input region for this output position
                        in_h_start = out_h * self.stride
                        in_w_start = out_w * self.stride

                        # Accumulate convolution result
                        conv_sum = 0.0
                        for k_h in range(kernel_h):
                            for k_w in range(kernel_w):
                                for in_ch in range(in_channels):
                                    # Get input and weight values
                                    input_val = padded_input[b, in_ch,
                                                           in_h_start + k_h,
                                                           in_w_start + k_w]
                                    weight_val = self.weight.data[out_ch, in_ch, k_h, k_w]

                                    # Accumulate
                                    conv_sum += input_val * weight_val

                        # Store result
                        output[b, out_ch, out_h, out_w] = conv_sum

        # Add bias if present
        if self.bias is not None:
            # Broadcast bias across spatial dimensions
            for out_ch in range(out_channels):
                output[:, out_ch, :, :] += self.bias.data[out_ch]

        # Return Tensor with gradient tracking enabled
        result = Tensor(output, requires_grad=(x.requires_grad or self.weight.requires_grad))
        
        # Attach backward function for gradient computation (following TinyTorch protocol)
        if result.requires_grad:
            result._grad_fn = Conv2dBackward(
                x, self.weight, self.bias,
                self.stride, self.padding, self.kernel_size,
                padded_input.shape
            )
        
        return result
        ### END SOLUTION

    def parameters(self):
        """Return trainable parameters."""
        params = [self.weight]
        if self.bias is not None:
            params.append(self.bias)
        return params

    def __call__(self, x):
        """Enable model(x) syntax."""
        return self.forward(x)

# %% ../../modules/09_spatial/spatial.ipynb 11
class MaxPool2dBackward(Function):
    """
    Gradient computation for 2D max pooling.
    
    Max pooling gradients flow only to the positions that were selected
    as the maximum in the forward pass.
    """
    
    def __init__(self, x, output_shape, kernel_size, stride, padding):
        super().__init__(x)
        self.x = x
        self.output_shape = output_shape
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        # Store max positions for gradient routing
        self.max_positions = {}
    
    def apply(self, grad_output):
        """
        Route gradients back to max positions.
        
        Args:
            grad_output: Gradient from next layer
        
        Returns:
            Gradient w.r.t. input
        """
        batch_size, channels, in_height, in_width = self.x.shape
        _, _, out_height, out_width = self.output_shape
        kernel_h, kernel_w = self.kernel_size
        
        # Apply padding if needed
        if self.padding > 0:
            padded_input = np.pad(self.x.data,
                                ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
                                mode='constant', constant_values=-np.inf)
            grad_input_padded = np.zeros_like(padded_input)
        else:
            padded_input = self.x.data
            grad_input_padded = np.zeros_like(self.x.data)
        
        # Route gradients to max positions
        for b in range(batch_size):
            for c in range(channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        in_h_start = out_h * self.stride
                        in_w_start = out_w * self.stride
                        
                        # Find max position in this window
                        max_val = -np.inf
                        max_h, max_w = 0, 0
                        for k_h in range(kernel_h):
                            for k_w in range(kernel_w):
                                in_h = in_h_start + k_h
                                in_w = in_w_start + k_w
                                val = padded_input[b, c, in_h, in_w]
                                if val > max_val:
                                    max_val = val
                                    max_h, max_w = in_h, in_w
                        
                        # Route gradient to max position
                        grad_input_padded[b, c, max_h, max_w] += grad_output[b, c, out_h, out_w]
        
        # Remove padding
        if self.padding > 0:
            grad_input = grad_input_padded[:, :, 
                                          self.padding:-self.padding,
                                          self.padding:-self.padding]
        else:
            grad_input = grad_input_padded
        
        # Return as tuple (following Function protocol)
        return (grad_input,)


class MaxPool2d:
    """
    2D Max Pooling layer for spatial dimension reduction.

    Applies maximum operation over spatial windows, preserving
    the strongest activations while reducing computational load.

    Args:
        kernel_size: Size of pooling window (int or tuple)
        stride: Stride of pooling operation (default: same as kernel_size)
        padding: Zero-padding added to input (default: 0)
    """

    def __init__(self, kernel_size, stride=None, padding=0):
        """
        Initialize MaxPool2d layer.

        TODO: Store pooling parameters

        APPROACH:
        1. Convert kernel_size to tuple if needed
        2. Set stride to kernel_size if not provided (non-overlapping)
        3. Store padding parameter

        HINT: Default stride equals kernel_size for non-overlapping windows
        """
        super().__init__()

        ### BEGIN SOLUTION
        # Handle kernel_size as int or tuple
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size

        # Default stride equals kernel_size (non-overlapping)
        if stride is None:
            self.stride = self.kernel_size[0]
        else:
            self.stride = stride

        self.padding = padding
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through MaxPool2d layer.

        TODO: Implement max pooling with explicit loops

        APPROACH:
        1. Extract input dimensions
        2. Calculate output dimensions
        3. Apply padding if needed
        4. Implement nested loops for pooling windows
        5. Find maximum value in each window

        LOOP STRUCTURE:
        for batch in range(batch_size):
            for channel in range(channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        # Find max in window [in_h:in_h+k_h, in_w:in_w+k_w]
                        max_val = -infinity
                        for k_h in range(kernel_height):
                            for k_w in range(kernel_width):
                                max_val = max(max_val, input[...])

        EXAMPLE:
        >>> pool = MaxPool2d(kernel_size=2, stride=2)
        >>> x = Tensor(np.random.randn(1, 3, 8, 8))
        >>> out = pool(x)
        >>> print(out.shape)  # Should be (1, 3, 4, 4)

        HINTS:
        - Initialize max_val to negative infinity
        - Handle stride correctly when accessing input
        - No parameters to update (pooling has no weights)
        """
        ### BEGIN SOLUTION
        # Input validation and shape extraction
        if len(x.shape) != 4:
            raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")

        batch_size, channels, in_height, in_width = x.shape
        kernel_h, kernel_w = self.kernel_size

        # Calculate output dimensions
        out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
        out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1

        # Apply padding if needed
        if self.padding > 0:
            padded_input = np.pad(x.data,
                                ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
                                mode='constant', constant_values=-np.inf)
        else:
            padded_input = x.data

        # Initialize output
        output = np.zeros((batch_size, channels, out_height, out_width))

        # Explicit nested loop max pooling
        for b in range(batch_size):
            for c in range(channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        # Calculate input region for this output position
                        in_h_start = out_h * self.stride
                        in_w_start = out_w * self.stride

                        # Find maximum in window
                        max_val = -np.inf
                        for k_h in range(kernel_h):
                            for k_w in range(kernel_w):
                                input_val = padded_input[b, c,
                                                       in_h_start + k_h,
                                                       in_w_start + k_w]
                                max_val = max(max_val, input_val)

                        # Store result
                        output[b, c, out_h, out_w] = max_val

        # Return Tensor with gradient tracking
        result = Tensor(output, requires_grad=x.requires_grad)
        
        # Attach backward function for gradient computation
        if result.requires_grad:
            result._grad_fn = MaxPool2dBackward(
                x, output.shape, self.kernel_size, self.stride, self.padding
            )
        
        return result
        ### END SOLUTION

    def parameters(self):
        """Return empty list (pooling has no parameters)."""
        return []

    def __call__(self, x):
        """Enable model(x) syntax."""
        return self.forward(x)

# %% ../../modules/09_spatial/spatial.ipynb 13
class AvgPool2d:
    """
    2D Average Pooling layer for spatial dimension reduction.

    Applies average operation over spatial windows, smoothing
    features while reducing computational load.

    Args:
        kernel_size: Size of pooling window (int or tuple)
        stride: Stride of pooling operation (default: same as kernel_size)
        padding: Zero-padding added to input (default: 0)
    """

    def __init__(self, kernel_size, stride=None, padding=0):
        """
        Initialize AvgPool2d layer.

        TODO: Store pooling parameters (same as MaxPool2d)

        APPROACH:
        1. Convert kernel_size to tuple if needed
        2. Set stride to kernel_size if not provided
        3. Store padding parameter
        """
        super().__init__()

        ### BEGIN SOLUTION
        # Handle kernel_size as int or tuple
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size

        # Default stride equals kernel_size (non-overlapping)
        if stride is None:
            self.stride = self.kernel_size[0]
        else:
            self.stride = stride

        self.padding = padding
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through AvgPool2d layer.

        TODO: Implement average pooling with explicit loops

        APPROACH:
        1. Similar structure to MaxPool2d
        2. Instead of max, compute average of window
        3. Divide sum by window area for true average

        LOOP STRUCTURE:
        for batch in range(batch_size):
            for channel in range(channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        # Compute average in window
                        window_sum = 0
                        for k_h in range(kernel_height):
                            for k_w in range(kernel_width):
                                window_sum += input[...]
                        avg_val = window_sum / (kernel_height * kernel_width)

        HINT: Remember to divide by window area to get true average
        """
        ### BEGIN SOLUTION
        # Input validation and shape extraction
        if len(x.shape) != 4:
            raise ValueError(f"Expected 4D input (batch, channels, height, width), got {x.shape}")

        batch_size, channels, in_height, in_width = x.shape
        kernel_h, kernel_w = self.kernel_size

        # Calculate output dimensions
        out_height = (in_height + 2 * self.padding - kernel_h) // self.stride + 1
        out_width = (in_width + 2 * self.padding - kernel_w) // self.stride + 1

        # Apply padding if needed
        if self.padding > 0:
            padded_input = np.pad(x.data,
                                ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)),
                                mode='constant', constant_values=0)
        else:
            padded_input = x.data

        # Initialize output
        output = np.zeros((batch_size, channels, out_height, out_width))

        # Explicit nested loop average pooling
        for b in range(batch_size):
            for c in range(channels):
                for out_h in range(out_height):
                    for out_w in range(out_width):
                        # Calculate input region for this output position
                        in_h_start = out_h * self.stride
                        in_w_start = out_w * self.stride

                        # Compute sum in window
                        window_sum = 0.0
                        for k_h in range(kernel_h):
                            for k_w in range(kernel_w):
                                input_val = padded_input[b, c,
                                                       in_h_start + k_h,
                                                       in_w_start + k_w]
                                window_sum += input_val

                        # Compute average
                        avg_val = window_sum / (kernel_h * kernel_w)

                        # Store result
                        output[b, c, out_h, out_w] = avg_val

        return Tensor(output)
        ### END SOLUTION

    def parameters(self):
        """Return empty list (pooling has no parameters)."""
        return []

    def __call__(self, x):
        """Enable model(x) syntax."""
        return self.forward(x)

# %% ../../modules/09_spatial/spatial.ipynb 21
class SimpleCNN:
    """
    Simple CNN demonstrating spatial operations integration.

    Architecture:
    - Conv2d(3→16, 3×3) + ReLU + MaxPool(2×2)
    - Conv2d(16→32, 3×3) + ReLU + MaxPool(2×2)
    - Flatten + Linear(features→num_classes)
    """

    def __init__(self, num_classes=10):
        """
        Initialize SimpleCNN.

        TODO: Build CNN architecture with spatial and dense layers

        APPROACH:
        1. Conv layer 1: 3 → 16 channels, 3×3 kernel, padding=1
        2. Pool layer 1: 2×2 max pooling
        3. Conv layer 2: 16 → 32 channels, 3×3 kernel, padding=1
        4. Pool layer 2: 2×2 max pooling
        5. Calculate flattened size and add final linear layer

        HINT: For 32×32 input → 32→16→8→4 spatial reduction
        Final feature size: 32 channels × 4×4 = 512 features
        """
        super().__init__()

        ### BEGIN SOLUTION
        # Convolutional layers
        self.conv1 = Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.pool1 = MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.pool2 = MaxPool2d(kernel_size=2, stride=2)

        # Calculate flattened size
        # Input: 32×32 → Conv1+Pool1: 16×16 → Conv2+Pool2: 8×8
        # Wait, let's recalculate: 32×32 → Pool1: 16×16 → Pool2: 8×8
        # Final: 32 channels × 8×8 = 2048 features
        self.flattened_size = 32 * 8 * 8

        # Import Linear layer (we'll implement a simple version)
        # For now, we'll use a placeholder that we can replace
        # This represents the final classification layer
        self.num_classes = num_classes
        self.flattened_size = 32 * 8 * 8  # Will be used when we add Linear layer
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through SimpleCNN.

        TODO: Implement CNN forward pass

        APPROACH:
        1. Apply conv1 → ReLU → pool1
        2. Apply conv2 → ReLU → pool2
        3. Flatten spatial dimensions
        4. Apply final linear layer (when available)

        For now, return features before final linear layer
        since we haven't imported Linear from layers module yet.
        """
        ### BEGIN SOLUTION
        # First conv block
        x = self.conv1(x)
        x = self.relu(x)  # ReLU activation
        x = self.pool1(x)

        # Second conv block
        x = self.conv2(x)
        x = self.relu(x)  # ReLU activation
        x = self.pool2(x)

        # Flatten for classification (reshape to 2D)
        batch_size = x.shape[0]
        x_flat = x.data.reshape(batch_size, -1)

        # Return flattened features
        # In a complete implementation, this would go through a Linear layer
        return Tensor(x_flat)
        ### END SOLUTION

    def relu(self, x):
        """Simple ReLU implementation for CNN."""
        return Tensor(np.maximum(0, x.data))

    def parameters(self):
        """Return all trainable parameters."""
        params = []
        params.extend(self.conv1.parameters())
        params.extend(self.conv2.parameters())
        # Linear layer parameters would be added here
        return params

    def __call__(self, x):
        """Enable model(x) syntax."""
        return self.forward(x)