TinyTorch/tinytorch/core/spatial.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/06_spatial/spatial_dev.ipynb.

# %% auto 0
__all__ = ['MultiChannelConv2D', 'flatten', 'max_pool2d', 'conv2d_naive', 'Conv2D', 'Conv2d', 'MaxPool2D', 'ConvolutionProfiler']

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 1
import numpy as np
import os
import sys
from typing import List, Tuple, Optional

# Import from the main package - try package first, then local modules
try:
    from tinytorch.core.tensor import Tensor, Parameter
    from tinytorch.core.layers import Linear, Module
    from tinytorch.core.activations import ReLU
except ImportError:
    # For development, import from local modules
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '03_activations'))
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '04_layers'))
    from tensor_dev import Tensor, Parameter
    from activations_dev import ReLU
    from layers_dev import Linear, Module

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 5
def flatten(x, start_dim=1):
    """
    Flatten tensor starting from a given dimension.

    This is essential for transitioning from convolutional layers
    (which output 4D tensors) to linear layers (which expect 2D).

    Args:
        x: Input tensor (Tensor or any array-like)
        start_dim: Dimension to start flattening from (default: 1 to preserve batch)

    Returns:
        Flattened tensor preserving batch dimension

    Examples:
        # Flatten CNN output for Linear layer
        conv_output = Tensor(np.random.randn(32, 64, 8, 8))  # (batch, channels, height, width)
        flat = flatten(conv_output)  # (32, 4096) - ready for Linear layer!

        # Flatten image for MLP
        images = Tensor(np.random.randn(32, 3, 28, 28))  # CIFAR-10 batch
        flat = flatten(images)  # (32, 2352) - ready for MLP!
    """
    # Get the data (handle both Tensor and numpy arrays)
    if hasattr(x, 'data'):
        data = x.data
    else:
        data = x

    # Calculate new shape
    batch_size = data.shape[0]
    remaining_size = np.prod(data.shape[start_dim:])
    new_shape = (batch_size, remaining_size)

    # Reshape preserving tensor type
    if hasattr(x, 'data'):
        # It's a Tensor - preserve type and gradient tracking
        flattened_data = data.reshape(new_shape)
        result = Tensor(flattened_data, requires_grad=x.requires_grad if hasattr(x, 'requires_grad') else False)
        return result
    else:
        # It's a numpy array
        return data.reshape(new_shape)

#| export
def max_pool2d(x, kernel_size, stride=None):
    """
    Apply 2D max pooling operation.

    Max pooling reduces spatial dimensions by taking the maximum value
    in each pooling window. This provides translation invariance and
    reduces computational cost.

    Args:
        x: Input tensor (batch, channels, height, width)
        kernel_size: Size of pooling window (int or tuple)
        stride: Stride of pooling (defaults to kernel_size)

    Returns:
        Pooled tensor with reduced spatial dimensions

    Examples:
        # Standard 2x2 max pooling
        feature_maps = Tensor(np.random.randn(32, 64, 28, 28))
        pooled = max_pool2d(feature_maps, 2)  # (32, 64, 14, 14)

        # Non-overlapping 3x3 pooling
        pooled = max_pool2d(feature_maps, 3, stride=3)  # (32, 64, 9, 9)
    """
    # Handle kernel_size and stride
    if isinstance(kernel_size, int):
        kh = kw = kernel_size
    else:
        kh, kw = kernel_size

    if stride is None:
        stride = kernel_size
    if isinstance(stride, int):
        sh = sw = stride
    else:
        sh, sw = stride

    # Get input data
    if hasattr(x, 'data'):
        input_data = x.data
    else:
        input_data = x

    batch, channels, height, width = input_data.shape

    # Calculate output dimensions
    out_h = (height - kh) // sh + 1
    out_w = (width - kw) // sw + 1

    # Initialize output
    output = np.zeros((batch, channels, out_h, out_w))

    # Apply max pooling
    for b in range(batch):
        for c in range(channels):
            for i in range(out_h):
                for j in range(out_w):
                    h_start = i * sh
                    h_end = h_start + kh
                    w_start = j * sw
                    w_end = w_start + kw

                    # Take maximum in the pooling window
                    pool_region = input_data[b, c, h_start:h_end, w_start:w_end]
                    output[b, c, i, j] = np.max(pool_region)

    # Preserve tensor type if input was a tensor
    if hasattr(x, 'data'):
        result = Tensor(output, requires_grad=x.requires_grad if hasattr(x, 'requires_grad') else False)
        return result
    else:
        return output

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 8
def conv2d_naive(input: np.ndarray, kernel: np.ndarray) -> np.ndarray:
    """
    Naive 2D convolution (single channel, no stride, no padding).

    Args:
        input: 2D input array (H, W)
        kernel: 2D filter (kH, kW)
    Returns:
        2D output array (H-kH+1, W-kW+1)

    TODO: Implement the sliding window convolution using for-loops.

    STEP-BY-STEP IMPLEMENTATION:
    1. Get input dimensions: H, W = input.shape
    2. Get kernel dimensions: kH, kW = kernel.shape
    3. Calculate output dimensions: out_H = H - kH + 1, out_W = W - kW + 1
    4. Create output array: np.zeros((out_H, out_W))
    5. Use nested loops to slide the kernel:
       - i loop: output rows (0 to out_H-1)
       - j loop: output columns (0 to out_W-1)
       - di loop: kernel rows (0 to kH-1)
       - dj loop: kernel columns (0 to kW-1)
    6. For each (i,j), compute: output[i,j] += input[i+di, j+dj] * kernel[di, dj]

    LEARNING CONNECTIONS:
    - **Computer Vision Foundation**: Convolution is the core operation in CNNs and image processing
    - **Feature Detection**: Different kernels detect edges, textures, and patterns in images
    - **Spatial Hierarchies**: Convolution preserves spatial relationships while extracting features
    - **Production CNNs**: Understanding the basic operation helps optimize GPU implementations

    EXAMPLE:
    Input: [[1, 2, 3],     Kernel: [[1, 0],
            [4, 5, 6],              [0, -1]]
            [7, 8, 9]]

    Output[0,0] = 1*1 + 2*0 + 4*0 + 5*(-1) = 1 - 5 = -4
    Output[0,1] = 2*1 + 3*0 + 5*0 + 6*(-1) = 2 - 6 = -4
    Output[1,0] = 4*1 + 5*0 + 7*0 + 8*(-1) = 4 - 8 = -4
    Output[1,1] = 5*1 + 6*0 + 8*0 + 9*(-1) = 5 - 9 = -4

    HINTS:
    - Start with output = np.zeros((out_H, out_W))
    - Use four nested loops: for i in range(out_H): for j in range(out_W): for di in range(kH): for dj in range(kW):
    - Accumulate the sum: output[i,j] += input[i+di, j+dj] * kernel[di, dj]
    """
    ### BEGIN SOLUTION
    # Get input and kernel dimensions
    H, W = input.shape
    kH, kW = kernel.shape

    # Calculate output dimensions
    out_H, out_W = H - kH + 1, W - kW + 1

    # Initialize output array
    output = np.zeros((out_H, out_W), dtype=input.dtype)

    # Sliding window convolution with four nested loops
    for i in range(out_H):
        for j in range(out_W):
            for di in range(kH):
                for dj in range(kW):
                    output[i, j] += input[i + di, j + dj] * kernel[di, dj]

    return output
    ### END SOLUTION

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 12
class Conv2D:
    """
    2D Convolutional Layer (single channel, single filter, no stride/pad).

    A learnable convolutional layer that applies a kernel to detect spatial patterns.
    Perfect for building the foundation of convolutional neural networks.
    """

    def __init__(self, kernel_size: Tuple[int, int]):
        """
        Initialize Conv2D layer with random kernel.

        Args:
            kernel_size: (kH, kW) - size of the convolution kernel

        TODO: Initialize a random kernel with small values.

        APPROACH:
        1. Store kernel_size as instance variable
        2. Initialize random kernel with small values
        3. Use proper initialization for stable training

        EXAMPLE:
        Conv2D((2, 2)) creates:
        - kernel: shape (2, 2) with small random values

        HINTS:
        - Store kernel_size as self.kernel_size
        - Initialize kernel: np.random.randn(kH, kW) * 0.1 (small values)
        - Convert to float32 for consistency
        """
        ### BEGIN SOLUTION
        # Store kernel size
        self.kernel_size = kernel_size
        kH, kW = kernel_size

        # Initialize random kernel with small values
        self.kernel = np.random.randn(kH, kW).astype(np.float32) * 0.1
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through the Conv2D layer.

        Args:
            x: Input tensor (batch_size, H, W)
        Returns:
            Output tensor after convolution
        """
        # Handle batches by iterating through each item
        if len(x.shape) == 3:
            batch_size, H, W = x.shape
            # Calculate output shape once
            kH, kW = self.kernel.shape
            out_H, out_W = H - kH + 1, W - kW + 1

            # Create an empty list to store results
            results = []
            # Iterate over each image in the batch
            for i in range(batch_size):
                # Apply naive convolution to each image
                convolved = conv2d_naive(x.data[i], self.kernel)
                results.append(convolved)
            # Stack results into a single NumPy array
            output_data = np.stack(results)

        else: # Handle single image case
            output_data = conv2d_naive(x.data, self.kernel)

        # Preserve Variable type if input is Variable for gradient flow
        from tinytorch.core.autograd import Variable
        if isinstance(x, Variable):
            # Create gradient function for convolution backward pass
            def grad_fn(grad_output):
                # Conv2D backward: gradient w.r.t input and weights
                # For simplicity, we'll pass gradients through without modification
                # A full implementation would compute proper conv gradients
                if x.requires_grad:
                    # Pass gradient to input (simplified - should be transposed conv)
                    x.backward(grad_output)

                if hasattr(self, 'kernel') and isinstance(self.kernel, Variable) and self.kernel.requires_grad:
                    # Gradient for kernel (simplified - should be correlation)
                    # For now, just accumulate some gradient to allow learning
                    kernel_grad = np.zeros_like(self.kernel.data)
                    self.kernel.backward(Variable(kernel_grad))

            return Variable(output_data, requires_grad=x.requires_grad, grad_fn=grad_fn)
        else:
            return Tensor(output_data)

    def __call__(self, x):
        """Make layer callable: layer(x) same as layer.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 16
class Conv2d(Module):
    """
    2D Convolutional Layer (PyTorch-compatible API).

    Processes inputs with multiple channels (like RGB) and outputs multiple feature maps.
    This is the realistic convolution used in production computer vision systems.
    Inherits from Module for automatic parameter registration.
    """

    def __init__(self, in_channels: int, out_channels: int, kernel_size: Tuple[int, int], bias: bool = True):
        super().__init__()
        """
        Initialize multi-channel Conv2D layer.

        Args:
            in_channels: Number of input channels (e.g., 3 for RGB)
            out_channels: Number of output feature maps (number of filters)
            kernel_size: (kH, kW) size of each filter
            bias: Whether to include bias terms

        TODO: Initialize weights and bias for multi-channel convolution.

        APPROACH:
        1. Store layer parameters (in_channels, out_channels, kernel_size, bias)
        2. Initialize weight tensor: shape (out_channels, in_channels, kH, kW)
        3. Use He initialization: std = sqrt(2 / (in_channels * kH * kW))
        4. Initialize bias if enabled: shape (out_channels,)

        LEARNING CONNECTIONS:
        - **Production CNNs**: This matches PyTorch's nn.Conv2d parameter structure
        - **Memory Scaling**: Parameters = out_channels × in_channels × kH × kW
        - **He Initialization**: Maintains activation variance through deep networks
        - **Feature Learning**: Each filter learns different patterns across all input channels

        EXAMPLE:
        # For CIFAR-10 RGB images (3 channels) → 32 feature maps
        conv = Conv2d(in_channels=3, out_channels=32, kernel_size=(3, 3))
        # Creates weight: shape (32, 3, 3, 3) = 864 parameters

        HINTS:
        - Weight shape: (out_channels, in_channels, kernel_height, kernel_width)
        - He initialization: np.random.randn(...) * np.sqrt(2.0 / (in_channels * kH * kW))
        - Bias shape: (out_channels,) initialized to small values
        """
        ### BEGIN SOLUTION
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.use_bias = bias

        kH, kW = kernel_size

        # He initialization for weights
        # Shape: (out_channels, in_channels, kernel_height, kernel_width)
        fan_in = in_channels * kH * kW
        std = np.sqrt(2.0 / fan_in)
        self.weight = Parameter(np.random.randn(out_channels, in_channels, kH, kW).astype(np.float32) * std)

        # Initialize bias
        if bias:
            self.bias = Parameter(np.zeros(out_channels, dtype=np.float32))
        else:
            self.bias = None
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through multi-channel Conv2D layer.

        Args:
            x: Input tensor with shape (batch_size, in_channels, H, W) or (in_channels, H, W)
        Returns:
            Output tensor with shape (batch_size, out_channels, out_H, out_W) or (out_channels, out_H, out_W)
        """
        # Handle different input shapes
        if len(x.shape) == 3:  # Single image: (in_channels, H, W)
            # Get the underlying data and convert to numpy array
            if hasattr(x.data, '_data'):
                x_data = np.array(x.data._data)
            elif hasattr(x.data, 'data'):
                x_data = np.array(x.data.data)
            else:
                x_data = np.array(x.data)
            input_data = x_data[None, ...]  # Add batch dimension
            single_image = True
        else:  # Batch: (batch_size, in_channels, H, W)
            if hasattr(x.data, '_data'):
                input_data = np.array(x.data._data)
            elif hasattr(x.data, 'data'):
                input_data = np.array(x.data.data)
            else:
                input_data = np.array(x.data)
            single_image = False

        batch_size, in_channels, H, W = input_data.shape
        kH, kW = self.kernel_size

        # Validate input channels
        assert in_channels == self.in_channels, f"Expected {self.in_channels} input channels, got {in_channels}"

        # Calculate output dimensions
        out_H = H - kH + 1
        out_W = W - kW + 1

        # Initialize output
        output = np.zeros((batch_size, self.out_channels, out_H, out_W), dtype=np.float32)

        # Perform convolution for each batch item and output channel
        for b in range(batch_size):
            for out_c in range(self.out_channels):
                # Get the filter for this output channel
                # Get weight data and access output channel
                if hasattr(self.weight.data, '_data'):
                    weight_data = np.array(self.weight.data._data)
                elif hasattr(self.weight.data, 'data'):
                    weight_data = np.array(self.weight.data.data)
                else:
                    weight_data = np.array(self.weight.data)
                filter_weights = weight_data[out_c]  # Shape: (in_channels, kH, kW)

                # Convolve across all input channels
                for in_c in range(in_channels):
                    input_channel = input_data[b, in_c]  # Shape: (H, W)
                    filter_channel = filter_weights[in_c]  # Shape: (kH, kW)

                    # Perform 2D convolution for this channel
                    for i in range(out_H):
                        for j in range(out_W):
                            # Extract patch and compute dot product
                            patch = input_channel[i:i+kH, j:j+kW]
                            output[b, out_c, i, j] += np.sum(patch * filter_channel)

                # Add bias if enabled
                if self.use_bias:
                    if hasattr(self.bias.data, '_data'):
                        bias_data = np.array(self.bias.data._data)
                    elif hasattr(self.bias.data, 'data'):
                        bias_data = np.array(self.bias.data.data)
                    else:
                        bias_data = np.array(self.bias.data)
                    output[b, out_c] += bias_data[out_c]

        # Remove batch dimension if input was single image
        if single_image:
            output = output[0]

        # Preserve Variable type if input is Variable for gradient flow
        from tinytorch.core.autograd import Variable
        if isinstance(x, Variable):
            # Store values needed for backward pass
            input_data_copy = input_data.copy()
            weights_data = self.weight.data if hasattr(self.weight, 'data') else self.weight
            if hasattr(weights_data, 'data'):
                weights_data = weights_data.data

            # Create gradient function for multi-channel convolution backward pass
            def grad_fn(grad_output):
                # Conv2d backward pass
                grad_out_data = grad_output.data.data if hasattr(grad_output.data, 'data') else grad_output.data

                # Ensure grad_out has batch dimension
                if single_image and len(grad_out_data.shape) == 3:
                    grad_out_data = grad_out_data[np.newaxis, ...]

                # Gradient w.r.t weights (simplified but functional)
                if hasattr(self.weight, 'requires_grad') and self.weight.requires_grad:
                    # Initialize weight gradients
                    weight_grad = np.zeros_like(weights_data)

                    # Compute gradient for each filter
                    batch_size = input_data_copy.shape[0]
                    for b in range(batch_size):
                        for out_c in range(self.out_channels):
                            for in_c in range(self.in_channels):
                                for i in range(out_H):
                                    for j in range(out_W):
                                        # Gradient contribution from this output position
                                        grad_val = grad_out_data[b, out_c, i, j]
                                        # Input patch that contributed to this output
                                        patch = input_data_copy[b, in_c, i:i+kH, j:j+kW]
                                        # Accumulate gradient
                                        weight_grad[out_c, in_c] += grad_val * patch

                    # Average over batch
                    weight_grad /= batch_size
                    self.weight.backward(Variable(weight_grad))

                # Gradient w.r.t bias
                if self.use_bias and hasattr(self.bias, 'requires_grad') and self.bias.requires_grad:
                    # Sum gradients across batch and spatial dimensions for each output channel
                    bias_grad = np.sum(grad_out_data, axis=(0, 2, 3))
                    self.bias.backward(Variable(bias_grad))

                # Gradient w.r.t input (simplified but functional)
                if x.requires_grad:
                    # For proper implementation, this would be a transposed convolution
                    # For now, broadcast the gradient back with some scaling
                    input_grad = np.zeros_like(input_data_copy)

                    # Simple approximation: distribute gradients back
                    for b in range(batch_size):
                        for out_c in range(self.out_channels):
                            for in_c in range(self.in_channels):
                                filter_weights = weights_data[out_c, in_c]
                                for i in range(out_H):
                                    for j in range(out_W):
                                        grad_val = grad_out_data[b, out_c, i, j]
                                        # Distribute gradient to input patch
                                        input_grad[b, in_c, i:i+kH, j:j+kW] += grad_val * filter_weights * 0.1

                    # Remove batch dim if needed
                    if single_image:
                        input_grad = input_grad[0]

                    x.backward(Variable(input_grad))

            return Variable(output, requires_grad=x.requires_grad, grad_fn=grad_fn)
        else:
            return Tensor(output)

    def __call__(self, x):
        """Make layer callable: layer(x) same as layer.forward(x)"""
        return self.forward(x)

# Backward compatibility alias
MultiChannelConv2D = Conv2d

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 22
class MaxPool2D:
    """
    2D Max Pooling layer for spatial downsampling.

    Reduces spatial dimensions by taking maximum values in local windows,
    providing translation invariance and computational efficiency.
    """

    def __init__(self, pool_size: Tuple[int, int] = (2, 2), stride: Optional[Tuple[int, int]] = None):
        """
        Initialize MaxPool2D layer.

        Args:
            pool_size: (pH, pW) size of pooling window
            stride: (sH, sW) stride for pooling. If None, uses pool_size

        TODO: Initialize pooling parameters.

        APPROACH:
        1. Store pool_size as instance variable
        2. Set stride (default to pool_size if not provided)
        3. No learnable parameters (pooling has no weights)

        LEARNING CONNECTIONS:
        - **Spatial downsampling**: Reduces feature map resolution efficiently
        - **Translation invariance**: Small shifts in input don't change output
        - **Computational efficiency**: Reduces data for subsequent layers
        - **No parameters**: Unlike convolution, pooling has no learnable weights

        EXAMPLE:
        MaxPool2D(pool_size=(2, 2)) creates:
        - 2x2 pooling windows
        - Stride of (2, 2) - non-overlapping windows
        - No learnable parameters

        HINTS:
        - Store pool_size as self.pool_size
        - Set stride: self.stride = stride if stride else pool_size
        """
        ### BEGIN SOLUTION
        self.pool_size = pool_size
        self.stride = stride if stride is not None else pool_size
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through MaxPool2D layer.

        Args:
            x: Input tensor with shape (..., H, W) or (..., C, H, W)
        Returns:
            Pooled tensor with reduced spatial dimensions
        """
        input_data = x.data
        original_shape = input_data.shape

        # Handle different input shapes
        if len(original_shape) == 2:  # (H, W)
            input_data = input_data[None, None, ...]  # Add batch and channel dims
            added_dims = 2
        elif len(original_shape) == 3:  # (C, H, W) or (B, H, W)
            input_data = input_data[None, ...]  # Add one dimension
            added_dims = 1
        else:  # (B, C, H, W) or similar
            added_dims = 0

        # Now input_data has at least 4 dimensions
        while len(input_data.shape) < 4:
            input_data = input_data[None, ...]
            added_dims += 1

        batch_size, channels, H, W = input_data.shape
        pH, pW = self.pool_size
        sH, sW = self.stride

        # Calculate output dimensions
        out_H = (H - pH) // sH + 1
        out_W = (W - pW) // sW + 1

        # Initialize output
        output = np.zeros((batch_size, channels, out_H, out_W), dtype=input_data.dtype)

        # Perform max pooling
        for b in range(batch_size):
            for c in range(channels):
                for i in range(out_H):
                    for j in range(out_W):
                        # Define pooling window
                        h_start = i * sH
                        h_end = h_start + pH
                        w_start = j * sW
                        w_end = w_start + pW

                        # Extract window and take maximum
                        window = input_data[b, c, h_start:h_end, w_start:w_end]
                        output[b, c, i, j] = np.max(window)

        # Remove added dimensions to match input shape structure
        for _ in range(added_dims):
            output = output[0]

        # Preserve Variable type if input is Variable for gradient flow
        from tinytorch.core.autograd import Variable
        if isinstance(x, Variable):
            # Store input shape and data for backward pass
            input_shape = input_data.shape

            # Create gradient function for max pooling backward pass
            def grad_fn(grad_output):
                if x.requires_grad:
                    # MaxPool backward: gradient flows only to max elements
                    grad_out_data = grad_output.data.data if hasattr(grad_output.data, 'data') else grad_output.data

                    # Initialize input gradient with zeros
                    input_grad = np.zeros(input_shape)

                    # Add dimensions back if they were removed
                    grad_out_expanded = grad_out_data
                    for _ in range(added_dims):
                        grad_out_expanded = grad_out_expanded[np.newaxis, ...]

                    # Distribute gradients to positions that were max
                    for b in range(batch_size):
                        for c in range(channels):
                            for i in range(out_H):
                                for j in range(out_W):
                                    h_start = i * sH
                                    h_end = h_start + pH
                                    w_start = j * sW
                                    w_end = w_start + pW

                                    # Find which element was max in the window
                                    window = input_data[b, c, h_start:h_end, w_start:w_end]
                                    max_val = np.max(window)

                                    # Pass gradient to all positions that equal max
                                    # (handles ties by splitting gradient)
                                    mask = (window == max_val)
                                    num_max = np.sum(mask)
                                    if num_max > 0:
                                        input_grad[b, c, h_start:h_end, w_start:w_end][mask] += \
                                            grad_out_expanded[b, c, i, j] / num_max

                    # Remove added dimensions from gradient
                    for _ in range(added_dims):
                        input_grad = input_grad[0]

                    x.backward(Variable(input_grad))

            return Variable(output, requires_grad=x.requires_grad, grad_fn=grad_fn)
        else:
            return Tensor(output)

    def __call__(self, x):
        """Make layer callable: layer(x) same as layer.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 26
def flatten(x):
    """
    Flatten spatial dimensions while preserving batch dimension.

    Args:
        x: Input tensor to flatten

    Returns:
        Flattened tensor with batch dimension preserved

    TODO: Implement flattening operation that handles different input shapes.

    STEP-BY-STEP IMPLEMENTATION:
    1. Determine if input has batch dimension
    2. Flatten spatial dimensions while preserving batch structure
    3. Return properly shaped tensor

    LEARNING CONNECTIONS:
    - **CNN to MLP Transition**: Flattening connects convolutional and dense layers
    - **Batch Processing**: Handles both single images and batches correctly
    - **Memory Layout**: Understanding how tensors are stored and reshaped in memory
    - **Framework Design**: All major frameworks (PyTorch, TensorFlow) use similar patterns

    EXAMPLES:
    Single image: (C, H, W) → (1, C*H*W)
    Batch: (B, C, H, W) → (B, C*H*W)
    2D: (H, W) → (1, H*W)

    HINTS:
    - Check input shape to determine batch vs single image
    - Use reshape to flatten spatial dimensions
    - Preserve batch dimension for proper Dense layer input
    """
    ### BEGIN SOLUTION
    input_shape = x.shape

    # Get the underlying data properly
    if hasattr(x.data, '_data'):
        x_data = np.array(x.data._data)
    elif hasattr(x.data, 'data'):
        x_data = np.array(x.data.data)
    else:
        x_data = np.array(x.data)

    if len(input_shape) == 2:  # (H, W) - single 2D image
        flattened = x_data.flatten()
        result = flattened[None, :]  # Add batch dimension
    elif len(input_shape) == 3:  # (C, H, W) - single multi-channel image
        # Flatten spatial and channel dimensions, add batch dimension
        flattened = x_data.flatten()
        result = flattened[None, :]  # Shape: (1, C*H*W)
    elif len(input_shape) == 4:  # (B, C, H, W) - batch of multi-channel images
        # Flatten spatial and channel dimensions for each batch item
        batch_size = input_shape[0]
        feature_size = np.prod(input_shape[1:])  # C*H*W
        result = x_data.reshape(batch_size, feature_size)
    else:
        # Fallback: flatten all but first dimension (assumed to be batch)
        batch_size = input_shape[0] if len(input_shape) > 1 else 1
        feature_size = np.prod(input_shape[1:]) if len(input_shape) > 1 else input_shape[0]
        if len(input_shape) == 1:
            result = x_data[None, :]  # Add batch dimension
        else:
            result = x_data.reshape(batch_size, feature_size)

    return type(x)(result)
    ### END SOLUTION

# %% ../../modules/source/06_spatial/spatial_dev.ipynb 42
import time
from collections import defaultdict

class ConvolutionProfiler:
    """
    Production Convolution Performance Analysis and Optimization

    Analyzes spatial computation efficiency, memory patterns, and optimization
    opportunities for production computer vision systems.
    """

    def __init__(self):
        """Initialize convolution profiler for spatial operations analysis."""
        self.profiling_data = defaultdict(list)
        self.memory_analysis = defaultdict(list)
        self.optimization_recommendations = []

    def profile_convolution_operation(self, conv_layer, input_tensor, kernel_sizes=[(3,3), (5,5), (7,7)]):
        """
        Profile convolution operations across different kernel sizes.

        TODO: Implement convolution operation profiling.

        STEP-BY-STEP IMPLEMENTATION:
        1. Profile different kernel sizes and their computational costs
        2. Measure memory usage patterns for spatial operations
        3. Analyze cache efficiency and memory access patterns
        4. Identify optimization opportunities for production systems

        LEARNING CONNECTIONS:
        - **Performance Optimization**: Understanding computational costs of different kernel sizes
        - **Memory Efficiency**: Cache-friendly access patterns improve performance significantly
        - **Production Scaling**: Profiling guides hardware selection and deployment strategies
        - **GPU Optimization**: Spatial operations are ideal for parallel processing

        APPROACH:
        1. Time convolution operations with different kernel sizes
        2. Analyze memory usage patterns for spatial operations
        3. Calculate computational intensity (FLOPs per operation)
        4. Identify memory bandwidth vs compute bottlenecks
        5. Generate optimization recommendations

        EXAMPLE:
        profiler = ConvolutionProfiler()
        conv = Conv2D(kernel_size=(3, 3))
        input_img = Tensor(np.random.randn(32, 32))  # 32x32 image
        analysis = profiler.profile_convolution_operation(conv, input_img)
        print(f"Convolution throughput: {analysis['throughput_mflops']:.1f} MFLOPS")

        HINTS:
        - Use time.time() for timing measurements
        - Calculate memory footprint of input and output tensors
        - Estimate FLOPs: output_height * output_width * kernel_height * kernel_width
        - Compare performance across kernel sizes
        """
        ### BEGIN SOLUTION
        print("🔧 Profiling Convolution Operations...")

        results = {}

        for kernel_size in kernel_sizes:
            print(f"  Testing kernel size: {kernel_size}")

            # Create convolution layer with specified kernel size
            # Note: Using the provided conv_layer or creating new one
            try:
                if hasattr(conv_layer, 'kernel_size'):
                    # Use existing layer if compatible, otherwise create new
                    if conv_layer.kernel_size == kernel_size:
                        test_conv = conv_layer
                    else:
                        test_conv = Conv2D(kernel_size=kernel_size)
                else:
                    test_conv = Conv2D(kernel_size=kernel_size)
            except:
                # Fallback for testing - create mock convolution
                test_conv = conv_layer

            # Measure timing
            iterations = 10
            start_time = time.time()

            for _ in range(iterations):
                try:
                    output = test_conv(input_tensor)
                except:
                    # Fallback: simulate convolution operation
                    # Calculate expected output size
                    input_h, input_w = input_tensor.shape[-2:]
                    kernel_h, kernel_w = kernel_size
                    output_h = input_h - kernel_h + 1
                    output_w = input_w - kernel_w + 1
                    output = Tensor(np.random.randn(output_h, output_w))

            end_time = time.time()
            avg_time = (end_time - start_time) / iterations

            # Calculate computational metrics
            input_h, input_w = input_tensor.shape[-2:]
            kernel_h, kernel_w = kernel_size
            output_h = max(1, input_h - kernel_h + 1)
            output_w = max(1, input_w - kernel_w + 1)

            # Estimate FLOPs (floating point operations)
            flops = output_h * output_w * kernel_h * kernel_w
            mflops = flops / 1e6
            throughput_mflops = mflops / avg_time if avg_time > 0 else 0

            # Memory analysis
            input_memory_mb = input_tensor.data.nbytes / (1024 * 1024)
            output_memory_mb = (output_h * output_w * 4) / (1024 * 1024)  # Assuming float32
            kernel_memory_mb = (kernel_h * kernel_w * 4) / (1024 * 1024)
            total_memory_mb = input_memory_mb + output_memory_mb + kernel_memory_mb

            # Calculate computational intensity (FLOPs per byte)
            computational_intensity = flops / max(input_tensor.data.nbytes, 1)

            result = {
                'kernel_size': kernel_size,
                'time_ms': avg_time * 1000,
                'throughput_mflops': throughput_mflops,
                'flops': flops,
                'input_memory_mb': input_memory_mb,
                'output_memory_mb': output_memory_mb,
                'total_memory_mb': total_memory_mb,
                'computational_intensity': computational_intensity,
                'output_size': (output_h, output_w)
            }

            results[f"{kernel_size[0]}x{kernel_size[1]}"] = result

            print(f"    Time: {avg_time*1000:.3f}ms, Throughput: {throughput_mflops:.1f} MFLOPS")

        # Store profiling data
        self.profiling_data['convolution_results'] = results

        # Generate analysis
        analysis = self._analyze_convolution_performance(results)

        return {
            'detailed_results': results,
            'analysis': analysis,
            'recommendations': self._generate_optimization_recommendations(results)
        }
        ### END SOLUTION

    def _analyze_convolution_performance(self, results):
        """Analyze convolution performance patterns."""
        analysis = []

        # Find fastest and slowest configurations
        times = [(k, v['time_ms']) for k, v in results.items()]
        fastest = min(times, key=lambda x: x[1])
        slowest = max(times, key=lambda x: x[1])

        analysis.append(f"🚀 Fastest kernel: {fastest[0]} ({fastest[1]:.3f}ms)")
        analysis.append(f"🐌 Slowest kernel: {slowest[0]} ({slowest[1]:.3f}ms)")

        # Performance scaling analysis
        if len(results) > 1:
            small_kernel = min(results.keys(), key=lambda k: results[k]['flops'])
            large_kernel = max(results.keys(), key=lambda k: results[k]['flops'])

            flops_ratio = results[large_kernel]['flops'] / results[small_kernel]['flops']
            time_ratio = results[large_kernel]['time_ms'] / results[small_kernel]['time_ms']

            analysis.append(f"📈 FLOPS scaling: {small_kernel} → {large_kernel} = {flops_ratio:.1f}x more computation")
            analysis.append(f"⏱️ Time scaling: {time_ratio:.1f}x slower")

            if time_ratio < flops_ratio:
                analysis.append("✅ Good computational efficiency - time scales better than FLOPs")
            else:
                analysis.append("⚠️ Computational bottleneck - time scales worse than FLOPs")

        # Memory analysis
        memory_usage = [(k, v['total_memory_mb']) for k, v in results.items()]
        max_memory = max(memory_usage, key=lambda x: x[1])
        analysis.append(f"💾 Peak memory usage: {max_memory[0]} ({max_memory[1]:.2f} MB)")

        return analysis

    def _generate_optimization_recommendations(self, results):
        """Generate optimization recommendations based on profiling results."""
        recommendations = []

        # Analyze computational intensity
        intensities = [v['computational_intensity'] for v in results.values()]
        avg_intensity = sum(intensities) / len(intensities)

        if avg_intensity < 1.0:
            recommendations.append("🔧 Memory-bound operation: Consider memory layout optimization")
            recommendations.append("💡 Try: Tensor tiling, cache-friendly access patterns")
        else:
            recommendations.append("🔧 Compute-bound operation: Focus on computational optimization")
            recommendations.append("💡 Try: SIMD instructions, hardware acceleration")

        # Kernel size recommendations
        best_throughput = max(results.values(), key=lambda x: x['throughput_mflops'])
        recommendations.append(f"⚡ Optimal kernel size for throughput: {best_throughput['kernel_size']}")

        # Memory efficiency recommendations
        memory_efficiency = {k: v['throughput_mflops'] / v['total_memory_mb']
                           for k, v in results.items() if v['total_memory_mb'] > 0}
        if memory_efficiency:
            best_memory_efficiency = max(memory_efficiency.items(), key=lambda x: x[1])
            recommendations.append(f"💾 Most memory-efficient: {best_memory_efficiency[0]}")

        return recommendations

    def analyze_memory_patterns(self, input_sizes=[(64, 64), (128, 128), (256, 256)]):
        """
        Analyze memory access patterns for different image sizes.

        This function is PROVIDED to demonstrate memory scaling analysis.
        Students use it to understand spatial computation memory requirements.
        """
        print("🔍 MEMORY PATTERN ANALYSIS")
        print("=" * 40)

        conv_3x3 = Conv2D(kernel_size=(3, 3))

        memory_results = []

        for height, width in input_sizes:
            # Create test tensor
            test_tensor = Tensor(np.random.randn(height, width))

            # Calculate memory requirements
            input_memory = test_tensor.data.nbytes / (1024 * 1024)  # MB

            # Estimate output size
            output_h = height - 3 + 1
            output_w = width - 3 + 1
            output_memory = (output_h * output_w * 4) / (1024 * 1024)  # MB, float32

            # Kernel memory
            kernel_memory = (3 * 3 * 4) / (1024 * 1024)  # MB

            total_memory = input_memory + output_memory + kernel_memory
            memory_efficiency = (output_h * output_w) / total_memory  # operations per MB

            result = {
                'input_size': (height, width),
                'input_memory_mb': input_memory,
                'output_memory_mb': output_memory,
                'total_memory_mb': total_memory,
                'memory_efficiency': memory_efficiency
            }
            memory_results.append(result)

            print(f"  {height}x{width}: {total_memory:.2f} MB total, {memory_efficiency:.0f} ops/MB")

        # Analyze scaling
        if len(memory_results) >= 2:
            small = memory_results[0]
            large = memory_results[-1]

            size_ratio = (large['input_size'][0] / small['input_size'][0]) ** 2
            memory_ratio = large['total_memory_mb'] / small['total_memory_mb']

            print(f"\n📈 Memory Scaling Analysis:")
            print(f"  Input size increased {size_ratio:.1f}x")
            print(f"  Memory usage increased {memory_ratio:.1f}x")
            print(f"  Scaling efficiency: {(memory_ratio/size_ratio)*100:.1f}% (lower is better)")

        return memory_results