TinyTorch/tinytorch/core/layers.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/03_layers/layers_dev.ipynb.

# %% auto 0
__all__ = ['Dense', 'Module', 'matmul', 'Linear', 'Sequential', 'Flatten', 'flatten']

# %% ../../modules/03_layers/layers_dev.ipynb 1
import numpy as np
import sys
import os
from typing import Union, Tuple, Optional, Any

# Import our building blocks - try package first, then local modules
try:
    from tinytorch.core.tensor import Tensor, Parameter
except ImportError:
    # For development, import from local modules
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor'))
    from tensor_dev import Tensor, Parameter

# %% ../../modules/03_layers/layers_dev.ipynb 4
class Module:
    """
    Base class for all neural network modules.

    Provides automatic parameter collection, forward pass management,
    and clean composition patterns. All layers (Dense, Conv2d, etc.)
    inherit from this class.

    Key Features:
    - Automatic parameter registration when you assign Tensors with requires_grad=True
    - Recursive parameter collection from sub-modules
    - Clean __call__ interface: model(x) instead of model.forward(x)
    - Extensible for custom layers

    Example Usage:
        class MLP(Module):
            def __init__(self):
                super().__init__()
                self.layer1 = Dense(784, 128)  # Auto-registered!
                self.layer2 = Dense(128, 10)   # Auto-registered!

            def forward(self, x):
                x = self.layer1(x)
                return self.layer2(x)

        model = MLP()
        params = model.parameters()  # Gets all parameters automatically!
        output = model(input)        # Clean interface!
    """

    def __init__(self):
        """Initialize module with empty parameter and sub-module storage."""
        self._parameters = []
        self._modules = []

    def __setattr__(self, name, value):
        """
        Intercept attribute assignment to auto-register parameters and modules.

        When you do self.weight = Parameter(...), this automatically adds
        the parameter to our collection for easy optimization.
        """
        # Check if it's a tensor that needs gradients (a parameter)
        if hasattr(value, 'requires_grad') and value.requires_grad:
            self._parameters.append(value)
        # Check if it's another Module (sub-module)
        elif isinstance(value, Module):
            self._modules.append(value)

        # Always call parent to actually set the attribute
        super().__setattr__(name, value)

    def parameters(self):
        """
        Recursively collect all parameters from this module and sub-modules.

        Returns:
            List of all parameters (Tensors with requires_grad=True)

        This enables: optimizer = Adam(model.parameters())
        """
        # Start with our own parameters
        params = list(self._parameters)

        # Add parameters from sub-modules recursively
        for module in self._modules:
            params.extend(module.parameters())

        return params

    def __call__(self, *args, **kwargs):
        """
        Makes modules callable: model(x) instead of model.forward(x).

        This is the magic that enables clean syntax like:
            output = model(input)
        instead of:
            output = model.forward(input)
        """
        return self.forward(*args, **kwargs)

    def forward(self, *args, **kwargs):
        """
        Forward pass - must be implemented by subclasses.

        This is where the actual computation happens. Every layer
        defines its own forward() method.
        """
        raise NotImplementedError("Subclasses must implement forward()")

# %% ../../modules/03_layers/layers_dev.ipynb 7
def matmul(a: Tensor, b: Tensor) -> Tensor:
    """
    Matrix multiplication for tensors.

    Args:
        a: Left tensor (shape: ..., m, k)
        b: Right tensor (shape: ..., k, n)

    Returns:
        Result tensor (shape: ..., m, n)

    TODO: Implement matrix multiplication using numpy's @ operator.

    STEP-BY-STEP IMPLEMENTATION:
    1. Extract numpy arrays from both tensors using .data
    2. Perform matrix multiplication: result_data = a_data @ b_data
    3. Wrap result in a new Tensor and return

    LEARNING CONNECTIONS:
    - This is the core operation in Dense layers: output = input @ weights
    - PyTorch uses optimized BLAS libraries for this operation
    - GPU implementations parallelize this across thousands of cores
    - Understanding this operation is key to neural network performance

    EXAMPLE:
    ```python
    a = Tensor([[1, 2], [3, 4]])  # shape (2, 2)
    b = Tensor([[5, 6], [7, 8]])  # shape (2, 2)
    result = matmul(a, b)
    # result.data = [[19, 22], [43, 50]]
    ```

    IMPLEMENTATION HINTS:
    - Use the @ operator for clean matrix multiplication
    - Ensure you return a Tensor, not a numpy array
    - The operation should work for any compatible matrix shapes
    """
    ### BEGIN SOLUTION
    # Check if we're dealing with Variables (autograd) or plain Tensors
    a_is_variable = hasattr(a, 'requires_grad') and hasattr(a, 'grad_fn')
    b_is_variable = hasattr(b, 'requires_grad') and hasattr(b, 'grad_fn')

    # Extract numpy data appropriately
    if a_is_variable:
        a_data = a.data.data  # Variable.data is a Tensor, so .data.data gets numpy array
    else:
        a_data = a.data  # Tensor.data is numpy array directly

    if b_is_variable:
        b_data = b.data.data
    else:
        b_data = b.data

    # Perform matrix multiplication
    result_data = a_data @ b_data

    # If any input is a Variable, return Variable with gradient tracking
    if a_is_variable or b_is_variable:
        # Import Variable locally to avoid circular imports
        if 'Variable' not in globals():
            try:
                from tinytorch.core.autograd import Variable
            except ImportError:
                from autograd_dev import Variable

        # Create gradient function for matrix multiplication
        def grad_fn(grad_output):
            # Matrix multiplication backward pass:
            # If C = A @ B, then:
            # dA = grad_output @ B^T
            # dB = A^T @ grad_output

            if a_is_variable and a.requires_grad:
                # Gradient w.r.t. A: grad_output @ B^T
                grad_a_data = grad_output.data.data @ b_data.T
                a.backward(Variable(grad_a_data))

            if b_is_variable and b.requires_grad:
                # Gradient w.r.t. B: A^T @ grad_output
                grad_b_data = a_data.T @ grad_output.data.data
                b.backward(Variable(grad_b_data))

        # Determine if result should require gradients
        requires_grad = (a_is_variable and a.requires_grad) or (b_is_variable and b.requires_grad)

        return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn)
    else:
        # Both inputs are Tensors, return Tensor (backward compatible)
        return Tensor(result_data)
    ### END SOLUTION

# %% ../../modules/03_layers/layers_dev.ipynb 11
class Linear(Module):
    """
    Linear (Fully Connected) Layer implementation.

    Applies the transformation: output = input @ weights + bias

    Inherits from Module for automatic parameter management and clean API.
    This is PyTorch's nn.Linear equivalent with the same name for familiarity.

    Features:
    - Automatic parameter registration (weights and bias)
    - Clean call interface: layer(input) instead of layer.forward(input)
    - Works with optimizers via model.parameters()
    """

    def __init__(self, input_size: int, output_size: int, use_bias: bool = True):
        """
        Initialize Linear layer with random weights and optional bias.

        Args:
            input_size: Number of input features
            output_size: Number of output features
            use_bias: Whether to include bias term

        TODO: Implement Linear layer initialization.

        STEP-BY-STEP IMPLEMENTATION:
        1. Store input_size and output_size as instance variables
        2. Initialize weights as Tensor with shape (input_size, output_size)
        3. Use small random values: np.random.randn(...) * 0.1
        4. Initialize bias as Tensor with shape (output_size,) if use_bias is True
        5. Set bias to None if use_bias is False

        LEARNING CONNECTIONS:
        - Small random initialization prevents symmetry breaking
        - Weight shape (input_size, output_size) enables matrix multiplication
        - Bias allows shifting the output (like y-intercept in linear regression)
        - PyTorch uses more sophisticated initialization (Xavier, Kaiming)

        IMPLEMENTATION HINTS:
        - Use np.random.randn() for Gaussian random numbers
        - Scale by 0.1 to keep initial values small
        - Remember to wrap numpy arrays in Tensor()
        - Store use_bias flag for forward pass logic
        """
        ### BEGIN SOLUTION
        super().__init__()  # Initialize Module base class

        self.input_size = input_size
        self.output_size = output_size
        self.use_bias = use_bias

        # Initialize weights with small random values using Parameter
        # Shape: (input_size, output_size) for matrix multiplication
        weight_data = np.random.randn(input_size, output_size) * 0.1
        self.weights = Parameter(weight_data)  # Auto-registers for optimization!

        # Initialize bias if requested
        if use_bias:
            bias_data = np.random.randn(output_size) * 0.1
            self.bias = Parameter(bias_data)  # Auto-registers for optimization!
        else:
            self.bias = None
        ### END SOLUTION

    def forward(self, x: Union[Tensor, 'Variable']) -> Union[Tensor, 'Variable']:
        """
        Forward pass through the Linear layer.

        Args:
            x: Input tensor or Variable (shape: ..., input_size)

        Returns:
            Output tensor or Variable (shape: ..., output_size)
            Preserves Variable type for gradient tracking in training

        TODO: Implement autograd-aware forward pass: output = input @ weights + bias

        STEP-BY-STEP IMPLEMENTATION:
        1. Perform matrix multiplication: output = matmul(x, self.weights)
        2. If bias exists, add it appropriately based on input type
        3. Preserve Variable type for gradient tracking if input is Variable
        4. Return result maintaining autograd capabilities

        AUTOGRAD CONSIDERATIONS:
        - If x is Variable: weights and bias should also be Variables for training
        - Preserve gradient tracking through the entire computation
        - Enable backpropagation through this layer's parameters
        - Handle mixed Tensor/Variable scenarios gracefully

        LEARNING CONNECTIONS:
        - This is the core neural network transformation
        - Matrix multiplication scales input features to output features
        - Bias provides offset (like y-intercept in linear equations)
        - Broadcasting handles different batch sizes automatically
        - Autograd support enables automatic parameter optimization

        IMPLEMENTATION HINTS:
        - Use the matmul function you implemented above (now autograd-aware)
        - Handle bias addition based on input/output types
        - Variables support + operator for gradient-tracked addition
        - Check if self.bias is not None before adding
        """
        ### BEGIN SOLUTION
        # Matrix multiplication: input @ weights (now autograd-aware)
        output = matmul(x, self.weights)

        # Add bias if it exists
        # The addition will preserve Variable type if output is Variable
        if self.bias is not None:
            # Check if we need Variable-aware addition
            if hasattr(output, 'requires_grad'):
                # output is a Variable, use Variable addition
                if hasattr(self.bias, 'requires_grad'):
                    # bias is also Variable, direct addition works
                    output = output + self.bias
                else:
                    # bias is Tensor, convert to Variable for addition
                    # Import Variable if not already available
                    if 'Variable' not in globals():
                        try:
                            from tinytorch.core.autograd import Variable
                        except ImportError:
                            from autograd_dev import Variable

                    bias_var = Variable(self.bias.data, requires_grad=False)
                    output = output + bias_var
            else:
                # output is Tensor, use regular addition
                output = output + self.bias

        return output
        ### END SOLUTION

# Backward compatibility alias
Dense = Linear

class Sequential(Module):
    """
    Sequential Network: Composes layers in sequence.

    The most fundamental network architecture that applies layers in order:
    f(x) = layer_n(...layer_2(layer_1(x)))

    Inherits from Module for automatic parameter collection from all sub-layers.
    This enables optimizers to find all parameters automatically.

    Example Usage:
        # Create a 3-layer MLP
        model = Sequential([
            Linear(784, 128),
            ReLU(),
            Linear(128, 64),
            ReLU(),
            Linear(64, 10)
        ])

        # Use the model
        output = model(input_data)  # Clean interface!
        params = model.parameters()  # All parameters from all layers!
    """

    def __init__(self, layers=None):
        """
        Initialize Sequential network with layers.

        Args:
            layers: List of layers to compose in order (optional)
        """
        super().__init__()  # Initialize Module base class
        self.layers = layers if layers is not None else []

        # Register all layers as sub-modules for parameter collection
        for i, layer in enumerate(self.layers):
            # This automatically adds each layer to self._modules
            setattr(self, f'layer_{i}', layer)

    def forward(self, x):
        """
        Forward pass through all layers in sequence.

        Args:
            x: Input tensor

        Returns:
            Output tensor after passing through all layers
        """
        for layer in self.layers:
            x = layer(x)
        return x

    def add(self, layer):
        """Add a layer to the network."""
        self.layers.append(layer)
        # Register the new layer for parameter collection
        setattr(self, f'layer_{len(self.layers)-1}', layer)

def flatten(x, start_dim=1):
    """
    Flatten tensor starting from a given dimension.

    This is essential for transitioning from convolutional layers
    (which output 4D tensors) to linear layers (which expect 2D).

    Args:
        x: Input tensor (Tensor or any array-like)
        start_dim: Dimension to start flattening from (default: 1 to preserve batch)

    Returns:
        Flattened tensor preserving batch dimension

    Examples:
        # Flatten CNN output for Linear layer
        conv_output = Tensor(np.random.randn(32, 64, 8, 8))  # (batch, channels, height, width)
        flat = flatten(conv_output)  # (32, 4096) - ready for Linear layer!

        # Flatten image for MLP
        images = Tensor(np.random.randn(32, 3, 28, 28))  # CIFAR-10 batch
        flat = flatten(images)  # (32, 2352) - ready for MLP!
    """
    # Get the data (handle both Tensor and numpy arrays)
    if hasattr(x, 'data'):
        data = x.data
    else:
        data = x

    # Calculate new shape
    batch_size = data.shape[0] if start_dim > 0 else 1
    remaining_size = np.prod(data.shape[start_dim:])
    new_shape = (batch_size, remaining_size) if start_dim > 0 else (remaining_size,)

    # Reshape while preserving the original tensor type
    if hasattr(x, 'data'):
        # It's a Tensor - create a new Tensor with flattened data
        flattened_data = data.reshape(new_shape)
        # Use type(x) to preserve the exact Tensor type (Parameter vs regular Tensor)
        # This ensures that if input was a Parameter, output is also a Parameter
        return type(x)(flattened_data)
    else:
        # It's a numpy array - just reshape and return
        return data.reshape(new_shape)

class Flatten(Module):
    """
    Flatten layer that reshapes tensors from multi-dimensional to 2D.

    Essential for connecting convolutional layers (which output 4D tensors)
    to linear layers (which expect 2D tensors). Preserves the batch dimension.

    Example Usage:
        # In a CNN architecture
        model = Sequential([
            Conv2D(3, 16, kernel_size=3),  # Output: (batch, 16, height, width)
            ReLU(),
            Flatten(),                     # Output: (batch, 16*height*width)
            Linear(16*height*width, 10)    # Now compatible!
        ])
    """

    def __init__(self, start_dim=1):
        """
        Initialize Flatten layer.

        Args:
            start_dim: Dimension to start flattening from (default: 1 to preserve batch)
        """
        super().__init__()
        self.start_dim = start_dim

    def forward(self, x):
        """
        Flatten tensor starting from start_dim.

        Args:
            x: Input tensor

        Returns:
            Flattened tensor with batch dimension preserved
        """
        return flatten(x, start_dim=self.start_dim)