TinyTorch/tinytorch/core/layers.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_layers/layers_dev.ipynb.

# %% auto 0
__all__ = ['Linear', 'Dropout']

# %% ../../modules/source/03_layers/layers_dev.ipynb 1
import numpy as np
import sys
import os

# Import dependencies from tinytorch package
from .tensor import Tensor
from .activations import ReLU, Sigmoid

# %% ../../modules/source/03_layers/layers_dev.ipynb 6
class Linear:
    """
    Linear (fully connected) layer: y = xW + b

    This is the fundamental building block of neural networks.
    Applies a linear transformation to incoming data.
    """

    def __init__(self, in_features, out_features, bias=True):
        """
        Initialize linear layer with proper weight initialization.

        TODO: Initialize weights and bias with Xavier initialization

        APPROACH:
        1. Create weight matrix (in_features, out_features) with Xavier scaling
        2. Create bias vector (out_features,) initialized to zeros if bias=True
        3. Set requires_grad=True for parameters (ready for Module 05)

        EXAMPLE:
        >>> layer = Linear(784, 10)  # MNIST classifier final layer
        >>> print(layer.weight.shape)
        (784, 10)
        >>> print(layer.bias.shape)
        (10,)

        HINTS:
        - Xavier init: scale = sqrt(1/in_features)
        - Use np.random.randn() for normal distribution
        - bias=None when bias=False
        """
        ### BEGIN SOLUTION
        self.in_features = in_features
        self.out_features = out_features

        # Xavier/Glorot initialization for stable gradients
        scale = np.sqrt(1.0 / in_features)
        weight_data = np.random.randn(in_features, out_features) * scale
        self.weight = Tensor(weight_data, requires_grad=True)

        # Initialize bias to zeros or None
        if bias:
            bias_data = np.zeros(out_features)
            self.bias = Tensor(bias_data, requires_grad=True)
        else:
            self.bias = None
        ### END SOLUTION

    def forward(self, x):
        """
        Forward pass through linear layer.

        TODO: Implement y = xW + b

        APPROACH:
        1. Matrix multiply input with weights: xW
        2. Add bias if it exists
        3. Return result as new Tensor

        EXAMPLE:
        >>> layer = Linear(3, 2)
        >>> x = Tensor([[1, 2, 3], [4, 5, 6]])  # 2 samples, 3 features
        >>> y = layer.forward(x)
        >>> print(y.shape)
        (2, 2)  # 2 samples, 2 outputs

        HINTS:
        - Use tensor.matmul() for matrix multiplication
        - Handle bias=None case
        - Broadcasting automatically handles bias addition
        """
        ### BEGIN SOLUTION
        # Linear transformation: y = xW
        output = x.matmul(self.weight)

        # Add bias if present
        if self.bias is not None:
            output = output + self.bias

        return output
        ### END SOLUTION

    def __call__(self, x):
        """Allows the layer to be called like a function."""
        return self.forward(x)

    def parameters(self):
        """
        Return list of trainable parameters.

        TODO: Return all tensors that need gradients

        APPROACH:
        1. Start with weight (always present)
        2. Add bias if it exists
        3. Return as list for optimizer
        """
        ### BEGIN SOLUTION
        params = [self.weight]
        if self.bias is not None:
            params.append(self.bias)
        return params
        ### END SOLUTION

    def __repr__(self):
        """String representation for debugging."""
        bias_str = f", bias={self.bias is not None}"
        return f"Linear(in_features={self.in_features}, out_features={self.out_features}{bias_str})"

# %% ../../modules/source/03_layers/layers_dev.ipynb 10
class Dropout:
    """
    Dropout layer for regularization.

    During training: randomly zeros elements with probability p
    During inference: scales outputs by (1-p) to maintain expected value

    This prevents overfitting by forcing the network to not rely on specific neurons.
    """

    def __init__(self, p=0.5):
        """
        Initialize dropout layer.

        TODO: Store dropout probability

        Args:
            p: Probability of zeroing each element (0.0 = no dropout, 1.0 = zero everything)

        EXAMPLE:
        >>> dropout = Dropout(0.5)  # Zero 50% of elements during training
        """
        ### BEGIN SOLUTION
        if not 0.0 <= p <= 1.0:
            raise ValueError(f"Dropout probability must be between 0 and 1, got {p}")
        self.p = p
        ### END SOLUTION

    def forward(self, x, training=True):
        """
        Forward pass through dropout layer.

        TODO: Apply dropout during training, pass through during inference

        APPROACH:
        1. If not training, return input unchanged
        2. If training, create random mask with probability (1-p)
        3. Multiply input by mask and scale by 1/(1-p)
        4. Return result as new Tensor

        EXAMPLE:
        >>> dropout = Dropout(0.5)
        >>> x = Tensor([1, 2, 3, 4])
        >>> y_train = dropout.forward(x, training=True)   # Some elements zeroed
        >>> y_eval = dropout.forward(x, training=False)   # All elements preserved

        HINTS:
        - Use np.random.random() < keep_prob for mask
        - Scale by 1/(1-p) to maintain expected value
        - training=False should return input unchanged
        """
        ### BEGIN SOLUTION
        if not training or self.p == 0.0:
            # During inference or no dropout, pass through unchanged
            return x

        if self.p == 1.0:
            # Drop everything (preserve requires_grad for gradient flow)
            return Tensor(np.zeros_like(x.data), requires_grad=x.requires_grad if hasattr(x, 'requires_grad') else False)

        # During training, apply dropout
        keep_prob = 1.0 - self.p

        # Create random mask: True where we keep elements
        mask = np.random.random(x.data.shape) < keep_prob

        # Apply mask and scale using Tensor operations to preserve gradients!
        mask_tensor = Tensor(mask.astype(np.float32), requires_grad=False)  # Mask doesn't need gradients
        scale = Tensor(np.array(1.0 / keep_prob), requires_grad=False)

        # Use Tensor operations: x * mask * scale
        output = x * mask_tensor * scale
        return output
        ### END SOLUTION

    def __call__(self, x, training=True):
        """Allows the layer to be called like a function."""
        return self.forward(x, training)

    def parameters(self):
        """Dropout has no parameters."""
        return []

    def __repr__(self):
        return f"Dropout(p={self.p})"