# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/04_layers/layers_dev.ipynb. # %% auto 0 __all__ = ['Dense', 'Module', 'matmul', 'Linear'] # %% ../../modules/source/04_layers/layers_dev.ipynb 1 import numpy as np import sys import os from typing import Union, Tuple, Optional, Any # Import our building blocks - try package first, then local modules try: from tinytorch.core.tensor import Tensor, Parameter except ImportError: # For development, import from local modules sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_tensor')) from tensor_dev import Tensor, Parameter # %% ../../modules/source/04_layers/layers_dev.ipynb 4 class Module: """ Base class for all neural network modules. Provides automatic parameter collection, forward pass management, and clean composition patterns. All layers (Dense, Conv2d, etc.) inherit from this class. Key Features: - Automatic parameter registration when you assign Tensors with requires_grad=True - Recursive parameter collection from sub-modules - Clean __call__ interface: model(x) instead of model.forward(x) - Extensible for custom layers Example Usage: class MLP(Module): def __init__(self): super().__init__() self.layer1 = Dense(784, 128) # Auto-registered! self.layer2 = Dense(128, 10) # Auto-registered! def forward(self, x): x = self.layer1(x) return self.layer2(x) model = MLP() params = model.parameters() # Gets all parameters automatically! output = model(input) # Clean interface! """ def __init__(self): """Initialize module with empty parameter and sub-module storage.""" self._parameters = [] self._modules = [] def __setattr__(self, name, value): """ Intercept attribute assignment to auto-register parameters and modules. When you do self.weight = Parameter(...), this automatically adds the parameter to our collection for easy optimization. """ # Check if it's a tensor that needs gradients (a parameter) if hasattr(value, 'requires_grad') and value.requires_grad: self._parameters.append(value) # Check if it's another Module (sub-module) elif isinstance(value, Module): self._modules.append(value) # Always call parent to actually set the attribute super().__setattr__(name, value) def parameters(self): """ Recursively collect all parameters from this module and sub-modules. Returns: List of all parameters (Tensors with requires_grad=True) This enables: optimizer = Adam(model.parameters()) """ # Start with our own parameters params = list(self._parameters) # Add parameters from sub-modules recursively for module in self._modules: params.extend(module.parameters()) return params def __call__(self, *args, **kwargs): """ Makes modules callable: model(x) instead of model.forward(x). This is the magic that enables clean syntax like: output = model(input) instead of: output = model.forward(input) """ return self.forward(*args, **kwargs) def forward(self, *args, **kwargs): """ Forward pass - must be implemented by subclasses. This is where the actual computation happens. Every layer defines its own forward() method. """ raise NotImplementedError("Subclasses must implement forward()") # %% ../../modules/source/04_layers/layers_dev.ipynb 7 def matmul(a: Tensor, b: Tensor) -> Tensor: """ Matrix multiplication for tensors using explicit loops. This implementation uses triple-nested loops for educational understanding of the fundamental operations. Module 15 will show the optimization progression from loops → blocking → vectorized operations. Args: a: Left tensor (shape: ..., m, k) b: Right tensor (shape: ..., k, n) Returns: Result tensor (shape: ..., m, n) TODO: Implement matrix multiplication using explicit loops. STEP-BY-STEP IMPLEMENTATION: 1. Extract numpy arrays from both tensors using .data 2. Check tensor shapes for compatibility 3. Use triple-nested loops to show every operation 4. Wrap result in a new Tensor and return LEARNING CONNECTIONS: - This is the core operation in Dense layers: output = input @ weights - Shows the fundamental computation before optimization - Module 15 will demonstrate the progression to high-performance implementations - Understanding loops helps appreciate vectorization and GPU parallelization EDUCATIONAL APPROACH: - Intentionally simple for understanding, not performance - Makes every multiply-add operation explicit - Sets up Module 15 to show optimization techniques EXAMPLE: ```python a = Tensor([[1, 2], [3, 4]]) # shape (2, 2) b = Tensor([[5, 6], [7, 8]]) # shape (2, 2) result = matmul(a, b) # result.data = [[19, 22], [43, 50]] ``` IMPLEMENTATION HINTS: - Use explicit loops to show every operation - This is educational, not optimized for performance - Module 15 will show the progression to fast implementations """ ### BEGIN SOLUTION # Check if we're dealing with Variables (autograd) or plain Tensors a_is_variable = hasattr(a, 'requires_grad') and hasattr(a, 'grad_fn') b_is_variable = hasattr(b, 'requires_grad') and hasattr(b, 'grad_fn') # Extract numpy data appropriately if a_is_variable: a_data = a.data.data # Variable.data is a Tensor, so .data.data gets numpy array else: a_data = a.data # Tensor.data is numpy array directly if b_is_variable: b_data = b.data.data else: b_data = b.data # Perform matrix multiplication using explicit loops (educational) # Get dimensions and validate compatibility if len(a_data.shape) != 2 or len(b_data.shape) != 2: raise ValueError("matmul requires 2D tensors") m, k = a_data.shape k2, n = b_data.shape if k != k2: raise ValueError(f"Inner dimensions must match: {k} != {k2}") # Initialize result matrix result_data = np.zeros((m, n), dtype=a_data.dtype) # Triple nested loops - educational, shows every operation # This is intentionally simple to understand the fundamental computation # Module 15 will show the optimization journey: # Step 1 (here): Educational loops - slow but clear # Step 2: Loop blocking for cache efficiency # Step 3: Vectorized operations with NumPy # Step 4: GPU acceleration and BLAS libraries for i in range(m): # For each row in result for j in range(n): # For each column in result for k_idx in range(k): # Dot product: sum over inner dimension result_data[i, j] += a_data[i, k_idx] * b_data[k_idx, j] # If any input is a Variable, return Variable with gradient tracking if a_is_variable or b_is_variable: # Import Variable locally to avoid circular imports if 'Variable' not in globals(): try: from tinytorch.core.autograd import Variable except ImportError: from autograd_dev import Variable # Create gradient function for matrix multiplication def grad_fn(grad_output): # Matrix multiplication backward pass: # If C = A @ B, then: # dA = grad_output @ B^T # dB = A^T @ grad_output if a_is_variable and a.requires_grad: # Gradient w.r.t. A: grad_output @ B^T grad_a_data = grad_output.data.data @ b_data.T a.backward(Variable(grad_a_data)) if b_is_variable and b.requires_grad: # Gradient w.r.t. B: A^T @ grad_output grad_b_data = a_data.T @ grad_output.data.data b.backward(Variable(grad_b_data)) # Determine if result should require gradients requires_grad = (a_is_variable and a.requires_grad) or (b_is_variable and b.requires_grad) return Variable(result_data, requires_grad=requires_grad, grad_fn=grad_fn) else: # Both inputs are Tensors, return Tensor (backward compatible) return Tensor(result_data) ### END SOLUTION # %% ../../modules/source/04_layers/layers_dev.ipynb 11 class Linear(Module): """ Linear (Fully Connected) Layer implementation. Applies the transformation: output = input @ weights + bias Inherits from Module for automatic parameter management and clean API. This is PyTorch's nn.Linear equivalent with the same name for familiarity. Features: - Automatic parameter registration (weights and bias) - Clean call interface: layer(input) instead of layer.forward(input) - Works with optimizers via model.parameters() """ def __init__(self, input_size: int, output_size: int, use_bias: bool = True): """ Initialize Linear layer with random weights and optional bias. Args: input_size: Number of input features output_size: Number of output features use_bias: Whether to include bias term TODO: Implement Linear layer initialization. STEP-BY-STEP IMPLEMENTATION: 1. Store input_size and output_size as instance variables 2. Initialize weights as Tensor with shape (input_size, output_size) 3. Use small random values: np.random.randn(...) * 0.1 4. Initialize bias as Tensor with shape (output_size,) if use_bias is True 5. Set bias to None if use_bias is False LEARNING CONNECTIONS: - Small random initialization prevents symmetry breaking - Weight shape (input_size, output_size) enables matrix multiplication - Bias allows shifting the output (like y-intercept in linear regression) - PyTorch uses more sophisticated initialization (Xavier, Kaiming) IMPLEMENTATION HINTS: - Use np.random.randn() for Gaussian random numbers - Scale by 0.1 to keep initial values small - Remember to wrap numpy arrays in Tensor() - Store use_bias flag for forward pass logic """ ### BEGIN SOLUTION super().__init__() # Initialize Module base class self.input_size = input_size self.output_size = output_size self.use_bias = use_bias # Initialize weights with small random values using Parameter # Shape: (input_size, output_size) for matrix multiplication weight_data = np.random.randn(input_size, output_size) * 0.1 self.weights = Parameter(weight_data) # Auto-registers for optimization! # Initialize bias if requested if use_bias: bias_data = np.random.randn(output_size) * 0.1 self.bias = Parameter(bias_data) # Auto-registers for optimization! else: self.bias = None ### END SOLUTION def forward(self, x: Union[Tensor, 'Variable']) -> Union[Tensor, 'Variable']: """ Forward pass through the Linear layer. Args: x: Input tensor or Variable (shape: ..., input_size) Returns: Output tensor or Variable (shape: ..., output_size) Preserves Variable type for gradient tracking in training """ ### BEGIN SOLUTION # Import Variable for gradient tracking try: from tinytorch.core.autograd import Variable except ImportError: # Fallback for development import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '..', '06_autograd')) from autograd_dev import Variable # Ensure input supports autograd if it's a Variable input_var = x if isinstance(x, Variable) else Variable(x, requires_grad=False) # Convert parameters to Variables to maintain gradient connections weight_var = Variable(self.weights, requires_grad=True) if not isinstance(self.weights, Variable) else self.weights # Matrix multiplication using Variable.__matmul__ which calls matmul_vars output = input_var @ weight_var # Add bias if it exists if self.bias is not None: bias_var = Variable(self.bias, requires_grad=True) if not isinstance(self.bias, Variable) else self.bias output = output + bias_var return output ### END SOLUTION # Backward compatibility alias #| export Dense = Linear