# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_layers/layers_dev.ipynb.

# %% auto 0
__all__ = ['matmul_naive', 'Dense']

# %% ../../modules/source/03_layers/layers_dev.ipynb 1
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from typing import Union, List, Tuple, Optional

# Import our dependencies - try from package first, then local modules
try:
    from tinytorch.core.tensor import Tensor
    from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax
except ImportError:
    # For development, import from local modules
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '02_activations'))
    from tensor_dev import Tensor
    from activations_dev import ReLU, Sigmoid, Tanh, Softmax

# %% ../../modules/source/03_layers/layers_dev.ipynb 2
def _should_show_plots():
    """Check if we should show plots (disable during testing)"""
    # Check multiple conditions that indicate we're in test mode
    is_pytest = (
        'pytest' in sys.modules or
        'test' in sys.argv or
        os.environ.get('PYTEST_CURRENT_TEST') is not None or
        any('test' in arg for arg in sys.argv) or
        any('pytest' in arg for arg in sys.argv)
    )
    
    # Show plots in development mode (when not in test mode)
    return not is_pytest

# %% ../../modules/source/03_layers/layers_dev.ipynb 7
def matmul_naive(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    """
    Naive matrix multiplication using explicit for-loops.
    
    This helps you understand what matrix multiplication really does!
    
    Args:
        A: Matrix of shape (m, n)
        B: Matrix of shape (n, p)
        
    Returns:
        Matrix of shape (m, p) where C[i,j] = sum(A[i,k] * B[k,j] for k in range(n))
        
    TODO: Implement matrix multiplication using three nested for-loops.
    
    APPROACH:
    1. Get the dimensions: m, n from A and n2, p from B
    2. Check that n == n2 (matrices must be compatible)
    3. Create output matrix C of shape (m, p) filled with zeros
    4. Use three nested loops:
       - i loop: rows of A (0 to m-1)
       - j loop: columns of B (0 to p-1) 
       - k loop: shared dimension (0 to n-1)
    5. For each (i,j), compute: C[i,j] += A[i,k] * B[k,j]
    
    EXAMPLE:
    A = [[1, 2],     B = [[5, 6],
         [3, 4]]          [7, 8]]
    
    C[0,0] = A[0,0]*B[0,0] + A[0,1]*B[1,0] = 1*5 + 2*7 = 19
    C[0,1] = A[0,0]*B[0,1] + A[0,1]*B[1,1] = 1*6 + 2*8 = 22
    C[1,0] = A[1,0]*B[0,0] + A[1,1]*B[1,0] = 3*5 + 4*7 = 43
    C[1,1] = A[1,0]*B[0,1] + A[1,1]*B[1,1] = 3*6 + 4*8 = 50
    
    HINTS:
    - Start with C = np.zeros((m, p))
    - Use three nested for loops: for i in range(m): for j in range(p): for k in range(n):
    - Accumulate the sum: C[i,j] += A[i,k] * B[k,j]
    """
    ### BEGIN SOLUTION
    # Get matrix dimensions
    m, n = A.shape
    n2, p = B.shape
    
    # Check compatibility
    if n != n2:
        raise ValueError(f"Incompatible matrix dimensions: A is {m}x{n}, B is {n2}x{p}")
    
    # Initialize result matrix
    C = np.zeros((m, p))
    
    # Triple nested loop for matrix multiplication
    for i in range(m):
        for j in range(p):
            for k in range(n):
                C[i, j] += A[i, k] * B[k, j]
    
    return C
    ### END SOLUTION

# %% ../../modules/source/03_layers/layers_dev.ipynb 11
class Dense:
    """
    Dense (Linear) Layer: y = Wx + b
    
    The fundamental building block of neural networks.
    Performs linear transformation: matrix multiplication + bias addition.
    """
    
    def __init__(self, input_size: int, output_size: int, use_bias: bool = True, 
                 use_naive_matmul: bool = False):
        """
        Initialize Dense layer with random weights.
        
        Args:
            input_size: Number of input features
            output_size: Number of output features
            use_bias: Whether to include bias term (default: True)
            use_naive_matmul: Whether to use naive matrix multiplication (for learning)
            
        TODO: Implement Dense layer initialization with proper weight initialization.
        
        APPROACH:
        1. Store layer parameters (input_size, output_size, use_bias, use_naive_matmul)
        2. Initialize weights with Xavier/Glorot initialization
        3. Initialize bias to zeros (if use_bias=True)
        4. Convert to float32 for consistency
        
        EXAMPLE:
        Dense(3, 2) creates:
        - weights: shape (3, 2) with small random values
        - bias: shape (2,) with zeros
        
        HINTS:
        - Use np.random.randn() for random initialization
        - Scale weights by sqrt(2/(input_size + output_size)) for Xavier init
        - Use np.zeros() for bias initialization
        - Convert to float32 with .astype(np.float32)
        """
        ### BEGIN SOLUTION
        # Store parameters
        self.input_size = input_size
        self.output_size = output_size
        self.use_bias = use_bias
        self.use_naive_matmul = use_naive_matmul
        
        # Xavier/Glorot initialization
        scale = np.sqrt(2.0 / (input_size + output_size))
        self.weights = np.random.randn(input_size, output_size).astype(np.float32) * scale
        
        # Initialize bias
        if use_bias:
            self.bias = np.zeros(output_size, dtype=np.float32)
        else:
            self.bias = None
        ### END SOLUTION
    
    def forward(self, x):
        """
        Forward pass: y = Wx + b
        
        Args:
            x: Input tensor of shape (batch_size, input_size)
            
        Returns:
            Output tensor of shape (batch_size, output_size)
            
        TODO: Implement matrix multiplication and bias addition.
        
        APPROACH:
        1. Choose matrix multiplication method based on use_naive_matmul flag
        2. Perform matrix multiplication: Wx
        3. Add bias if use_bias=True
        4. Return result wrapped in Tensor
        
        EXAMPLE:
        Input x: Tensor([[1, 2, 3]])  # shape (1, 3)
        Weights: shape (3, 2)
        Output: Tensor([[val1, val2]])  # shape (1, 2)
        
        HINTS:
        - Use self.use_naive_matmul to choose between matmul_naive and @
        - x.data gives you the numpy array
        - Use broadcasting for bias addition: result + self.bias
        - Return Tensor(result) to wrap the result
        """
        ### BEGIN SOLUTION
        # Matrix multiplication
        if self.use_naive_matmul:
            result = matmul_naive(x.data, self.weights)
        else:
            result = x.data @ self.weights
        
        # Add bias
        if self.use_bias:
            result += self.bias
        
        return type(x)(result)
        ### END SOLUTION
    
    def __call__(self, x):
        """Make layer callable: layer(x) same as layer.forward(x)"""
        return self.forward(x)