TinyTorch/tinytorch/core/activations.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_activations/activations_dev.ipynb.

# %% auto 0
__all__ = ['ReLU', 'Sigmoid', 'Tanh', 'Softmax']

# %% ../../modules/source/03_activations/activations_dev.ipynb 1
import math
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from typing import Union, List

# Import our Tensor class - try from package first, then from local module
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local tensor module
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    from tensor_dev import Tensor

# %% ../../modules/source/03_activations/activations_dev.ipynb 2
def _should_show_plots():
    """Check if we should show plots (disable during testing)"""
    # Check multiple conditions that indicate we're in test mode
    is_pytest = (
        'pytest' in sys.modules or
        'test' in sys.argv or
        os.environ.get('PYTEST_CURRENT_TEST') is not None or
        any('test' in arg for arg in sys.argv) or
        any('pytest' in arg for arg in sys.argv)
    )

    # Show plots in development mode (when not in test mode)
    return not is_pytest

# %% ../../modules/source/03_activations/activations_dev.ipynb 7
class ReLU:
    """
    ReLU Activation Function: f(x) = max(0, x)

    The most popular activation function in deep learning.
    Simple, fast, and effective for most applications.
    """

    def forward(self, x):
        """
        Apply ReLU activation: f(x) = max(0, x)

        TODO: Implement ReLU activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. For each element in the input tensor, apply max(0, element)
        2. Use NumPy's maximum function for efficient element-wise operation
        3. Return a new tensor of the same type with the results
        4. Preserve the input tensor's shape

        EXAMPLE USAGE:
        ```python
        relu = ReLU()
        input_tensor = Tensor([[-2, -1, 0, 1, 2]])
        output = relu(input_tensor)
        print(output.data)  # [[0, 0, 0, 1, 2]]
        ```

        IMPLEMENTATION HINTS:
        - Use np.maximum(0, x.data) for element-wise max with 0
        - Return the same type as input: return type(x)(result)
        - The shape should remain the same as input
        - Don't modify the input tensor (immutable operations)

        LEARNING CONNECTIONS:
        - This is like torch.nn.ReLU() in PyTorch
        - Used in virtually every modern neural network
        - Enables deep networks by preventing vanishing gradients
        - Creates sparse representations (many zeros)
        """
        ### BEGIN SOLUTION
        result = np.maximum(0, x.data)
        return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: relu(x) instead of relu.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 11
class Sigmoid:
    """
    Sigmoid Activation Function: f(x) = 1 / (1 + e^(-x))

    Maps any real number to the range (0, 1).
    Useful for binary classification and probability outputs.
    """

    def forward(self, x):
        """
        Apply Sigmoid activation: f(x) = 1 / (1 + e^(-x))

        TODO: Implement Sigmoid activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. Compute the negative of input: -x.data
        2. Compute the exponential: np.exp(-x.data)
        3. Add 1 to the exponential: 1 + np.exp(-x.data)
        4. Take the reciprocal: 1 / (1 + np.exp(-x.data))
        5. Return as new Tensor

        EXAMPLE USAGE:
        ```python
        sigmoid = Sigmoid()
        input_tensor = Tensor([[-2, -1, 0, 1, 2]])
        output = sigmoid(input_tensor)
        print(output.data)  # [[0.119, 0.269, 0.5, 0.731, 0.881]]
        ```

        IMPLEMENTATION HINTS:
        - Use np.exp() for exponential function
        - Formula: 1 / (1 + np.exp(-x.data))
        - Handle potential overflow with np.clip(-x.data, -500, 500)
        - Return Tensor(result)

        LEARNING CONNECTIONS:
        - This is like torch.nn.Sigmoid() in PyTorch
        - Used in binary classification output layers
        - Key component in LSTM and GRU gating mechanisms
        - Historically important for early neural networks
        """
        ### BEGIN SOLUTION
        # Clip to prevent overflow
        clipped_input = np.clip(-x.data, -500, 500)
        result = 1 / (1 + np.exp(clipped_input))
        return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: sigmoid(x) instead of sigmoid.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 15
class Tanh:
    """
    Tanh Activation Function: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))

    Zero-centered activation function with range (-1, 1).
    Better gradient properties than sigmoid.
    """

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply Tanh activation: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))

        TODO: Implement Tanh activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. Use NumPy's built-in tanh function: np.tanh(x.data)
        2. Alternatively, implement manually:
           - Compute e^x and e^(-x)
           - Calculate (e^x - e^(-x)) / (e^x + e^(-x))
        3. Return as new Tensor

        EXAMPLE USAGE:
        ```python
        tanh = Tanh()
        input_tensor = Tensor([[-2, -1, 0, 1, 2]])
        output = tanh(input_tensor)
        print(output.data)  # [[-0.964, -0.762, 0, 0.762, 0.964]]
        ```

        IMPLEMENTATION HINTS:
        - Use np.tanh(x.data) for simplicity
        - Manual implementation: (np.exp(x.data) - np.exp(-x.data)) / (np.exp(x.data) + np.exp(-x.data))
        - Handle overflow by clipping inputs: np.clip(x.data, -500, 500)
        - Return Tensor(result)

        LEARNING CONNECTIONS:
        - This is like torch.nn.Tanh() in PyTorch
        - Used in RNN, LSTM, and GRU cells
        - Better than sigmoid for hidden layers
        - Zero-centered outputs help with gradient flow
        """
        ### BEGIN SOLUTION
        # Use NumPy's built-in tanh function
        result = np.tanh(x.data)
        return type(x)(result)
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Make the class callable: tanh(x) instead of tanh.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 19
class Softmax:
    """
    Softmax Activation Function: f(x_i) = e^(x_i) / Σ(e^(x_j))

    Converts a vector of real numbers into a probability distribution.
    Essential for multi-class classification.
    """

    def forward(self, x):
        """
        Apply Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))

        TODO: Implement Softmax activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. Handle empty input case
        2. Subtract max value for numerical stability: x - max(x)
        3. Compute exponentials: np.exp(x - max(x))
        4. Compute sum of exponentials: np.sum(exp_values)
        5. Divide each exponential by the sum: exp_values / sum
        6. Return as same tensor type as input

        EXAMPLE USAGE:
        ```python
        softmax = Softmax()
        input_tensor = Tensor([[1, 2, 3]])
        output = softmax(input_tensor)
        print(output.data)  # [[0.09, 0.24, 0.67]]
        print(np.sum(output.data))  # 1.0
        ```

        IMPLEMENTATION HINTS:
        - Handle empty case: if x.data.size == 0: return type(x)(x.data.copy())
        - Subtract max for numerical stability: x_shifted = x.data - np.max(x.data, axis=-1, keepdims=True)
        - Compute exponentials: exp_values = np.exp(x_shifted)
        - Sum along last axis: sum_exp = np.sum(exp_values, axis=-1, keepdims=True)
        - Divide: result = exp_values / sum_exp
        - Return same type as input: return type(x)(result)

        LEARNING CONNECTIONS:
        - This is like torch.nn.Softmax() in PyTorch
        - Used in classification output layers
        - Key component in attention mechanisms
        - Enables probability-based decision making
        """
        ### BEGIN SOLUTION
        # Handle empty input
        if x.data.size == 0:
            return type(x)(x.data.copy())

        # Subtract max for numerical stability
        x_shifted = x.data - np.max(x.data, axis=-1, keepdims=True)

        # Compute exponentials
        exp_values = np.exp(x_shifted)

        # Sum along last axis
        sum_exp = np.sum(exp_values, axis=-1, keepdims=True)

        # Divide to get probabilities
        result = exp_values / sum_exp

        return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: softmax(x) instead of softmax.forward(x)"""
        return self.forward(x)