TinyTorch/tinytorch/core/activations.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_activations/activations_dev.ipynb.

# %% auto 0
__all__ = ['ReLU', 'Sigmoid', 'Tanh', 'Softmax', 'ActivationProfiler', 'benchmark_activation_suite']

# %% ../../modules/source/03_activations/activations_dev.ipynb 1
import math
import numpy as np
import os
import sys
from typing import Union, List

# Import our Tensor class - try from package first, then from local module
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local tensor module
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    from tensor_dev import Tensor

# %% ../../modules/source/03_activations/activations_dev.ipynb 7
class ReLU:
    """
    ReLU Activation Function: f(x) = max(0, x)

    The most popular activation function in deep learning.
    Simple, fast, and effective for most applications.
    """

    def forward(self, x):
        """
        Apply ReLU activation: f(x) = max(0, x)

        TODO: Implement ReLU activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. For each element in the input tensor, apply max(0, element)
        2. Use NumPy's maximum function for efficient element-wise operation
        3. Return a new tensor of the same type with the results
        4. Preserve the input tensor's shape

        EXAMPLE USAGE:
        ```python
        relu = ReLU()
        input_tensor = Tensor([[-2, -1, 0, 1, 2]])
        output = relu(input_tensor)
        print(output.data)  # [[0, 0, 0, 1, 2]]
        ```

        IMPLEMENTATION HINTS:
        - Use np.maximum(0, x.data) for element-wise max with 0
        - Return the same type as input: return type(x)(result)
        - The shape should remain the same as input
        - Do not modify the input tensor (immutable operations)

        LEARNING CONNECTIONS:
        - This is like torch.nn.ReLU() in PyTorch
        - Used in virtually every modern neural network
        - Enables deep networks by preventing vanishing gradients
        - Creates sparse representations (many zeros)
        """
        ### BEGIN SOLUTION
        # Check if input is a Variable (autograd-enabled)
        if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
            # Input is a Variable - preserve autograd capabilities

            # Forward pass: ReLU activation
            input_data = x.data.data if hasattr(x.data, 'data') else x.data
            output_data = np.maximum(0, input_data)

            # Create gradient function for backward pass
            def relu_grad_fn(grad_output):
                if x.requires_grad:
                    # ReLU gradient: 1 where input > 0, 0 elsewhere
                    relu_mask = (input_data > 0).astype(np.float32)
                    grad_input_data = grad_output.data.data * relu_mask
                    # Import Variable locally to avoid circular imports
                    try:
                        from tinytorch.core.autograd import Variable
                    except ImportError:
                        from autograd_dev import Variable
                    grad_input = Variable(grad_input_data)
                    x.backward(grad_input)

            # Return Variable with gradient function
            requires_grad = x.requires_grad
            # Import Variable locally to avoid circular imports
            try:
                from tinytorch.core.autograd import Variable
            except ImportError:
                from autograd_dev import Variable
            result = Variable(output_data, requires_grad=requires_grad, grad_fn=relu_grad_fn if requires_grad else None)
            return result
        else:
            # Input is a Tensor - use original implementation
            result = np.maximum(0, x.data)
            return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: relu(x) instead of relu.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 11
class Sigmoid:
    """
    Sigmoid Activation Function: f(x) = 1 / (1 + e^(-x))

    Maps any real number to the range (0, 1).
    Useful for binary classification and probability outputs.
    """

    def forward(self, x):
        """
        Apply Sigmoid activation: f(x) = 1 / (1 + e^(-x))

        TODO: Implement Sigmoid activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. Compute the negative of input: -x.data
        2. Compute the exponential: np.exp(-x.data)
        3. Add 1 to the exponential: 1 + np.exp(-x.data)
        4. Take the reciprocal: 1 / (1 + np.exp(-x.data))
        5. Return as new Tensor

        EXAMPLE USAGE:
        ```python
        sigmoid = Sigmoid()
        input_tensor = Tensor([[-2, -1, 0, 1, 2]])
        output = sigmoid(input_tensor)
        print(output.data)  # [[0.119, 0.269, 0.5, 0.731, 0.881]]
        ```

        IMPLEMENTATION HINTS:
        - Use np.exp() for exponential function
        - Formula: 1 / (1 + np.exp(-x.data))
        - Handle potential overflow with np.clip(-x.data, -500, 500)
        - Return Tensor(result)

        LEARNING CONNECTIONS:
        - This is like torch.nn.Sigmoid() in PyTorch
        - Used in binary classification output layers
        - Key component in LSTM and GRU gating mechanisms
        - Historically important for early neural networks
        """
        ### BEGIN SOLUTION
        # Handle both Variable (x.data) and Tensor (x._data) inputs
        if hasattr(x, 'data'):
            # x is a Variable, get the tensor data
            if hasattr(x.data, '_data'):
                # x.data is a Tensor
                data = x.data._data
            else:
                # x.data is already numpy array
                data = x.data
        else:
            # x is a Tensor
            data = x._data

        # Clip to prevent overflow
        clipped_input = np.clip(-data, -500, 500)
        result = 1 / (1 + np.exp(clipped_input))
        return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: sigmoid(x) instead of sigmoid.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 15
class Tanh:
    """
    Tanh Activation Function: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))

    Zero-centered activation function with range (-1, 1).
    Better gradient properties than sigmoid.
    """

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply Tanh activation: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))

        TODO: Implement Tanh activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. Use NumPy's built-in tanh function: np.tanh(x.data)
        2. Alternatively, implement manually:
           - Compute e^x and e^(-x)
           - Calculate (e^x - e^(-x)) / (e^x + e^(-x))
        3. Return as new Tensor

        EXAMPLE USAGE:
        ```python
        tanh = Tanh()
        input_tensor = Tensor([[-2, -1, 0, 1, 2]])
        output = tanh(input_tensor)
        print(output.data)  # [[-0.964, -0.762, 0, 0.762, 0.964]]
        ```

        IMPLEMENTATION HINTS:
        - Use np.tanh(x.data) for simplicity
        - Manual implementation: (np.exp(x.data) - np.exp(-x.data)) / (np.exp(x.data) + np.exp(-x.data))
        - Handle overflow by clipping inputs: np.clip(x.data, -500, 500)
        - Return Tensor(result)

        LEARNING CONNECTIONS:
        - This is like torch.nn.Tanh() in PyTorch
        - Used in RNN, LSTM, and GRU cells
        - Better than sigmoid for hidden layers
        - Zero-centered outputs help with gradient flow
        """
        ### BEGIN SOLUTION
        # Use NumPy's built-in tanh function
        result = np.tanh(x.data)
        return type(x)(result)
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Make the class callable: tanh(x) instead of tanh.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 19
class Softmax:
    """
    Softmax Activation Function: f(x_i) = e^(x_i) / Σ(e^(x_j))

    Converts a vector of real numbers into a probability distribution.
    Essential for multi-class classification.
    """

    def forward(self, x):
        """
        Apply Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))

        TODO: Implement Softmax activation function.

        STEP-BY-STEP IMPLEMENTATION:
        1. Handle empty input case
        2. Subtract max value for numerical stability: x - max(x)
        3. Compute exponentials: np.exp(x - max(x))
        4. Compute sum of exponentials: np.sum(exp_values)
        5. Divide each exponential by the sum: exp_values / sum
        6. Return as same tensor type as input

        EXAMPLE USAGE:
        ```python
        softmax = Softmax()
        input_tensor = Tensor([[1, 2, 3]])
        output = softmax(input_tensor)
        print(output.data)  # [[0.09, 0.24, 0.67]]
        print(np.sum(output.data))  # 1.0
        ```

        IMPLEMENTATION HINTS:
        - Handle empty case: if x.data.size == 0: return type(x)(x.data.copy())
        - Subtract max for numerical stability: x_shifted = x.data - np.max(x.data, axis=-1, keepdims=True)
        - Compute exponentials: exp_values = np.exp(x_shifted)
        - Sum along last axis: sum_exp = np.sum(exp_values, axis=-1, keepdims=True)
        - Divide: result = exp_values / sum_exp
        - Return same type as input: return type(x)(result)

        LEARNING CONNECTIONS:
        - This is like torch.nn.Softmax() in PyTorch
        - Used in classification output layers
        - Key component in attention mechanisms
        - Enables probability-based decision making
        """
        ### BEGIN SOLUTION
        # Check if input is a Variable (autograd-enabled)
        if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
            # Input is a Variable - preserve autograd capabilities

            # Forward pass: Softmax activation
            input_data = x.data.data if hasattr(x.data, 'data') else x.data

            # Handle empty input
            if input_data.size == 0:
                # Import Variable locally to avoid circular imports
                try:
                    from tinytorch.core.autograd import Variable
                except ImportError:
                    from autograd_dev import Variable
                return Variable(input_data.copy(), requires_grad=x.requires_grad)

            # Subtract max for numerical stability
            x_shifted = input_data - np.max(input_data, axis=-1, keepdims=True)

            # Compute exponentials
            exp_values = np.exp(x_shifted)

            # Sum along last axis
            sum_exp = np.sum(exp_values, axis=-1, keepdims=True)

            # Divide to get probabilities
            output_data = exp_values / sum_exp

            # Create gradient function for backward pass
            def softmax_grad_fn(grad_output):
                if x.requires_grad:
                    # Softmax gradient: softmax(x) * (grad_output - (softmax(x) * grad_output).sum())
                    grad_input_data = output_data * (grad_output.data.data - np.sum(output_data * grad_output.data.data, axis=-1, keepdims=True))
                    # Import Variable locally to avoid circular imports
                    try:
                        from tinytorch.core.autograd import Variable
                    except ImportError:
                        from autograd_dev import Variable
                    grad_input = Variable(grad_input_data)
                    x.backward(grad_input)

            # Return Variable with gradient function
            requires_grad = x.requires_grad
            # Import Variable locally to avoid circular imports
            try:
                from tinytorch.core.autograd import Variable
            except ImportError:
                from autograd_dev import Variable
            result = Variable(output_data, requires_grad=requires_grad, grad_fn=softmax_grad_fn if requires_grad else None)
            return result
        else:
            # Input is a Tensor - use original implementation
            # Handle empty input
            if x.data.size == 0:
                return type(x)(x.data.copy())

            # Subtract max for numerical stability
            x_shifted = x.data - np.max(x.data, axis=-1, keepdims=True)

            # Compute exponentials
            exp_values = np.exp(x_shifted)

            # Sum along last axis
            sum_exp = np.sum(exp_values, axis=-1, keepdims=True)

            # Divide to get probabilities
            result = exp_values / sum_exp

            return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: softmax(x) instead of softmax.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 26
import time

class ActivationProfiler:
    """
    Performance profiling toolkit for activation functions.

    Helps ML engineers understand computational costs and optimize
    neural network performance for production deployment.
    """

    def __init__(self):
        self.results = {}

    def time_activation(self, activation_fn, tensor, activation_name, iterations=100):
        """
        Time how long an activation function takes to run.

        TODO: Implement activation timing.

        STEP-BY-STEP IMPLEMENTATION:
        1. Record start time using time.time()
        2. Run the activation function for specified iterations
        3. Record end time
        4. Calculate average time per iteration
        5. Return the average time in milliseconds

        EXAMPLE:
        profiler = ActivationProfiler()
        relu = ReLU()
        test_tensor = Tensor(np.random.randn(1000, 1000))
        avg_time = profiler.time_activation(relu, test_tensor, "ReLU")
        print(f"ReLU took {avg_time:.3f} ms on average")

        HINTS:
        - Use time.time() for timing
        - Run multiple iterations for better accuracy
        - Calculate: (end_time - start_time) / iterations * 1000 for ms
        - Return the average time per call in milliseconds
        """
        ### BEGIN SOLUTION
        start_time = time.time()

        for _ in range(iterations):
            result = activation_fn(tensor)

        end_time = time.time()
        avg_time_ms = (end_time - start_time) / iterations * 1000

        return avg_time_ms
        ### END SOLUTION

    def compare_activations(self, tensor_size=(1000, 1000), iterations=50):
        """
        Compare performance of all activation functions.

        This function is PROVIDED to show systems analysis.
        Students run it to understand performance differences.
        """
        print(f"⚡ ACTIVATION PERFORMANCE COMPARISON")
        print(f"=" * 50)
        print(f"Tensor size: {tensor_size}, Iterations: {iterations}")

        # Create test tensor
        test_tensor = Tensor(np.random.randn(*tensor_size))
        tensor_mb = test_tensor.data.nbytes / (1024 * 1024)
        print(f"Test tensor: {tensor_mb:.2f} MB")

        # Test all activation functions
        activations = {
            'ReLU': ReLU(),
            'Sigmoid': Sigmoid(),
            'Tanh': Tanh(),
            'Softmax': Softmax()
        }

        results = {}
        for name, activation_fn in activations.items():
            avg_time = self.time_activation(activation_fn, test_tensor, name, iterations)
            results[name] = avg_time
            print(f"   {name:8}: {avg_time:.3f} ms")

        # Calculate speed ratios relative to fastest
        fastest_time = min(results.values())
        fastest_name = min(results, key=results.get)

        print(f"\n📊 SPEED ANALYSIS:")
        for name, time_ms in sorted(results.items(), key=lambda x: x[1]):
            speed_ratio = time_ms / fastest_time
            if name == fastest_name:
                print(f"   {name:8}: {speed_ratio:.1f}x (fastest)")
            else:
                print(f"   {name:8}: {speed_ratio:.1f}x slower than {fastest_name}")

        return results

    def analyze_scaling(self, activation_fn, activation_name, sizes=[100, 500, 1000]):
        """
        Analyze how activation performance scales with tensor size.

        This function is PROVIDED to demonstrate scaling patterns.
        Students use it to understand computational complexity.
        """
        print(f"\n🔍 SCALING ANALYSIS: {activation_name}")
        print(f"=" * 40)

        scaling_results = []

        for size in sizes:
            test_tensor = Tensor(np.random.randn(size, size))
            avg_time = self.time_activation(activation_fn, test_tensor, activation_name, iterations=20)

            elements = size * size
            time_per_element = avg_time / elements * 1e6  # microseconds per element

            result = {
                'size': size,
                'elements': elements,
                'time_ms': avg_time,
                'time_per_element_us': time_per_element
            }
            scaling_results.append(result)

            print(f"   {size}x{size}: {avg_time:.3f}ms ({time_per_element:.3f}μs/element)")

        # Analyze scaling pattern
        if len(scaling_results) >= 2:
            small = scaling_results[0]
            large = scaling_results[-1]

            size_ratio = large['size'] / small['size']
            time_ratio = large['time_ms'] / small['time_ms']

            print(f"\n📈 Scaling Pattern:")
            print(f"   Size increased {size_ratio:.1f}x ({small['size']} → {large['size']})")
            print(f"   Time increased {time_ratio:.1f}x")

            if abs(time_ratio - size_ratio**2) < abs(time_ratio - size_ratio):
                print(f"   Pattern: O(n^2) - linear in tensor size")
            else:
                print(f"   Pattern: ~O(n) - very efficient scaling")

        return scaling_results

def benchmark_activation_suite():
    """
    Comprehensive benchmark of all activation functions.

    This function is PROVIDED to show complete systems analysis.
    Students run it to understand production performance implications.
    """
    profiler = ActivationProfiler()

    print("🏆 COMPREHENSIVE ACTIVATION BENCHMARK")
    print("=" * 60)

    # Test 1: Performance comparison
    comparison_results = profiler.compare_activations(tensor_size=(800, 800), iterations=30)

    # Test 2: Scaling analysis for each activation
    activations_to_test = [
        (ReLU(), "ReLU"),
        (Sigmoid(), "Sigmoid"),
        (Tanh(), "Tanh")
    ]

    for activation_fn, name in activations_to_test:
        profiler.analyze_scaling(activation_fn, name, sizes=[200, 400, 600])

    # Test 3: Memory vs Performance trade-offs
    print(f"\n💾 MEMORY vs PERFORMANCE ANALYSIS:")
    print(f"=" * 40)

    test_tensor = Tensor(np.random.randn(500, 500))
    original_memory = test_tensor.data.nbytes / (1024 * 1024)

    for name, activation_fn in [("ReLU", ReLU()), ("Sigmoid", Sigmoid())]:
        start_time = time.time()
        result = activation_fn(test_tensor)
        end_time = time.time()

        result_memory = result.data.nbytes / (1024 * 1024)
        time_ms = (end_time - start_time) * 1000

        print(f"   {name}:")
        print(f"     Input: {original_memory:.2f} MB")
        print(f"     Output: {result_memory:.2f} MB")
        print(f"     Memory overhead: {result_memory - original_memory:.2f} MB")
        print(f"     Time: {time_ms:.3f} ms")

    print(f"\n🎯 PRODUCTION INSIGHTS:")
    print(f"   - ReLU is typically fastest (simple max operation)")
    print(f"   - Sigmoid/Tanh slower due to exponential calculations")
    print(f"   - All operations scale linearly with tensor size")
    print(f"   - Memory usage doubles (input + output tensors)")
    print(f"   - Choose activation based on accuracy vs speed trade-offs")

    return comparison_results