TinyTorch/tinytorch/core/activations.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_activations/activations_dev.ipynb.

# %% auto 0
__all__ = ['ReLU', 'Sigmoid', 'Tanh', 'Softmax', 'ActivationProfiler', 'benchmark_activation_suite']

# %% ../../modules/source/03_activations/activations_dev.ipynb 1
import math
import numpy as np
import os
import sys
from typing import Union, List

# Import our Tensor class - try from package first, then from local module
try:
    from tinytorch.core.tensor import Tensor
except ImportError:
    # For development, import from local tensor module
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
    from tensor_dev import Tensor

# Import Variable for autograd support
try:
    from tinytorch.core.autograd import Variable
except ImportError:
    # For development, import from local autograd module
    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '09_autograd'))
    from autograd_dev import Variable

# %% ../../modules/source/03_activations/activations_dev.ipynb 7
class ReLU:
    """
    ReLU Activation Function: f(x) = max(0, x)

    The most popular activation function in deep learning.
    Simple, fast, and effective for most applications.
    """

    def forward(self, x):
        """
        Apply ReLU activation: f(x) = max(0, x)

        Now supports both Tensor and Variable inputs with automatic differentiation.

        STEP-BY-STEP IMPLEMENTATION:
        1. Check if input is Variable (for autograd) or Tensor
        2. For each element in the input tensor, apply max(0, element)
        3. If input is Variable: create Variable output with proper gradient function
        4. If input is Tensor: return Tensor as before

        MATHEMATICAL FOUNDATION:
        - Forward: f(x) = max(0, x)
        - Backward: f'(x) = 1 if x > 0, else 0

        EXAMPLE USAGE:
        ```python
        relu = ReLU()
        # With Tensor (no gradients)
        tensor_input = Tensor([[-2, -1, 0, 1, 2]])
        tensor_output = relu(tensor_input)

        # With Variable (with gradients)
        var_input = Variable([[-2, -1, 0, 1, 2]], requires_grad=True)
        var_output = relu(var_input)
        var_output.backward()
        print(var_input.grad)  # Gradients: [0, 0, 0, 1, 1]
        ```

        IMPLEMENTATION HINTS:
        - Check type with hasattr(x, 'requires_grad')
        - For Variables: implement gradient function for backward pass
        - ReLU gradient: 1 where input > 0, 0 elsewhere
        - Use np.maximum(0, x.data) for forward pass

        LEARNING CONNECTIONS:
        - This is like torch.nn.ReLU() in PyTorch with autograd support
        - Enables gradient-based training of neural networks
        - ReLU's simple gradient (0 or 1) prevents vanishing gradients
        - Creates sparse representations and efficient gradient flow
        """
        ### BEGIN SOLUTION
        # Check if input is a Variable (autograd-enabled)
        if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
            # Input is a Variable - preserve autograd capabilities

            # Forward pass: ReLU activation
            input_data = x.data.data if hasattr(x.data, 'data') else x.data
            output_data = np.maximum(0, input_data)

            # Create gradient function for backward pass
            def relu_grad_fn(grad_output):
                if x.requires_grad:
                    # ReLU gradient: 1 where input > 0, 0 elsewhere
                    relu_mask = (input_data > 0).astype(np.float32)
                    grad_input_data = grad_output.data.data * relu_mask
                    grad_input = Variable(grad_input_data)
                    x.backward(grad_input)

            # Return Variable with gradient function
            requires_grad = x.requires_grad
            result = Variable(output_data, requires_grad=requires_grad, grad_fn=relu_grad_fn if requires_grad else None)
            return result
        else:
            # Input is a Tensor - use original implementation
            result = np.maximum(0, x.data)
            return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: relu(x) instead of relu.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 11
class Sigmoid:
    """
    Sigmoid Activation Function: f(x) = 1 / (1 + e^(-x))

    Maps any real number to the range (0, 1).
    Useful for binary classification and probability outputs.
    """

    def forward(self, x):
        """
        Apply Sigmoid activation: f(x) = 1 / (1 + e^(-x))

        Now supports both Tensor and Variable inputs with automatic differentiation.

        STEP-BY-STEP IMPLEMENTATION:
        1. Check if input is Variable (for autograd) or Tensor
        2. Compute sigmoid: 1 / (1 + exp(-x))
        3. If input is Variable: create Variable output with proper gradient function
        4. If input is Tensor: return Tensor as before

        MATHEMATICAL FOUNDATION:
        - Forward: f(x) = 1 / (1 + e^(-x))
        - Backward: f'(x) = f(x) * (1 - f(x)) = sigmoid(x) * (1 - sigmoid(x))

        EXAMPLE USAGE:
        ```python
        sigmoid = Sigmoid()
        # With Variable (with gradients)
        var_input = Variable([[0.0]], requires_grad=True)
        var_output = sigmoid(var_input)  # 0.5
        var_output.backward()
        print(var_input.grad)  # 0.25 = 0.5 * (1 - 0.5)
        ```

        IMPLEMENTATION HINTS:
        - Check type with hasattr(x, 'requires_grad')
        - For Variables: implement gradient function for backward pass
        - Sigmoid gradient: sigmoid(x) * (1 - sigmoid(x))
        - Use numerical stability: clip inputs to prevent overflow

        LEARNING CONNECTIONS:
        - This is like torch.nn.Sigmoid() in PyTorch with autograd support
        - Used in binary classification and gating mechanisms
        - Smooth gradients enable stable training
        - Self-normalizing gradient (max at x=0, decreases at extremes)
        """
        ### BEGIN SOLUTION
        # Check if input is a Variable (autograd-enabled)
        if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
            # Input is a Variable - preserve autograd capabilities

            # Forward pass: Sigmoid activation with numerical stability
            input_data = x.data.data if hasattr(x.data, 'data') else x.data
            clipped_input = np.clip(-input_data, -500, 500)
            output_data = 1 / (1 + np.exp(clipped_input))

            # Create gradient function for backward pass
            def sigmoid_grad_fn(grad_output):
                if x.requires_grad:
                    # Sigmoid gradient: sigmoid(x) * (1 - sigmoid(x))
                    sigmoid_grad = output_data * (1 - output_data)
                    grad_input_data = grad_output.data.data * sigmoid_grad
                    grad_input = Variable(grad_input_data)
                    x.backward(grad_input)

            # Return Variable with gradient function
            requires_grad = x.requires_grad
            result = Variable(output_data, requires_grad=requires_grad, grad_fn=sigmoid_grad_fn if requires_grad else None)
            return result
        else:
            # Input is a Tensor - use original implementation
            clipped_input = np.clip(-x.data, -500, 500)
            result = 1 / (1 + np.exp(clipped_input))
            return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: sigmoid(x) instead of sigmoid.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 15
class Tanh:
    """
    Tanh Activation Function: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))

    Zero-centered activation function with range (-1, 1).
    Better gradient properties than sigmoid.
    """

    def forward(self, x):
        """
        Apply Tanh activation: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))

        Now supports both Tensor and Variable inputs with automatic differentiation.

        STEP-BY-STEP IMPLEMENTATION:
        1. Check if input is Variable (for autograd) or Tensor
        2. Compute tanh: (e^x - e^(-x)) / (e^x + e^(-x))
        3. If input is Variable: create Variable output with proper gradient function
        4. If input is Tensor: return Tensor as before

        MATHEMATICAL FOUNDATION:
        - Forward: f(x) = tanh(x)
        - Backward: f'(x) = 1 - tanh²(x) = 1 - f(x)²

        EXAMPLE USAGE:
        ```python
        tanh = Tanh()
        # With Variable (with gradients)
        var_input = Variable([[0.0]], requires_grad=True)
        var_output = tanh(var_input)  # 0.0
        var_output.backward()
        print(var_input.grad)  # 1.0 = 1 - 0²
        ```

        IMPLEMENTATION HINTS:
        - Check type with hasattr(x, 'requires_grad')
        - For Variables: implement gradient function for backward pass
        - Tanh gradient: 1 - tanh²(x)
        - Use np.tanh() for numerical stability

        LEARNING CONNECTIONS:
        - This is like torch.nn.Tanh() in PyTorch with autograd support
        - Used in RNN, LSTM, and GRU cells
        - Zero-centered outputs improve gradient flow
        - Strong gradients near zero, weaker at extremes
        """
        ### BEGIN SOLUTION
        # Check if input is a Variable (autograd-enabled)
        if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
            # Input is a Variable - preserve autograd capabilities

            # Forward pass: Tanh activation
            input_data = x.data.data if hasattr(x.data, 'data') else x.data
            output_data = np.tanh(input_data)

            # Create gradient function for backward pass
            def tanh_grad_fn(grad_output):
                if x.requires_grad:
                    # Tanh gradient: 1 - tanh²(x)
                    tanh_grad = 1 - output_data ** 2
                    grad_input_data = grad_output.data.data * tanh_grad
                    grad_input = Variable(grad_input_data)
                    x.backward(grad_input)

            # Return Variable with gradient function
            requires_grad = x.requires_grad
            result = Variable(output_data, requires_grad=requires_grad, grad_fn=tanh_grad_fn if requires_grad else None)
            return result
        else:
            # Input is a Tensor - use original implementation
            result = np.tanh(x.data)
            return type(x)(result)
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Make the class callable: tanh(x) instead of tanh.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 19
class Softmax:
    """
    Softmax Activation Function: f(x_i) = e^(x_i) / Σ(e^(x_j))

    Converts a vector of real numbers into a probability distribution.
    Essential for multi-class classification.
    """

    def forward(self, x):
        """
        Apply Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))

        Now supports both Tensor and Variable inputs with automatic differentiation.

        STEP-BY-STEP IMPLEMENTATION:
        1. Check if input is Variable (for autograd) or Tensor
        2. Compute softmax with numerical stability
        3. If input is Variable: create Variable output with proper gradient function
        4. If input is Tensor: return Tensor as before

        MATHEMATICAL FOUNDATION:
        - Forward: f(x_i) = e^(x_i) / Σ(e^(x_j))
        - Backward: ∂f_i/∂x_j = f_i * (δ_ij - f_j) where δ_ij is Kronecker delta
        - Simplified: ∂f_i/∂x_i = f_i * (1 - f_i), ∂f_i/∂x_j = -f_i * f_j (i ≠ j)

        EXAMPLE USAGE:
        ```python
        softmax = Softmax()
        # With Variable (with gradients)
        var_input = Variable([[1.0, 2.0]], requires_grad=True)
        var_output = softmax(var_input)
        var_output.backward(Variable([[1.0, 0.0]]))
        # Gradients computed automatically
        ```

        IMPLEMENTATION HINTS:
        - Check type with hasattr(x, 'requires_grad')
        - For Variables: implement gradient function for backward pass
        - Softmax gradient: Jacobian matrix with f_i * (δ_ij - f_j)
        - Use numerical stability: subtract max before exponential

        LEARNING CONNECTIONS:
        - This is like torch.nn.Softmax() in PyTorch with autograd support
        - Used in classification and attention mechanisms
        - Converts logits to probability distributions
        - Complex gradient structure due to normalization
        """
        ### BEGIN SOLUTION
        # Check if input is a Variable (autograd-enabled)
        if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
            # Input is a Variable - preserve autograd capabilities

            # Forward pass: Softmax activation with numerical stability
            input_data = x.data.data if hasattr(x.data, 'data') else x.data

            # Handle empty input
            if input_data.size == 0:
                return Variable(input_data.copy(), requires_grad=x.requires_grad)

            # Subtract max for numerical stability
            x_shifted = input_data - np.max(input_data, axis=-1, keepdims=True)

            # Compute exponentials
            exp_values = np.exp(x_shifted)

            # Sum along last axis
            sum_exp = np.sum(exp_values, axis=-1, keepdims=True)

            # Divide to get probabilities
            output_data = exp_values / sum_exp

            # Create gradient function for backward pass
            def softmax_grad_fn(grad_output):
                if x.requires_grad:
                    # Softmax gradient: for each element i,j: ∂f_i/∂x_j = f_i * (δ_ij - f_j)
                    # For vector input, this becomes: grad_input = softmax * (grad_output - (softmax * grad_output).sum(keepdims=True))
                    grad_out_data = grad_output.data.data
                    softmax_grad_sum = np.sum(output_data * grad_out_data, axis=-1, keepdims=True)
                    grad_input_data = output_data * (grad_out_data - softmax_grad_sum)
                    grad_input = Variable(grad_input_data)
                    x.backward(grad_input)

            # Return Variable with gradient function
            requires_grad = x.requires_grad
            result = Variable(output_data, requires_grad=requires_grad, grad_fn=softmax_grad_fn if requires_grad else None)
            return result
        else:
            # Input is a Tensor - use original implementation
            # Handle empty input
            if x.data.size == 0:
                return type(x)(x.data.copy())

            # Subtract max for numerical stability
            x_shifted = x.data - np.max(x.data, axis=-1, keepdims=True)

            # Compute exponentials
            exp_values = np.exp(x_shifted)

            # Sum along last axis
            sum_exp = np.sum(exp_values, axis=-1, keepdims=True)

            # Divide to get probabilities
            result = exp_values / sum_exp

            return type(x)(result)
        ### END SOLUTION

    def __call__(self, x):
        """Make the class callable: softmax(x) instead of softmax.forward(x)"""
        return self.forward(x)

# %% ../../modules/source/03_activations/activations_dev.ipynb 30
import time

class ActivationProfiler:
    """
    Performance profiling toolkit for activation functions.

    Helps ML engineers understand computational costs and optimize
    neural network performance for production deployment.
    """

    def __init__(self):
        self.results = {}

    def time_activation(self, activation_fn, tensor, activation_name, iterations=100):
        """
        Time how long an activation function takes to run.

        TODO: Implement activation timing.

        STEP-BY-STEP IMPLEMENTATION:
        1. Record start time using time.time()
        2. Run the activation function for specified iterations
        3. Record end time
        4. Calculate average time per iteration
        5. Return the average time in milliseconds

        EXAMPLE:
        profiler = ActivationProfiler()
        relu = ReLU()
        test_tensor = Tensor(np.random.randn(1000, 1000))
        avg_time = profiler.time_activation(relu, test_tensor, "ReLU")
        print(f"ReLU took {avg_time:.3f} ms on average")

        HINTS:
        - Use time.time() for timing
        - Run multiple iterations for better accuracy
        - Calculate: (end_time - start_time) / iterations * 1000 for ms
        - Return the average time per call in milliseconds
        """
        ### BEGIN SOLUTION
        start_time = time.time()

        for _ in range(iterations):
            result = activation_fn(tensor)

        end_time = time.time()
        avg_time_ms = (end_time - start_time) / iterations * 1000

        return avg_time_ms
        ### END SOLUTION

    def compare_activations(self, tensor_size=(1000, 1000), iterations=50):
        """
        Compare performance of all activation functions.

        This function is PROVIDED to show systems analysis.
        Students run it to understand performance differences.
        """
        print(f"⚡ ACTIVATION PERFORMANCE COMPARISON")
        print(f"=" * 50)
        print(f"Tensor size: {tensor_size}, Iterations: {iterations}")

        # Create test tensor
        test_tensor = Tensor(np.random.randn(*tensor_size))
        tensor_mb = test_tensor.data.nbytes / (1024 * 1024)
        print(f"Test tensor: {tensor_mb:.2f} MB")

        # Test all activation functions
        activations = {
            'ReLU': ReLU(),
            'Sigmoid': Sigmoid(),
            'Tanh': Tanh(),
            'Softmax': Softmax()
        }

        results = {}
        for name, activation_fn in activations.items():
            avg_time = self.time_activation(activation_fn, test_tensor, name, iterations)
            results[name] = avg_time
            print(f"   {name:8}: {avg_time:.3f} ms")

        # Calculate speed ratios relative to fastest
        fastest_time = min(results.values())
        fastest_name = min(results, key=results.get)

        print(f"\n📊 SPEED ANALYSIS:")
        for name, time_ms in sorted(results.items(), key=lambda x: x[1]):
            speed_ratio = time_ms / fastest_time
            if name == fastest_name:
                print(f"   {name:8}: {speed_ratio:.1f}x (fastest)")
            else:
                print(f"   {name:8}: {speed_ratio:.1f}x slower than {fastest_name}")

        return results

    def analyze_scaling(self, activation_fn, activation_name, sizes=[100, 500, 1000]):
        """
        Analyze how activation performance scales with tensor size.

        This function is PROVIDED to demonstrate scaling patterns.
        Students use it to understand computational complexity.
        """
        print(f"\n🔍 SCALING ANALYSIS: {activation_name}")
        print(f"=" * 40)

        scaling_results = []

        for size in sizes:
            test_tensor = Tensor(np.random.randn(size, size))
            avg_time = self.time_activation(activation_fn, test_tensor, activation_name, iterations=20)

            elements = size * size
            time_per_element = avg_time / elements * 1e6  # microseconds per element

            result = {
                'size': size,
                'elements': elements,
                'time_ms': avg_time,
                'time_per_element_us': time_per_element
            }
            scaling_results.append(result)

            print(f"   {size}x{size}: {avg_time:.3f}ms ({time_per_element:.3f}μs/element)")

        # Analyze scaling pattern
        if len(scaling_results) >= 2:
            small = scaling_results[0]
            large = scaling_results[-1]

            size_ratio = large['size'] / small['size']
            time_ratio = large['time_ms'] / small['time_ms']

            print(f"\n📈 Scaling Pattern:")
            print(f"   Size increased {size_ratio:.1f}x ({small['size']} → {large['size']})")
            print(f"   Time increased {time_ratio:.1f}x")

            if abs(time_ratio - size_ratio**2) < abs(time_ratio - size_ratio):
                print(f"   Pattern: O(n^2) - linear in tensor size")
            else:
                print(f"   Pattern: ~O(n) - very efficient scaling")

        return scaling_results

def benchmark_activation_suite():
    """
    Comprehensive benchmark of all activation functions.

    This function is PROVIDED to show complete systems analysis.
    Students run it to understand production performance implications.
    """
    profiler = ActivationProfiler()

    print("🏆 COMPREHENSIVE ACTIVATION BENCHMARK")
    print("=" * 60)

    # Test 1: Performance comparison
    comparison_results = profiler.compare_activations(tensor_size=(800, 800), iterations=30)

    # Test 2: Scaling analysis for each activation
    activations_to_test = [
        (ReLU(), "ReLU"),
        (Sigmoid(), "Sigmoid"),
        (Tanh(), "Tanh")
    ]

    for activation_fn, name in activations_to_test:
        profiler.analyze_scaling(activation_fn, name, sizes=[200, 400, 600])

    # Test 3: Memory vs Performance trade-offs
    print(f"\n💾 MEMORY vs PERFORMANCE ANALYSIS:")
    print(f"=" * 40)

    test_tensor = Tensor(np.random.randn(500, 500))
    original_memory = test_tensor.data.nbytes / (1024 * 1024)

    for name, activation_fn in [("ReLU", ReLU()), ("Sigmoid", Sigmoid())]:
        start_time = time.time()
        result = activation_fn(test_tensor)
        end_time = time.time()

        result_memory = result.data.nbytes / (1024 * 1024)
        time_ms = (end_time - start_time) * 1000

        print(f"   {name}:")
        print(f"     Input: {original_memory:.2f} MB")
        print(f"     Output: {result_memory:.2f} MB")
        print(f"     Memory overhead: {result_memory - original_memory:.2f} MB")
        print(f"     Time: {time_ms:.3f} ms")

    print(f"\n🎯 PRODUCTION INSIGHTS:")
    print(f"   - ReLU is typically fastest (simple max operation)")
    print(f"   - Sigmoid/Tanh slower due to exponential calculations")
    print(f"   - All operations scale linearly with tensor size")
    print(f"   - Memory usage doubles (input + output tensors)")
    print(f"   - Choose activation based on accuracy vs speed trade-offs")

    return comparison_results