TinyTorch/tinytorch/core/activations.py

# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: modules/source/03_activations/activations_dev.py       ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in modules/source/ - that's where real development    ║
# ║     happens! The tinytorch/ directory is just the compiled output.           ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['Sigmoid', 'ReLU', 'Tanh', 'GELU', 'Softmax']

# %% ../../modules/source/02_activations/activations_dev.ipynb 3
import numpy as np
from typing import Optional
import sys
import os


# Import will be in export cell

# %% ../../modules/source/02_activations/activations_dev.ipynb 8
from .tensor import Tensor

class Sigmoid:
    """
    Sigmoid activation: σ(x) = 1/(1 + e^(-x))

    Maps any real number to (0, 1) range.
    Perfect for probabilities and binary classification.
    """

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply sigmoid activation element-wise.

        TODO: Implement sigmoid function

        APPROACH:
        1. Apply sigmoid formula: 1 / (1 + exp(-x))
        2. Use np.exp for exponential
        3. Return result wrapped in new Tensor

        EXAMPLE:
        >>> sigmoid = Sigmoid()
        >>> x = Tensor([-2, 0, 2])
        >>> result = sigmoid(x)
        >>> print(result.data)
        [0.119, 0.5, 0.881]  # All values between 0 and 1

        HINT: Use np.exp(-x.data) for numerical stability
        """
        ### BEGIN SOLUTION
        # Apply sigmoid: 1 / (1 + exp(-x))
        result_data = 1.0 / (1.0 + np.exp(-x.data))
        result = Tensor(result_data)

        # Track gradients if autograd is enabled and input requires_grad
        if SigmoidBackward is not None and x.requires_grad:
            result.requires_grad = True
            result._grad_fn = SigmoidBackward(x, result)

        return result
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Allows the activation to be called like a function."""
        return self.forward(x)

    def backward(self, grad: Tensor) -> Tensor:
        """Compute gradient (implemented in Module 05)."""
        pass  # Will implement backward pass in Module 05

# %% ../../modules/source/02_activations/activations_dev.ipynb 12
class ReLU:
    """
    ReLU activation: f(x) = max(0, x)

    Sets negative values to zero, keeps positive values unchanged.
    Most popular activation for hidden layers.
    """

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply ReLU activation element-wise.

        TODO: Implement ReLU function

        APPROACH:
        1. Use np.maximum(0, x.data) for element-wise max with zero
        2. Return result wrapped in new Tensor

        EXAMPLE:
        >>> relu = ReLU()
        >>> x = Tensor([-2, -1, 0, 1, 2])
        >>> result = relu(x)
        >>> print(result.data)
        [0, 0, 0, 1, 2]  # Negative values become 0, positive unchanged

        HINT: np.maximum handles element-wise maximum automatically
        """
        ### BEGIN SOLUTION
        # Apply ReLU: max(0, x)
        result = np.maximum(0, x.data)
        return Tensor(result)
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Allows the activation to be called like a function."""
        return self.forward(x)

    def backward(self, grad: Tensor) -> Tensor:
        """Compute gradient (implemented in Module 05)."""
        pass  # Will implement backward pass in Module 05

# %% ../../modules/source/02_activations/activations_dev.ipynb 16
class Tanh:
    """
    Tanh activation: f(x) = (e^x - e^(-x))/(e^x + e^(-x))

    Maps any real number to (-1, 1) range.
    Zero-centered alternative to sigmoid.
    """

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply tanh activation element-wise.

        TODO: Implement tanh function

        APPROACH:
        1. Use np.tanh(x.data) for hyperbolic tangent
        2. Return result wrapped in new Tensor

        EXAMPLE:
        >>> tanh = Tanh()
        >>> x = Tensor([-2, 0, 2])
        >>> result = tanh(x)
        >>> print(result.data)
        [-0.964, 0.0, 0.964]  # Range (-1, 1), symmetric around 0

        HINT: NumPy provides np.tanh function
        """
        ### BEGIN SOLUTION
        # Apply tanh using NumPy
        result = np.tanh(x.data)
        return Tensor(result)
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Allows the activation to be called like a function."""
        return self.forward(x)

    def backward(self, grad: Tensor) -> Tensor:
        """Compute gradient (implemented in Module 05)."""
        pass  # Will implement backward pass in Module 05

# %% ../../modules/source/02_activations/activations_dev.ipynb 20
class GELU:
    """
    GELU activation: f(x) = x * Φ(x) ≈ x * Sigmoid(1.702 * x)

    Smooth approximation to ReLU, used in modern transformers.
    Where Φ(x) is the cumulative distribution function of standard normal.
    """

    def forward(self, x: Tensor) -> Tensor:
        """
        Apply GELU activation element-wise.

        TODO: Implement GELU approximation

        APPROACH:
        1. Use approximation: x * sigmoid(1.702 * x)
        2. Compute sigmoid part: 1 / (1 + exp(-1.702 * x))
        3. Multiply by x element-wise
        4. Return result wrapped in new Tensor

        EXAMPLE:
        >>> gelu = GELU()
        >>> x = Tensor([-1, 0, 1])
        >>> result = gelu(x)
        >>> print(result.data)
        [-0.159, 0.0, 0.841]  # Smooth, like ReLU but differentiable everywhere

        HINT: The 1.702 constant comes from √(2/π) approximation
        """
        ### BEGIN SOLUTION
        # GELU approximation: x * sigmoid(1.702 * x)
        # First compute sigmoid part
        sigmoid_part = 1.0 / (1.0 + np.exp(-1.702 * x.data))
        # Then multiply by x
        result = x.data * sigmoid_part
        return Tensor(result)
        ### END SOLUTION

    def __call__(self, x: Tensor) -> Tensor:
        """Allows the activation to be called like a function."""
        return self.forward(x)

    def backward(self, grad: Tensor) -> Tensor:
        """Compute gradient (implemented in Module 05)."""
        pass  # Will implement backward pass in Module 05

# %% ../../modules/source/02_activations/activations_dev.ipynb 24
class Softmax:
    """
    Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))

    Converts any vector to a probability distribution.
    Sum of all outputs equals 1.0.
    """

    def forward(self, x: Tensor, dim: int = -1) -> Tensor:
        """
        Apply softmax activation along specified dimension.

        TODO: Implement numerically stable softmax

        APPROACH:
        1. Subtract max for numerical stability: x - max(x)
        2. Compute exponentials: exp(x - max(x))
        3. Sum along dimension: sum(exp_values)
        4. Divide: exp_values / sum
        5. Return result wrapped in new Tensor

        EXAMPLE:
        >>> softmax = Softmax()
        >>> x = Tensor([1, 2, 3])
        >>> result = softmax(x)
        >>> print(result.data)
        [0.090, 0.245, 0.665]  # Sums to 1.0, larger inputs get higher probability

        HINTS:
        - Use np.max(x.data, axis=dim, keepdims=True) for max
        - Use np.sum(exp_values, axis=dim, keepdims=True) for sum
        - The max subtraction prevents overflow in exponentials
        """
        ### BEGIN SOLUTION
        # Numerical stability: subtract max to prevent overflow
        # Use Tensor operations to preserve gradient flow!
        x_max_data = np.max(x.data, axis=dim, keepdims=True)
        x_max = Tensor(x_max_data, requires_grad=False)  # max is not differentiable in this context
        x_shifted = x - x_max  # Tensor subtraction!

        # Compute exponentials (NumPy operation, but wrapped in Tensor)
        exp_values = Tensor(np.exp(x_shifted.data), requires_grad=x_shifted.requires_grad)

        # Sum along dimension (Tensor operation)
        exp_sum_data = np.sum(exp_values.data, axis=dim, keepdims=True)
        exp_sum = Tensor(exp_sum_data, requires_grad=exp_values.requires_grad)

        # Normalize to get probabilities (Tensor division!)
        result = exp_values / exp_sum
        return result
        ### END SOLUTION

    def __call__(self, x: Tensor, dim: int = -1) -> Tensor:
        """Allows the activation to be called like a function."""
        return self.forward(x, dim)

    def backward(self, grad: Tensor) -> Tensor:
        """Compute gradient (implemented in Module 05)."""
        pass  # Will implement backward pass in Module 05