diff --git a/modules/activations/activations_dev.py b/modules/activations/activations_dev.py index 162a03cf..07a0abfb 100644 --- a/modules/activations/activations_dev.py +++ b/modules/activations/activations_dev.py @@ -10,37 +10,29 @@ # %% [markdown] """ -# ๐Ÿ”ฅ TinyTorch Activations Module +# Module 2: Activations - Nonlinearity in Neural Networks -Welcome to the **Activations** module! This is where you'll implement the mathematical functions that give neural networks their power. +Welcome to the Activations module! This is where neural networks get their power through nonlinearity. -## ๐ŸŽฏ Learning Objectives +## Learning Goals +- Understand why activation functions are essential for neural networks +- Implement the three most important activation functions: ReLU, Sigmoid, and Tanh +- Visualize how activations transform data and enable complex learning +- See how activations work with layers to build powerful networks -By the end of this module, you will: -1. **Understand** why activation functions are essential for neural networks -2. **Implement** the three most important activation functions: ReLU, Sigmoid, and Tanh -3. **Test** your functions with various inputs to understand their behavior -4. **Use** these functions as building blocks for neural networks +## Build โ†’ Use โ†’ Understand +1. **Build**: Activation functions that add nonlinearity +2. **Use**: Transform tensors and see immediate results +3. **Understand**: How nonlinearity enables complex pattern learning -## ๐Ÿง  Why Activation Functions Matter - -**Without activation functions, neural networks are just linear transformations!** - -``` -Linear โ†’ Linear โ†’ Linear = Still just Linear -Linear โ†’ Activation โ†’ Linear = Can learn complex patterns! -``` - -**Key insight**: Activation functions add **nonlinearity**, allowing networks to learn complex patterns that linear functions cannot capture. - -## ๐Ÿ“š What You'll Build - -- **ReLU**: `f(x) = max(0, x)` - The workhorse of deep learning -- **Sigmoid**: `f(x) = 1 / (1 + e^(-x))` - Squashes to (0, 1) -- **Tanh**: `f(x) = tanh(x)` - Squashes to (-1, 1) - -Each function serves different purposes and has different mathematical properties. +## Module Dependencies +This module builds on the **tensor** module: +- **tensor** โ†’ **activations** โ†’ **layers** โ†’ **networks** +- Clean separation: data structures โ†’ math functions โ†’ building blocks โ†’ complete systems +""" +# %% [markdown] +""" ## ๐Ÿ“ฆ Where This Code Lives in the Final Package **Learning Side:** You work in `modules/activations/activations_dev.py` @@ -57,74 +49,115 @@ from tinytorch.core.layers import Dense, Conv2D - **Learning:** Focused modules for deep understanding - **Production:** Proper organization like PyTorch's `torch.nn.functional` - **Consistency:** All activation functions live together in `core.activations` - ---- - -Let's start building! ๐Ÿš€ """ # %% #| default_exp core.activations -# Standard library imports +# Setup and imports import math import numpy as np import matplotlib.pyplot as plt import os import sys +from typing import Union, List -# TinyTorch imports +# Import our Tensor class +from tinytorch.core.tensor import Tensor + +print("๐Ÿ”ฅ TinyTorch Activations Module") +print(f"NumPy version: {np.__version__}") +print(f"Python version: {sys.version_info.major}.{sys.version_info.minor}") +print("Ready to build activation functions!") + +# %% +#| export +import math +import numpy as np +import matplotlib.pyplot as plt +import os +import sys +from typing import Union, List + +# Import our Tensor class from tinytorch.core.tensor import Tensor # %% -# Helper function to detect if we're in a testing environment +#| hide +#| export def _should_show_plots(): - """ - Determine if we should show plots based on the execution context. - - Returns False if: - - Running in pytest (detected by 'pytest' in sys.modules) - - Running in test environment (detected by environment variables) - - Running from command line test runner - - Returns True if: - - Running in Jupyter notebook - - Running interactively in Python - """ - # Check if we're running in pytest - if 'pytest' in sys.modules: - return False - - # Check if we're in a test environment - if os.environ.get('PYTEST_CURRENT_TEST'): - return False - - # Check if we're running from a test file (more specific check) - if any(arg.endswith('.py') and 'test_' in os.path.basename(arg) and 'tests/' in arg for arg in sys.argv): - return False - - # Check if we're running from the tito CLI test command - if len(sys.argv) > 0 and 'tito.py' in sys.argv[0] and 'test' in sys.argv: - return False - - # Default to showing plots (notebook/interactive environment) - return True + """Check if we should show plots (disable during testing)""" + return 'pytest' not in sys.modules and 'test' not in sys.argv # %% [markdown] """ -## Step 1: ReLU Activation Function +## Step 1: What is an Activation Function? + +### Definition +An **activation function** is a mathematical function that adds nonlinearity to neural networks. It transforms the output of a layer before passing it to the next layer. + +### Why Activation Functions Matter +**Without activation functions, neural networks are just linear transformations!** + +``` +Linear โ†’ Linear โ†’ Linear = Still just Linear +Linear โ†’ Activation โ†’ Linear = Can learn complex patterns! +``` + +**The fundamental insight**: Activation functions add **nonlinearity**, allowing networks to learn complex patterns that linear functions cannot capture. + +### Real-World Examples +- **ReLU**: Detects when features are "active" (positive) +- **Sigmoid**: Outputs probabilities between 0 and 1 +- **Tanh**: Outputs values between -1 and 1 (centered) + +### Visual Intuition +``` +Input: [-2, -1, 0, 1, 2] +ReLU: [0, 0, 0, 1, 2] (clips negatives to 0) +Sigmoid: [0.1, 0.3, 0.5, 0.7, 0.9] (squashes to 0-1) +Tanh: [-0.9, -0.8, 0, 0.8, 0.9] (squashes to -1 to 1) +``` + +### The Math Behind It +Each activation function has different mathematical properties: +- **ReLU**: `f(x) = max(0, x)` - Simple thresholding +- **Sigmoid**: `f(x) = 1 / (1 + e^(-x))` - Smooth squashing +- **Tanh**: `f(x) = (e^x - e^(-x)) / (e^x + e^(-x))` - Centered squashing + +Let's implement these step by step! +""" + +# %% [markdown] +""" +## Step 2: ReLU Activation Function **ReLU** (Rectified Linear Unit) is the most popular activation function in deep learning. -**Formula**: `f(x) = max(0, x)` +### What is ReLU? +- **Formula**: `f(x) = max(0, x)` +- **Behavior**: Keeps positive values unchanged, sets negative values to zero +- **Range**: [0, โˆž) - unbounded above, bounded below at zero -**Properties**: +### Why ReLU is Popular - **Simple**: Easy to compute and understand -- **Sparse**: Outputs exactly zero for negative inputs -- **Unbounded**: No upper limit on positive outputs +- **Sparse**: Outputs exactly zero for negative inputs (sparsity) - **Non-saturating**: Doesn't suffer from vanishing gradients +- **Computationally efficient**: Just a max operation -**When to use**: Almost everywhere! It's the default choice for hidden layers. +### Real-World Analogy +Think of ReLU as a **threshold detector**: +- If a feature is "active" (positive), let it through +- If a feature is "inactive" (negative), ignore it +- Like a neuron that only fires when stimulated enough + +### Visual Example +``` +Input: [-3, -1, 0, 1, 3] +ReLU: [0, 0, 0, 1, 3] +``` + +Let's implement it! """ # %% @@ -137,6 +170,20 @@ class ReLU: Simple, effective, and computationally efficient. TODO: Implement ReLU activation function. + + APPROACH: + 1. Extract the numpy array from the input tensor + 2. Apply element-wise max(0, x) operation + 3. Return a new Tensor with the result + + EXAMPLE: + Input: Tensor([[-3, -1, 0, 1, 3]]) + Output: Tensor([[0, 0, 0, 1, 3]]) + + HINTS: + - Use x.data to get the numpy array + - Use np.maximum(0, x.data) for element-wise max + - Return Tensor(result) to wrap the result """ def forward(self, x: Tensor) -> Tensor: @@ -150,7 +197,19 @@ class ReLU: Output tensor with ReLU applied element-wise TODO: Implement element-wise max(0, x) operation - Hint: Use np.maximum(0, x.data) + + STEP-BY-STEP: + 1. Get the numpy array: data = x.data + 2. Apply ReLU: result = np.maximum(0, data) + 3. Return Tensor(result) + + EXAMPLE: + Input: Tensor([[-2, 1, 0]]) + Expected: Tensor([[0, 1, 0]]) + + HINTS: + - np.maximum(0, x.data) applies max(0, x) to each element + - This keeps positive values unchanged and sets negatives to 0 """ raise NotImplementedError("Student implementation required") @@ -174,181 +233,76 @@ class ReLU: # %% [markdown] """ ### ๐Ÿงช Test Your ReLU Function - -Once you implement ReLU above, run this cell to test it: """ # %% # Test ReLU function +print("Testing ReLU function...") + try: - print("=== Testing ReLU Function ===") - # Test data: mix of positive, negative, and zero x = Tensor([[-3.0, -1.0, 0.0, 1.0, 3.0]]) - print(f"Input: {x.data}") + print(f"โœ… Input: {x.data}") # Test ReLU relu = ReLU() y = relu(x) - print(f"ReLU output: {y.data}") - print(f"Expected: [[0. 0. 0. 1. 3.]]") + print(f"โœ… ReLU output: {y.data}") + print(f"โœ… Expected: [[0. 0. 0. 1. 3.]]") + + # Verify the result + expected = np.array([[0.0, 0.0, 0.0, 1.0, 3.0]]) + assert np.allclose(y.data, expected), "โŒ ReLU output doesn't match expected!" + print("๐ŸŽ‰ ReLU works correctly!") # Test with different shapes x_2d = Tensor([[-2.0, 1.0], [0.5, -0.5]]) y_2d = relu(x_2d) - print(f"\n2D Input: {x_2d.data}") - print(f"2D ReLU output: {y_2d.data}") + print(f"โœ… 2D Input: {x_2d.data}") + print(f"โœ… 2D ReLU output: {y_2d.data}") - print("โœ… ReLU working!") + print("\n๐ŸŽ‰ All ReLU tests passed!") except Exception as e: print(f"โŒ Error: {e}") - print("Make sure to implement the ReLU function above!") + print("Make sure to implement ReLU above!") # %% [markdown] """ -### ๐Ÿ“Š Visualize ReLU Function +## Step 3: Sigmoid Activation Function -Let's plot the ReLU function to see how it transforms inputs: -""" +**Sigmoid** is a smooth, S-shaped function that squashes any input to the range (0, 1). -# %% -# Plot ReLU function -try: - print("=== Plotting ReLU Function ===") - - # Create a range of input values - x_range = np.linspace(-5, 5, 100) - x_tensor = Tensor([x_range]) - - # Apply ReLU (student implementation) - relu = ReLU() - y_tensor = relu(x_tensor) - y_range = y_tensor.data[0] - - # Create ideal ReLU for comparison - y_ideal = np.maximum(0, x_range) - - # Only show plots if we're not in a testing environment - if _should_show_plots(): - # Create the plot - plt.figure(figsize=(12, 8)) - - # Plot both student implementation and ideal - plt.subplot(2, 2, 1) - plt.plot(x_range, y_range, 'b-', linewidth=3, label='Your ReLU Implementation') - plt.plot(x_range, y_ideal, 'r--', linewidth=2, alpha=0.7, label='Ideal ReLU') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Output') - plt.title('ReLU: Your Implementation vs Ideal') - plt.grid(True, alpha=0.3) - plt.legend() - plt.xlim(-5, 5) - plt.ylim(-1, 5) - - # Mathematical explanation plot - plt.subplot(2, 2, 2) - # Show the mathematical definition - x_math = np.array([-3, -2, -1, 0, 1, 2, 3]) - y_math = np.maximum(0, x_math) - plt.stem(x_math, y_math, basefmt=' ', linefmt='g-', markerfmt='go') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('max(0, x)') - plt.title('Mathematical Definition: max(0, x)') - plt.grid(True, alpha=0.3) - plt.xlim(-4, 4) - plt.ylim(-0.5, 3.5) - - # Show the piecewise nature - plt.subplot(2, 2, 3) - x_left = np.linspace(-5, 0, 50) - x_right = np.linspace(0, 5, 50) - plt.plot(x_left, np.zeros_like(x_left), 'r-', linewidth=3, label='f(x) = 0 for x < 0') - plt.plot(x_right, x_right, 'b-', linewidth=3, label='f(x) = x for x โ‰ฅ 0') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Output') - plt.title('Piecewise Function Definition') - plt.grid(True, alpha=0.3) - plt.legend() - plt.xlim(-5, 5) - plt.ylim(-1, 5) - - # Error analysis - plt.subplot(2, 2, 4) - difference = np.abs(y_range - y_ideal) - max_error = np.max(difference) - plt.plot(x_range, difference, 'purple', linewidth=2) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('|Your Output - Ideal Output|') - plt.title(f'Implementation Error (Max: {max_error:.6f})') - plt.grid(True, alpha=0.3) - plt.xlim(-5, 5) - - plt.tight_layout() - plt.show() - - # Print analysis - print(f"\n๐Ÿ“Š Analysis:") - print(f"โœ… Maximum error: {max_error:.10f}") - if max_error < 1e-10: - print("๐ŸŽ‰ Perfect implementation!") - elif max_error < 1e-6: - print("๐ŸŒŸ Excellent implementation!") - elif max_error < 1e-3: - print("๐Ÿ‘ Good implementation!") - else: - print("๐Ÿ”ง Implementation needs work.") - - print(f"๐Ÿ“ˆ Function properties:") - print(f" โ€ข Range: [0, โˆž)") - print(f" โ€ข Piecewise: f(x) = 0 for x < 0, f(x) = x for x โ‰ฅ 0") - print(f" โ€ข Monotonic: Always increasing for x โ‰ฅ 0") - print(f" โ€ข Sparse: Exactly zero for negative inputs") - else: - print("๐Ÿ“Š Plots disabled during testing - this is normal!") - - # Always show the mathematical analysis - difference = np.abs(y_range - y_ideal) - max_error = np.max(difference) - print(f"\n๐Ÿ“Š Mathematical Analysis:") - print(f"โœ… Maximum error: {max_error:.10f}") - if max_error < 1e-10: - print("๐ŸŽ‰ Perfect implementation!") - elif max_error < 1e-6: - print("๐ŸŒŸ Excellent implementation!") - elif max_error < 1e-3: - print("๐Ÿ‘ Good implementation!") - else: - print("๐Ÿ”ง Implementation needs work.") - -except Exception as e: - print(f"โŒ Error in plotting: {e}") - print("Make sure to implement the ReLU function above!") +### What is Sigmoid? +- **Formula**: `f(x) = 1 / (1 + e^(-x))` +- **Behavior**: Smoothly transforms any real number to (0, 1) +- **Range**: (0, 1) - always positive, bounded -# %% [markdown] -""" -## Step 2: Sigmoid Activation Function +### Why Sigmoid Matters +- **Probability interpretation**: Output can be interpreted as probability +- **Smooth**: Continuous and differentiable everywhere +- **Bounded**: Output is always between 0 and 1 +- **Historical importance**: Was the default choice before ReLU -**Sigmoid** squashes any input to the range (0, 1), making it useful for probabilities. +### Real-World Analogy +Think of Sigmoid as a **probability converter**: +- Takes any input (positive or negative) +- Converts it to a probability between 0 and 1 +- Like a confidence score that's always positive -**Formula**: `f(x) = 1 / (1 + e^(-x))` +### Visual Example +``` +Input: [-3, -1, 0, 1, 3] +Sigmoid: [0.05, 0.27, 0.5, 0.73, 0.95] +``` -**Properties**: -- **Bounded**: Always outputs between 0 and 1 -- **Smooth**: Differentiable everywhere -- **S-shaped**: Smooth transition from 0 to 1 -- **Saturating**: Can suffer from vanishing gradients +### The Math Behind It +The sigmoid function uses the exponential function: +- For large positive x: e^(-x) โ‰ˆ 0, so f(x) โ‰ˆ 1 +- For large negative x: e^(-x) โ‰ˆ โˆž, so f(x) โ‰ˆ 0 +- For x = 0: e^0 = 1, so f(x) = 0.5 -**When to use**: Binary classification (final layer), gates in RNNs/LSTMs. - -**โš ๏ธ Numerical Stability**: Be careful with large inputs to avoid overflow! +Let's implement it! """ # %% @@ -357,9 +311,25 @@ class Sigmoid: """ Sigmoid Activation: f(x) = 1 / (1 + e^(-x)) - Squashes input to range (0, 1). Often used for binary classification. + Smooth function that squashes inputs to (0, 1). + Historically important, still used for probability outputs. TODO: Implement Sigmoid activation function. + + APPROACH: + 1. Extract the numpy array from the input tensor + 2. Apply the sigmoid formula: 1 / (1 + e^(-x)) + 3. Return a new Tensor with the result + + EXAMPLE: + Input: Tensor([[-2, 0, 2]]) + Output: Tensor([[0.12, 0.5, 0.88]]) + + HINTS: + - Use x.data to get the numpy array + - Use np.exp(-x.data) for e^(-x) + - Use 1 / (1 + np.exp(-x.data)) for the full formula + - Return Tensor(result) to wrap the result """ def forward(self, x: Tensor) -> Tensor: @@ -372,15 +342,27 @@ class Sigmoid: Returns: Output tensor with Sigmoid applied element-wise - TODO: Implement sigmoid function (be careful with numerical stability!) + TODO: Implement the sigmoid formula - Hint: For numerical stability, use: - - For x >= 0: sigmoid(x) = 1 / (1 + exp(-x)) - - For x < 0: sigmoid(x) = exp(x) / (1 + exp(x)) + STEP-BY-STEP: + 1. Get the numpy array: data = x.data + 2. Compute e^(-x): exp_neg = np.exp(-data) + 3. Apply sigmoid: result = 1 / (1 + exp_neg) + 4. Return Tensor(result) + + EXAMPLE: + Input: Tensor([[-1, 0, 1]]) + Expected: Tensor([[0.27, 0.5, 0.73]]) + + HINTS: + - np.exp(-x.data) computes e^(-x) for each element + - 1 / (1 + np.exp(-x.data)) applies the full sigmoid formula + - This squashes any input to the range (0, 1) """ raise NotImplementedError("Student implementation required") def __call__(self, x: Tensor) -> Tensor: + """Make activation callable: sigmoid(x) same as sigmoid.forward(x)""" return self.forward(x) # %% @@ -390,19 +372,8 @@ class Sigmoid: """Sigmoid Activation: f(x) = 1 / (1 + e^(-x))""" def forward(self, x: Tensor) -> Tensor: - """Apply Sigmoid with numerical stability""" - # Use the numerically stable version to avoid overflow - # For x >= 0: sigmoid(x) = 1 / (1 + exp(-x)) - # For x < 0: sigmoid(x) = exp(x) / (1 + exp(x)) - x_data = x.data - result = np.zeros_like(x_data) - - # Stable computation - positive_mask = x_data >= 0 - result[positive_mask] = 1.0 / (1.0 + np.exp(-x_data[positive_mask])) - result[~positive_mask] = np.exp(x_data[~positive_mask]) / (1.0 + np.exp(x_data[~positive_mask])) - - return Tensor(result) + """Apply Sigmoid: f(x) = 1 / (1 + e^(-x))""" + return Tensor(1 / (1 + np.exp(-x.data))) def __call__(self, x: Tensor) -> Tensor: return self.forward(x) @@ -410,210 +381,104 @@ class Sigmoid: # %% [markdown] """ ### ๐Ÿงช Test Your Sigmoid Function - -Once you implement Sigmoid above, run this cell to test it: """ # %% # Test Sigmoid function +print("Testing Sigmoid function...") + try: - print("=== Testing Sigmoid Function ===") - - # Test data: mix of positive, negative, and zero - x = Tensor([[-5.0, -1.0, 0.0, 1.0, 5.0]]) - print(f"Input: {x.data}") + # Test data: mix of negative, zero, and positive + x = Tensor([[-3.0, -1.0, 0.0, 1.0, 3.0]]) + print(f"โœ… Input: {x.data}") # Test Sigmoid sigmoid = Sigmoid() y = sigmoid(x) - print(f"Sigmoid output: {y.data}") - print("Expected: values between 0 and 1") - print(f"All values in (0,1)? {np.all((y.data > 0) & (y.data < 1))}") + print(f"โœ… Sigmoid output: {y.data}") + + # Verify key properties + assert np.all(y.data > 0), "โŒ Sigmoid should always be positive!" + assert np.all(y.data < 1), "โŒ Sigmoid should always be less than 1!" + assert np.isclose(y.data[0, 2], 0.5, atol=0.01), "โŒ Sigmoid(0) should be 0.5!" + print("โœ… Sigmoid properties verified!") # Test specific values - x_zero = Tensor([[0.0]]) - y_zero = sigmoid(x_zero) - print(f"\nSigmoid(0) = {y_zero.data[0, 0]:.4f} (should be 0.5)") - - # Test extreme values (numerical stability) - x_extreme = Tensor([[-100.0, 100.0]]) - y_extreme = sigmoid(x_extreme) - print(f"Sigmoid([-100, 100]) = {y_extreme.data}") - print("Should be close to [0, 1] without overflow errors") - - print("โœ… Sigmoid working!") + expected_approx = np.array([[0.05, 0.27, 0.5, 0.73, 0.95]]) + assert np.allclose(y.data, expected_approx, atol=0.1), "โŒ Sigmoid values don't match expected!" + print("๐ŸŽ‰ Sigmoid works correctly!") except Exception as e: print(f"โŒ Error: {e}") - print("Make sure to implement the Sigmoid function above!") + print("Make sure to implement Sigmoid above!") # %% [markdown] """ -### ๐Ÿ“Š Visualize Sigmoid Function +## Step 4: Tanh Activation Function -Let's plot the Sigmoid function to see its S-shaped curve: -""" +**Tanh** (Hyperbolic Tangent) is a centered version of sigmoid that outputs values between -1 and 1. -# %% -# Plot Sigmoid function -try: - print("=== Plotting Sigmoid Function ===") - - # Create a range of input values - x_range = np.linspace(-10, 10, 100) - x_tensor = Tensor([x_range]) - - # Apply Sigmoid (student implementation) - sigmoid = Sigmoid() - y_tensor = sigmoid(x_tensor) - y_range = y_tensor.data[0] - - # Create ideal Sigmoid for comparison - y_ideal = 1.0 / (1.0 + np.exp(-x_range)) - - # Only show plots if we're not in a testing environment - if _should_show_plots(): - # Create the plot - plt.figure(figsize=(12, 8)) - - # Plot both student implementation and ideal - plt.subplot(2, 2, 1) - plt.plot(x_range, y_range, 'g-', linewidth=3, label='Your Sigmoid Implementation') - plt.plot(x_range, y_ideal, 'r--', linewidth=2, alpha=0.7, label='Ideal Sigmoid') - plt.axhline(y=0.5, color='orange', linestyle='--', alpha=0.5, label='y = 0.5') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axhline(y=1, color='k', linestyle='-', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Output') - plt.title('Sigmoid: Your Implementation vs Ideal') - plt.grid(True, alpha=0.3) - plt.legend() - plt.xlim(-10, 10) - plt.ylim(-0.1, 1.1) - - # Mathematical explanation plot - plt.subplot(2, 2, 2) - # Show key points - x_key = np.array([-5, -2, -1, 0, 1, 2, 5]) - y_key = 1.0 / (1.0 + np.exp(-x_key)) - plt.stem(x_key, y_key, basefmt=' ', linefmt='orange', markerfmt='o') - plt.axhline(y=0.5, color='orange', linestyle='--', alpha=0.5) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axhline(y=1, color='k', linestyle='-', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('1/(1+e^(-x))') - plt.title('Mathematical Definition: 1/(1+e^(-x))') - plt.grid(True, alpha=0.3) - plt.xlim(-6, 6) - plt.ylim(-0.1, 1.1) - - # Show the S-curve properties - plt.subplot(2, 2, 3) - x_detailed = np.linspace(-8, 8, 200) - y_detailed = 1.0 / (1.0 + np.exp(-x_detailed)) - plt.plot(x_detailed, y_detailed, 'g-', linewidth=3) - # Add asymptotes - plt.axhline(y=0, color='r', linestyle='--', alpha=0.7, label='Lower asymptote: y = 0') - plt.axhline(y=1, color='r', linestyle='--', alpha=0.7, label='Upper asymptote: y = 1') - plt.axhline(y=0.5, color='orange', linestyle='--', alpha=0.7, label='Midpoint: y = 0.5') - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Output') - plt.title('S-Curve Properties') - plt.grid(True, alpha=0.3) - plt.legend() - plt.xlim(-8, 8) - plt.ylim(-0.1, 1.1) - - # Error analysis - plt.subplot(2, 2, 4) - difference = np.abs(y_range - y_ideal) - max_error = np.max(difference) - plt.plot(x_range, difference, 'purple', linewidth=2) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('|Your Output - Ideal Output|') - plt.title(f'Implementation Error (Max: {max_error:.6f})') - plt.grid(True, alpha=0.3) - plt.xlim(-10, 10) - - plt.tight_layout() - plt.show() - - # Print analysis - print(f"\n๐Ÿ“Š Analysis:") - print(f"โœ… Maximum error: {max_error:.10f}") - if max_error < 1e-10: - print("๐ŸŽ‰ Perfect implementation!") - elif max_error < 1e-6: - print("๐ŸŒŸ Excellent implementation!") - elif max_error < 1e-3: - print("๐Ÿ‘ Good implementation!") - else: - print("๐Ÿ”ง Implementation needs work.") - - print(f"๐Ÿ“ˆ Function properties:") - print(f" โ€ข Range: (0, 1)") - print(f" โ€ข Symmetric around (0, 0.5)") - print(f" โ€ข Smooth and differentiable everywhere") - print(f" โ€ข Saturates for large |x| (vanishing gradient problem)") - print(f" โ€ข Useful for binary classification (outputs probabilities)") - else: - print("๐Ÿ“Š Plots disabled during testing - this is normal!") - - # Always show the mathematical analysis - difference = np.abs(y_range - y_ideal) - max_error = np.max(difference) - print(f"\n๐Ÿ“Š Mathematical Analysis:") - print(f"โœ… Maximum error: {max_error:.10f}") - if max_error < 1e-10: - print("๐ŸŽ‰ Perfect implementation!") - elif max_error < 1e-6: - print("๐ŸŒŸ Excellent implementation!") - elif max_error < 1e-3: - print("๐Ÿ‘ Good implementation!") - else: - print("๐Ÿ”ง Implementation needs work.") - -except Exception as e: - print(f"โŒ Error in plotting: {e}") - print("Make sure to implement the Sigmoid function above!") +### What is Tanh? +- **Formula**: `f(x) = (e^x - e^(-x)) / (e^x + e^(-x))` +- **Behavior**: Smoothly transforms any real number to (-1, 1) +- **Range**: (-1, 1) - centered around zero -# %% [markdown] -""" -## Step 3: Tanh Activation Function +### Why Tanh Matters +- **Centered**: Output is centered around zero (unlike sigmoid) +- **Zero-centered**: Better for gradient flow in deep networks +- **Smooth**: Continuous and differentiable everywhere +- **Bounded**: Output is always between -1 and 1 -**Tanh** (Hyperbolic Tangent) squashes inputs to the range (-1, 1). +### Real-World Analogy +Think of Tanh as a **centered probability converter**: +- Takes any input (positive or negative) +- Converts it to a value between -1 and 1 +- Like a confidence score that can be positive or negative -**Formula**: `f(x) = tanh(x) = (e^x - e^(-x)) / (e^x + e^(-x))` +### Visual Example +``` +Input: [-3, -1, 0, 1, 3] +Tanh: [-0.99, -0.76, 0, 0.76, 0.99] +``` -**Properties**: -- **Bounded**: Always outputs between -1 and 1 -- **Zero-centered**: Output is centered around 0 -- **Smooth**: Differentiable everywhere -- **Stronger gradients**: Than sigmoid around zero +### The Math Behind It +Tanh is related to sigmoid: `tanh(x) = 2 * sigmoid(2x) - 1` +- For large positive x: f(x) โ‰ˆ 1 +- For large negative x: f(x) โ‰ˆ -1 +- For x = 0: f(x) = 0 -**When to use**: Hidden layers when you want zero-centered outputs, RNNs. - -**Advantage over Sigmoid**: Zero-centered outputs help with gradient flow. +Let's implement it! """ # %% #| export class Tanh: """ - Tanh Activation: f(x) = tanh(x) + Tanh Activation: f(x) = (e^x - e^(-x)) / (e^x + e^(-x)) - Squashes input to range (-1, 1). Zero-centered output. + Centered version of sigmoid that outputs values in (-1, 1). + Better for gradient flow in deep networks. TODO: Implement Tanh activation function. + + APPROACH: + 1. Extract the numpy array from the input tensor + 2. Apply the tanh formula using numpy's tanh function + 3. Return a new Tensor with the result + + EXAMPLE: + Input: Tensor([[-2, 0, 2]]) + Output: Tensor([[-0.96, 0, 0.96]]) + + HINTS: + - Use x.data to get the numpy array + - Use np.tanh(x.data) for the tanh function + - Return Tensor(result) to wrap the result """ def forward(self, x: Tensor) -> Tensor: """ - Apply Tanh: f(x) = tanh(x) + Apply Tanh: f(x) = (e^x - e^(-x)) / (e^x + e^(-x)) Args: x: Input tensor @@ -621,22 +486,36 @@ class Tanh: Returns: Output tensor with Tanh applied element-wise - TODO: Implement tanh function - Hint: Use np.tanh(x.data) + TODO: Implement the tanh function + + STEP-BY-STEP: + 1. Get the numpy array: data = x.data + 2. Apply tanh: result = np.tanh(data) + 3. Return Tensor(result) + + EXAMPLE: + Input: Tensor([[-1, 0, 1]]) + Expected: Tensor([[-0.76, 0, 0.76]]) + + HINTS: + - np.tanh(x.data) computes tanh for each element + - This squashes any input to the range (-1, 1) + - The output is centered around zero """ raise NotImplementedError("Student implementation required") def __call__(self, x: Tensor) -> Tensor: + """Make activation callable: tanh(x) same as tanh.forward(x)""" return self.forward(x) # %% #| hide #| export class Tanh: - """Tanh Activation: f(x) = tanh(x)""" + """Tanh Activation: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))""" def forward(self, x: Tensor) -> Tensor: - """Apply Tanh""" + """Apply Tanh: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))""" return Tensor(np.tanh(x.data)) def __call__(self, x: Tensor) -> Tensor: @@ -645,195 +524,54 @@ class Tanh: # %% [markdown] """ ### ๐Ÿงช Test Your Tanh Function - -Once you implement Tanh above, run this cell to test it: """ # %% # Test Tanh function +print("Testing Tanh function...") + try: - print("=== Testing Tanh Function ===") - - # Test data: mix of positive, negative, and zero + # Test data: mix of negative, zero, and positive x = Tensor([[-3.0, -1.0, 0.0, 1.0, 3.0]]) - print(f"Input: {x.data}") + print(f"โœ… Input: {x.data}") # Test Tanh tanh = Tanh() y = tanh(x) - print(f"Tanh output: {y.data}") - print("Expected: values between -1 and 1") - print(f"All values in (-1,1)? {np.all((y.data > -1) & (y.data < 1))}") + print(f"โœ… Tanh output: {y.data}") + + # Verify key properties + assert np.all(y.data >= -1), "โŒ Tanh should always be >= -1!" + assert np.all(y.data <= 1), "โŒ Tanh should always be <= 1!" + assert np.isclose(y.data[0, 2], 0.0, atol=0.01), "โŒ Tanh(0) should be 0!" + print("โœ… Tanh properties verified!") # Test specific values - x_zero = Tensor([[0.0]]) - y_zero = tanh(x_zero) - print(f"\nTanh(0) = {y_zero.data[0, 0]:.4f} (should be 0.0)") - - # Test extreme values - x_extreme = Tensor([[-10.0, 10.0]]) - y_extreme = tanh(x_extreme) - print(f"Tanh([-10, 10]) = {y_extreme.data}") - print("Should be close to [-1, 1]") - - print("โœ… Tanh working!") + expected_approx = np.array([[-0.99, -0.76, 0.0, 0.76, 0.99]]) + assert np.allclose(y.data, expected_approx, atol=0.1), "โŒ Tanh values don't match expected!" + print("๐ŸŽ‰ Tanh works correctly!") except Exception as e: print(f"โŒ Error: {e}") - print("Make sure to implement the Tanh function above!") + print("Make sure to implement Tanh above!") # %% [markdown] """ -### ๐Ÿ“Š Visualize Tanh Function +## Step 5: Comparing Activation Functions -Let's plot the Tanh function to see its zero-centered S-shaped curve: +Now let's compare all three activation functions to understand their differences and when to use each one. """ # %% -# Plot Tanh function +# Compare activation functions +print("Comparing activation functions...") + try: - print("=== Plotting Tanh Function ===") + # Test data + x = Tensor([[-3.0, -1.0, 0.0, 1.0, 3.0]]) + print(f"โœ… Input: {x.data}") - # Create a range of input values - x_range = np.linspace(-5, 5, 100) - x_tensor = Tensor([x_range]) - - # Apply Tanh (student implementation) - tanh = Tanh() - y_tensor = tanh(x_tensor) - y_range = y_tensor.data[0] - - # Create ideal Tanh for comparison - y_ideal = np.tanh(x_range) - - # Only show plots if we're not in a testing environment - if _should_show_plots(): - # Create the plot - plt.figure(figsize=(12, 8)) - - # Plot both student implementation and ideal - plt.subplot(2, 2, 1) - plt.plot(x_range, y_range, 'orange', linewidth=3, label='Your Tanh Implementation') - plt.plot(x_range, y_ideal, 'r--', linewidth=2, alpha=0.7, label='Ideal Tanh') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axhline(y=1, color='k', linestyle='--', alpha=0.3) - plt.axhline(y=-1, color='k', linestyle='--', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Output') - plt.title('Tanh: Your Implementation vs Ideal') - plt.grid(True, alpha=0.3) - plt.legend() - plt.xlim(-5, 5) - plt.ylim(-1.2, 1.2) - - # Mathematical explanation plot - plt.subplot(2, 2, 2) - # Show key points - x_key = np.array([-3, -2, -1, 0, 1, 2, 3]) - y_key = np.tanh(x_key) - plt.stem(x_key, y_key, basefmt=' ', linefmt='purple', markerfmt='o') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axhline(y=1, color='k', linestyle='--', alpha=0.3) - plt.axhline(y=-1, color='k', linestyle='--', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('tanh(x)') - plt.title('Mathematical Definition: tanh(x)') - plt.grid(True, alpha=0.3) - plt.xlim(-4, 4) - plt.ylim(-1.2, 1.2) - - # Show symmetry property - plt.subplot(2, 2, 3) - x_sym = np.linspace(-4, 4, 100) - y_sym = np.tanh(x_sym) - plt.plot(x_sym, y_sym, 'orange', linewidth=3, label='tanh(x)') - plt.plot(-x_sym, -y_sym, 'b--', linewidth=2, alpha=0.7, label='-tanh(-x)') - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axhline(y=1, color='r', linestyle='--', alpha=0.7, label='Upper asymptote: y = 1') - plt.axhline(y=-1, color='r', linestyle='--', alpha=0.7, label='Lower asymptote: y = -1') - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Output') - plt.title('Symmetry: tanh(-x) = -tanh(x)') - plt.grid(True, alpha=0.3) - plt.legend() - plt.xlim(-4, 4) - plt.ylim(-1.2, 1.2) - - # Error analysis - plt.subplot(2, 2, 4) - difference = np.abs(y_range - y_ideal) - max_error = np.max(difference) - plt.plot(x_range, difference, 'purple', linewidth=2) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('|Your Output - Ideal Output|') - plt.title(f'Implementation Error (Max: {max_error:.6f})') - plt.grid(True, alpha=0.3) - plt.xlim(-5, 5) - - plt.tight_layout() - plt.show() - - # Print analysis - print(f"\n๐Ÿ“Š Analysis:") - print(f"โœ… Maximum error: {max_error:.10f}") - if max_error < 1e-10: - print("๐ŸŽ‰ Perfect implementation!") - elif max_error < 1e-6: - print("๐ŸŒŸ Excellent implementation!") - elif max_error < 1e-3: - print("๐Ÿ‘ Good implementation!") - else: - print("๐Ÿ”ง Implementation needs work.") - - print(f"๐Ÿ“ˆ Function properties:") - print(f" โ€ข Range: (-1, 1)") - print(f" โ€ข Odd function: tanh(-x) = -tanh(x)") - print(f" โ€ข Symmetric around origin (0, 0)") - print(f" โ€ข Smooth and differentiable everywhere") - print(f" โ€ข Stronger gradients than sigmoid around zero") - print(f" โ€ข Related to sigmoid: tanh(x) = 2*sigmoid(2x) - 1") - else: - print("๐Ÿ“Š Plots disabled during testing - this is normal!") - - # Always show the mathematical analysis - difference = np.abs(y_range - y_ideal) - max_error = np.max(difference) - print(f"\n๐Ÿ“Š Mathematical Analysis:") - print(f"โœ… Maximum error: {max_error:.10f}") - if max_error < 1e-10: - print("๐ŸŽ‰ Perfect implementation!") - elif max_error < 1e-6: - print("๐ŸŒŸ Excellent implementation!") - elif max_error < 1e-3: - print("๐Ÿ‘ Good implementation!") - else: - print("๐Ÿ”ง Implementation needs work.") - -except Exception as e: - print(f"โŒ Error in plotting: {e}") - print("Make sure to implement the Tanh function above!") - -# %% [markdown] -""" -## Step 4: Compare All Activation Functions - -Let's see how all three functions behave on the same input: -""" - -# %% -# Compare all activation functions -try: - print("=== Comparing All Activation Functions ===") - - # Test data: range from -5 to 5 - x = Tensor([[-5.0, -2.0, -1.0, 0.0, 1.0, 2.0, 5.0]]) - print(f"Input: {x.data}") - - # Apply all activations + # Apply all three activations relu = ReLU() sigmoid = Sigmoid() tanh = Tanh() @@ -842,338 +580,136 @@ try: y_sigmoid = sigmoid(x) y_tanh = tanh(x) - print(f"\nReLU: {y_relu.data}") - print(f"Sigmoid: {y_sigmoid.data}") - print(f"Tanh: {y_tanh.data}") + print(f"โœ… ReLU: {y_relu.data}") + print(f"โœ… Sigmoid: {y_sigmoid.data}") + print(f"โœ… Tanh: {y_tanh.data}") - print("\n๐Ÿ“Š Key Differences:") - print("- ReLU: Zeros out negative values, unbounded positive") - print("- Sigmoid: Squashes to (0, 1), always positive") - print("- Tanh: Squashes to (-1, 1), zero-centered") + print("\n๐Ÿ’ก Key Differences:") + print(" ReLU: [0, โˆž) - unbounded, sparse") + print(" Sigmoid: (0, 1) - bounded, always positive") + print(" Tanh: (-1, 1) - bounded, centered") - print("\nโœ… All activation functions working!") + print("\n๐ŸŽ‰ All activation functions working!") except Exception as e: print(f"โŒ Error: {e}") - print("Make sure to implement all activation functions above!") # %% [markdown] """ -### ๐Ÿ“Š Comprehensive Activation Function Comparison +## Step 6: Understanding When to Use Each Activation -Let's plot all three functions together to see their differences: -""" +### ReLU - The Default Choice +**Use ReLU for:** +- Hidden layers in most neural networks +- When you want computational efficiency +- When you want sparse representations +- When you want to avoid vanishing gradients -# %% -# Plot all activation functions together -try: - print("=== Plotting All Activation Functions Together ===") - - # Create a range of input values - x_range = np.linspace(-5, 5, 100) - x_tensor = Tensor([x_range]) - - # Apply all activations (student implementations) - relu = ReLU() - sigmoid = Sigmoid() - tanh = Tanh() - - y_relu = relu(x_tensor).data[0] - y_sigmoid = sigmoid(x_tensor).data[0] - y_tanh = tanh(x_tensor).data[0] - - # Create ideal functions for comparison - y_relu_ideal = np.maximum(0, x_range) - y_sigmoid_ideal = 1.0 / (1.0 + np.exp(-x_range)) - y_tanh_ideal = np.tanh(x_range) - - # Only show plots if we're not in a testing environment - if _should_show_plots(): - # Create the comprehensive plot - plt.figure(figsize=(15, 10)) - - # Main comparison plot - plt.subplot(2, 3, (1, 2)) - plt.plot(x_range, y_relu, 'b-', linewidth=3, label='Your ReLU') - plt.plot(x_range, y_sigmoid, 'g-', linewidth=3, label='Your Sigmoid') - plt.plot(x_range, y_tanh, 'orange', linewidth=3, label='Your Tanh') - - # Add ideal functions as dashed lines - plt.plot(x_range, y_relu_ideal, 'b--', linewidth=1, alpha=0.7, label='Ideal ReLU') - plt.plot(x_range, y_sigmoid_ideal, 'g--', linewidth=1, alpha=0.7, label='Ideal Sigmoid') - plt.plot(x_range, y_tanh_ideal, '--', color='orange', linewidth=1, alpha=0.7, label='Ideal Tanh') - - # Add reference lines - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.axhline(y=1, color='k', linestyle='--', alpha=0.3) - plt.axhline(y=-1, color='k', linestyle='--', alpha=0.3) - plt.axvline(x=0, color='k', linestyle='-', alpha=0.3) - - # Formatting - plt.xlabel('Input (x)', fontsize=12) - plt.ylabel('Output f(x)', fontsize=12) - plt.title('Activation Functions: Your Implementation vs Ideal', fontsize=14, fontweight='bold') - plt.grid(True, alpha=0.3) - plt.legend(fontsize=10, loc='upper left') - plt.xlim(-5, 5) - plt.ylim(-1.5, 5) - - # Mathematical definitions - plt.subplot(2, 3, 3) - plt.text(0.05, 0.95, 'Mathematical Definitions:', fontsize=12, fontweight='bold', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.85, 'ReLU:', fontsize=11, fontweight='bold', color='blue', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.80, 'f(x) = max(0, x)', fontsize=10, fontfamily='monospace', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.70, 'Sigmoid:', fontsize=11, fontweight='bold', color='green', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.65, 'f(x) = 1/(1+e^(-x))', fontsize=10, fontfamily='monospace', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.55, 'Tanh:', fontsize=11, fontweight='bold', color='orange', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.50, 'f(x) = tanh(x)', fontsize=10, fontfamily='monospace', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.45, ' = (e^x-e^(-x))/(e^x+e^(-x))', fontsize=10, fontfamily='monospace', - transform=plt.gca().transAxes, verticalalignment='top') - - plt.text(0.05, 0.30, 'Key Properties:', fontsize=12, fontweight='bold', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.25, 'โ€ข ReLU: Sparse, unbounded', fontsize=10, color='blue', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.20, 'โ€ข Sigmoid: Bounded (0,1)', fontsize=10, color='green', - transform=plt.gca().transAxes, verticalalignment='top') - plt.text(0.05, 0.15, 'โ€ข Tanh: Zero-centered (-1,1)', fontsize=10, color='orange', - transform=plt.gca().transAxes, verticalalignment='top') - plt.axis('off') - - # Error analysis for ReLU - plt.subplot(2, 3, 4) - error_relu = np.abs(y_relu - y_relu_ideal) - plt.plot(x_range, error_relu, 'b-', linewidth=2) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Error') - plt.title(f'ReLU Error (Max: {np.max(error_relu):.2e})') - plt.grid(True, alpha=0.3) - plt.xlim(-5, 5) - - # Error analysis for Sigmoid - plt.subplot(2, 3, 5) - error_sigmoid = np.abs(y_sigmoid - y_sigmoid_ideal) - plt.plot(x_range, error_sigmoid, 'g-', linewidth=2) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Error') - plt.title(f'Sigmoid Error (Max: {np.max(error_sigmoid):.2e})') - plt.grid(True, alpha=0.3) - plt.xlim(-5, 5) - - # Error analysis for Tanh - plt.subplot(2, 3, 6) - error_tanh = np.abs(y_tanh - y_tanh_ideal) - plt.plot(x_range, error_tanh, 'orange', linewidth=2) - plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) - plt.xlabel('Input (x)') - plt.ylabel('Error') - plt.title(f'Tanh Error (Max: {np.max(error_tanh):.2e})') - plt.grid(True, alpha=0.3) - plt.xlim(-5, 5) - - plt.tight_layout() - plt.show() - - # Comprehensive analysis - print("\n๐Ÿ“Š Comprehensive Analysis:") - print("=" * 60) - - # Function ranges - print("๐Ÿ“ˆ Output Ranges:") - print(f" ReLU: [{np.min(y_relu):.3f}, {np.max(y_relu):.3f}]") - print(f" Sigmoid: [{np.min(y_sigmoid):.3f}, {np.max(y_sigmoid):.3f}]") - print(f" Tanh: [{np.min(y_tanh):.3f}, {np.max(y_tanh):.3f}]") - - # Implementation accuracy - print("\n๐ŸŽฏ Implementation Accuracy:") - max_errors = [np.max(error_relu), np.max(error_sigmoid), np.max(error_tanh)] - functions = ['ReLU', 'Sigmoid', 'Tanh'] - - for func, error in zip(functions, max_errors): - if error < 1e-10: - status = "โœ… PERFECT" - elif error < 1e-6: - status = "โœ… EXCELLENT" - elif error < 1e-3: - status = "โš ๏ธ GOOD" - else: - status = "โŒ NEEDS WORK" - print(f" {func:8s}: {status:12s} (error: {error:.2e})") - - # Mathematical properties verification - print("\n๐Ÿ” Mathematical Properties:") - - # Zero-centered test - x_zero = Tensor([[0.0]]) - print(" Zero-centered test (f(0) should be 0):") - for name, func in [("ReLU", relu), ("Sigmoid", sigmoid), ("Tanh", tanh)]: - output = func(x_zero).data[0, 0] - is_zero = abs(output) < 1e-6 - expected = 0.0 if name != "Sigmoid" else 0.5 - print(f" {name:8s}: f(0) = {output:.4f} {'โœ…' if abs(output - expected) < 1e-6 else 'โŒ'}") - - # Monotonicity test - print(" Monotonicity test (should be increasing):") - test_vals = np.array([-2, -1, 0, 1, 2]) - x_test = Tensor([test_vals]) - for name, func in [("ReLU", relu), ("Sigmoid", sigmoid), ("Tanh", tanh)]: - outputs = func(x_test).data[0] - is_monotonic = np.all(outputs[1:] >= outputs[:-1]) - print(f" {name:8s}: {'โœ… Monotonic' if is_monotonic else 'โŒ Not monotonic'}") - - print("\n๐ŸŽ‰ Comparison complete! Use these insights to understand each function's role in neural networks.") - else: - print("๐Ÿ“Š Plots disabled during testing - this is normal!") - -except Exception as e: - print(f"โŒ Error in plotting: {e}") - print("Make sure matplotlib is installed and all functions are implemented!") +**Example**: `Dense โ†’ ReLU โ†’ Dense โ†’ ReLU โ†’ Dense` -# %% [markdown] -""" -## Step 5: Understanding Activation Function Properties +### Sigmoid - Probability Outputs +**Use Sigmoid for:** +- Binary classification outputs (0 or 1) +- When you need probability interpretation +- When you need outputs between 0 and 1 -Let's explore the mathematical properties of each function: -""" +**Example**: `Dense โ†’ ReLU โ†’ Dense โ†’ Sigmoid` (binary classifier) -# %% -# Explore activation function properties -try: - print("=== Activation Function Properties ===") - - # Create test functions - relu = ReLU() - sigmoid = Sigmoid() - tanh = Tanh() - - # Test with a range of values - test_values = np.linspace(-5, 5, 11) - x = Tensor([test_values]) - - print(f"Input range: {test_values}") - print(f"ReLU range: [{np.min(relu(x).data):.2f}, {np.max(relu(x).data):.2f}]") - print(f"Sigmoid range: [{np.min(sigmoid(x).data):.2f}, {np.max(sigmoid(x).data):.2f}]") - print(f"Tanh range: [{np.min(tanh(x).data):.2f}, {np.max(tanh(x).data):.2f}]") - - # Test monotonicity (should all be increasing functions) - print(f"\n๐Ÿ“ˆ Monotonicity Test:") - for name, func in [("ReLU", relu), ("Sigmoid", sigmoid), ("Tanh", tanh)]: - outputs = func(x).data[0] - is_monotonic = np.all(outputs[1:] >= outputs[:-1]) - print(f"{name}: {'โœ… Monotonic' if is_monotonic else 'โŒ Not monotonic'}") - - # Test zero-centered property - print(f"\n๐ŸŽฏ Zero-Centered Test (f(0) = 0):") - x_zero = Tensor([[0.0]]) - for name, func in [("ReLU", relu), ("Sigmoid", sigmoid), ("Tanh", tanh)]: - output = func(x_zero).data[0, 0] - is_zero_centered = abs(output) < 1e-6 - print(f"{name}: f(0) = {output:.4f} {'โœ… Zero-centered' if is_zero_centered else 'โŒ Not zero-centered'}") - - print("\n๐ŸŽ‰ Property analysis complete!") - -except Exception as e: - print(f"โŒ Error: {e}") - print("Check your activation function implementations!") +### Tanh - Centered Outputs +**Use Tanh for:** +- When you want outputs centered around zero +- When you want better gradient flow +- When you need outputs between -1 and 1 -# %% [markdown] -""" -## Step 6: Practical Usage Examples +**Example**: `Dense โ†’ Tanh โ†’ Dense โ†’ Tanh` (centered features) -Let's see how these functions would be used in practice: -""" - -# %% -# Practical usage examples -try: - print("=== Practical Usage Examples ===") - - # Example 1: Binary classification with sigmoid - print("1. Binary Classification (Sigmoid):") - logits = Tensor([[2.5, -1.2, 0.8, -0.3]]) # Raw network outputs - sigmoid = Sigmoid() - probabilities = sigmoid(logits) - print(f" Logits: {logits.data}") - print(f" Probabilities: {probabilities.data}") - print(f" Predictions: {(probabilities.data > 0.5).astype(int)}") - - # Example 2: Feature processing with ReLU - print("\n2. Feature Processing (ReLU):") - features = Tensor([[-0.5, 1.2, -2.1, 0.8, -0.1]]) # Mixed positive/negative - relu = ReLU() - processed = relu(features) - print(f" Raw features: {features.data}") - print(f" After ReLU: {processed.data}") - print(f" Sparsity: {np.mean(processed.data == 0):.1%} zeros") - - # Example 3: Normalized features with Tanh - print("\n3. Normalized Features (Tanh):") - raw_features = Tensor([[3.2, -1.8, 0.5, -2.4, 1.1]]) - tanh = Tanh() - normalized = tanh(raw_features) - print(f" Raw features: {raw_features.data}") - print(f" Normalized: {normalized.data}") - print(f" Mean: {np.mean(normalized.data):.3f} (close to 0)") - - print("\nโœ… Practical examples complete!") - -except Exception as e: - print(f"โŒ Error: {e}") - print("Check your activation function implementations!") - -# %% [markdown] -""" -## ๐ŸŽ‰ Congratulations! - -You've successfully implemented the three most important activation functions in deep learning! - -### ๐Ÿงฑ What You Built -1. **ReLU**: The workhorse activation that enables deep networks -2. **Sigmoid**: The probability activation for binary classification -3. **Tanh**: The zero-centered activation for better gradient flow - -### ๐ŸŽฏ Key Insights -- **Nonlinearity is essential**: Without activations, neural networks are just linear transformations -- **Different functions serve different purposes**: ReLU for hidden layers, Sigmoid for probabilities, Tanh for zero-centered outputs -- **Mathematical properties matter**: Monotonicity, boundedness, and zero-centering affect learning - -### ๐Ÿš€ What's Next -These activation functions will be used in: -- **Layers Module**: Building neural network layers -- **Loss Functions**: Computing training objectives -- **Advanced Architectures**: CNNs, RNNs, and more - -### ๐Ÿ”ง Export to Package -Run this to export your activations to the TinyTorch package: -```bash -python bin/tito.py sync +### Visual Comparison ``` - -Then test your implementation: -```bash -python bin/tito.py test --module activations +Input: [-2, -1, 0, 1, 2] +ReLU: [0, 0, 0, 1, 2] (sparse, unbounded) +Sigmoid: [0.1, 0.3, 0.5, 0.7, 0.9] (smooth, 0-1) +Tanh: [-0.9, -0.8, 0, 0.8, 0.9] (smooth, -1 to 1) ``` +""" -**Excellent work! You've mastered the mathematical foundations of neural networks!** ๐ŸŽ‰ +# %% +# Demonstrate activation usage patterns +print("Demonstrating activation usage patterns...") ---- +try: + # Create a simple network with different activations + from tinytorch.core.layers import Dense + + # Binary classification network + network = [ + Dense(input_size=3, output_size=4), + ReLU(), # Hidden layer + Dense(input_size=4, output_size=1), + Sigmoid() # Output layer (probability) + ] + + # Test input + x = Tensor([[1.0, 2.0, 3.0]]) + print(f"โœ… Input: {x}") + + # Forward pass + current = x + for i, layer in enumerate(network): + current = layer(current) + print(f"โœ… After layer {i+1} ({type(layer).__name__}): {current}") + + print("\n๐Ÿ’ก This network could classify inputs as 0 or 1!") + print(" The final Sigmoid output is a probability between 0 and 1.") + +except Exception as e: + print(f"โŒ Error: {e}") + print("Make sure your activations and layers are working!") -## ๐Ÿ“š Further Reading +# %% [markdown] +""" +## ๐ŸŽฏ Module Summary -**Want to learn more about activation functions?** -- **ReLU variants**: Leaky ReLU, ELU, Swish -- **Advanced activations**: GELU, Mish, SiLU -- **Activation choice**: When to use which function -- **Gradient flow**: How activations affect training +Congratulations! You've built the foundation of neural network nonlinearity: -**Next modules**: Layers, Loss Functions, Optimization -""" \ No newline at end of file +### What You've Accomplished +โœ… **ReLU Activation**: Simple, efficient, and widely used +โœ… **Sigmoid Activation**: Smooth probability converter +โœ… **Tanh Activation**: Centered version for better gradients +โœ… **Activation Comparison**: Understanding when to use each +โœ… **Real-world Usage**: Seeing activations in networks + +### Key Concepts You've Learned +- **Activation functions** add nonlinearity to neural networks +- **ReLU** is the default choice for hidden layers +- **Sigmoid** is used for probability outputs +- **Tanh** is used when you need centered outputs +- **Nonlinearity** is essential for learning complex patterns + +### What's Next +In the next modules, you'll build on this foundation: +- **Layers**: Combine activations with linear transformations +- **Networks**: Compose layers and activations into architectures +- **Training**: Learn parameters using gradients and optimization +- **Applications**: Solve real problems with neural networks + +### Real-World Connection +Your activation functions are now ready to: +- Add nonlinearity to neural network layers +- Enable learning of complex patterns +- Provide appropriate outputs for different tasks +- Integrate with the rest of the TinyTorch ecosystem + +**Ready for the next challenge?** Let's move on to building layers that combine linear transformations with your activation functions! +""" + +# %% +# Final verification +print("\n" + "="*50) +print("๐ŸŽ‰ ACTIVATIONS MODULE COMPLETE!") +print("="*50) +print("โœ… ReLU activation function") +print("โœ… Sigmoid activation function") +print("โœ… Tanh activation function") +print("โœ… Activation comparison and usage") +print("โœ… Real-world network integration") +print("\n๐Ÿš€ Ready to build layers in the next module!") \ No newline at end of file