mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-04-28 18:00:02 -05:00
🎯 Major Accomplishments: • ✅ All 15 module dev files validated and unit tests passing • ✅ Comprehensive integration tests (11/11 pass) • ✅ All 3 examples working with PyTorch-like API (XOR, MNIST, CIFAR-10) • ✅ Training capability verified (4/4 tests pass, XOR shows 35.8% improvement) • ✅ Clean directory structure (modules/source/ → modules/) 🧹 Repository Cleanup: • Removed experimental/debug files and old logos • Deleted redundant documentation (API_SIMPLIFICATION_COMPLETE.md, etc.) • Removed empty module directories and backup files • Streamlined examples (kept modern API versions only) • Cleaned up old TinyGPT implementation (moved to examples concept) 📊 Validation Results: • Module unit tests: 15/15 ✅ • Integration tests: 11/11 ✅ • Example validation: 3/3 ✅ • Training validation: 4/4 ✅ 🔧 Key Fixes: • Fixed activations module requires_grad test • Fixed networks module layer name test (Dense → Linear) • Fixed spatial module Conv2D weights attribute issues • Updated all documentation to reflect new structure 📁 Structure Improvements: • Simplified modules/source/ → modules/ (removed unnecessary nesting) • Added comprehensive validation test suites • Created VALIDATION_COMPLETE.md and WORKING_MODULES.md documentation • Updated book structure to reflect ML evolution story 🚀 System Status: READY FOR PRODUCTION All components validated, examples working, training capability verified. Test-first approach successfully implemented and proven.
582 lines
23 KiB
Python
Generated
582 lines
23 KiB
Python
Generated
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/03_activations/activations_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['ReLU', 'Sigmoid', 'Tanh', 'Softmax', 'ActivationProfiler', 'benchmark_activation_suite']
|
|
|
|
# %% ../../modules/source/03_activations/activations_dev.ipynb 1
|
|
import math
|
|
import numpy as np
|
|
import os
|
|
import sys
|
|
from typing import Union, List
|
|
|
|
# Import our Tensor class - try from package first, then from local module
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
except ImportError:
|
|
# For development, import from local tensor module
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '01_tensor'))
|
|
from tensor_dev import Tensor
|
|
|
|
# Import Variable for autograd support
|
|
try:
|
|
from tinytorch.core.autograd import Variable
|
|
except ImportError:
|
|
# For development, import from local autograd module
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '09_autograd'))
|
|
from autograd_dev import Variable
|
|
|
|
# %% ../../modules/source/03_activations/activations_dev.ipynb 7
|
|
class ReLU:
|
|
"""
|
|
ReLU Activation Function: f(x) = max(0, x)
|
|
|
|
The most popular activation function in deep learning.
|
|
Simple, fast, and effective for most applications.
|
|
"""
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Apply ReLU activation: f(x) = max(0, x)
|
|
|
|
Now supports both Tensor and Variable inputs with automatic differentiation.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Check if input is Variable (for autograd) or Tensor
|
|
2. For each element in the input tensor, apply max(0, element)
|
|
3. If input is Variable: create Variable output with proper gradient function
|
|
4. If input is Tensor: return Tensor as before
|
|
|
|
MATHEMATICAL FOUNDATION:
|
|
- Forward: f(x) = max(0, x)
|
|
- Backward: f'(x) = 1 if x > 0, else 0
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
relu = ReLU()
|
|
# With Tensor (no gradients)
|
|
tensor_input = Tensor([[-2, -1, 0, 1, 2]])
|
|
tensor_output = relu(tensor_input)
|
|
|
|
# With Variable (with gradients)
|
|
var_input = Variable([[-2, -1, 0, 1, 2]], requires_grad=True)
|
|
var_output = relu(var_input)
|
|
var_output.backward()
|
|
print(var_input.grad) # Gradients: [0, 0, 0, 1, 1]
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Check type with hasattr(x, 'requires_grad')
|
|
- For Variables: implement gradient function for backward pass
|
|
- ReLU gradient: 1 where input > 0, 0 elsewhere
|
|
- Use np.maximum(0, x.data) for forward pass
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is like torch.nn.ReLU() in PyTorch with autograd support
|
|
- Enables gradient-based training of neural networks
|
|
- ReLU's simple gradient (0 or 1) prevents vanishing gradients
|
|
- Creates sparse representations and efficient gradient flow
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Check if input is a Variable (autograd-enabled)
|
|
if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
|
|
# Input is a Variable - preserve autograd capabilities
|
|
|
|
# Forward pass: ReLU activation
|
|
input_data = x.data.data if hasattr(x.data, 'data') else x.data
|
|
output_data = np.maximum(0, input_data)
|
|
|
|
# Create gradient function for backward pass
|
|
def relu_grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# ReLU gradient: 1 where input > 0, 0 elsewhere
|
|
relu_mask = (input_data > 0).astype(np.float32)
|
|
grad_input_data = grad_output.data.data * relu_mask
|
|
grad_input = Variable(grad_input_data)
|
|
x.backward(grad_input)
|
|
|
|
# Return Variable with gradient function
|
|
requires_grad = x.requires_grad
|
|
result = Variable(output_data, requires_grad=requires_grad, grad_fn=relu_grad_fn if requires_grad else None)
|
|
return result
|
|
else:
|
|
# Input is a Tensor - use original implementation
|
|
result = np.maximum(0, x.data)
|
|
return type(x)(result)
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x):
|
|
"""Make the class callable: relu(x) instead of relu.forward(x)"""
|
|
return self.forward(x)
|
|
|
|
# %% ../../modules/source/03_activations/activations_dev.ipynb 11
|
|
class Sigmoid:
|
|
"""
|
|
Sigmoid Activation Function: f(x) = 1 / (1 + e^(-x))
|
|
|
|
Maps any real number to the range (0, 1).
|
|
Useful for binary classification and probability outputs.
|
|
"""
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Apply Sigmoid activation: f(x) = 1 / (1 + e^(-x))
|
|
|
|
Now supports both Tensor and Variable inputs with automatic differentiation.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Check if input is Variable (for autograd) or Tensor
|
|
2. Compute sigmoid: 1 / (1 + exp(-x))
|
|
3. If input is Variable: create Variable output with proper gradient function
|
|
4. If input is Tensor: return Tensor as before
|
|
|
|
MATHEMATICAL FOUNDATION:
|
|
- Forward: f(x) = 1 / (1 + e^(-x))
|
|
- Backward: f'(x) = f(x) * (1 - f(x)) = sigmoid(x) * (1 - sigmoid(x))
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
sigmoid = Sigmoid()
|
|
# With Variable (with gradients)
|
|
var_input = Variable([[0.0]], requires_grad=True)
|
|
var_output = sigmoid(var_input) # 0.5
|
|
var_output.backward()
|
|
print(var_input.grad) # 0.25 = 0.5 * (1 - 0.5)
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Check type with hasattr(x, 'requires_grad')
|
|
- For Variables: implement gradient function for backward pass
|
|
- Sigmoid gradient: sigmoid(x) * (1 - sigmoid(x))
|
|
- Use numerical stability: clip inputs to prevent overflow
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is like torch.nn.Sigmoid() in PyTorch with autograd support
|
|
- Used in binary classification and gating mechanisms
|
|
- Smooth gradients enable stable training
|
|
- Self-normalizing gradient (max at x=0, decreases at extremes)
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Check if input is a Variable (autograd-enabled)
|
|
if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
|
|
# Input is a Variable - preserve autograd capabilities
|
|
|
|
# Forward pass: Sigmoid activation with numerical stability
|
|
input_data = x.data.data if hasattr(x.data, 'data') else x.data
|
|
clipped_input = np.clip(-input_data, -500, 500)
|
|
output_data = 1 / (1 + np.exp(clipped_input))
|
|
|
|
# Create gradient function for backward pass
|
|
def sigmoid_grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Sigmoid gradient: sigmoid(x) * (1 - sigmoid(x))
|
|
sigmoid_grad = output_data * (1 - output_data)
|
|
grad_input_data = grad_output.data.data * sigmoid_grad
|
|
grad_input = Variable(grad_input_data)
|
|
x.backward(grad_input)
|
|
|
|
# Return Variable with gradient function
|
|
requires_grad = x.requires_grad
|
|
result = Variable(output_data, requires_grad=requires_grad, grad_fn=sigmoid_grad_fn if requires_grad else None)
|
|
return result
|
|
else:
|
|
# Input is a Tensor - use original implementation
|
|
clipped_input = np.clip(-x.data, -500, 500)
|
|
result = 1 / (1 + np.exp(clipped_input))
|
|
return type(x)(result)
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x):
|
|
"""Make the class callable: sigmoid(x) instead of sigmoid.forward(x)"""
|
|
return self.forward(x)
|
|
|
|
# %% ../../modules/source/03_activations/activations_dev.ipynb 15
|
|
class Tanh:
|
|
"""
|
|
Tanh Activation Function: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))
|
|
|
|
Zero-centered activation function with range (-1, 1).
|
|
Better gradient properties than sigmoid.
|
|
"""
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Apply Tanh activation: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))
|
|
|
|
Now supports both Tensor and Variable inputs with automatic differentiation.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Check if input is Variable (for autograd) or Tensor
|
|
2. Compute tanh: (e^x - e^(-x)) / (e^x + e^(-x))
|
|
3. If input is Variable: create Variable output with proper gradient function
|
|
4. If input is Tensor: return Tensor as before
|
|
|
|
MATHEMATICAL FOUNDATION:
|
|
- Forward: f(x) = tanh(x)
|
|
- Backward: f'(x) = 1 - tanh²(x) = 1 - f(x)²
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
tanh = Tanh()
|
|
# With Variable (with gradients)
|
|
var_input = Variable([[0.0]], requires_grad=True)
|
|
var_output = tanh(var_input) # 0.0
|
|
var_output.backward()
|
|
print(var_input.grad) # 1.0 = 1 - 0²
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Check type with hasattr(x, 'requires_grad')
|
|
- For Variables: implement gradient function for backward pass
|
|
- Tanh gradient: 1 - tanh²(x)
|
|
- Use np.tanh() for numerical stability
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is like torch.nn.Tanh() in PyTorch with autograd support
|
|
- Used in RNN, LSTM, and GRU cells
|
|
- Zero-centered outputs improve gradient flow
|
|
- Strong gradients near zero, weaker at extremes
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Check if input is a Variable (autograd-enabled)
|
|
if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
|
|
# Input is a Variable - preserve autograd capabilities
|
|
|
|
# Forward pass: Tanh activation
|
|
input_data = x.data.data if hasattr(x.data, 'data') else x.data
|
|
output_data = np.tanh(input_data)
|
|
|
|
# Create gradient function for backward pass
|
|
def tanh_grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Tanh gradient: 1 - tanh²(x)
|
|
tanh_grad = 1 - output_data ** 2
|
|
grad_input_data = grad_output.data.data * tanh_grad
|
|
grad_input = Variable(grad_input_data)
|
|
x.backward(grad_input)
|
|
|
|
# Return Variable with gradient function
|
|
requires_grad = x.requires_grad
|
|
result = Variable(output_data, requires_grad=requires_grad, grad_fn=tanh_grad_fn if requires_grad else None)
|
|
return result
|
|
else:
|
|
# Input is a Tensor - use original implementation
|
|
result = np.tanh(x.data)
|
|
return type(x)(result)
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x: Tensor) -> Tensor:
|
|
"""Make the class callable: tanh(x) instead of tanh.forward(x)"""
|
|
return self.forward(x)
|
|
|
|
# %% ../../modules/source/03_activations/activations_dev.ipynb 19
|
|
class Softmax:
|
|
"""
|
|
Softmax Activation Function: f(x_i) = e^(x_i) / Σ(e^(x_j))
|
|
|
|
Converts a vector of real numbers into a probability distribution.
|
|
Essential for multi-class classification.
|
|
"""
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Apply Softmax activation: f(x_i) = e^(x_i) / Σ(e^(x_j))
|
|
|
|
Now supports both Tensor and Variable inputs with automatic differentiation.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Check if input is Variable (for autograd) or Tensor
|
|
2. Compute softmax with numerical stability
|
|
3. If input is Variable: create Variable output with proper gradient function
|
|
4. If input is Tensor: return Tensor as before
|
|
|
|
MATHEMATICAL FOUNDATION:
|
|
- Forward: f(x_i) = e^(x_i) / Σ(e^(x_j))
|
|
- Backward: ∂f_i/∂x_j = f_i * (δ_ij - f_j) where δ_ij is Kronecker delta
|
|
- Simplified: ∂f_i/∂x_i = f_i * (1 - f_i), ∂f_i/∂x_j = -f_i * f_j (i ≠ j)
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
softmax = Softmax()
|
|
# With Variable (with gradients)
|
|
var_input = Variable([[1.0, 2.0]], requires_grad=True)
|
|
var_output = softmax(var_input)
|
|
var_output.backward(Variable([[1.0, 0.0]]))
|
|
# Gradients computed automatically
|
|
```
|
|
|
|
IMPLEMENTATION HINTS:
|
|
- Check type with hasattr(x, 'requires_grad')
|
|
- For Variables: implement gradient function for backward pass
|
|
- Softmax gradient: Jacobian matrix with f_i * (δ_ij - f_j)
|
|
- Use numerical stability: subtract max before exponential
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is like torch.nn.Softmax() in PyTorch with autograd support
|
|
- Used in classification and attention mechanisms
|
|
- Converts logits to probability distributions
|
|
- Complex gradient structure due to normalization
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Check if input is a Variable (autograd-enabled)
|
|
if hasattr(x, 'requires_grad') and hasattr(x, 'grad_fn'):
|
|
# Input is a Variable - preserve autograd capabilities
|
|
|
|
# Forward pass: Softmax activation with numerical stability
|
|
input_data = x.data.data if hasattr(x.data, 'data') else x.data
|
|
|
|
# Handle empty input
|
|
if input_data.size == 0:
|
|
return Variable(input_data.copy(), requires_grad=x.requires_grad)
|
|
|
|
# Subtract max for numerical stability
|
|
x_shifted = input_data - np.max(input_data, axis=-1, keepdims=True)
|
|
|
|
# Compute exponentials
|
|
exp_values = np.exp(x_shifted)
|
|
|
|
# Sum along last axis
|
|
sum_exp = np.sum(exp_values, axis=-1, keepdims=True)
|
|
|
|
# Divide to get probabilities
|
|
output_data = exp_values / sum_exp
|
|
|
|
# Create gradient function for backward pass
|
|
def softmax_grad_fn(grad_output):
|
|
if x.requires_grad:
|
|
# Softmax gradient: for each element i,j: ∂f_i/∂x_j = f_i * (δ_ij - f_j)
|
|
# For vector input, this becomes: grad_input = softmax * (grad_output - (softmax * grad_output).sum(keepdims=True))
|
|
grad_out_data = grad_output.data.data
|
|
softmax_grad_sum = np.sum(output_data * grad_out_data, axis=-1, keepdims=True)
|
|
grad_input_data = output_data * (grad_out_data - softmax_grad_sum)
|
|
grad_input = Variable(grad_input_data)
|
|
x.backward(grad_input)
|
|
|
|
# Return Variable with gradient function
|
|
requires_grad = x.requires_grad
|
|
result = Variable(output_data, requires_grad=requires_grad, grad_fn=softmax_grad_fn if requires_grad else None)
|
|
return result
|
|
else:
|
|
# Input is a Tensor - use original implementation
|
|
# Handle empty input
|
|
if x.data.size == 0:
|
|
return type(x)(x.data.copy())
|
|
|
|
# Subtract max for numerical stability
|
|
x_shifted = x.data - np.max(x.data, axis=-1, keepdims=True)
|
|
|
|
# Compute exponentials
|
|
exp_values = np.exp(x_shifted)
|
|
|
|
# Sum along last axis
|
|
sum_exp = np.sum(exp_values, axis=-1, keepdims=True)
|
|
|
|
# Divide to get probabilities
|
|
result = exp_values / sum_exp
|
|
|
|
return type(x)(result)
|
|
### END SOLUTION
|
|
|
|
def __call__(self, x):
|
|
"""Make the class callable: softmax(x) instead of softmax.forward(x)"""
|
|
return self.forward(x)
|
|
|
|
# %% ../../modules/source/03_activations/activations_dev.ipynb 30
|
|
import time
|
|
|
|
class ActivationProfiler:
|
|
"""
|
|
Performance profiling toolkit for activation functions.
|
|
|
|
Helps ML engineers understand computational costs and optimize
|
|
neural network performance for production deployment.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.results = {}
|
|
|
|
def time_activation(self, activation_fn, tensor, activation_name, iterations=100):
|
|
"""
|
|
Time how long an activation function takes to run.
|
|
|
|
TODO: Implement activation timing.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Record start time using time.time()
|
|
2. Run the activation function for specified iterations
|
|
3. Record end time
|
|
4. Calculate average time per iteration
|
|
5. Return the average time in milliseconds
|
|
|
|
EXAMPLE:
|
|
profiler = ActivationProfiler()
|
|
relu = ReLU()
|
|
test_tensor = Tensor(np.random.randn(1000, 1000))
|
|
avg_time = profiler.time_activation(relu, test_tensor, "ReLU")
|
|
print(f"ReLU took {avg_time:.3f} ms on average")
|
|
|
|
HINTS:
|
|
- Use time.time() for timing
|
|
- Run multiple iterations for better accuracy
|
|
- Calculate: (end_time - start_time) / iterations * 1000 for ms
|
|
- Return the average time per call in milliseconds
|
|
"""
|
|
### BEGIN SOLUTION
|
|
start_time = time.time()
|
|
|
|
for _ in range(iterations):
|
|
result = activation_fn(tensor)
|
|
|
|
end_time = time.time()
|
|
avg_time_ms = (end_time - start_time) / iterations * 1000
|
|
|
|
return avg_time_ms
|
|
### END SOLUTION
|
|
|
|
def compare_activations(self, tensor_size=(1000, 1000), iterations=50):
|
|
"""
|
|
Compare performance of all activation functions.
|
|
|
|
This function is PROVIDED to show systems analysis.
|
|
Students run it to understand performance differences.
|
|
"""
|
|
print(f"⚡ ACTIVATION PERFORMANCE COMPARISON")
|
|
print(f"=" * 50)
|
|
print(f"Tensor size: {tensor_size}, Iterations: {iterations}")
|
|
|
|
# Create test tensor
|
|
test_tensor = Tensor(np.random.randn(*tensor_size))
|
|
tensor_mb = test_tensor.data.nbytes / (1024 * 1024)
|
|
print(f"Test tensor: {tensor_mb:.2f} MB")
|
|
|
|
# Test all activation functions
|
|
activations = {
|
|
'ReLU': ReLU(),
|
|
'Sigmoid': Sigmoid(),
|
|
'Tanh': Tanh(),
|
|
'Softmax': Softmax()
|
|
}
|
|
|
|
results = {}
|
|
for name, activation_fn in activations.items():
|
|
avg_time = self.time_activation(activation_fn, test_tensor, name, iterations)
|
|
results[name] = avg_time
|
|
print(f" {name:8}: {avg_time:.3f} ms")
|
|
|
|
# Calculate speed ratios relative to fastest
|
|
fastest_time = min(results.values())
|
|
fastest_name = min(results, key=results.get)
|
|
|
|
print(f"\n📊 SPEED ANALYSIS:")
|
|
for name, time_ms in sorted(results.items(), key=lambda x: x[1]):
|
|
speed_ratio = time_ms / fastest_time
|
|
if name == fastest_name:
|
|
print(f" {name:8}: {speed_ratio:.1f}x (fastest)")
|
|
else:
|
|
print(f" {name:8}: {speed_ratio:.1f}x slower than {fastest_name}")
|
|
|
|
return results
|
|
|
|
def analyze_scaling(self, activation_fn, activation_name, sizes=[100, 500, 1000]):
|
|
"""
|
|
Analyze how activation performance scales with tensor size.
|
|
|
|
This function is PROVIDED to demonstrate scaling patterns.
|
|
Students use it to understand computational complexity.
|
|
"""
|
|
print(f"\n🔍 SCALING ANALYSIS: {activation_name}")
|
|
print(f"=" * 40)
|
|
|
|
scaling_results = []
|
|
|
|
for size in sizes:
|
|
test_tensor = Tensor(np.random.randn(size, size))
|
|
avg_time = self.time_activation(activation_fn, test_tensor, activation_name, iterations=20)
|
|
|
|
elements = size * size
|
|
time_per_element = avg_time / elements * 1e6 # microseconds per element
|
|
|
|
result = {
|
|
'size': size,
|
|
'elements': elements,
|
|
'time_ms': avg_time,
|
|
'time_per_element_us': time_per_element
|
|
}
|
|
scaling_results.append(result)
|
|
|
|
print(f" {size}x{size}: {avg_time:.3f}ms ({time_per_element:.3f}μs/element)")
|
|
|
|
# Analyze scaling pattern
|
|
if len(scaling_results) >= 2:
|
|
small = scaling_results[0]
|
|
large = scaling_results[-1]
|
|
|
|
size_ratio = large['size'] / small['size']
|
|
time_ratio = large['time_ms'] / small['time_ms']
|
|
|
|
print(f"\n📈 Scaling Pattern:")
|
|
print(f" Size increased {size_ratio:.1f}x ({small['size']} → {large['size']})")
|
|
print(f" Time increased {time_ratio:.1f}x")
|
|
|
|
if abs(time_ratio - size_ratio**2) < abs(time_ratio - size_ratio):
|
|
print(f" Pattern: O(n^2) - linear in tensor size")
|
|
else:
|
|
print(f" Pattern: ~O(n) - very efficient scaling")
|
|
|
|
return scaling_results
|
|
|
|
def benchmark_activation_suite():
|
|
"""
|
|
Comprehensive benchmark of all activation functions.
|
|
|
|
This function is PROVIDED to show complete systems analysis.
|
|
Students run it to understand production performance implications.
|
|
"""
|
|
profiler = ActivationProfiler()
|
|
|
|
print("🏆 COMPREHENSIVE ACTIVATION BENCHMARK")
|
|
print("=" * 60)
|
|
|
|
# Test 1: Performance comparison
|
|
comparison_results = profiler.compare_activations(tensor_size=(800, 800), iterations=30)
|
|
|
|
# Test 2: Scaling analysis for each activation
|
|
activations_to_test = [
|
|
(ReLU(), "ReLU"),
|
|
(Sigmoid(), "Sigmoid"),
|
|
(Tanh(), "Tanh")
|
|
]
|
|
|
|
for activation_fn, name in activations_to_test:
|
|
profiler.analyze_scaling(activation_fn, name, sizes=[200, 400, 600])
|
|
|
|
# Test 3: Memory vs Performance trade-offs
|
|
print(f"\n💾 MEMORY vs PERFORMANCE ANALYSIS:")
|
|
print(f"=" * 40)
|
|
|
|
test_tensor = Tensor(np.random.randn(500, 500))
|
|
original_memory = test_tensor.data.nbytes / (1024 * 1024)
|
|
|
|
for name, activation_fn in [("ReLU", ReLU()), ("Sigmoid", Sigmoid())]:
|
|
start_time = time.time()
|
|
result = activation_fn(test_tensor)
|
|
end_time = time.time()
|
|
|
|
result_memory = result.data.nbytes / (1024 * 1024)
|
|
time_ms = (end_time - start_time) * 1000
|
|
|
|
print(f" {name}:")
|
|
print(f" Input: {original_memory:.2f} MB")
|
|
print(f" Output: {result_memory:.2f} MB")
|
|
print(f" Memory overhead: {result_memory - original_memory:.2f} MB")
|
|
print(f" Time: {time_ms:.3f} ms")
|
|
|
|
print(f"\n🎯 PRODUCTION INSIGHTS:")
|
|
print(f" - ReLU is typically fastest (simple max operation)")
|
|
print(f" - Sigmoid/Tanh slower due to exponential calculations")
|
|
print(f" - All operations scale linearly with tensor size")
|
|
print(f" - Memory usage doubles (input + output tensors)")
|
|
print(f" - Choose activation based on accuracy vs speed trade-offs")
|
|
|
|
return comparison_results
|