Files
TinyTorch/tests/13_transformers/test_progressive_integration.py
Vijay Janapa Reddi 0af88840b1 Update test suite for module restructuring
Updated test imports and paths after modules/source/ removal:
- Progressive integration tests for modules 03, 06, 08, 13, 14
- Checkpoint integration tests
- Module completion orchestrator
- Optimizer integration tests
- Gradient flow regression tests

Updated test documentation:
- tests/README.md with new module paths
- tests/TEST_STRATEGY.md with restructuring notes

All tests now reference modules/XX_name/ instead of modules/source/.
2025-11-10 19:42:23 -05:00

1284 lines
52 KiB
Python

"""
Module 13: Progressive Integration Tests
Tests that Module 14 (Benchmarking) works correctly AND that the entire ML system (01→13) still works.
DEPENDENCY CHAIN: 01_setup → ... → 13_kernels → 14_benchmarking
This is where we enable performance analysis and optimization for ML systems.
🎯 WHAT THIS TESTS:
- Module 13: Performance benchmarking, profiling, bottleneck analysis
- Integration: Benchmarking works with all ML components (models, training, data)
- Regression: Complete ML system (01→13) still works correctly
- Preparation: Ready for production deployment (Module 15: MLOps)
💡 FOR STUDENTS: If tests fail, check:
1. Does your benchmark_model function exist in tinytorch.core.benchmarking?
2. Can you profile memory usage and execution time?
3. Do benchmarks work with different model architectures?
4. Are performance metrics meaningful and actionable?
🔧 DEBUGGING HELP:
- Benchmarking measures: latency, throughput, memory usage, FLOPS
- Profiling identifies: bottlenecks, memory leaks, inefficiencies
- Analysis guides: optimization decisions, hardware choices, scaling strategies
"""
import numpy as np
import sys
import time
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
class TestCompleteMLSystemStillWorks:
"""
🔄 REGRESSION CHECK: Verify complete ML system (01→13) still works after benchmarking development.
💡 If these fail: You may have broken something in the ML system while implementing benchmarking.
🔧 Fix: Check that benchmarking code doesn't interfere with core ML functionality.
"""
def test_end_to_end_ml_system_stable(self):
"""
✅ TEST: Complete ML system (training, inference, optimization) should still work
📋 FULL ML SYSTEM COMPONENTS:
- Data loading and preprocessing
- Model architecture (CNN, attention, dense)
- Training loops with optimization
- Inference and evaluation
- Performance optimizations
🚨 IF FAILS: Core ML system broken by benchmarking development
"""
try:
# Test complete ML system still works
from tinytorch.core.tensor import Tensor
from tinytorch.core.spatial import Conv2D, MaxPool2D
from tinytorch.core.layers import Dense
from tinytorch.core.activations import ReLU, Softmax
from tinytorch.core.optimizers import Adam
from tinytorch.core.training import Trainer
from tinytorch.core.data import Dataset, DataLoader
# Create complete ML pipeline
class TestModel:
def __init__(self):
self.conv1 = Conv2D(3, 16, kernel_size=3, padding=1)
self.pool = MaxPool2D(kernel_size=2)
self.conv2 = Conv2D(16, 32, kernel_size=3, padding=1)
self.fc = Dense(32 * 8 * 8, 10)
self.relu = ReLU()
self.softmax = Softmax()
def __call__(self, x):
h1 = self.relu(self.conv1(x))
h1_pool = self.pool(h1)
h2 = self.relu(self.conv2(h1_pool))
h2_pool = self.pool(h2)
# Flatten
flattened = Tensor(h2_pool.data.reshape(h2_pool.shape[0], -1))
logits = self.fc(flattened)
return self.softmax(logits)
def parameters(self):
params = []
for layer in [self.conv1, self.conv2, self.fc]:
if hasattr(layer, 'parameters'):
params.extend(layer.parameters())
elif hasattr(layer, 'weights'):
params.append(layer.weights)
if hasattr(layer, 'bias') and layer.bias is not None:
params.append(layer.bias)
return params
# Create dataset
class TestDataset(Dataset):
def __init__(self):
self.data = np.random.randn(40, 3, 32, 32)
self.targets = np.random.randint(0, 10, 40)
def __len__(self):
return 40
def __getitem__(self, idx):
return Tensor(self.data[idx]), self.targets[idx]
# Test complete pipeline
model = TestModel()
optimizer = Adam(model.parameters(), lr=0.001)
trainer = Trainer(model, optimizer)
dataset = TestDataset()
dataloader = DataLoader(dataset, batch_size=8)
# Test inference
for batch_x, batch_y in dataloader:
predictions = model(batch_x)
assert predictions.shape == (8, 10), \
f"❌ ML system shape broken. Expected (8, 10), got {predictions.shape}"
# Verify probabilities
prob_sums = np.sum(predictions.data, axis=1)
assert np.allclose(prob_sums, 1.0), \
"❌ ML system probabilities broken"
break # Test one batch
except ImportError as e:
assert False, f"""
❌ ML SYSTEM IMPORTS BROKEN!
🔍 IMPORT ERROR: {str(e)}
🔧 COMPLETE ML SYSTEM REQUIREMENTS:
All modules (01→13) must be working:
1. Tensor operations (Module 02)
2. Activation functions (Module 03)
3. Layer infrastructure (Module 04)
4. Dense networks (Module 05)
5. Spatial operations (Module 06)
6. Attention mechanisms (Module 07)
7. Data loading (Module 08)
8. Automatic differentiation (Module 09)
9. Optimizers (Module 10)
10. Training loops (Module 11)
11. Model compression (Module 12)
12. Performance kernels (Module 13)
💡 SYSTEM INTEGRITY:
Benchmarking should be NON-INVASIVE - it measures
performance without changing core functionality.
"""
except Exception as e:
assert False, f"""
❌ ML SYSTEM FUNCTIONALITY BROKEN!
🔍 ERROR: {str(e)}
🔧 POSSIBLE CAUSES:
1. Benchmarking interfering with forward pass
2. Memory profiling affecting tensor operations
3. Performance monitoring breaking training loops
4. Instrumentation corrupting gradients
5. Timing code affecting numerical stability
💡 BENCHMARKING SAFETY:
Benchmarking code should be:
- Non-intrusive: Doesn't change model behavior
- Optional: Can be disabled for production
- Isolated: Runs in separate threads/processes
- Clean: No side effects on ML computations
"""
def test_optimization_and_training_stable(self):
"""
✅ TEST: Training and optimization should still work after benchmarking
📋 TRAINING SYSTEM:
- Gradient computation and backpropagation
- Parameter updates via optimizers
- Loss function evaluation
- Training loop coordination
🎯 Ensures benchmarking doesn't break learning
"""
try:
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Dense
from tinytorch.core.optimizers import SGD
from tinytorch.core.training import Trainer, MSELoss
from tinytorch.core.autograd import Variable
# Test training system
model = Dense(5, 3)
optimizer = SGD(model.parameters(), lr=0.01)
loss_fn = MSELoss()
trainer = Trainer(model, optimizer)
# Test training step
x = Tensor(np.random.randn(4, 5))
target = Tensor(np.random.randn(4, 3))
# Forward pass
predictions = model(x)
loss = loss_fn(predictions, target)
# Verify training components work
assert predictions.shape == (4, 3), \
f"❌ Training forward pass broken. Expected (4, 3), got {predictions.shape}"
assert hasattr(loss, 'data') or isinstance(loss, (float, np.ndarray)), \
"❌ Loss computation broken"
# Test optimization step structure
assert hasattr(optimizer, 'step'), \
"❌ Optimizer step method missing"
assert hasattr(optimizer, 'zero_grad'), \
"❌ Optimizer zero_grad method missing"
except Exception as e:
assert False, f"""
❌ TRAINING AND OPTIMIZATION BROKEN!
🔍 ERROR: {str(e)}
🔧 TRAINING REQUIREMENTS:
1. Forward pass produces correct outputs
2. Loss functions compute meaningful values
3. Optimizers can update parameters
4. Training loops coordinate all components
5. Gradient computation remains stable
💡 TRAINING INTEGRITY:
Training must work perfectly because:
- Benchmarking measures training performance
- Performance analysis guides training optimization
- Production systems depend on training stability
- Model deployment requires trained parameters
"""
class TestModule14BenchmarkingCore:
"""
🆕 NEW FUNCTIONALITY: Test Module 14 (Benchmarking) core implementation.
💡 What you're implementing: Performance analysis and optimization tools for ML systems.
🎯 Goal: Enable data-driven performance optimization and bottleneck identification.
"""
def test_model_benchmarking_exists(self):
"""
✅ TEST: Model benchmarking - Measure model performance characteristics
📋 WHAT YOU NEED TO IMPLEMENT:
def benchmark_model(model, input_shape, batch_sizes=[1, 8, 32], num_trials=10):
# Measure latency, throughput, memory usage
return {'latency': ..., 'throughput': ..., 'memory': ...}
🚨 IF FAILS: Model benchmarking doesn't exist or missing components
"""
try:
from tinytorch.core.benchmarking import benchmark_model
from tinytorch.core.layers import Dense
from tinytorch.core.tensor import Tensor
# Test model benchmarking
model = Dense(100, 50)
input_shape = (32, 100) # Batch size 32, 100 features
# Run benchmark
results = benchmark_model(model, input_shape, num_trials=5)
# Should return performance metrics
assert isinstance(results, dict), \
"❌ benchmark_model should return dictionary of metrics"
# Essential performance metrics
required_metrics = ['latency', 'throughput', 'memory_usage']
for metric in required_metrics:
assert metric in results, \
f"❌ Missing benchmark metric: {metric}"
assert isinstance(results[metric], (int, float)), \
f"❌ Benchmark metric {metric} should be numeric, got {type(results[metric])}"
assert results[metric] > 0, \
f"❌ Benchmark metric {metric} should be positive, got {results[metric]}"
# Latency should be reasonable (milliseconds)
assert 0.001 < results['latency'] < 10.0, \
f"❌ Latency seems unrealistic: {results['latency']} seconds"
# Throughput should be reasonable (samples/second)
assert 1 < results['throughput'] < 1000000, \
f"❌ Throughput seems unrealistic: {results['throughput']} samples/sec"
except ImportError as e:
assert False, f"""
❌ MODEL BENCHMARKING MISSING!
🔍 IMPORT ERROR: {str(e)}
🔧 HOW TO IMPLEMENT:
1. Create in modules/14_benchmarking/14_benchmarking.py:
import time
import numpy as np
from tinytorch.core.tensor import Tensor
def benchmark_model(model, input_shape, batch_sizes=[1, 8, 32], num_trials=10):
'''Benchmark model performance across different batch sizes.'''
results = {{
'latency': [],
'throughput': [],
'memory_usage': []
}}
for batch_size in batch_sizes:
# Create test input
test_input = Tensor(np.random.randn(batch_size, *input_shape[1:]))
# Measure latency
latencies = []
for _ in range(num_trials):
start_time = time.time()
output = model(test_input)
end_time = time.time()
latencies.append(end_time - start_time)
# Calculate metrics
avg_latency = np.mean(latencies)
throughput = batch_size / avg_latency # samples/second
# Estimate memory usage (simplified)
memory_usage = test_input.data.nbytes + output.data.nbytes
results['latency'].append(avg_latency)
results['throughput'].append(throughput)
results['memory_usage'].append(memory_usage)
# Return average metrics
return {{
'latency': np.mean(results['latency']),
'throughput': np.mean(results['throughput']),
'memory_usage': np.mean(results['memory_usage'])
}}
2. Export the module:
tito module complete 14_benchmarking
📊 BENCHMARKING METRICS:
- Latency: Time per inference (milliseconds)
- Throughput: Samples processed per second
- Memory Usage: Peak memory consumption (bytes)
- FLOPS: Floating point operations per second
- Efficiency: Hardware utilization percentage
"""
except Exception as e:
assert False, f"""
❌ MODEL BENCHMARKING BROKEN!
🔍 ERROR: {str(e)}
🔧 BENCHMARKING REQUIREMENTS:
1. Measure inference latency accurately
2. Calculate throughput (samples/second)
3. Monitor memory usage
4. Handle different batch sizes
5. Provide statistical measures (mean, std)
6. Support different model architectures
💡 PERFORMANCE ANALYSIS:
Benchmarking enables:
- Hardware selection (CPU vs GPU vs TPU)
- Model architecture optimization
- Deployment planning and scaling
- Cost estimation for production
- SLA planning and capacity sizing
"""
def test_profiling_and_bottleneck_analysis(self):
"""
✅ TEST: Profiling tools - Identify performance bottlenecks
📋 PROFILING CAPABILITIES:
- Layer-wise timing analysis
- Memory allocation tracking
- Operation hotspot identification
- Gradient computation profiling
🎯 Essential for performance optimization
"""
try:
from tinytorch.core.benchmarking import profile_model, ProfilerContext
from tinytorch.core.layers import Dense
from tinytorch.core.activations import ReLU
from tinytorch.core.tensor import Tensor
# Test profiling functionality
class TestModel:
def __init__(self):
self.layer1 = Dense(100, 200)
self.relu = ReLU()
self.layer2 = Dense(200, 50)
def __call__(self, x):
h1 = self.layer1(x)
h1_act = self.relu(h1)
return self.layer2(h1_act)
model = TestModel()
x = Tensor(np.random.randn(16, 100))
# Test profiling
if 'profile_model' in locals():
profile_results = profile_model(model, x)
# Should provide layer-wise timing
assert isinstance(profile_results, dict), \
"❌ Profiler should return dictionary of results"
assert 'layer_times' in profile_results, \
"❌ Profiler missing layer timing analysis"
assert 'total_time' in profile_results, \
"❌ Profiler missing total execution time"
assert 'bottlenecks' in profile_results, \
"❌ Profiler missing bottleneck identification"
# Test profiler context (if available)
if 'ProfilerContext' in locals():
with ProfilerContext() as profiler:
output = model(x)
results = profiler.get_results()
assert isinstance(results, dict), \
"❌ ProfilerContext should provide results"
assert 'operations' in results, \
"❌ ProfilerContext missing operation tracking"
except ImportError:
assert False, f"""
❌ PROFILING TOOLS MISSING!
🔧 PROFILING IMPLEMENTATION:
class ProfilerContext:
'''Context manager for detailed performance profiling.'''
def __init__(self):
self.operation_times = []
self.memory_snapshots = []
self.start_time = None
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.total_time = time.time() - self.start_time
def get_results(self):
return {{
'total_time': self.total_time,
'operations': self.operation_times,
'memory': self.memory_snapshots
}}
def profile_model(model, input_data):
'''Profile model execution and identify bottlenecks.'''
# Profile each layer
layer_times = {{}}
with ProfilerContext() as profiler:
# Time each operation
start = time.time()
output = model(input_data)
end = time.time()
total_time = end - start
return {{
'total_time': total_time,
'layer_times': layer_times,
'bottlenecks': ['layer2'], # Identify slowest layers
'recommendations': ['Consider reducing layer2 size']
}}
💡 PROFILING APPLICATIONS:
- Optimize model architectures
- Choose efficient layer types
- Identify memory bottlenecks
- Guide hardware upgrades
- Plan distributed training
"""
except Exception as e:
assert False, f"""
❌ PROFILING TOOLS BROKEN!
🔍 ERROR: {str(e)}
🔧 PROFILING REQUIREMENTS:
1. Layer-wise execution timing
2. Memory allocation tracking
3. Operation bottleneck identification
4. Statistical analysis of performance
5. Actionable optimization recommendations
🔬 PROFILING INSIGHTS:
Profiling reveals:
- Which layers are slowest
- Memory allocation patterns
- Computational hotspots
- Inefficient operations
- Hardware utilization
"""
def test_performance_comparison_tools(self):
"""
✅ TEST: Performance comparison - Compare different model configurations
📋 COMPARISON CAPABILITIES:
- Model architecture comparison
- Optimization technique evaluation
- Hardware performance analysis
- Training vs inference benchmarks
💡 Guide optimization decisions with data
"""
try:
from tinytorch.core.benchmarking import compare_models, PerformanceComparator
from tinytorch.core.layers import Dense
from tinytorch.core.tensor import Tensor
# Test model comparison
model_small = Dense(50, 25)
model_large = Dense(100, 50)
input_shape = (16, 50)
if 'compare_models' in locals():
comparison = compare_models([model_small, model_large], input_shape)
# Should compare performance metrics
assert isinstance(comparison, dict), \
"❌ Model comparison should return results dictionary"
assert 'models' in comparison, \
"❌ Comparison missing model results"
assert len(comparison['models']) == 2, \
f"❌ Should compare 2 models, got {len(comparison['models'])}"
# Should provide relative performance
assert 'relative_performance' in comparison, \
"❌ Comparison missing relative performance analysis"
# Should identify best model
assert 'best_model' in comparison, \
"❌ Comparison should identify best performing model"
# Test performance comparator (if available)
if 'PerformanceComparator' in locals():
comparator = PerformanceComparator()
# Add models to comparison
comparator.add_model("small", model_small)
comparator.add_model("large", model_large)
# Run comparison
results = comparator.benchmark_all(input_shape)
assert isinstance(results, dict), \
"❌ PerformanceComparator should return results"
assert 'small' in results and 'large' in results, \
"❌ Comparison should include all added models"
except ImportError:
assert False, f"""
❌ PERFORMANCE COMPARISON TOOLS MISSING!
🔧 COMPARISON IMPLEMENTATION:
def compare_models(models, input_shape, metrics=['latency', 'memory']):
'''Compare performance of multiple models.'''
results = {{
'models': [],
'relative_performance': {{}},
'best_model': None
}}
# Benchmark each model
for i, model in enumerate(models):
model_results = benchmark_model(model, input_shape)
results['models'].append({{
'model_id': i,
'metrics': model_results
}})
# Find best model (lowest latency)
best_idx = min(range(len(models)),
key=lambda i: results['models'][i]['metrics']['latency'])
results['best_model'] = best_idx
return results
class PerformanceComparator:
'''Advanced model performance comparison tool.'''
def __init__(self):
self.models = {{}}
def add_model(self, name, model):
self.models[name] = model
def benchmark_all(self, input_shape):
results = {{}}
for name, model in self.models.items():
results[name] = benchmark_model(model, input_shape)
return results
💡 COMPARISON USE CASES:
- Architecture search: Which model design is best?
- Optimization evaluation: Does pruning help performance?
- Hardware selection: CPU vs GPU vs TPU performance
- Deployment planning: Latency vs accuracy tradeoffs
"""
except Exception as e:
assert False, f"""
❌ PERFORMANCE COMPARISON BROKEN!
🔍 ERROR: {str(e)}
🔧 COMPARISON REQUIREMENTS:
1. Fair benchmarking across models
2. Multiple performance metrics
3. Statistical significance testing
4. Relative performance analysis
5. Actionable recommendations
📊 COMPARISON INSIGHTS:
Performance comparison enables:
- Data-driven model selection
- Optimization technique validation
- Hardware cost-benefit analysis
- Production deployment decisions
"""
class TestBenchmarkingIntegration:
"""
🔗 INTEGRATION TEST: Benchmarking + Complete ML system working together.
💡 Test that benchmarking works with real ML workflows and architectures.
🎯 Goal: Enable performance analysis for production ML systems.
"""
def test_training_performance_analysis(self):
"""
✅ TEST: Benchmark training loops and optimization
📋 TRAINING BENCHMARKS:
- Forward pass timing
- Backward pass timing
- Optimizer step timing
- Data loading bottlenecks
- End-to-end training throughput
💡 Optimize training performance for faster iteration
"""
try:
from tinytorch.core.benchmarking import benchmark_training
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Dense
from tinytorch.core.optimizers import SGD
from tinytorch.core.training import Trainer, MSELoss
from tinytorch.core.data import Dataset, DataLoader
# Create training setup
model = Dense(50, 10)
optimizer = SGD(model.parameters(), lr=0.01)
loss_fn = MSELoss()
trainer = Trainer(model, optimizer)
# Create dataset
class TrainingDataset(Dataset):
def __init__(self):
self.data = np.random.randn(100, 50)
self.targets = np.random.randn(100, 10)
def __len__(self):
return 100
def __getitem__(self, idx):
return Tensor(self.data[idx]), Tensor(self.targets[idx])
dataset = TrainingDataset()
dataloader = DataLoader(dataset, batch_size=16)
# Test training benchmarking
if 'benchmark_training' in locals():
training_results = benchmark_training(trainer, dataloader, num_epochs=2)
# Should provide training performance metrics
assert isinstance(training_results, dict), \
"❌ Training benchmark should return results dictionary"
training_metrics = ['forward_time', 'backward_time', 'optimizer_time', 'total_time']
for metric in training_metrics:
assert metric in training_results, \
f"❌ Missing training metric: {metric}"
# Training time should be reasonable
assert training_results['total_time'] > 0, \
"❌ Total training time should be positive"
assert training_results['forward_time'] > 0, \
"❌ Forward pass time should be positive"
# Test manual training loop timing
start_time = time.time()
for epoch in range(2):
for batch_x, batch_y in dataloader:
# Simulate training step
predictions = model(batch_x)
loss = loss_fn(predictions, batch_y)
# Simple parameter update (without full autograd)
# In real implementation: loss.backward(); optimizer.step()
break # One batch per epoch for testing
end_time = time.time()
training_duration = end_time - start_time
assert training_duration > 0, \
"❌ Training loop timing measurement broken"
except Exception as e:
assert False, f"""
❌ TRAINING PERFORMANCE ANALYSIS BROKEN!
🔍 ERROR: {str(e)}
🔧 TRAINING BENCHMARK REQUIREMENTS:
1. Measure forward pass timing
2. Measure backward pass timing
3. Measure optimizer step timing
4. Measure data loading timing
5. Calculate training throughput (samples/second)
6. Identify training bottlenecks
💡 TRAINING OPTIMIZATION:
def benchmark_training(trainer, dataloader, num_epochs=1):
'''Benchmark training loop performance.'''
times = {{
'forward_time': 0,
'backward_time': 0,
'optimizer_time': 0,
'data_loading_time': 0
}}
total_start = time.time()
for epoch in range(num_epochs):
for batch_x, batch_y in dataloader:
# Forward pass timing
forward_start = time.time()
predictions = trainer.model(batch_x)
times['forward_time'] += time.time() - forward_start
# Loss computation
loss = trainer.loss_fn(predictions, batch_y)
# Backward pass timing (if available)
backward_start = time.time()
# loss.backward() # When autograd ready
times['backward_time'] += time.time() - backward_start
# Optimizer timing
opt_start = time.time()
# trainer.optimizer.step() # When ready
times['optimizer_time'] += time.time() - opt_start
times['total_time'] = time.time() - total_start
return times
🚀 TRAINING INSIGHTS:
- Data loading bottlenecks: Async data loading
- Forward pass optimization: Kernel fusion, mixed precision
- Backward pass optimization: Gradient checkpointing
- Optimizer optimization: Fused optimizers, large batch training
"""
def test_inference_performance_optimization(self):
"""
✅ TEST: Benchmark inference and deployment scenarios
📋 INFERENCE BENCHMARKS:
- Single sample latency
- Batch inference throughput
- Memory efficiency analysis
- Hardware utilization
- Real-time performance
🎯 Optimize for production deployment
"""
try:
from tinytorch.core.benchmarking import benchmark_inference
from tinytorch.core.tensor import Tensor
from tinytorch.core.spatial import Conv2D, MaxPool2D
from tinytorch.core.layers import Dense
from tinytorch.core.activations import ReLU
# Create inference model (CNN for image classification)
class InferenceModel:
def __init__(self):
self.conv1 = Conv2D(3, 32, kernel_size=3, padding=1)
self.pool = MaxPool2D(kernel_size=2)
self.conv2 = Conv2D(32, 64, kernel_size=3, padding=1)
self.fc = Dense(64 * 8 * 8, 1000) # ImageNet-like
self.relu = ReLU()
def __call__(self, x):
h1 = self.relu(self.conv1(x))
h1_pool = self.pool(h1)
h2 = self.relu(self.conv2(h1_pool))
h2_pool = self.pool(h2)
# Flatten
flattened = Tensor(h2_pool.data.reshape(h2_pool.shape[0], -1))
return self.fc(flattened)
model = InferenceModel()
# Test different inference scenarios
scenarios = [
("single_image", (1, 3, 32, 32)), # Real-time inference
("small_batch", (8, 3, 32, 32)), # Small batch processing
("large_batch", (64, 3, 32, 32)), # Batch processing
]
if 'benchmark_inference' in locals():
for scenario_name, input_shape in scenarios:
inference_results = benchmark_inference(model, input_shape)
assert isinstance(inference_results, dict), \
f"❌ Inference benchmark should return dict for {scenario_name}"
# Essential inference metrics
inference_metrics = ['latency', 'throughput', 'memory_peak']
for metric in inference_metrics:
assert metric in inference_results, \
f"❌ Missing inference metric {metric} for {scenario_name}"
assert inference_results[metric] > 0, \
f"❌ Inference metric {metric} should be positive for {scenario_name}"
# Test manual inference timing
for scenario_name, input_shape in scenarios:
x = Tensor(np.random.randn(*input_shape))
# Warmup
for _ in range(3):
_ = model(x)
# Actual timing
start_time = time.time()
output = model(x)
end_time = time.time()
latency = end_time - start_time
throughput = input_shape[0] / latency # samples/second
assert latency > 0, f"❌ Latency timing broken for {scenario_name}"
assert throughput > 0, f"❌ Throughput calculation broken for {scenario_name}"
# Reasonable performance expectations
if scenario_name == "single_image":
assert latency < 1.0, f"❌ Single image inference too slow: {latency}s"
elif scenario_name == "large_batch":
assert throughput > input_shape[0] / 5.0, f"❌ Batch throughput too low: {throughput}"
except Exception as e:
assert False, f"""
❌ INFERENCE PERFORMANCE OPTIMIZATION BROKEN!
🔍 ERROR: {str(e)}
🔧 INFERENCE BENCHMARK REQUIREMENTS:
1. Single sample latency measurement
2. Batch inference throughput analysis
3. Memory consumption profiling
4. Hardware utilization monitoring
5. Real-time performance validation
💡 INFERENCE OPTIMIZATION STRATEGIES:
def benchmark_inference(model, input_shape, num_trials=100):
'''Comprehensive inference benchmarking.'''
# Create test input
test_input = Tensor(np.random.randn(*input_shape))
# Warmup
for _ in range(10):
_ = model(test_input)
# Timing trials
latencies = []
for _ in range(num_trials):
start = time.perf_counter()
output = model(test_input)
end = time.perf_counter()
latencies.append(end - start)
# Calculate metrics
avg_latency = np.mean(latencies)
p95_latency = np.percentile(latencies, 95)
throughput = input_shape[0] / avg_latency
# Memory estimation
memory_peak = test_input.data.nbytes + output.data.nbytes
return {{
'latency': avg_latency,
'p95_latency': p95_latency,
'throughput': throughput,
'memory_peak': memory_peak
}}
🚀 PRODUCTION DEPLOYMENT:
Inference optimization enables:
- Real-time applications (< 100ms latency)
- High-throughput services (> 1000 QPS)
- Edge deployment (mobile, IoT devices)
- Cost-effective cloud serving
"""
def test_hardware_performance_analysis(self):
"""
✅ TEST: Hardware-specific performance analysis
📋 HARDWARE ANALYSIS:
- CPU vs GPU performance comparison
- Memory bandwidth utilization
- Compute utilization analysis
- Scaling characteristics
- Hardware recommendation
💡 Guide hardware selection and optimization
"""
try:
from tinytorch.core.benchmarking import analyze_hardware_performance
from tinytorch.core.tensor import Tensor
from tinytorch.core.layers import Dense
from tinytorch.core.spatial import Conv2D
# Test hardware analysis
models_for_analysis = [
("cpu_intensive", Dense(1000, 1000)), # CPU-friendly
("memory_intensive", Conv2D(3, 512, 7)), # Memory-bound
]
if 'analyze_hardware_performance' in locals():
for model_name, model in models_for_analysis:
hw_analysis = analyze_hardware_performance(model)
assert isinstance(hw_analysis, dict), \
f"❌ Hardware analysis should return dict for {model_name}"
# Hardware analysis components
hw_metrics = ['compute_bound', 'memory_bound', 'io_bound', 'recommendations']
for metric in hw_metrics:
assert metric in hw_analysis, \
f"❌ Missing hardware metric {metric} for {model_name}"
# Recommendations should be actionable
assert isinstance(hw_analysis['recommendations'], list), \
f"❌ Hardware recommendations should be list for {model_name}"
assert len(hw_analysis['recommendations']) > 0, \
f"❌ Should provide hardware recommendations for {model_name}"
# Test basic hardware characteristic analysis
# CPU-intensive model (many small operations)
cpu_model = Dense(500, 500)
cpu_input = Tensor(np.random.randn(1, 500))
# Memory-intensive model (large feature maps)
memory_model = Conv2D(3, 256, kernel_size=7)
memory_input = Tensor(np.random.randn(1, 3, 224, 224))
# Test execution and timing
for model_type, model, test_input in [
("CPU-intensive", cpu_model, cpu_input),
("Memory-intensive", memory_model, memory_input)
]:
start_time = time.time()
output = model(test_input)
end_time = time.time()
execution_time = end_time - start_time
# Basic hardware analysis
input_size = test_input.data.nbytes
output_size = output.data.nbytes
memory_throughput = (input_size + output_size) / execution_time # bytes/second
assert execution_time > 0, f"❌ Execution timing broken for {model_type}"
assert memory_throughput > 0, f"❌ Memory throughput calculation broken for {model_type}"
except Exception as e:
assert False, f"""
❌ HARDWARE PERFORMANCE ANALYSIS BROKEN!
🔍 ERROR: {str(e)}
🔧 HARDWARE ANALYSIS REQUIREMENTS:
1. Identify compute vs memory bottlenecks
2. Analyze hardware utilization patterns
3. Provide hardware-specific recommendations
4. Support different hardware targets
5. Scale analysis across model sizes
💡 HARDWARE OPTIMIZATION GUIDE:
def analyze_hardware_performance(model):
'''Analyze model hardware requirements and bottlenecks.'''
analysis = {{
'compute_bound': False,
'memory_bound': False,
'io_bound': False,
'recommendations': []
}}
# Analyze model characteristics
total_params = count_parameters(model)
total_ops = estimate_flops(model)
# Heuristic analysis
if total_ops > 1e9: # > 1 GFLOP
analysis['compute_bound'] = True
analysis['recommendations'].append("Consider GPU acceleration")
if total_params > 1e6: # > 1M parameters
analysis['memory_bound'] = True
analysis['recommendations'].append("Ensure sufficient memory bandwidth")
return analysis
🔧 HARDWARE SELECTION GUIDE:
- CPU: Small models, low latency, high precision
- GPU: Large models, high throughput, parallel training
- TPU: Very large models, training at scale
- Edge: Mobile deployment, power efficiency
- Cloud: Elastic scaling, cost optimization
"""
class TestModule14Completion:
"""
✅ COMPLETION CHECK: Module 14 ready and foundation set for production deployment.
🎯 Final validation that benchmarking works and enables performance optimization.
"""
def test_benchmarking_foundation_complete(self):
"""
✅ FINAL TEST: Complete benchmarking foundation ready for production
📋 BENCHMARKING FOUNDATION CHECKLIST:
□ Model performance benchmarking
□ Profiling and bottleneck analysis
□ Performance comparison tools
□ Training performance analysis
□ Inference optimization
□ Hardware performance analysis
□ Integration with ML pipeline
□ Production readiness
🎯 SUCCESS = Ready for Module 15: MLOps and Production Deployment!
"""
benchmarking_capabilities = {
"Model benchmarking": False,
"Profiling tools": False,
"Performance comparison": False,
"Training analysis": False,
"Inference optimization": False,
"Hardware analysis": False,
"ML pipeline integration": False,
"Production readiness": False
}
try:
# Test 1: Model benchmarking
from tinytorch.core.benchmarking import benchmark_model
from tinytorch.core.layers import Dense
model = Dense(50, 25)
results = benchmark_model(model, (16, 50))
assert 'latency' in results and 'throughput' in results
benchmarking_capabilities["Model benchmarking"] = True
# Test 2: Profiling (check structure)
try:
from tinytorch.core.benchmarking import profile_model
benchmarking_capabilities["Profiling tools"] = True
except ImportError:
# Basic timing works
import time
start = time.time()
time.sleep(0.001) # Simulate work
duration = time.time() - start
assert duration > 0
benchmarking_capabilities["Profiling tools"] = True
# Test 3: Performance comparison (check structure)
try:
from tinytorch.core.benchmarking import compare_models
benchmarking_capabilities["Performance comparison"] = True
except ImportError:
# Can compare manually
model1_results = benchmark_model(Dense(25, 10), (8, 25))
model2_results = benchmark_model(Dense(50, 10), (8, 50))
assert model1_results['latency'] != model2_results['latency']
benchmarking_capabilities["Performance comparison"] = True
# Test 4: Training analysis
from tinytorch.core.tensor import Tensor
from tinytorch.core.optimizers import SGD
# Simulate training timing
start = time.time()
x = Tensor(np.random.randn(16, 50))
output = model(x)
training_time = time.time() - start
assert training_time > 0
benchmarking_capabilities["Training analysis"] = True
# Test 5: Inference optimization
# Test different batch sizes
single_start = time.time()
single_input = Tensor(np.random.randn(1, 50))
model(single_input)
single_time = time.time() - single_start
batch_start = time.time()
batch_input = Tensor(np.random.randn(16, 50))
model(batch_input)
batch_time = time.time() - batch_start
# Batch should be more efficient per sample
single_throughput = 1 / single_time
batch_throughput = 16 / batch_time
assert batch_throughput > single_throughput
benchmarking_capabilities["Inference optimization"] = True
# Test 6: Hardware analysis (basic structure)
# Analyze different model sizes
small_model = Dense(10, 5)
large_model = Dense(1000, 500)
small_results = benchmark_model(small_model, (4, 10))
large_results = benchmark_model(large_model, (4, 1000))
# Large model should use more resources
assert large_results['memory_usage'] > small_results['memory_usage']
benchmarking_capabilities["Hardware analysis"] = True
# Test 7: ML pipeline integration
from tinytorch.core.spatial import Conv2D
from tinytorch.core.activations import ReLU
# Benchmark complex model
class ComplexModel:
def __init__(self):
self.conv = Conv2D(3, 16, 3)
self.dense = Dense(16 * 30 * 30, 10)
self.relu = ReLU()
def __call__(self, x):
h = self.relu(self.conv(x))
h_flat = Tensor(h.data.reshape(h.shape[0], -1))
return self.dense(h_flat)
complex_model = ComplexModel()
complex_results = benchmark_model(complex_model, (2, 3, 32, 32))
assert 'latency' in complex_results
benchmarking_capabilities["ML pipeline integration"] = True
# Test 8: Production readiness
# Can measure and compare performance across scenarios
scenarios = [
("real_time", (1, 50)),
("batch_processing", (32, 50))
]
for scenario_name, input_shape in scenarios:
scenario_results = benchmark_model(model, input_shape)
assert scenario_results['latency'] > 0
assert scenario_results['throughput'] > 0
benchmarking_capabilities["Production readiness"] = True
except Exception as e:
# Show progress even if not complete
completed_count = sum(benchmarking_capabilities.values())
total_count = len(benchmarking_capabilities)
progress_report = "\n🔍 BENCHMARKING PROGRESS:\n"
for capability, completed in benchmarking_capabilities.items():
status = "" if completed else ""
progress_report += f" {status} {capability}\n"
progress_report += f"\n📊 Progress: {completed_count}/{total_count} capabilities ready"
assert False, f"""
❌ BENCHMARKING FOUNDATION NOT COMPLETE!
🔍 ERROR: {str(e)}
{progress_report}
🔧 NEXT STEPS:
1. Fix the failing capability above
2. Re-run this test
3. When all ✅, you're ready for production deployment!
💡 ALMOST THERE!
You've completed {completed_count}/{total_count} benchmarking capabilities.
Just fix the error above and you'll have complete performance analysis!
"""
# If we get here, everything passed!
assert True, """
🎉 BENCHMARKING FOUNDATION COMPLETE! 🎉
✅ Model performance benchmarking
✅ Profiling and bottleneck analysis
✅ Performance comparison tools
✅ Training performance analysis
✅ Inference optimization
✅ Hardware performance analysis
✅ ML pipeline integration
✅ Production readiness
🚀 READY FOR MODULE 15: MLOPS AND PRODUCTION DEPLOYMENT!
💡 What you can now do:
- Optimize model performance with data
- Choose optimal hardware configurations
- Identify and fix performance bottlenecks
- Plan production deployment strategies
- Scale ML systems efficiently
📊 PERFORMANCE ENGINEERING ACHIEVED:
You've built the tools to:
- Make data-driven optimization decisions
- Ensure production SLA compliance
- Minimize deployment costs
- Maximize system efficiency
🎯 Next: Production deployment and monitoring in Module 15!
"""
# Note: No separate regression prevention - we test complete ML system stability above