""" Performance Tests for Module 16: Hardware Acceleration Tests whether the acceleration techniques actually provide measurable speedups over baseline implementations. Key questions: - Does blocked matrix multiplication actually improve cache performance? - How much faster is NumPy compared to naive loops? - Does the smart backend system work correctly? - Are the claimed 10-100ร— speedups realistic? """ import sys import os import time import numpy as np from pathlib import Path # Add the performance framework to path sys.path.append(str(Path(__file__).parent)) from performance_test_framework import PerformanceTestSuite, PerformanceComparator, WorkloadGenerator # Add module path sys.path.append(str(Path(__file__).parent.parent.parent / 'modules' / '16_acceleration')) try: from acceleration_dev import ( matmul_naive, matmul_blocked, matmul_numpy, OptimizedBackend, matmul ) ACCELERATION_AVAILABLE = True except ImportError: print("โŒ Module 16 acceleration tools not available") ACCELERATION_AVAILABLE = False class Module16PerformanceTests: """Test suite for Module 16 acceleration techniques.""" def __init__(self): self.suite = PerformanceTestSuite() self.comparator = PerformanceComparator() self.workloads = WorkloadGenerator() def test_naive_vs_blocked_matmul(self): """Test whether blocked matrix multiplication improves over naive loops.""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿ”„ Testing naive vs blocked matrix multiplication") # Use small matrices for naive implementation (it's very slow) size = 64 # Small enough that naive doesn't take forever A, B = self.workloads.matrix_multiply_workload(size) # Wrapper functions for testing def naive_implementation(): return matmul_naive(A, B) def blocked_implementation(): return matmul_blocked(A, B, block_size=32) # First verify results are the same try: naive_result = naive_implementation() blocked_result = blocked_implementation() numpy_result = A @ B # Check correctness naive_correct = np.allclose(naive_result, numpy_result, rtol=1e-3, atol=1e-3) blocked_correct = np.allclose(blocked_result, numpy_result, rtol=1e-3, atol=1e-3) if not naive_correct: return "Naive implementation produces incorrect results" if not blocked_correct: return "Blocked implementation produces incorrect results" except Exception as e: return f"Implementation error: {e}" # Performance comparison comparison = self.comparator.compare_implementations( naive_implementation, blocked_implementation, baseline_name="naive_matmul", optimized_name="blocked_matmul" ) # Blocked should be faster than naive (cache-friendly access) speedup_achieved = comparison.speedup > 1.2 # At least 20% improvement result = { 'correctness_naive': naive_correct, 'correctness_blocked': blocked_correct, 'speedup': comparison.speedup, 'speedup_achieved': speedup_achieved, 'naive_time_ms': comparison.baseline.mean_time_ms, 'blocked_time_ms': comparison.optimized.mean_time_ms, 'matrix_size': size } if speedup_achieved: print(f"โœ… Blocked matmul speedup achieved: {comparison.speedup:.2f}ร—") else: print(f"โŒ Blocked matmul speedup insufficient: {comparison.speedup:.2f}ร—") return comparison def test_blocked_vs_numpy_matmul(self): """Test blocked implementation against NumPy (production baseline).""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿš€ Testing blocked vs NumPy matrix multiplication") # Use medium size matrices size = 256 A, B = self.workloads.matrix_multiply_workload(size) def blocked_implementation(): return matmul_blocked(A, B, block_size=64) def numpy_implementation(): return matmul_numpy(A, B) # Verify correctness try: blocked_result = blocked_implementation() numpy_result = numpy_implementation() results_match = np.allclose(blocked_result, numpy_result, rtol=1e-3, atol=1e-3) if not results_match: return "Blocked and NumPy implementations produce different results" except Exception as e: return f"Implementation error: {e}" # Performance comparison comparison = self.comparator.compare_implementations( blocked_implementation, numpy_implementation, baseline_name="blocked_matmul", optimized_name="numpy_matmul" ) # NumPy should be significantly faster than blocked numpy_advantage = comparison.speedup > 2.0 # NumPy should be 2ร—+ faster result = { 'correctness': results_match, 'numpy_speedup': comparison.speedup, 'numpy_advantage': numpy_advantage, 'blocked_time_ms': comparison.baseline.mean_time_ms, 'numpy_time_ms': comparison.optimized.mean_time_ms, 'matrix_size': size } if numpy_advantage: print(f"โœ… NumPy dominance confirmed: {comparison.speedup:.2f}ร— faster than blocked") else: print(f"โš ๏ธ NumPy advantage lower than expected: {comparison.speedup:.2f}ร—") return comparison def test_naive_vs_numpy_full_spectrum(self): """Test the full optimization spectrum: naive โ†’ blocked โ†’ NumPy.""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿ“Š Testing full optimization spectrum") # Use very small matrix for naive (it's extremely slow) size = 32 A, B = self.workloads.matrix_multiply_workload(size) def naive_impl(): return matmul_naive(A, B) def numpy_impl(): return matmul_numpy(A, B) # Test naive vs NumPy to see full improvement comparison = self.comparator.compare_implementations( naive_impl, numpy_impl, baseline_name="naive_loops", optimized_name="numpy_optimized" ) # Should see dramatic improvement (10ร—+ claimed in module) dramatic_improvement = comparison.speedup > 5.0 result = { 'full_spectrum_speedup': comparison.speedup, 'dramatic_improvement': dramatic_improvement, 'naive_time_ms': comparison.baseline.mean_time_ms, 'numpy_time_ms': comparison.optimized.mean_time_ms, 'matrix_size': size } if dramatic_improvement: print(f"๐ŸŽ‰ Dramatic optimization achieved: {comparison.speedup:.1f}ร— improvement!") else: print(f"โš ๏ธ Full optimization less dramatic: {comparison.speedup:.1f}ร— improvement") return comparison def test_backend_system(self): """Test the smart backend dispatch system.""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿง  Testing smart backend system") size = 128 A, B = self.workloads.matrix_multiply_workload(size) # Test backend function def backend_matmul(): return matmul(A, B) def direct_numpy(): return matmul_numpy(A, B) # Verify results match try: backend_result = backend_matmul() numpy_result = direct_numpy() results_match = np.allclose(backend_result, numpy_result, rtol=1e-5, atol=1e-5) if not results_match: return "Backend system produces different results than NumPy" except Exception as e: return f"Backend system error: {e}" # Performance should be equivalent (backend uses NumPy) comparison = self.comparator.compare_implementations( backend_matmul, direct_numpy, baseline_name="backend_matmul", optimized_name="direct_numpy" ) # Backend should have minimal overhead (< 20%) low_overhead = comparison.speedup < 1.2 and comparison.speedup > 0.8 result = { 'correctness': results_match, 'overhead_factor': comparison.speedup, 'low_overhead': low_overhead, 'backend_time_ms': comparison.baseline.mean_time_ms, 'numpy_time_ms': comparison.optimized.mean_time_ms } if low_overhead: print(f"โœ… Backend overhead acceptable: {comparison.speedup:.2f}ร— factor") else: print(f"โŒ Backend overhead too high: {comparison.speedup:.2f}ร— factor") return result def test_scaling_behavior(self): """Test how optimizations scale with matrix size.""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿ“ˆ Testing optimization scaling behavior") sizes = [64, 128, 256] # Keep reasonable for testing results = {} for size in sizes: print(f" Testing size {size}ร—{size}") A, B = self.workloads.matrix_multiply_workload(size) # Compare blocked vs NumPy at this size def blocked_impl(): return matmul_blocked(A, B, block_size=min(64, size//2)) def numpy_impl(): return matmul_numpy(A, B) # Quick timing comparison (fewer runs for speed) timer = self.comparator.timer timer.measurement_runs = 10 comparison = self.comparator.compare_implementations( blocked_impl, numpy_impl, baseline_name=f"blocked_{size}", optimized_name=f"numpy_{size}" ) results[size] = { 'speedup': comparison.speedup, 'blocked_time_ms': comparison.baseline.mean_time_ms, 'numpy_time_ms': comparison.optimized.mean_time_ms } # Analyze scaling trends speedups = [results[size]['speedup'] for size in sizes] speedup_increases = all(speedups[i] <= speedups[i+1] for i in range(len(speedups)-1)) scaling_result = { 'size_results': results, 'speedup_increases_with_size': speedup_increases, 'speedups': speedups, 'sizes': sizes } print(f"Speedup scaling: {' โ†’ '.join(f'{s:.1f}ร—' for s in speedups)}") if speedup_increases: print("โœ… NumPy advantage increases with size (expected)") else: print("โš ๏ธ Inconsistent scaling behavior") return scaling_result def test_cache_blocking_effectiveness(self): """Test whether blocking actually improves cache performance.""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿ’พ Testing cache blocking effectiveness") # Test different block sizes size = 128 A, B = self.workloads.matrix_multiply_workload(size) block_sizes = [16, 32, 64, 128] block_results = {} for block_size in block_sizes: def blocked_impl(): return matmul_blocked(A, B, block_size=block_size) timer = self.comparator.timer timer.measurement_runs = 10 result = timer.measure_function(blocked_impl, name=f"block_{block_size}") block_results[block_size] = result.mean_time_ms # Find optimal block size (should be around 32-64 for typical L1 cache) optimal_block_size = min(block_results.keys(), key=lambda k: block_results[k]) performance_variation = max(block_results.values()) / min(block_results.values()) cache_result = { 'block_sizes': list(block_sizes), 'timings_ms': list(block_results.values()), 'optimal_block_size': optimal_block_size, 'performance_variation': performance_variation, 'cache_blocking_effective': performance_variation > 1.2 } print(f"Block size performance: {dict(block_results)}") print(f"Optimal block size: {optimal_block_size}") if cache_result['cache_blocking_effective']: print(f"โœ… Cache blocking shows {performance_variation:.1f}ร— variation") else: print(f"โŒ Cache blocking shows minimal impact: {performance_variation:.1f}ร— variation") return cache_result def test_ml_model_acceleration(self): """Test acceleration on realistic ML model operations.""" if not ACCELERATION_AVAILABLE: return "Acceleration module not available" print("๐Ÿค– Testing acceleration on ML model operations") # Simulate MLP forward pass batch_size = 32 input_dim = 256 hidden_dim = 128 output_dim = 64 # Create model data x = np.random.randn(batch_size, input_dim).astype(np.float32) W1 = np.random.randn(input_dim, hidden_dim).astype(np.float32) W2 = np.random.randn(hidden_dim, output_dim).astype(np.float32) def naive_mlp(): # Use naive matmul for "educational" version (very small for speed) x_small = x[:4, :32] # Much smaller for naive W1_small = W1[:32, :16] W2_small = W2[:16, :8] h1 = matmul_naive(x_small, W1_small) h1_relu = np.maximum(0, h1) output = matmul_naive(h1_relu, W2_small) return output def optimized_mlp(): h1 = matmul(x, W1) h1_relu = np.maximum(0, h1) output = matmul(h1_relu, W2) return output try: # Time both implementations timer = self.comparator.timer timer.measurement_runs = 5 # Fewer runs since naive is slow naive_result = timer.measure_function(naive_mlp, name="naive_mlp") optimized_result = timer.measure_function(optimized_mlp, name="optimized_mlp") # Compare (note: different sizes, so this is qualitative) ml_acceleration = { 'naive_time_ms': naive_result.mean_time_ms, 'optimized_time_ms': optimized_result.mean_time_ms, 'operations_comparison': "Different sizes - qualitative comparison", 'naive_much_slower': naive_result.mean_time_ms > optimized_result.mean_time_ms } if ml_acceleration['naive_much_slower']: print("โœ… ML acceleration effective - optimized version much faster") else: print("โŒ ML acceleration test inconclusive") return ml_acceleration except Exception as e: return f"ML acceleration test error: {e}" def run_module_16_performance_tests(): """Run all performance tests for Module 16.""" print("๐Ÿงช TESTING MODULE 16: HARDWARE ACCELERATION") print("=" * 60) print("Verifying that acceleration techniques provide real speedups") if not ACCELERATION_AVAILABLE: print("โŒ Cannot test Module 16 - acceleration tools not available") return test_suite = Module16PerformanceTests() tests = { 'naive_vs_blocked': test_suite.test_naive_vs_blocked_matmul, 'blocked_vs_numpy': test_suite.test_blocked_vs_numpy_matmul, 'full_spectrum': test_suite.test_naive_vs_numpy_full_spectrum, 'backend_system': test_suite.test_backend_system, 'scaling_behavior': test_suite.test_scaling_behavior, 'cache_blocking': test_suite.test_cache_blocking_effectiveness, 'ml_model_acceleration': test_suite.test_ml_model_acceleration } results = test_suite.suite.run_module_tests('module_16_acceleration', tests) # Summary print(f"\n๐Ÿ“Š MODULE 16 TEST SUMMARY") print("=" * 40) speedup_tests = [] correctness_tests = [] for test_name, result in results.items(): if hasattr(result, 'speedup'): # ComparisonResult speedup_tests.append((test_name, result.speedup, result.is_significant)) print(f"โšก {test_name}: {result.speedup:.2f}ร— speedup {'โœ…' if result.is_significant else 'โŒ'}") elif isinstance(result, dict): # Check for various success criteria success = False if 'speedup_achieved' in result: success = result['speedup_achieved'] elif 'dramatic_improvement' in result: success = result['dramatic_improvement'] elif 'low_overhead' in result: success = result['low_overhead'] elif 'cache_blocking_effective' in result: success = result['cache_blocking_effective'] correctness_tests.append((test_name, success)) print(f"๐Ÿ”ง {test_name}: {'โœ… PASS' if success else 'โŒ FAIL'}") else: print(f"โŒ {test_name}: ERROR - {result}") # Overall assessment significant_speedups = sum(1 for _, speedup, significant in speedup_tests if significant and speedup > 1.5) successful_tests = sum(1 for _, success in correctness_tests if success) total_meaningful_tests = len(speedup_tests) + len(correctness_tests) total_successes = significant_speedups + successful_tests success_rate = total_successes / total_meaningful_tests if total_meaningful_tests > 0 else 0 print(f"\nSUCCESS RATE: {success_rate:.1%} ({total_successes}/{total_meaningful_tests})") print(f"Significant speedups: {significant_speedups}/{len(speedup_tests)}") print(f"System tests passed: {successful_tests}/{len(correctness_tests)}") if success_rate >= 0.7: print("๐ŸŽ‰ Module 16 acceleration techniques are working well!") else: print("โš ๏ธ Module 16 acceleration techniques need improvement") return results if __name__ == "__main__": run_module_16_performance_tests()