""" Performance Tests for Module 20: Benchmarking Tests whether the benchmarking suite actually provides meaningful performance measurements and can drive optimization competitions. Key questions: - Does TinyMLPerf provide fair, reproducible benchmarks? - Can the benchmarking system detect real performance differences? - Do the competition metrics correlate with actual improvements? - Is the benchmarking framework scientifically sound? """ import sys import os import time import numpy as np from pathlib import Path # Add the performance framework to path sys.path.append(str(Path(__file__).parent)) from performance_test_framework import PerformanceTestSuite, PerformanceComparator, WorkloadGenerator # Add module path sys.path.append(str(Path(__file__).parent.parent.parent / 'modules' / '20_benchmarking')) try: from benchmarking_dev import TinyMLPerf BENCHMARKING_AVAILABLE = True except ImportError: print("โŒ Module 20 benchmarking tools not available") BENCHMARKING_AVAILABLE = False class Module20PerformanceTests: """Test suite for Module 20 benchmarking system.""" def __init__(self): self.suite = PerformanceTestSuite() self.comparator = PerformanceComparator() self.workloads = WorkloadGenerator() def test_benchmark_suite_loading(self): """Test whether TinyMLPerf benchmark suite loads correctly.""" if not BENCHMARKING_AVAILABLE: return "Benchmarking module not available" print("๐Ÿ“‹ Testing TinyMLPerf benchmark suite loading") try: # Initialize benchmark suite tinyperf = TinyMLPerf(profiler_warmup_runs=2, profiler_timing_runs=3) # Test available events events = tinyperf.get_available_events() expected_events = {'mlp_sprint', 'cnn_marathon', 'transformer_decathlon'} has_all_events = expected_events.issubset(set(events.keys())) # Test loading each benchmark load_results = {} for event_name in expected_events: try: model, dataset = tinyperf.load_benchmark(event_name) # Test model inference inputs = dataset['inputs'] outputs = model.predict(inputs) # Verify output shape batch_size = inputs.shape[0] output_shape_correct = outputs.shape[0] == batch_size load_results[event_name] = { 'loaded': True, 'inference_works': True, 'output_shape_correct': output_shape_correct, 'input_shape': inputs.shape, 'output_shape': outputs.shape } except Exception as e: load_results[event_name] = {'loaded': False, 'error': str(e)} all_benchmarks_work = all( result.get('loaded', False) and result.get('inference_works', False) and result.get('output_shape_correct', False) for result in load_results.values() ) loading_result = { 'has_all_events': has_all_events, 'load_results': load_results, 'all_benchmarks_work': all_benchmarks_work, 'events_available': list(events.keys()), 'suite_loading_successful': has_all_events and all_benchmarks_work } if loading_result['suite_loading_successful']: print("โœ… TinyMLPerf benchmark suite loaded successfully") print(f" Events: {', '.join(events.keys())}") else: print("โŒ TinyMLPerf benchmark suite loading issues") return loading_result except Exception as e: return f"Benchmark suite loading error: {e}" def test_benchmark_reproducibility(self): """Test whether benchmarks produce reproducible results.""" if not BENCHMARKING_AVAILABLE: return "Benchmarking module not available" print("๐Ÿ”„ Testing benchmark reproducibility") try: tinyperf = TinyMLPerf(profiler_warmup_runs=2, profiler_timing_runs=5) model, dataset = tinyperf.load_benchmark('mlp_sprint') inputs = dataset['inputs'] # Run inference multiple times results = [] for run in range(5): outputs = model.predict(inputs) results.append(outputs.copy()) # Check if all results are identical (they should be with deterministic model) all_identical = all(np.allclose(results[0], result, rtol=1e-10, atol=1e-10) for result in results[1:]) # Check output consistency across multiple instantiations tinyperf2 = TinyMLPerf(profiler_warmup_runs=2, profiler_timing_runs=5) model2, dataset2 = tinyperf2.load_benchmark('mlp_sprint') # Same inputs should produce same outputs (models initialized the same way) outputs1 = model.predict(inputs) outputs2 = model2.predict(inputs) cross_instance_identical = np.allclose(outputs1, outputs2, rtol=1e-10, atol=1e-10) reproducibility_result = { 'multiple_runs_identical': all_identical, 'cross_instance_identical': cross_instance_identical, 'reproducible': all_identical and cross_instance_identical } if reproducibility_result['reproducible']: print("โœ… Benchmarks produce reproducible results") else: print("โŒ Benchmark reproducibility issues") if not all_identical: print(" Multiple runs produce different results") if not cross_instance_identical: print(" Different instances produce different results") return reproducibility_result except Exception as e: return f"Reproducibility test error: {e}" def test_performance_detection(self): """Test whether benchmarks can detect performance differences.""" if not BENCHMARKING_AVAILABLE: return "Benchmarking module not available" print("๐Ÿ” Testing performance difference detection") try: tinyperf = TinyMLPerf(profiler_warmup_runs=2, profiler_timing_runs=10) model, dataset = tinyperf.load_benchmark('mlp_sprint') inputs = dataset['inputs'] # Create fast and slow versions of the same operation def fast_inference(): """Standard model inference""" return model.predict(inputs) def slow_inference(): """Artificially slowed model inference""" result = model.predict(inputs) # Add artificial delay time.sleep(0.001) # 1ms delay return result # Compare performance comparison = self.comparator.compare_implementations( slow_inference, fast_inference, baseline_name="slow_model", optimized_name="fast_model" ) # Should detect the artificial slowdown detects_difference = comparison.speedup > 1.5 # Should see significant speedup results_identical = np.allclose( slow_inference(), fast_inference(), rtol=1e-10, atol=1e-10 ) detection_result = { 'speedup_detected': comparison.speedup, 'detects_performance_difference': detects_difference, 'results_remain_identical': results_identical, 'detection_working': detects_difference and results_identical } if detection_result['detection_working']: print(f"โœ… Performance difference detected: {comparison.speedup:.1f}ร— speedup") else: print(f"โŒ Failed to detect performance difference: {comparison.speedup:.1f}ร— speedup") return detection_result except Exception as e: return f"Performance detection test error: {e}" def test_cross_event_fairness(self): """Test whether different benchmark events provide fair comparisons.""" if not BENCHMARKING_AVAILABLE: return "Benchmarking module not available" print("โš–๏ธ Testing cross-event benchmark fairness") try: tinyperf = TinyMLPerf(profiler_warmup_runs=1, profiler_timing_runs=3) # Test all events events = ['mlp_sprint', 'cnn_marathon', 'transformer_decathlon'] event_metrics = {} for event in events: try: model, dataset = tinyperf.load_benchmark(event) inputs = dataset['inputs'] # Time inference timer = self.comparator.timer timer.measurement_runs = 5 result = timer.measure_function( lambda: model.predict(inputs), name=f"{event}_inference" ) event_metrics[event] = { 'mean_time_ms': result.mean_time_ms, 'std_time_ms': result.std_time_ms, 'batch_size': inputs.shape[0], 'input_size': np.prod(inputs.shape[1:]), 'time_per_sample_ms': result.mean_time_ms / inputs.shape[0], 'measurement_stable': result.std_time_ms / result.mean_time_ms < 0.2 # CV < 20% } except Exception as e: event_metrics[event] = {'error': str(e)} # Check measurement stability across events all_stable = all( metrics.get('measurement_stable', False) for metrics in event_metrics.values() if 'error' not in metrics ) # Check reasonable timing ranges (different events should have different characteristics) timing_ranges_reasonable = len(set( int(metrics['mean_time_ms'] // 10) * 10 # Round to nearest 10ms for metrics in event_metrics.values() if 'error' not in metrics )) >= 2 # At least 2 different timing buckets fairness_result = { 'event_metrics': event_metrics, 'all_measurements_stable': all_stable, 'timing_ranges_reasonable': timing_ranges_reasonable, 'fairness_good': all_stable and timing_ranges_reasonable } if fairness_result['fairness_good']: print("โœ… Cross-event benchmarks provide fair comparisons") for event, metrics in event_metrics.items(): if 'error' not in metrics: print(f" {event}: {metrics['mean_time_ms']:.1f}ms ยฑ {metrics['std_time_ms']:.1f}ms") else: print("โŒ Cross-event benchmark fairness issues") return fairness_result except Exception as e: return f"Cross-event fairness test error: {e}" def test_scaling_measurement(self): """Test whether benchmarks measure scaling behavior correctly.""" if not BENCHMARKING_AVAILABLE: return "Benchmarking module not available" print("๐Ÿ“ˆ Testing benchmark scaling measurement") try: tinyperf = TinyMLPerf(profiler_warmup_runs=1, profiler_timing_runs=3) model, dataset = tinyperf.load_benchmark('mlp_sprint') # Test different batch sizes base_inputs = dataset['inputs'] batch_sizes = [25, 50, 100] # Different batch sizes scaling_results = {} for batch_size in batch_sizes: if batch_size <= base_inputs.shape[0]: test_inputs = base_inputs[:batch_size] else: # Repeat inputs to get larger batch repeats = (batch_size // base_inputs.shape[0]) + 1 repeated_inputs = np.tile(base_inputs, (repeats, 1))[:batch_size] test_inputs = repeated_inputs # Time inference at this batch size timer = self.comparator.timer timer.measurement_runs = 5 result = timer.measure_function( lambda inputs=test_inputs: model.predict(inputs), name=f"batch_{batch_size}" ) scaling_results[batch_size] = { 'total_time_ms': result.mean_time_ms, 'time_per_sample_ms': result.mean_time_ms / batch_size, 'throughput_samples_per_sec': 1000 * batch_size / result.mean_time_ms } # Analyze scaling behavior times_per_sample = [scaling_results[bs]['time_per_sample_ms'] for bs in batch_sizes] throughputs = [scaling_results[bs]['throughput_samples_per_sec'] for bs in batch_sizes] # Throughput should generally increase with batch size (more efficient) throughput_scaling_reasonable = throughputs[-1] >= throughputs[0] * 0.8 # Per-sample time should decrease or stay similar (batch efficiency) per_sample_scaling_reasonable = times_per_sample[-1] <= times_per_sample[0] * 1.2 scaling_measurement_result = { 'scaling_results': scaling_results, 'times_per_sample_ms': times_per_sample, 'throughputs_samples_per_sec': throughputs, 'throughput_scaling_reasonable': throughput_scaling_reasonable, 'per_sample_scaling_reasonable': per_sample_scaling_reasonable, 'scaling_measurement_good': throughput_scaling_reasonable and per_sample_scaling_reasonable } if scaling_measurement_result['scaling_measurement_good']: print("โœ… Benchmark scaling measurement working correctly") print(f" Throughput: {throughputs[0]:.0f} โ†’ {throughputs[-1]:.0f} samples/sec") else: print("โŒ Benchmark scaling measurement issues") return scaling_measurement_result except Exception as e: return f"Scaling measurement test error: {e}" def test_competition_scoring(self): """Test whether the competition scoring system works fairly.""" if not BENCHMARKING_AVAILABLE: return "Benchmarking module not available" print("๐Ÿ† Testing competition scoring system") try: tinyperf = TinyMLPerf(profiler_warmup_runs=1, profiler_timing_runs=5) # Simulate different optimization submissions model, dataset = tinyperf.load_benchmark('mlp_sprint') inputs = dataset['inputs'] # Create different "optimization" versions def baseline_submission(): """Baseline unoptimized version""" return model.predict(inputs) def fast_submission(): """Optimized version (simulated)""" result = model.predict(inputs) # Simulate faster execution (no added delay) return result def slow_submission(): """Poorly optimized version""" result = model.predict(inputs) # Add delay to simulate poor optimization time.sleep(0.0005) # 0.5ms delay return result # Score each submission timer = self.comparator.timer timer.measurement_runs = 5 baseline_time = timer.measure_function(baseline_submission, name="baseline").mean_time_ms fast_time = timer.measure_function(fast_submission, name="fast").mean_time_ms slow_time = timer.measure_function(slow_submission, name="slow").mean_time_ms # Calculate relative scores (speedup relative to baseline) fast_score = baseline_time / fast_time slow_score = baseline_time / slow_time baseline_score = 1.0 # Verify scoring makes sense scores_ordered_correctly = fast_score >= baseline_score >= slow_score meaningful_score_differences = (fast_score - slow_score) > 0.2 scoring_result = { 'baseline_score': baseline_score, 'fast_score': fast_score, 'slow_score': slow_score, 'scores_ordered_correctly': scores_ordered_correctly, 'meaningful_differences': meaningful_score_differences, 'competition_scoring_working': scores_ordered_correctly and meaningful_score_differences } if scoring_result['competition_scoring_working']: print(f"โœ… Competition scoring working: Fast {fast_score:.2f}, Base {baseline_score:.2f}, Slow {slow_score:.2f}") else: print(f"โŒ Competition scoring issues: Fast {fast_score:.2f}, Base {baseline_score:.2f}, Slow {slow_score:.2f}") return scoring_result except Exception as e: return f"Competition scoring test error: {e}" def run_module_20_performance_tests(): """Run all performance tests for Module 20.""" print("๐Ÿงช TESTING MODULE 20: BENCHMARKING SYSTEM") print("=" * 60) print("Verifying that the benchmarking suite provides fair, meaningful measurements") if not BENCHMARKING_AVAILABLE: print("โŒ Cannot test Module 20 - benchmarking tools not available") return test_suite = Module20PerformanceTests() tests = { 'suite_loading': test_suite.test_benchmark_suite_loading, 'reproducibility': test_suite.test_benchmark_reproducibility, 'performance_detection': test_suite.test_performance_detection, 'cross_event_fairness': test_suite.test_cross_event_fairness, 'scaling_measurement': test_suite.test_scaling_measurement, 'competition_scoring': test_suite.test_competition_scoring } results = test_suite.suite.run_module_tests('module_20_benchmarking', tests) # Summary print(f"\n๐Ÿ“Š MODULE 20 TEST SUMMARY") print("=" * 40) total_tests = len(tests) passed_tests = 0 for test_name, result in results.items(): if hasattr(result, 'speedup'): # ComparisonResult passed = result.speedup > 1.1 and result.is_significant print(f"โšก {test_name}: {result.speedup:.2f}ร— speedup {'โœ…' if passed else 'โŒ'}") elif isinstance(result, dict): # Check specific success criteria for each test if 'suite_loading_successful' in result: passed = result['suite_loading_successful'] print(f"๐Ÿ“‹ {test_name}: {'โœ… PASS' if passed else 'โŒ FAIL'}") elif 'reproducible' in result: passed = result['reproducible'] print(f"๐Ÿ”„ {test_name}: {'โœ… PASS' if passed else 'โŒ FAIL'}") elif 'detection_working' in result: passed = result['detection_working'] speedup = result.get('speedup_detected', 0) print(f"๐Ÿ” {test_name}: {speedup:.1f}ร— detected {'โœ… PASS' if passed else 'โŒ FAIL'}") elif 'fairness_good' in result: passed = result['fairness_good'] print(f"โš–๏ธ {test_name}: {'โœ… PASS' if passed else 'โŒ FAIL'}") elif 'scaling_measurement_good' in result: passed = result['scaling_measurement_good'] print(f"๐Ÿ“ˆ {test_name}: {'โœ… PASS' if passed else 'โŒ FAIL'}") elif 'competition_scoring_working' in result: passed = result['competition_scoring_working'] print(f"๐Ÿ† {test_name}: {'โœ… PASS' if passed else 'โŒ FAIL'}") else: passed = False print(f"โ“ {test_name}: Unknown result format") else: passed = False print(f"โŒ {test_name}: ERROR - {result}") if passed: passed_tests += 1 success_rate = passed_tests / total_tests print(f"\nSUCCESS RATE: {success_rate:.1%} ({passed_tests}/{total_tests})") if success_rate >= 0.8: print("๐ŸŽ‰ Module 20 benchmarking system is working well!") print("๐Ÿ† Ready for optimization competitions!") else: print("โš ๏ธ Module 20 benchmarking system needs improvement") return results if __name__ == "__main__": run_module_20_performance_tests()