From 26fafbc0671f2afafd6eff1cd1ee9656d4936fe0 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Thu, 6 Nov 2025 23:57:34 -0500 Subject: [PATCH] Add normalized scoring to Module 19 for fair competition comparison - Add Section 4.5: Normalized Metrics - Fair Comparison Across Different Hardware - Implement calculate_normalized_scores() function for MLPerf-style relative metrics - Calculate speedup, compression ratio, accuracy delta, and efficiency score - Add comprehensive unit tests for normalized scoring - Ensures fairness across different hardware by measuring relative improvements - Prepares students for Module 20 TinyMLPerf competition submissions --- .../19_benchmarking/benchmarking_dev.py | 199 +++++++++++++++++- 1 file changed, 198 insertions(+), 1 deletion(-) diff --git a/modules/source/19_benchmarking/benchmarking_dev.py b/modules/source/19_benchmarking/benchmarking_dev.py index 08e6c650..dfd5219a 100644 --- a/modules/source/19_benchmarking/benchmarking_dev.py +++ b/modules/source/19_benchmarking/benchmarking_dev.py @@ -2023,7 +2023,203 @@ TinyMLPerf is MLPerf for embedded/edge devices: # %% [markdown] """ -## 4.5 Combination Strategies - Preparing for TorchPerf Olympics +## 4.5 Normalized Metrics - Fair Comparison Across Different Hardware + +### The Hardware Problem + +Imagine two students submit their optimizations: +- **Alice** (M3 Mac, 16GB RAM): "My model runs at 50ms latency!" +- **Bob** (2015 laptop, 4GB RAM): "My model runs at 200ms latency!" + +Who optimized better? **You can't tell from raw numbers!** + +Alice's hardware is 4x faster. If Bob achieved 200ms on old hardware, he might have optimized MORE aggressively than Alice. Raw metrics are unfair. + +### The Solution: Relative Improvement Metrics + +Instead of absolute performance, measure **relative improvement** from YOUR baseline: + +``` +Speedup = Baseline Latency / Optimized Latency +Compression Ratio = Baseline Memory / Optimized Memory +Accuracy Delta = Optimized Accuracy - Baseline Accuracy +``` + +**Example:** +- Alice: 100ms → 50ms = **2.0x speedup** ✓ +- Bob: 400ms → 200ms = **2.0x speedup** ✓ + +Now they're fairly compared! Both achieved 2x speedup on their hardware. + +### Key Normalized Metrics for TorchPerf Olympics + +**1. Speedup (for Latency Sprint)** +```python +speedup = baseline_latency / optimized_latency +# Higher is better: 2.5x means 2.5 times faster +``` + +**2. Compression Ratio (for Memory Challenge)** +```python +compression_ratio = baseline_memory / optimized_memory +# Higher is better: 4.0x means 4 times smaller +``` + +**3. Accuracy Preservation (for All Events)** +```python +accuracy_delta = optimized_accuracy - baseline_accuracy +# Closer to 0 is better: -0.02 means 2% accuracy drop +``` + +**4. Efficiency Score (for All-Around)** +```python +efficiency = (speedup * compression_ratio) / max(1.0, abs(accuracy_delta)) +# Balances all metrics +``` + +### Why This Matters for Your Competition + +**Without normalization:** +- Newest hardware wins unfairly +- Focus shifts to "who has the best laptop" +- Optimization skill doesn't matter + +**With normalization:** +- Everyone competes on **optimization skill** +- Hardware differences are eliminated +- Focus is on relative improvement + +**Real MLPerf Example:** +``` +NVIDIA A100 submission: 2.1ms (absolute) → 3.5x speedup (relative) +Google TPU submission: 1.8ms (absolute) → 4.2x speedup (relative) + +Winner: Google (better speedup despite slower absolute time) +``` + +### Implementing Normalized Scoring +""" + +# %% [markdown] +""" +Let's implement a helper function to calculate normalized scores for the competition: +""" + +# %% nbgrader={"grade": false, "grade_id": "normalized-scoring", "locked": false} +#| export +def calculate_normalized_scores(baseline_results: dict, + optimized_results: dict) -> dict: + """ + Calculate normalized performance metrics for fair competition comparison. + + This function converts absolute measurements into relative improvements, + enabling fair comparison across different hardware platforms. + + Args: + baseline_results: Dict with keys: 'latency', 'memory', 'accuracy' + optimized_results: Dict with same keys as baseline_results + + Returns: + Dict with normalized metrics: + - speedup: Relative latency improvement (higher is better) + - compression_ratio: Relative memory reduction (higher is better) + - accuracy_delta: Absolute accuracy change (closer to 0 is better) + - efficiency_score: Combined metric balancing all factors + + Example: + >>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89} + >>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87} + >>> scores = calculate_normalized_scores(baseline, optimized) + >>> print(f"Speedup: {scores['speedup']:.2f}x") + Speedup: 2.50x + """ + # Calculate speedup (higher is better) + speedup = baseline_results['latency'] / optimized_results['latency'] + + # Calculate compression ratio (higher is better) + compression_ratio = baseline_results['memory'] / optimized_results['memory'] + + # Calculate accuracy delta (closer to 0 is better, negative means degradation) + accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy'] + + # Calculate efficiency score (combined metric) + # Penalize accuracy loss: the more accuracy you lose, the lower your score + accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0 + efficiency_score = (speedup * compression_ratio) / accuracy_penalty + + return { + 'speedup': speedup, + 'compression_ratio': compression_ratio, + 'accuracy_delta': accuracy_delta, + 'efficiency_score': efficiency_score, + 'baseline': baseline_results.copy(), + 'optimized': optimized_results.copy() + } + +# %% [markdown] +""" +### 🧪 Unit Test: Normalized Scoring + +**This is a unit test** - it validates that normalized scoring correctly calculates relative improvements. +""" + +# %% nbgrader={"grade": true, "grade_id": "test-normalized-scoring", "locked": true, "points": 1} +def test_unit_normalized_scoring(): + """Test normalized scoring calculation.""" + print("🔬 Unit Test: Normalized Scoring Calculation...") + + # Test Case 1: Standard optimization (speedup + compression) + baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89} + optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87} + + scores = calculate_normalized_scores(baseline, optimized) + + assert abs(scores['speedup'] - 2.5) < 0.01, "Speedup calculation incorrect" + assert abs(scores['compression_ratio'] - 4.0) < 0.01, "Compression ratio incorrect" + assert abs(scores['accuracy_delta'] - (-0.02)) < 0.001, "Accuracy delta incorrect" + print(" ✅ Standard optimization scoring works") + + # Test Case 2: Extreme optimization (high speedup, accuracy loss) + optimized_extreme = {'latency': 20.0, 'memory': 1.5, 'accuracy': 0.75} + scores_extreme = calculate_normalized_scores(baseline, optimized_extreme) + + assert scores_extreme['speedup'] > 4.0, "Extreme speedup not detected" + assert scores_extreme['accuracy_delta'] < -0.1, "Large accuracy loss not detected" + print(" ✅ Extreme optimization scoring works") + + # Test Case 3: Conservative optimization (minimal changes) + optimized_conservative = {'latency': 90.0, 'memory': 11.0, 'accuracy': 0.89} + scores_conservative = calculate_normalized_scores(baseline, optimized_conservative) + + assert abs(scores_conservative['accuracy_delta']) < 0.01, "Accuracy preservation not detected" + print(" ✅ Conservative optimization scoring works") + + # Test Case 4: Accuracy improvement (rare but possible) + optimized_better = {'latency': 80.0, 'memory': 10.0, 'accuracy': 0.91} + scores_better = calculate_normalized_scores(baseline, optimized_better) + + assert scores_better['accuracy_delta'] > 0, "Accuracy improvement not detected" + print(" ✅ Accuracy improvement scoring works") + + print("📈 Progress: Normalized Scoring ✓\n") + +test_unit_normalized_scoring() + +# %% [markdown] +""" +### Key Takeaways + +1. **Always report relative improvements, not absolute numbers** +2. **Speedup and compression ratio are the primary metrics** +3. **Accuracy delta shows the optimization cost** +4. **Efficiency score balances all factors for All-Around event** + +**In Module 20**, you'll use `calculate_normalized_scores()` to generate your competition submission! +""" + +# %% [markdown] +""" +## 4.6 Combination Strategies - Preparing for TorchPerf Olympics You've learned individual optimizations (M14-18). Now it's time to combine them strategically! The order and parameters matter significantly for final performance. @@ -2144,6 +2340,7 @@ def test_module(): test_unit_benchmark_suite() test_unit_tinymlperf() test_unit_optimization_comparison() + test_unit_normalized_scoring() print("\nRunning integration scenarios...")