From 26fafbc0671f2afafd6eff1cd1ee9656d4936fe0 Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Thu, 6 Nov 2025 23:57:34 -0500
Subject: [PATCH] Add normalized scoring to Module 19 for fair competition
 comparison

- Add Section 4.5: Normalized Metrics - Fair Comparison Across Different Hardware
- Implement calculate_normalized_scores() function for MLPerf-style relative metrics
- Calculate speedup, compression ratio, accuracy delta, and efficiency score
- Add comprehensive unit tests for normalized scoring
- Ensures fairness across different hardware by measuring relative improvements
- Prepares students for Module 20 TinyMLPerf competition submissions
---
 .../19_benchmarking/benchmarking_dev.py       | 199 +++++++++++++++++-
 1 file changed, 198 insertions(+), 1 deletion(-)

diff --git a/modules/source/19_benchmarking/benchmarking_dev.py b/modules/source/19_benchmarking/benchmarking_dev.py
index 08e6c650..dfd5219a 100644
--- a/modules/source/19_benchmarking/benchmarking_dev.py
+++ b/modules/source/19_benchmarking/benchmarking_dev.py
@@ -2023,7 +2023,203 @@ TinyMLPerf is MLPerf for embedded/edge devices:
 
 # %% [markdown]
 """
-## 4.5 Combination Strategies - Preparing for TorchPerf Olympics
+## 4.5 Normalized Metrics - Fair Comparison Across Different Hardware
+
+### The Hardware Problem
+
+Imagine two students submit their optimizations:
+- **Alice** (M3 Mac, 16GB RAM): "My model runs at 50ms latency!"
+- **Bob** (2015 laptop, 4GB RAM): "My model runs at 200ms latency!"
+
+Who optimized better? **You can't tell from raw numbers!**
+
+Alice's hardware is 4x faster. If Bob achieved 200ms on old hardware, he might have optimized MORE aggressively than Alice. Raw metrics are unfair.
+
+### The Solution: Relative Improvement Metrics
+
+Instead of absolute performance, measure **relative improvement** from YOUR baseline:
+
+```
+Speedup = Baseline Latency / Optimized Latency
+Compression Ratio = Baseline Memory / Optimized Memory  
+Accuracy Delta = Optimized Accuracy - Baseline Accuracy
+```
+
+**Example:**
+- Alice: 100ms → 50ms = **2.0x speedup** ✓
+- Bob: 400ms → 200ms = **2.0x speedup** ✓
+
+Now they're fairly compared! Both achieved 2x speedup on their hardware.
+
+### Key Normalized Metrics for TorchPerf Olympics
+
+**1. Speedup (for Latency Sprint)**
+```python
+speedup = baseline_latency / optimized_latency
+# Higher is better: 2.5x means 2.5 times faster
+```
+
+**2. Compression Ratio (for Memory Challenge)**
+```python
+compression_ratio = baseline_memory / optimized_memory
+# Higher is better: 4.0x means 4 times smaller
+```
+
+**3. Accuracy Preservation (for All Events)**
+```python
+accuracy_delta = optimized_accuracy - baseline_accuracy
+# Closer to 0 is better: -0.02 means 2% accuracy drop
+```
+
+**4. Efficiency Score (for All-Around)**
+```python
+efficiency = (speedup * compression_ratio) / max(1.0, abs(accuracy_delta))
+# Balances all metrics
+```
+
+### Why This Matters for Your Competition
+
+**Without normalization:**
+- Newest hardware wins unfairly
+- Focus shifts to "who has the best laptop"
+- Optimization skill doesn't matter
+
+**With normalization:**
+- Everyone competes on **optimization skill**
+- Hardware differences are eliminated
+- Focus is on relative improvement
+
+**Real MLPerf Example:**
+```
+NVIDIA A100 submission: 2.1ms (absolute) → 3.5x speedup (relative)
+Google TPU submission: 1.8ms (absolute) → 4.2x speedup (relative)
+
+Winner: Google (better speedup despite slower absolute time)
+```
+
+### Implementing Normalized Scoring
+"""
+
+# %% [markdown]
+"""
+Let's implement a helper function to calculate normalized scores for the competition:
+"""
+
+# %% nbgrader={"grade": false, "grade_id": "normalized-scoring", "locked": false}
+#| export
+def calculate_normalized_scores(baseline_results: dict, 
+                                optimized_results: dict) -> dict:
+    """
+    Calculate normalized performance metrics for fair competition comparison.
+    
+    This function converts absolute measurements into relative improvements,
+    enabling fair comparison across different hardware platforms.
+    
+    Args:
+        baseline_results: Dict with keys: 'latency', 'memory', 'accuracy'
+        optimized_results: Dict with same keys as baseline_results
+        
+    Returns:
+        Dict with normalized metrics:
+        - speedup: Relative latency improvement (higher is better)
+        - compression_ratio: Relative memory reduction (higher is better)
+        - accuracy_delta: Absolute accuracy change (closer to 0 is better)
+        - efficiency_score: Combined metric balancing all factors
+        
+    Example:
+        >>> baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
+        >>> optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
+        >>> scores = calculate_normalized_scores(baseline, optimized)
+        >>> print(f"Speedup: {scores['speedup']:.2f}x")
+        Speedup: 2.50x
+    """
+    # Calculate speedup (higher is better)
+    speedup = baseline_results['latency'] / optimized_results['latency']
+    
+    # Calculate compression ratio (higher is better)
+    compression_ratio = baseline_results['memory'] / optimized_results['memory']
+    
+    # Calculate accuracy delta (closer to 0 is better, negative means degradation)
+    accuracy_delta = optimized_results['accuracy'] - baseline_results['accuracy']
+    
+    # Calculate efficiency score (combined metric)
+    # Penalize accuracy loss: the more accuracy you lose, the lower your score
+    accuracy_penalty = max(1.0, 1.0 - accuracy_delta) if accuracy_delta < 0 else 1.0
+    efficiency_score = (speedup * compression_ratio) / accuracy_penalty
+    
+    return {
+        'speedup': speedup,
+        'compression_ratio': compression_ratio,
+        'accuracy_delta': accuracy_delta,
+        'efficiency_score': efficiency_score,
+        'baseline': baseline_results.copy(),
+        'optimized': optimized_results.copy()
+    }
+
+# %% [markdown]
+"""
+### 🧪 Unit Test: Normalized Scoring
+
+**This is a unit test** - it validates that normalized scoring correctly calculates relative improvements.
+"""
+
+# %% nbgrader={"grade": true, "grade_id": "test-normalized-scoring", "locked": true, "points": 1}
+def test_unit_normalized_scoring():
+    """Test normalized scoring calculation."""
+    print("🔬 Unit Test: Normalized Scoring Calculation...")
+    
+    # Test Case 1: Standard optimization (speedup + compression)
+    baseline = {'latency': 100.0, 'memory': 12.0, 'accuracy': 0.89}
+    optimized = {'latency': 40.0, 'memory': 3.0, 'accuracy': 0.87}
+    
+    scores = calculate_normalized_scores(baseline, optimized)
+    
+    assert abs(scores['speedup'] - 2.5) < 0.01, "Speedup calculation incorrect"
+    assert abs(scores['compression_ratio'] - 4.0) < 0.01, "Compression ratio incorrect"
+    assert abs(scores['accuracy_delta'] - (-0.02)) < 0.001, "Accuracy delta incorrect"
+    print("  ✅ Standard optimization scoring works")
+    
+    # Test Case 2: Extreme optimization (high speedup, accuracy loss)
+    optimized_extreme = {'latency': 20.0, 'memory': 1.5, 'accuracy': 0.75}
+    scores_extreme = calculate_normalized_scores(baseline, optimized_extreme)
+    
+    assert scores_extreme['speedup'] > 4.0, "Extreme speedup not detected"
+    assert scores_extreme['accuracy_delta'] < -0.1, "Large accuracy loss not detected"
+    print("  ✅ Extreme optimization scoring works")
+    
+    # Test Case 3: Conservative optimization (minimal changes)
+    optimized_conservative = {'latency': 90.0, 'memory': 11.0, 'accuracy': 0.89}
+    scores_conservative = calculate_normalized_scores(baseline, optimized_conservative)
+    
+    assert abs(scores_conservative['accuracy_delta']) < 0.01, "Accuracy preservation not detected"
+    print("  ✅ Conservative optimization scoring works")
+    
+    # Test Case 4: Accuracy improvement (rare but possible)
+    optimized_better = {'latency': 80.0, 'memory': 10.0, 'accuracy': 0.91}
+    scores_better = calculate_normalized_scores(baseline, optimized_better)
+    
+    assert scores_better['accuracy_delta'] > 0, "Accuracy improvement not detected"
+    print("  ✅ Accuracy improvement scoring works")
+    
+    print("📈 Progress: Normalized Scoring ✓\n")
+
+test_unit_normalized_scoring()
+
+# %% [markdown]
+"""
+### Key Takeaways
+
+1. **Always report relative improvements, not absolute numbers**
+2. **Speedup and compression ratio are the primary metrics**
+3. **Accuracy delta shows the optimization cost**
+4. **Efficiency score balances all factors for All-Around event**
+
+**In Module 20**, you'll use `calculate_normalized_scores()` to generate your competition submission!
+"""
+
+# %% [markdown]
+"""
+## 4.6 Combination Strategies - Preparing for TorchPerf Olympics
 
 You've learned individual optimizations (M14-18). Now it's time to combine them strategically! The order and parameters matter significantly for final performance.
 
@@ -2144,6 +2340,7 @@ def test_module():
     test_unit_benchmark_suite()
     test_unit_tinymlperf()
     test_unit_optimization_comparison()
+    test_unit_normalized_scoring()
 
     print("\nRunning integration scenarios...")