mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 09:08:54 -05:00
Merge branch 'feature/tinytorch-core' into dev
This commit is contained in:
@@ -162,11 +162,11 @@ ML systems exist on a **Pareto frontier** - you can't simultaneously maximize ac
|
||||
```
|
||||
Accuracy
|
||||
^
|
||||
| A . <- Model A: High accuracy, high latency
|
||||
| A .<- Model A: High accuracy, high latency
|
||||
|
|
||||
| B . <- Model B: Balanced trade-off
|
||||
|
|
||||
| C .<- Model C: Low accuracy, low latency
|
||||
| C . <- Model C: Low accuracy, low latency
|
||||
|__________> Latency (lower is better)
|
||||
```
|
||||
|
||||
@@ -2108,7 +2108,10 @@ if __name__ == "__main__":
|
||||
"""
|
||||
### MLPerf - Standardized Industry Benchmarking
|
||||
|
||||
MLPerf provides standardized benchmarks that enable fair comparison across different systems, similar to how MLPerf works for larger models. This is crucial for reproducible research and industry adoption.
|
||||
MLPerf® is a trademark of MLCommons. This module provides MLPerf-style standardized
|
||||
benchmarks that enable fair comparison across different systems, similar to how the
|
||||
official MLPerf suite works for larger models. This is important for reproducible
|
||||
research and industry adoption.
|
||||
|
||||
### Why Standardization Matters
|
||||
|
||||
@@ -2165,6 +2168,11 @@ MLPerf Benchmark Structure:
|
||||
- Task: Binary classification (anomaly/normal)
|
||||
- Target: 85% accuracy, <50ms latency
|
||||
|
||||
**Image Classification**: Tiny image recognition (CIFAR-style)
|
||||
- Input: 32×32 RGB images
|
||||
- Task: Multi-class classification (10 classes)
|
||||
- Target: 75% accuracy, <150ms latency
|
||||
|
||||
### Reproducibility Requirements
|
||||
|
||||
All MLPerf benchmarks use:
|
||||
@@ -2187,7 +2195,7 @@ Standard MLPerf Benchmarks:
|
||||
┌─────────────────────┬──────────────────┬─────────┬──────────┐
|
||||
│ Benchmark │ Input Shape │ Acc Tgt │ Lat Tgt │
|
||||
├─────────────────────┼──────────────────┼─────────┼──────────┤
|
||||
│ keyword_spotting │ (1, 16000) │ 90% │ <100ms │
|
||||
│ keyword_spotting │ (1, 16000) │ 90% │ <100ms │
|
||||
│ visual_wake_words │ (1, 96, 96, 3) │ 80% │ <200ms │
|
||||
│ anomaly_detection │ (1, 640) │ 85% │ <50ms │
|
||||
│ image_classification│ (1, 32, 32, 3) │ 75% │ <150ms │
|
||||
@@ -2201,6 +2209,10 @@ class MLPerf:
|
||||
"""
|
||||
MLPerf-style standardized benchmarking for edge ML systems.
|
||||
|
||||
MLPerf® is a trademark of MLCommons. Used here purely for educational purposes.
|
||||
This module teaches the principles of MLPerf-style benchmarking through a
|
||||
simplified suite inspired by MLPerf Tiny.
|
||||
|
||||
Provides fixed benchmark configurations with target thresholds,
|
||||
standardized measurement protocols, and compliance reporting.
|
||||
|
||||
@@ -2357,7 +2369,7 @@ def _mlperf_run_latency_test(self, model: Any, test_inputs: List[Any],
|
||||
output = model(test_input)
|
||||
else:
|
||||
# Simulate prediction
|
||||
output = np.random.rand(2) if benchmark_name in ['keyword_spotting', 'visual_wake_words'] else np.random.rand(10)
|
||||
output = np.random.rand(2) if benchmark_name in ['keyword_spotting', 'visual_wake_words', 'anomaly_detection'] else np.random.rand(10)
|
||||
|
||||
predictions.append(output)
|
||||
except Exception:
|
||||
@@ -2413,8 +2425,8 @@ if __name__ == "__main__":
|
||||
|
||||
This helper calculates accuracy by comparing model predictions against synthetic
|
||||
ground truth labels. It handles both binary classification (keyword spotting,
|
||||
visual wake words) and multi-class classification (image classification,
|
||||
anomaly detection).
|
||||
visual wake words, anomaly detection) and multi-class classification (image
|
||||
classification).
|
||||
|
||||
We'll build this in two steps: first a helper to extract a clean prediction
|
||||
array from various output formats, then the accuracy calculation itself.
|
||||
@@ -2487,12 +2499,12 @@ def _mlperf_run_accuracy_test(self, model: Any, predictions: List[Any],
|
||||
4. Add realistic noise based on model name
|
||||
|
||||
HINTS:
|
||||
- keyword_spotting and visual_wake_words are binary (2 classes)
|
||||
- image_classification has 10 classes, anomaly_detection has 5
|
||||
- keyword_spotting, visual_wake_words, and anomaly_detection are binary (2 classes)
|
||||
- image_classification has 10 classes
|
||||
"""
|
||||
### BEGIN SOLUTION
|
||||
np.random.seed(self.random_seed)
|
||||
if benchmark_name in ['keyword_spotting', 'visual_wake_words']:
|
||||
if benchmark_name in ['keyword_spotting', 'visual_wake_words', 'anomaly_detection']:
|
||||
# Binary classification
|
||||
true_labels = np.random.randint(0, 2, num_runs)
|
||||
predicted_labels = []
|
||||
@@ -2503,8 +2515,8 @@ def _mlperf_run_accuracy_test(self, model: Any, predictions: List[Any],
|
||||
else:
|
||||
predicted_labels.append(1 if pred_array[0] > 0.5 else 0)
|
||||
else:
|
||||
# Multi-class classification
|
||||
num_classes = 10 if benchmark_name == 'image_classification' else 5
|
||||
# Multi-class classification (image_classification only)
|
||||
num_classes = 10
|
||||
true_labels = np.random.randint(0, num_classes, num_runs)
|
||||
predicted_labels = []
|
||||
for pred in predictions:
|
||||
@@ -2650,17 +2662,19 @@ def mlperf_run_standard_benchmark(self, model: Any, benchmark_name: str,
|
||||
print(f" Target: {config['target_accuracy']:.1%} accuracy, "
|
||||
f"<{config['max_latency_ms']}ms latency")
|
||||
|
||||
# Generate standardized test inputs
|
||||
# Generate standardized test inputs (as Tensors for TinyTorch model compatibility)
|
||||
input_shape = config['input_shape']
|
||||
test_inputs = []
|
||||
for i in range(num_runs):
|
||||
# Use deterministic random generation for reproducibility
|
||||
np.random.seed(self.random_seed + i)
|
||||
if len(input_shape) == 2: # Audio/sequence data
|
||||
test_input = np.random.randn(*input_shape).astype(np.float32)
|
||||
else: # Image data
|
||||
test_input = np.random.randint(0, 256, input_shape).astype(np.float32) / 255.0
|
||||
test_inputs.append(test_input)
|
||||
if len(input_shape) == 2: # Audio/sequence data (keyword_spotting, anomaly_detection)
|
||||
arr = np.random.randn(*input_shape).astype(np.float32)
|
||||
else: # Image data (visual_wake_words, image_classification) - use CHW for Conv2d
|
||||
arr = np.random.randint(0, 256, input_shape).astype(np.float32) / 255.0
|
||||
if arr.ndim == 4 and arr.shape[-1] == 3: # (B,H,W,C) -> (B,C,H,W)
|
||||
arr = np.transpose(arr, (0, 3, 1, 2))
|
||||
test_inputs.append(Tensor(arr))
|
||||
|
||||
# Run latency and accuracy tests using helpers
|
||||
latencies, predictions = self._run_latency_test(model, test_inputs, benchmark_name, num_runs)
|
||||
|
||||
Reference in New Issue
Block a user