Merge branch 'feature/tinytorch-core' into dev

2026-04-29 09:08:54 -05:00 · 2026-02-22 13:22:37 -05:00
parent 94d079b57c 1733596996
commit 85c0cd4c83
2 changed files with 33 additions and 19 deletions
--- a/tinytorch/src/19_benchmarking/19_benchmarking.py
+++ b/tinytorch/src/19_benchmarking/19_benchmarking.py
@@ -162,11 +162,11 @@ ML systems exist on a **Pareto frontier** - you can't simultaneously maximize ac
 ```
 Accuracy
    ^
-    |  A .     <- Model A: High accuracy, high latency
+    |      A .<- Model A: High accuracy, high latency
    |
    |    B .  <- Model B: Balanced trade-off
    |
-    |      C .<- Model C: Low accuracy, low latency
+    |  C .     <- Model C: Low accuracy, low latency
    |__________> Latency (lower is better)
 ```

@@ -2108,7 +2108,10 @@ if __name__ == "__main__":
 """
 ### MLPerf - Standardized Industry Benchmarking

-MLPerf provides standardized benchmarks that enable fair comparison across different systems, similar to how MLPerf works for larger models. This is crucial for reproducible research and industry adoption.
+MLPerf® is a trademark of MLCommons. This module provides MLPerf-style standardized
+benchmarks that enable fair comparison across different systems, similar to how the
+official MLPerf suite works for larger models. This is important for reproducible
+research and industry adoption.

 ### Why Standardization Matters

@@ -2165,6 +2168,11 @@ MLPerf Benchmark Structure:
 - Task: Binary classification (anomaly/normal)
 - Target: 85% accuracy, <50ms latency

+**Image Classification**: Tiny image recognition (CIFAR-style)
+- Input: 32×32 RGB images
+- Task: Multi-class classification (10 classes)
+- Target: 75% accuracy, <150ms latency
+
 ### Reproducibility Requirements

 All MLPerf benchmarks use:
@@ -2187,7 +2195,7 @@ Standard MLPerf Benchmarks:
 ┌─────────────────────┬──────────────────┬─────────┬──────────┐
 │ Benchmark           │ Input Shape      │ Acc Tgt │ Lat Tgt  │
 ├─────────────────────┼──────────────────┼─────────┼──────────┤
-│ keyword_spotting     │ (1, 16000)       │ 90%     │ <100ms   │
+│ keyword_spotting    │ (1, 16000)       │ 90%     │ <100ms   │
 │ visual_wake_words   │ (1, 96, 96, 3)   │ 80%     │ <200ms   │
 │ anomaly_detection   │ (1, 640)         │ 85%     │ <50ms    │
 │ image_classification│ (1, 32, 32, 3)   │ 75%     │ <150ms   │
@@ -2201,6 +2209,10 @@ class MLPerf:
    """
    MLPerf-style standardized benchmarking for edge ML systems.

+    MLPerf® is a trademark of MLCommons. Used here purely for educational purposes.
+    This module teaches the principles of MLPerf-style benchmarking through a
+    simplified suite inspired by MLPerf Tiny.
+
    Provides fixed benchmark configurations with target thresholds,
    standardized measurement protocols, and compliance reporting.

@@ -2357,7 +2369,7 @@ def _mlperf_run_latency_test(self, model: Any, test_inputs: List[Any],
                    output = model(test_input)
                else:
                    # Simulate prediction
-                    output = np.random.rand(2) if benchmark_name in ['keyword_spotting', 'visual_wake_words'] else np.random.rand(10)
+                    output = np.random.rand(2) if benchmark_name in ['keyword_spotting', 'visual_wake_words', 'anomaly_detection'] else np.random.rand(10)

                predictions.append(output)
            except Exception:
@@ -2413,8 +2425,8 @@ if __name__ == "__main__":

 This helper calculates accuracy by comparing model predictions against synthetic
 ground truth labels. It handles both binary classification (keyword spotting,
-visual wake words) and multi-class classification (image classification,
-anomaly detection).
+visual wake words, anomaly detection) and multi-class classification (image
+classification).

 We'll build this in two steps: first a helper to extract a clean prediction
 array from various output formats, then the accuracy calculation itself.
@@ -2487,12 +2499,12 @@ def _mlperf_run_accuracy_test(self, model: Any, predictions: List[Any],
    4. Add realistic noise based on model name

    HINTS:
-    - keyword_spotting and visual_wake_words are binary (2 classes)
-    - image_classification has 10 classes, anomaly_detection has 5
+    - keyword_spotting, visual_wake_words, and anomaly_detection are binary (2 classes)
+    - image_classification has 10 classes
    """
    ### BEGIN SOLUTION
    np.random.seed(self.random_seed)
-    if benchmark_name in ['keyword_spotting', 'visual_wake_words']:
+    if benchmark_name in ['keyword_spotting', 'visual_wake_words', 'anomaly_detection']:
        # Binary classification
        true_labels = np.random.randint(0, 2, num_runs)
        predicted_labels = []
@@ -2503,8 +2515,8 @@ def _mlperf_run_accuracy_test(self, model: Any, predictions: List[Any],
            else:
                predicted_labels.append(1 if pred_array[0] > 0.5 else 0)
    else:
-        # Multi-class classification
-        num_classes = 10 if benchmark_name == 'image_classification' else 5
+        # Multi-class classification (image_classification only)
+        num_classes = 10
        true_labels = np.random.randint(0, num_classes, num_runs)
        predicted_labels = []
        for pred in predictions:
@@ -2650,17 +2662,19 @@ def mlperf_run_standard_benchmark(self, model: Any, benchmark_name: str,
    print(f"   Target: {config['target_accuracy']:.1%} accuracy, "
          f"<{config['max_latency_ms']}ms latency")

-    # Generate standardized test inputs
+    # Generate standardized test inputs (as Tensors for TinyTorch model compatibility)
    input_shape = config['input_shape']
    test_inputs = []
    for i in range(num_runs):
        # Use deterministic random generation for reproducibility
        np.random.seed(self.random_seed + i)
-        if len(input_shape) == 2:  # Audio/sequence data
-            test_input = np.random.randn(*input_shape).astype(np.float32)
-        else:  # Image data
-            test_input = np.random.randint(0, 256, input_shape).astype(np.float32) / 255.0
-        test_inputs.append(test_input)
+        if len(input_shape) == 2:  # Audio/sequence data (keyword_spotting, anomaly_detection)
+            arr = np.random.randn(*input_shape).astype(np.float32)
+        else:  # Image data (visual_wake_words, image_classification) - use CHW for Conv2d
+            arr = np.random.randint(0, 256, input_shape).astype(np.float32) / 255.0
+            if arr.ndim == 4 and arr.shape[-1] == 3:  # (B,H,W,C) -> (B,C,H,W)
+                arr = np.transpose(arr, (0, 3, 1, 2))
+        test_inputs.append(Tensor(arr))

    # Run latency and accuracy tests using helpers
    latencies, predictions = self._run_latency_test(model, test_inputs, benchmark_name, num_runs)