cs249r_book/mlperf-edu/examples/closed_resnet_optimized.py

import torch
from mlperf.sut import SUT_Interface
from reference.edge.resnet_train import ResNet18WhiteBox

class ClosedResNetOptimization(SUT_Interface):
    """
    CLOSED DIVISION: Mathematical Accuracy Retained > 99%.

    Student Optimization Notes:
    In the Closed division, you cannot drop layers or change the underlying mathematical
    representation completely. However, you CAN use hardware profiling, Torch compilation,
    and FP16 Autocasting to crush the Latency barriers structurally!
    """
    def __init__(self, config: dict):
        super().__init__(config)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

        # 1. Load the Instructor's PyTorch code untouched.
        print("[Submitter:Closed] 🧠 Loading Native ResNet18WhiteBox Parameters...")
        self.model = ResNet18WhiteBox(num_classes=100).to(self.device)
        self.model.eval()

        # 2. STUDENT OPTIMIZATION: Math-Safe Structural Speedups!
        print("[Submitter:Closed] ⚡ Engaging FP16 Autocast & Trace JIT!")
        # Fake weights initialization to spoof the state-dict for testing

        # We don't change weight values, but we compile the ops together (Fuse Ops)
        # Note: torch.compile isn't supported on MPS natively yet, so we use JIT tracing safely.
        dummy_input = torch.randn(1, 3, 32, 32).to(self.device)
        self.optimized_model = torch.jit.trace(self.model, dummy_input)

    async def process_queries(self, samples: list):
        batch_size = len(samples)
        input_tensor = torch.randn((batch_size, 3, 32, 32)).to(self.device)

        # 3. STUDENT OPTIMIZATION: inference_mode() and autocast
        with torch.inference_mode(), torch.autocast(device_type=self.device.type, dtype=torch.float16 if self.device.type != 'cpu' else torch.bfloat16):
            logits = self.optimized_model(input_tensor)

        return {
            "predictions": logits,
            "targets": torch.randint(0, 100, (batch_size,)).to(self.device)
        }