Files
cs249r_book/book/quarto/mlsys/hardware.py

132 lines
6.2 KiB
Python

# hardware.py
# Hierarchical Hardware Definitions for MLSys Textbook
import pint
from dataclasses import dataclass
from typing import Optional, Tuple
from .constants import (
ureg, Q_,
V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP, V100_FLOPS_FP32,
A100_MEM_BW, A100_FLOPS_FP16_TENSOR, A100_MEM_CAPACITY, A100_TDP, A100_FLOPS_FP32, A100_FLOPS_TF32, A100_FLOPS_INT8,
H100_MEM_BW, H100_FLOPS_FP16_TENSOR, H100_MEM_CAPACITY, H100_TDP, H100_FLOPS_TF32, H100_FLOPS_FP8_TENSOR, H100_FLOPS_INT8,
B200_MEM_BW, B200_FLOPS_FP16_TENSOR, B200_MEM_CAPACITY, B200_TDP, B200_FLOPS_FP8_TENSOR,
T4_MEM_BW, T4_FLOPS_FP16_TENSOR, T4_TDP, T4_FLOPS_INT8,
TPUV4_MEM_BW, TPUV4_FLOPS_BF16,
MOBILE_NPU_MEM_BW, MOBILE_NPU_TOPS_INT8,
ESP32_RAM, ESP32_FLASH, ESP32_POWER_MAX,
MCU_RAM_KIB,
NETWORK_10G_BW, NETWORK_100G_BW
)
@dataclass(frozen=True)
class HardwareSpec:
name: str
release_year: int
memory_bw: Q_
peak_flops: Q_ # Usually FP16 Tensor for AI accelerators
memory_capacity: Q_
tdp: Optional[Q_] = None
battery_capacity: Optional[Q_] = None
dispatch_tax: Q_ = 0.01 * ureg.ms # Default 10us
# Precision-specific FLOPS
peak_flops_fp32: Optional[Q_] = None
tf32_flops: Optional[Q_] = None
fp8_flops: Optional[Q_] = None
int8_flops: Optional[Q_] = None
def __post_init__(self):
"""Validate hardware specs: correct dimension type first, then positive value."""
from .constants import ureg
def _validate(qty, name, target_unit, dim_desc):
if not qty.is_compatible_with(target_unit):
raise pint.DimensionalityError(
qty.units, target_unit,
extra_msg=f"{self.name}.{name} must be {dim_desc}, got {qty.units}"
)
if qty.magnitude <= 0:
raise ValueError(f"{self.name}.{name} must be positive, got {qty}")
_validate(self.memory_bw, "memory_bw", ureg.byte/ureg.second, "data/time (e.g. GB/s)")
_validate(self.peak_flops, "peak_flops", ureg.flop/ureg.second, "compute rate (e.g. TFLOPs/s)")
_validate(self.memory_capacity, "memory_capacity", ureg.byte, "data size (e.g. GiB)")
_validate(self.dispatch_tax, "dispatch_tax", ureg.second, "time (e.g. ms)")
if self.tdp:
_validate(self.tdp, "tdp", ureg.watt, "power (e.g. W)")
if self.battery_capacity:
_validate(self.battery_capacity, "battery_capacity", ureg.joule, "energy (e.g. Wh or J)")
def ridge_point(self) -> Q_:
"""Calculates the Roofline ridge point (Intensity threshold)."""
# FLOPS / BW = Ops/Byte
return (self.peak_flops / self.memory_bw).to('flop/byte')
def __repr__(self):
return f"Hardware({self.name}, {self.release_year})"
@dataclass(frozen=True)
class NetworkSpec:
name: str
bandwidth: Q_
class Networks:
Ethernet_10G = NetworkSpec("10GbE", NETWORK_10G_BW)
Ethernet_100G = NetworkSpec("100GbE", NETWORK_100G_BW)
class Cloud:
"""Datacenter-scale Accelerators."""
V100 = HardwareSpec("NVIDIA V100", 2017, V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP,
peak_flops_fp32=V100_FLOPS_FP32, dispatch_tax=0.02 * ureg.ms)
A100 = HardwareSpec("NVIDIA A100", 2020, A100_MEM_BW, A100_FLOPS_FP16_TENSOR, A100_MEM_CAPACITY, A100_TDP,
peak_flops_fp32=A100_FLOPS_FP32, tf32_flops=A100_FLOPS_TF32, int8_flops=A100_FLOPS_INT8,
dispatch_tax=0.015 * ureg.ms)
H100 = HardwareSpec("NVIDIA H100", 2022, H100_MEM_BW, H100_FLOPS_FP16_TENSOR, H100_MEM_CAPACITY, H100_TDP,
tf32_flops=H100_FLOPS_TF32, fp8_flops=H100_FLOPS_FP8_TENSOR, int8_flops=H100_FLOPS_INT8,
dispatch_tax=0.01 * ureg.ms)
B200 = HardwareSpec("NVIDIA B200", 2024, B200_MEM_BW, B200_FLOPS_FP16_TENSOR, B200_MEM_CAPACITY, B200_TDP,
fp8_flops=B200_FLOPS_FP8_TENSOR, dispatch_tax=0.008 * ureg.ms)
T4 = HardwareSpec("NVIDIA T4", 2018, T4_MEM_BW, T4_FLOPS_FP16_TENSOR, 16 * ureg.GiB, T4_TDP,
int8_flops=T4_FLOPS_INT8, dispatch_tax=0.03 * ureg.ms)
TPUv4 = HardwareSpec("Google TPU v4", 2021, TPUV4_MEM_BW, TPUV4_FLOPS_BF16, 32 * ureg.GiB, dispatch_tax=0.05 * ureg.ms)
class Edge:
"""Mobile and Robotics Hardware."""
Generic_Phone = HardwareSpec("Smartphone", 2024, MOBILE_NPU_MEM_BW, MOBILE_NPU_TOPS_INT8, 8 * ureg.GiB,
battery_capacity=15 * ureg.Wh, dispatch_tax=1.0 * ureg.ms) # High OS overhead
# Specific Edge Devices
Coral = HardwareSpec("Google Coral Dev", 2019, 25 * ureg.GB/ureg.s, 4 * ureg.TFLOPs/ureg.s, 1 * ureg.GB, 2 * ureg.W,
dispatch_tax=0.5 * ureg.ms)
JetsonOrinNX = HardwareSpec("NVIDIA Jetson Orin NX", 2023, 102 * ureg.GB/ureg.s, 100 * ureg.TFLOPs/ureg.s, 16 * ureg.GB, 25 * ureg.W,
dispatch_tax=0.2 * ureg.ms)
NUC_Movidius = HardwareSpec("Intel NUC + Movidius", 2020, 50 * ureg.GB/ureg.s, 4 * ureg.TFLOPs/ureg.s, 16 * ureg.GB, 15 * ureg.W,
dispatch_tax=2.0 * ureg.ms)
# Servers
GenericServer = HardwareSpec("Edge Server", 2024, 100 * ureg.GB/ureg.s, 1 * ureg.TFLOPs/ureg.s, 128 * ureg.GB, 300 * ureg.W,
dispatch_tax=0.1 * ureg.ms)
class Tiny:
"""Microcontrollers and Embedded Systems."""
# ESP32 at 240MHz is ~240 MIPS, for AI math without FPU it's roughly 100-200 MFLOPS (0.0001-0.0002 TFLOPS)
ESP32 = HardwareSpec("ESP32-CAM", 2019, 0.1 * ureg.GB/ureg.second, 0.0002 * ureg.TFLOPs/ureg.second, ESP32_RAM, ESP32_POWER_MAX,
dispatch_tax=5.0 * ureg.ms) # Very high overhead relative to math
Generic_MCU = HardwareSpec("Cortex-M7", 2020, 0.05 * ureg.GB/ureg.second, 0.001 * ureg.TFLOPs/ureg.second, MCU_RAM_KIB,
dispatch_tax=2.0 * ureg.ms)
class Hardware:
Cloud = Cloud
Edge = Edge
Tiny = Tiny
Networks = Networks
# Aliases for the most common ones
V100 = Cloud.V100
A100 = Cloud.A100
H100 = Cloud.H100
B200 = Cloud.B200
TPUv4 = Cloud.TPUv4
ESP32 = Tiny.ESP32