mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 17:20:21 -05:00
132 lines
6.2 KiB
Python
132 lines
6.2 KiB
Python
# hardware.py
|
|
# Hierarchical Hardware Definitions for MLSys Textbook
|
|
|
|
import pint
|
|
from dataclasses import dataclass
|
|
from typing import Optional, Tuple
|
|
from .constants import (
|
|
ureg, Q_,
|
|
V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP, V100_FLOPS_FP32,
|
|
A100_MEM_BW, A100_FLOPS_FP16_TENSOR, A100_MEM_CAPACITY, A100_TDP, A100_FLOPS_FP32, A100_FLOPS_TF32, A100_FLOPS_INT8,
|
|
H100_MEM_BW, H100_FLOPS_FP16_TENSOR, H100_MEM_CAPACITY, H100_TDP, H100_FLOPS_TF32, H100_FLOPS_FP8_TENSOR, H100_FLOPS_INT8,
|
|
B200_MEM_BW, B200_FLOPS_FP16_TENSOR, B200_MEM_CAPACITY, B200_TDP, B200_FLOPS_FP8_TENSOR,
|
|
T4_MEM_BW, T4_FLOPS_FP16_TENSOR, T4_TDP, T4_FLOPS_INT8,
|
|
TPUV4_MEM_BW, TPUV4_FLOPS_BF16,
|
|
MOBILE_NPU_MEM_BW, MOBILE_NPU_TOPS_INT8,
|
|
ESP32_RAM, ESP32_FLASH, ESP32_POWER_MAX,
|
|
MCU_RAM_KIB,
|
|
NETWORK_10G_BW, NETWORK_100G_BW
|
|
)
|
|
|
|
@dataclass(frozen=True)
|
|
class HardwareSpec:
|
|
name: str
|
|
release_year: int
|
|
memory_bw: Q_
|
|
peak_flops: Q_ # Usually FP16 Tensor for AI accelerators
|
|
memory_capacity: Q_
|
|
tdp: Optional[Q_] = None
|
|
battery_capacity: Optional[Q_] = None
|
|
dispatch_tax: Q_ = 0.01 * ureg.ms # Default 10us
|
|
|
|
# Precision-specific FLOPS
|
|
peak_flops_fp32: Optional[Q_] = None
|
|
tf32_flops: Optional[Q_] = None
|
|
fp8_flops: Optional[Q_] = None
|
|
int8_flops: Optional[Q_] = None
|
|
|
|
def __post_init__(self):
|
|
"""Validate hardware specs: correct dimension type first, then positive value."""
|
|
from .constants import ureg
|
|
|
|
def _validate(qty, name, target_unit, dim_desc):
|
|
if not qty.is_compatible_with(target_unit):
|
|
raise pint.DimensionalityError(
|
|
qty.units, target_unit,
|
|
extra_msg=f" — {self.name}.{name} must be {dim_desc}, got {qty.units}"
|
|
)
|
|
if qty.magnitude <= 0:
|
|
raise ValueError(f"{self.name}.{name} must be positive, got {qty}")
|
|
|
|
_validate(self.memory_bw, "memory_bw", ureg.byte/ureg.second, "data/time (e.g. GB/s)")
|
|
_validate(self.peak_flops, "peak_flops", ureg.flop/ureg.second, "compute rate (e.g. TFLOPs/s)")
|
|
_validate(self.memory_capacity, "memory_capacity", ureg.byte, "data size (e.g. GiB)")
|
|
_validate(self.dispatch_tax, "dispatch_tax", ureg.second, "time (e.g. ms)")
|
|
if self.tdp:
|
|
_validate(self.tdp, "tdp", ureg.watt, "power (e.g. W)")
|
|
if self.battery_capacity:
|
|
_validate(self.battery_capacity, "battery_capacity", ureg.joule, "energy (e.g. Wh or J)")
|
|
|
|
def ridge_point(self) -> Q_:
|
|
"""Calculates the Roofline ridge point (Intensity threshold)."""
|
|
# FLOPS / BW = Ops/Byte
|
|
return (self.peak_flops / self.memory_bw).to('flop/byte')
|
|
|
|
def __repr__(self):
|
|
return f"Hardware({self.name}, {self.release_year})"
|
|
|
|
@dataclass(frozen=True)
|
|
class NetworkSpec:
|
|
name: str
|
|
bandwidth: Q_
|
|
|
|
class Networks:
|
|
Ethernet_10G = NetworkSpec("10GbE", NETWORK_10G_BW)
|
|
Ethernet_100G = NetworkSpec("100GbE", NETWORK_100G_BW)
|
|
|
|
class Cloud:
|
|
"""Datacenter-scale Accelerators."""
|
|
V100 = HardwareSpec("NVIDIA V100", 2017, V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP,
|
|
peak_flops_fp32=V100_FLOPS_FP32, dispatch_tax=0.02 * ureg.ms)
|
|
A100 = HardwareSpec("NVIDIA A100", 2020, A100_MEM_BW, A100_FLOPS_FP16_TENSOR, A100_MEM_CAPACITY, A100_TDP,
|
|
peak_flops_fp32=A100_FLOPS_FP32, tf32_flops=A100_FLOPS_TF32, int8_flops=A100_FLOPS_INT8,
|
|
dispatch_tax=0.015 * ureg.ms)
|
|
H100 = HardwareSpec("NVIDIA H100", 2022, H100_MEM_BW, H100_FLOPS_FP16_TENSOR, H100_MEM_CAPACITY, H100_TDP,
|
|
tf32_flops=H100_FLOPS_TF32, fp8_flops=H100_FLOPS_FP8_TENSOR, int8_flops=H100_FLOPS_INT8,
|
|
dispatch_tax=0.01 * ureg.ms)
|
|
B200 = HardwareSpec("NVIDIA B200", 2024, B200_MEM_BW, B200_FLOPS_FP16_TENSOR, B200_MEM_CAPACITY, B200_TDP,
|
|
fp8_flops=B200_FLOPS_FP8_TENSOR, dispatch_tax=0.008 * ureg.ms)
|
|
T4 = HardwareSpec("NVIDIA T4", 2018, T4_MEM_BW, T4_FLOPS_FP16_TENSOR, 16 * ureg.GiB, T4_TDP,
|
|
int8_flops=T4_FLOPS_INT8, dispatch_tax=0.03 * ureg.ms)
|
|
|
|
TPUv4 = HardwareSpec("Google TPU v4", 2021, TPUV4_MEM_BW, TPUV4_FLOPS_BF16, 32 * ureg.GiB, dispatch_tax=0.05 * ureg.ms)
|
|
|
|
class Edge:
|
|
"""Mobile and Robotics Hardware."""
|
|
Generic_Phone = HardwareSpec("Smartphone", 2024, MOBILE_NPU_MEM_BW, MOBILE_NPU_TOPS_INT8, 8 * ureg.GiB,
|
|
battery_capacity=15 * ureg.Wh, dispatch_tax=1.0 * ureg.ms) # High OS overhead
|
|
|
|
# Specific Edge Devices
|
|
Coral = HardwareSpec("Google Coral Dev", 2019, 25 * ureg.GB/ureg.s, 4 * ureg.TFLOPs/ureg.s, 1 * ureg.GB, 2 * ureg.W,
|
|
dispatch_tax=0.5 * ureg.ms)
|
|
JetsonOrinNX = HardwareSpec("NVIDIA Jetson Orin NX", 2023, 102 * ureg.GB/ureg.s, 100 * ureg.TFLOPs/ureg.s, 16 * ureg.GB, 25 * ureg.W,
|
|
dispatch_tax=0.2 * ureg.ms)
|
|
NUC_Movidius = HardwareSpec("Intel NUC + Movidius", 2020, 50 * ureg.GB/ureg.s, 4 * ureg.TFLOPs/ureg.s, 16 * ureg.GB, 15 * ureg.W,
|
|
dispatch_tax=2.0 * ureg.ms)
|
|
|
|
# Servers
|
|
GenericServer = HardwareSpec("Edge Server", 2024, 100 * ureg.GB/ureg.s, 1 * ureg.TFLOPs/ureg.s, 128 * ureg.GB, 300 * ureg.W,
|
|
dispatch_tax=0.1 * ureg.ms)
|
|
|
|
class Tiny:
|
|
"""Microcontrollers and Embedded Systems."""
|
|
# ESP32 at 240MHz is ~240 MIPS, for AI math without FPU it's roughly 100-200 MFLOPS (0.0001-0.0002 TFLOPS)
|
|
ESP32 = HardwareSpec("ESP32-CAM", 2019, 0.1 * ureg.GB/ureg.second, 0.0002 * ureg.TFLOPs/ureg.second, ESP32_RAM, ESP32_POWER_MAX,
|
|
dispatch_tax=5.0 * ureg.ms) # Very high overhead relative to math
|
|
Generic_MCU = HardwareSpec("Cortex-M7", 2020, 0.05 * ureg.GB/ureg.second, 0.001 * ureg.TFLOPs/ureg.second, MCU_RAM_KIB,
|
|
dispatch_tax=2.0 * ureg.ms)
|
|
|
|
class Hardware:
|
|
Cloud = Cloud
|
|
Edge = Edge
|
|
Tiny = Tiny
|
|
Networks = Networks
|
|
|
|
# Aliases for the most common ones
|
|
V100 = Cloud.V100
|
|
A100 = Cloud.A100
|
|
H100 = Cloud.H100
|
|
B200 = Cloud.B200
|
|
TPUv4 = Cloud.TPUv4
|
|
ESP32 = Tiny.ESP32
|