From 2de66f1c0fff4e195984bdec1e85f208ff4f87ba Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Wed, 25 Feb 2026 08:31:21 -0500 Subject: [PATCH] refactor: complete Gold Standard audit for core foundation chapters; unify Volume 1 and Volume 2 math; verify physical realism of hardware constants --- .references_verified.json | 645 +++++++++++++++++- book/cli/commands/reference_check.py | 45 ++ book/quarto/_quarto.yml | 2 +- .../contents/vol1/backmatter/references.bib | 73 +- .../contents/vol2/backmatter/references.bib | 12 +- .../vol2/data_storage/data_storage.qmd | 108 +-- book/quarto/mlsys/constants.py | 2 +- storage_cell.py | 142 ++++ tinytorch/src/01_tensor/01_tensor.py | 10 +- 9 files changed, 956 insertions(+), 83 deletions(-) create mode 100644 storage_cell.py diff --git a/.references_verified.json b/.references_verified.json index cdeb9476c..d7f0fb146 100644 --- a/.references_verified.json +++ b/.references_verified.json @@ -2,26 +2,661 @@ "10.1109/ICRA.2017.7989092": { "status": "verified", "source": "DOI", - "date": "2026-02-25T00:58:05.438076+00:00" + "date": "2026-02-25T13:18:39.259726+00:00" }, "abadi2016tensorflow": { "status": "verified", "source": "arXiv", - "date": "2026-02-25T00:58:05.438076+00:00" + "date": "2026-02-25T13:18:39.259726+00:00" }, "abiresearch2024tinyml": { "status": "not_found", "source": "", - "date": "2026-02-25T00:58:21.926716+00:00" + "date": "2026-02-25T13:18:39.259726+00:00" }, "adamson2018dermatology": { "status": "verified", "source": "CrossRef", - "date": "2026-02-25T00:58:21.926716+00:00" + "date": "2026-02-25T13:18:39.259726+00:00" }, "agile_manifesto": { + "status": "author_mismatch", + "source": "Semantic Scholar", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "ansible": { "status": "not_found", "source": "", - "date": "2026-02-25T00:58:21.926716+00:00" + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "anylogic_synthetic": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "arm_bf16alt": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "ARM2020": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "aws": { + "status": "author_mismatch", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "aws_s3": { + "status": "author_mismatch", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bai2019onnx": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "beazley2010understanding": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "ben2019cost": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "brutlag2009speed": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "brynjolfsson2014second": { + "status": "author_mismatch", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "cerebras_website": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Cerebras2021": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Cerebras2021wse2": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chapman2000crisp": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "choi2020dataechoing": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "circleci": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "cntk_website": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "crowdflower2016data": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "dbt": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "deepbench_github": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "deepspeed_training_system_2021": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "dehghani2022data": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "discord2020rust": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "domingos2015master": { + "status": "author_mismatch", + "source": "DOI", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "dvc": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "elastic": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "farmbeats_website": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "fastai_website": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "fayyad1996kdd": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "fda2021artificial": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "Feldman2020": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "fisher_8087_1981": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "gartner2024cloud": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "google_bfloat16": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "google_cloud": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "google_crowdsource": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "google_litert": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "google2024gemini": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "google2024staticdynamic": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "GoogleXLA": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "Graphcore2020": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "gu2023deep": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "gudivada2017data": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "guo2019mobile": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "harvard_law_chatgpt": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "hermann2017meet": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "hydra": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "ibm_data_drift": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:17:46.755488+00:00" + }, + "agrawal2024sarathi": { + "status": "verified", + "source": "arXiv", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "al2016theano": { + "status": "error", + "source": "validator crash", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "alexnet2012": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "amdahl1967validity": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "amershi2019software": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "andrae2015global": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "angluin1988queries": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "annette2020": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "ansel2024pytorch2": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "ardila2020common": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "armbrust2021lakehouse": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "attia2018noninvasive": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "aws_sagemaker": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "ba2014deep": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bahdanau2014neural": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "banbury2020benchmarking": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "banbury2021micronets": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "banbury2024wakevisiontailoreddataset": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "barocas2016big": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "barroso2007energy": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "barroso2017attack": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "barroso2019datacenter": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "baydin2018": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "belkin2019reconciling": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bellamy2019aif360": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Bellec2018": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Ben-Nun2019data": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bender2018data": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bender2021dangers": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bengio2013estimating": { + "status": "verified", + "source": "DBLP", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bengio2013representation": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bengio2015conditional": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bergstra2010theano": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Bergstra2011": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "beyer2016sre": { + "status": "author_mismatch", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "beyer2020we": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bird2020fairlearn": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bishop2006pattern": { + "status": "author_mismatch", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "blake2010evolution": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "blalock2020state": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "blas_netlib": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bobrow1964student": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bommasani2021opportunities": { + "status": "author_mismatch", + "source": "Europe PMC", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "BoroumandASPLOS2018": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "breck2019data": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "breck2020ml": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "brewer2000towards": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "bridle1990probabilistic": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "broder1997resemblance": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "brown2020language": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "buolamwini2018gender": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "caffe_website": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "cai2018proxylessnas": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Cai2020": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chapelle2009semisupervised": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2015mxnet": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Chen2016": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2016eyeriss": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2016training": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2017machine": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2018tvm": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2020mocov2": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2020reinforcement": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2020simclr": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2021evaluating": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2023framework": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chen2024eellm": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chetlur2014cudnn": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "cho2014properties": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "Choi2019": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chollet2018keras": { + "status": "author_mismatch", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "choudhary2020comprehensive": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "choukroun2019low": { + "status": "verified", + "source": "CrossRef", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chouldechova2017fair": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chowdhery2022palm": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "chu2021discovering": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "cifar10_website": { + "status": "not_found", + "source": "", + "date": "2026-02-25T13:18:39.259726+00:00" + }, + "clark2019what": { + "status": "verified", + "source": "DOI", + "date": "2026-02-25T13:18:39.259726+00:00" } } \ No newline at end of file diff --git a/book/cli/commands/reference_check.py b/book/cli/commands/reference_check.py index 7ea322000..1cdeb51cf 100644 --- a/book/cli/commands/reference_check.py +++ b/book/cli/commands/reference_check.py @@ -19,6 +19,7 @@ import time import unicodedata from datetime import datetime, timezone from pathlib import Path +from urllib.parse import quote_plus from types import SimpleNamespace from typing import Any, Dict, List, Optional, Tuple @@ -205,6 +206,24 @@ def _save_cache(cache_path: Path, updates: Dict[str, dict]) -> None: json.dump(existing, f, indent=2) +def parse_report_keys(report_path: Path) -> List[str]: + """ + Parse a reference-check report and return citation keys from + "Not found" and "Author mismatch" sections (for re-validating only those). + """ + keys: List[str] = [] + pattern = re.compile(r"^\s+\[([^\]]+)\]\s") + try: + with open(report_path, encoding="utf-8") as f: + for line in f: + m = pattern.match(line) + if m: + keys.append(m.group(1)) + except OSError: + pass + return keys + + def run( bib_paths: List[Path], *, @@ -216,6 +235,7 @@ def run( cache_path: Optional[Path] = None, skip_verified: bool = False, thorough: bool = False, + only_keys: Optional[List[str]] = None, ) -> Tuple[bool, int, List[dict], int]: """ Load .bib files, validate refs against academic DBs, optionally write report. @@ -223,6 +243,7 @@ def run( cache_path: if set, read/write verification cache (key -> {status, source, date}). skip_verified: only validate refs not already verified in cache (ignored if thorough). thorough: revalidate all refs and ignore cache for filtering. + only_keys: if set, validate only these citation keys (e.g. from a previous report). Returns: (passed, elapsed_ms, issues, ref_count) @@ -261,6 +282,16 @@ def run( if dedupe: all_refs = _dedupe_refs(all_refs) + + # Restrict to a subset of keys (e.g. from --only-from-report or --only-keys) before limit + if only_keys is not None: + allowed = set(only_keys) + all_refs = [(k, r) for k, r in all_refs if k in allowed] + if not all_refs and console: + console.print(f"[yellow]No .bib entries matched the {len(allowed)} key(s) to check.[/yellow]") + if not all_refs: + return True, int((time.time() - t0) * 1000), [], 0 + if limit is not None: all_refs = all_refs[:limit] @@ -369,6 +400,20 @@ def run( f.write("\nError (validator crash or timeout):\n") for key, r in err_list: f.write(f" [{key}] {r.title}\n") + # Companion file: one search link per not_verified entry (search online to fix .bib) + not_verified = [(k, r) for k, r in zip(keys, results) if r.status in ("not_found", "author_mismatch", "error")] + if not_verified: + search_links_path = output_path.with_suffix(output_path.suffix + ".search-links.txt") + with open(search_links_path, "w", encoding="utf-8") as sl: + sl.write("# key\ttitle\tGoogle Scholar search URL (search online, then update .bib with DOI/arXiv or authors)\n") + for key, r in zip(keys, results): + if r.status in ("not_found", "author_mismatch", "error"): + title = (r.title or "").strip() + q = quote_plus(title) + url = f"https://scholar.google.com/scholar?q={q}" + sl.write(f"{key}\t{title}\t{url}\n") + if console: + console.print(f"Search links written to {search_links_path} (open each URL to search online, then update .bib)") if console: console.print(f"\nReport written to {output_path}") diff --git a/book/quarto/_quarto.yml b/book/quarto/_quarto.yml index bf839f351..2b580bc6d 120000 --- a/book/quarto/_quarto.yml +++ b/book/quarto/_quarto.yml @@ -1 +1 @@ -config/_quarto-pdf-vol2.yml \ No newline at end of file +config/_quarto-html-vol1.yml \ No newline at end of file diff --git a/book/quarto/contents/vol1/backmatter/references.bib b/book/quarto/contents/vol1/backmatter/references.bib index 0434c4e18..75be4bd35 100644 --- a/book/quarto/contents/vol1/backmatter/references.bib +++ b/book/quarto/contents/vol1/backmatter/references.bib @@ -1521,7 +1521,12 @@ and Dahl, George E }, year = {2020}, - booktitle = {International Conference on Machine Learning}, + booktitle = {Proceedings of the 37th International Conference on Machine Learning}, + volume = {119}, + pages = {1925--1934}, + publisher = {PMLR}, + eprint = {1907.05550}, + archiveprefix = {arXiv}, url = {https://proceedings.mlr.press/v119/choi20a.html}, } @@ -2582,7 +2587,7 @@ volume = {17}, number = {3}, pages = {37--54}, - doi = {10.1609/AIMAG.V17I3.1230}, + doi = {10.1609/aimag.v17i3.1230}, url = {https://doi.org/10.1609/aimag.v17i3.1230}, source = {DBLP}, } @@ -5336,12 +5341,16 @@ archiveprefix = {arXiv}, } -@misc{lime_github, - title = {LIME (Local Interpretable Model-Agnostic Explanations)}, +@inproceedings{lime_github, + title = {"Why Should I Trust You?": Explaining the Predictions of Any Classifier}, author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos}, year = {2016}, - url = {https://github.com/marcotcr/lime}, - note = {Accessed: 2024-01-15}, + booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, + pages = {1135--1144}, + publisher = {ACM}, + doi = {10.1145/2939672.2939778}, + url = {https://doi.org/10.1145/2939672.2939778}, + note = {LIME; code: https://github.com/marcotcr/lime}, } @incollection{lin2014microsoft, @@ -7091,11 +7100,13 @@ organization = {PMLR}, } -@article{radford2018improving, +@techreport{radford2018improving, title = {Improving language understanding by generative pre-training}, author = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya}, year = {2018}, - publisher = {OpenAI}, + institution = {OpenAI}, + url = {https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf}, + note = {OpenAI technical report; no DOI or arXiv}, } @inproceedings{radford2021learning, @@ -7756,23 +7767,18 @@ essn = {1548-7105}, } -@incollection{sculley2015hidden, - title = {Technical Debt in Machine Learning Systems}, +@inproceedings{sculley2015hidden, + title = {Hidden Technical Debt in Machine Learning Systems}, author = { Sculley, D. and Holt, Gary and Golovin, Daniel and Davydov, Eugene and Phillips, Todd and Ebner, Dietmar and Chaudhary, Vinay and Young, Michael and Crespo, Jean-Fran\c{c}ois and Dennison, Dan }, - journal = {Advances in Neural Information Processing Systems}, - booktitle = {Technical Debt in Practice}, - publisher = {The MIT Press}, + year = {2015}, + booktitle = {Advances in Neural Information Processing Systems}, volume = {28}, - pages = {177--192}, - doi = {10.7551/mitpress/12440.003.0011}, - isbn = {9780262366304}, - url = {https://doi.org/10.7551/mitpress/12440.003.0011}, - source = {Crossref}, - date = {2021-08-17}, + pages = {2503--2511}, + url = {https://proceedings.neurips.cc/paper/2015/hash/86df7dcfd896fcaf2674f757a2463eba-Abstract.html}, } @online{sec2013knight, @@ -8023,12 +8029,16 @@ date = {2012-01-02}, } -@phdthesis{sifre2014rigid, +@article{sifre2014rigid, title = {Rigid-Motion Scattering for Image Classification}, - author = {Sifre, Laurent}, + author = {Sifre, Laurent and Mallat, St\'{e}phane}, year = {2014}, - url = {https://www.di.ens.fr/data/publications/papers/phd_sifre.pdf}, - school = {\'{E}cole Polytechnique, CMAP}, + journal = {arXiv preprint arXiv:1403.1687}, + eprint = {1403.1687}, + archiveprefix = {arXiv}, + doi = {10.48550/arXiv.1403.1687}, + url = {https://arxiv.org/abs/1403.1687}, + note = {PhD work, \'{E}cole Polytechnique, CMAP}, } @article{silver2016mastering, @@ -8534,12 +8544,17 @@ url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/abadi}, } -@techreport{tensorflow_data_2015, - title = {TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems}, - author = {Abadi, Mart\'{\i}n and Agarwal, Ashish and Barham, Paul and others}, - year = {2015}, - note = {Available at }, - institution = {Google Brain}, +@article{tensorflow_data_2015, + title = {TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems}, + author = {Abadi, Mart\'{\i}n and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and others}, + year = {2016}, + month = mar, + journal = {arXiv preprint arXiv:1603.04467}, + eprint = {1603.04467}, + archiveprefix = {arXiv}, + doi = {10.48550/arXiv.1603.04467}, + url = {https://arxiv.org/abs/1603.04467}, + note = {Preliminary white paper, November 2015; software: https://www.tensorflow.org/}, } @article{tensorflow_serving, diff --git a/book/quarto/contents/vol2/backmatter/references.bib b/book/quarto/contents/vol2/backmatter/references.bib index c1bb3e89b..18c153ef9 100644 --- a/book/quarto/contents/vol2/backmatter/references.bib +++ b/book/quarto/contents/vol2/backmatter/references.bib @@ -286,6 +286,8 @@ year = {2016}, month = jun, journal = {arXiv preprint arXiv:1606.06565}, + doi = {10.48550/arXiv.1606.06565}, + eprint = {1606.06565}, url = {http://arxiv.org/abs/1606.06565v2}, primaryclass = {cs.AI}, archiveprefix = {arXiv}, @@ -1788,6 +1790,8 @@ year = {2019}, month = nov, journal = {arXiv preprint arXiv:1911.01547}, + eprint = {1911.01547}, + doi = {10.48550/arXiv.1911.01547}, url = {http://arxiv.org/abs/1911.01547v2}, primaryclass = {cs.AI}, archiveprefix = {arXiv}, @@ -7877,6 +7881,8 @@ year = {2017}, month = jul, journal = {arXiv preprint arXiv:1707.06347}, + doi = {10.48550/arXiv.1707.06347}, + eprint = {1707.06347}, url = {http://arxiv.org/abs/1707.06347v2}, primaryclass = {cs.LG}, archiveprefix = {arXiv}, @@ -8046,9 +8052,11 @@ Shafahi, Ali and Najibi, Mahyar and Ghiasi, Amin and Xu, Zheng and Dickerson, John and Studer, Christoph and Davis, Larry S. and Taylor, Gavin and Goldstein, Tom }, + year = {2019}, journal = {arXiv preprint arXiv:1904.12843}, + eprint = {1904.12843}, + doi = {10.48550/arXiv.1904.12843}, url = {http://arxiv.org/abs/1904.12843v2}, - date = {2019-04-29}, primaryclass = {cs.LG}, archiveprefix = {arXiv}, } @@ -8262,6 +8270,7 @@ year = {2010}, number = {dapper-2010-1}, institution = {Google}, + url = {https://research.google/pubs/dapper-a-large-scale-distributed-systems-tracing-infrastructure/}, } @article{silva2019federated, @@ -9516,6 +9525,7 @@ year = {2022}, month = jun, journal = {Transactions on Machine Learning Research}, + doi = {10.48550/arXiv.2206.07682}, url = {http://arxiv.org/abs/2206.07682v2}, primaryclass = {cs.CL}, archiveprefix = {arXiv}, diff --git a/book/quarto/contents/vol2/data_storage/data_storage.qmd b/book/quarto/contents/vol2/data_storage/data_storage.qmd index 6a030ba32..26b1ec899 100644 --- a/book/quarto/contents/vol2/data_storage/data_storage.qmd +++ b/book/quarto/contents/vol2/data_storage/data_storage.qmd @@ -18,7 +18,7 @@ engine: jupyter # └───────────────────────────────────────────────────────────────────────────── from mlsys.registry import start_chapter from mlsys.constants import ( - GB, TB, PB, Gbps, byte, second, GB, MB, + GB, TB, PB, Gbps, byte, second, MB, BILLION, TRILLION, SEC_PER_HOUR, SEC_PER_DAY, BITS_PER_BYTE, KIB_TO_BYTES, A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW, H100_FLOPS_FP8_TENSOR, H100_FLOPS_FP16_TENSOR, H100_TDP, @@ -96,10 +96,11 @@ from mlsys.constants import ( H100_MEM_BW, H100_FLOPS_FP16_TENSOR, NVME_SEQUENTIAL_BW, GPT3_PARAMS, BILLION, MILLION, TRILLION, THOUSAND, GB, TB, second, byte, flop, USD, kilowatt, hour, - GPUS_PER_HOST, SEC_PER_MONTH, SEC_PER_YEAR, - H100_MEM_CAPACITY, BYTES_PER_FP16, BYTES_PER_FP32, - MICROSECOND, GB, Mparam, RESNET50_PARAMS, - NVLINK_H100_BW, PCIE_GEN5_BW + GPUS_PER_HOST, SEC_PER_DAY, SEC_PER_YEAR, + H100_MEM_CAPACITY, BYTES_FP16, BYTES_FP32, + Mparam, RESNET50_PARAMS, + NVLINK_H100_BW, PCIE_GEN5_BW, + A100_MEM_CAPACITY, H100_FLOPS_FP8_TENSOR, H100_TDP ) from mlsys.formatting import fmt, check, md import math @@ -108,83 +109,76 @@ import math class StorageHierarchyAnalysis: """ Namespace for global storage hierarchy and pipeline calculations. - Hierarchy: - - Level 1: Accelerator & Media Specs (The Silicon Contract) - - Level 2: Workload Profiles (Archetype A/B) - - Level 3: Pipeline Tiers (Node, Rack, PFS, Object) - - Level 4: Efficiency Metrics (Stall %, MFU impact) """ # ┌── 1. LOAD (Constants) ─────────────────────────────────────────────── - # Level 1: Media & Accelerator h100_bw = H100_MEM_BW h100_flops_fp16 = H100_FLOPS_FP16_TENSOR h100_cap = H100_MEM_CAPACITY - nvme_bw_raw = NVME_SEQUENTIAL_BW # ~7 GB/s - pfs_node_bw = (4.0 * GB / second) # Typical per-node PFS share - s3_bw = (1.0 * GB / second) # Typical single-stream object BW + nvme_bw_raw = NVME_SEQUENTIAL_BW + pfs_node_bw = (4.0 * GB / second) + s3_bw = (1.0 * GB / second) - # Level 2: Workload (Archetype A: 175B LLM) gpt3_params = GPT3_PARAMS.m_as('param') - batch_tokens_gpu = 4096 t_step_ms = 200 - # ImageNet Scenario n_gpus_image = 256 - img_size = 150 * THOUSAND # 150 KB + img_size = 150 * THOUSAND batch_img_gpu = 256 util_target = 0.80 - # Cost Parameters dataset_size_tb = 100 cost_s3_gb_mo = 0.02 cost_nvme_gb_mo = 0.10 cost_glacier_gb_mo = 0.004 cost_egress_gb = 0.09 - # Tail Latency Scenario n_tail_servers = 100 p_tail_fail = 0.01 # ┌── 2. EXECUTE (The Compute) ───────────────────────────────────────── - # ImageNet Pipeline Equation (Level 4) - # B = N * U * (S_batch / T_iter) t_step_s = t_step_ms / 1000 - req_bw_imagenet_val = (n_gpus_image * util_target * (batch_img_gpu * img_size)) / t_step_s # bytes/s + req_bw_imagenet_val = (n_gpus_image * util_target * (batch_img_gpu * img_size)) / t_step_s req_bw_imagenet_gbs = req_bw_imagenet_val / BILLION - # Data Stall (Level 4) t_comp_val = 200 t_io_val = 250 stall_max_t = max(t_comp_val, t_io_val) data_stall_pct_val = ((stall_max_t - t_comp_val) / stall_max_t) * 100 - # Checkpoint Storm (Archetype A) - # Weights (FP16) + Optimizer (FP32 x 2) - bytes_per_param_ckpt = BYTES_PER_FP16 + (2 * BYTES_PER_FP32) + bytes_per_param_ckpt = BYTES_FP16.m_as(byte) + (2 * BYTES_FP32.m_as(byte)) ckpt_total_gb_val = (gpt3_params * bytes_per_param_ckpt) / BILLION - # Per-node shard (assume 256 nodes) n_nodes = 256 node_shard_gb = ckpt_total_gb_val / n_nodes - ckpt_nvme_s_val = node_shard_gb / (4 * nvme_bw_raw.m_as(GB/second)) # 4-drive RAID + ckpt_nvme_s_val = node_shard_gb / (4 * nvme_bw_raw.m_as(GB/second)) ckpt_pfs_s_val = node_shard_gb / pfs_node_bw.m_as(GB/second) + ckpt_weights_gb = (gpt3_params * 2) / BILLION + ckpt_optimizer_gb = (gpt3_params * 8) / BILLION - # Economics s3_annual_val = dataset_size_tb * 1000 * cost_s3_gb_mo * 12 nvme_annual_val = dataset_size_tb * 1000 * cost_nvme_gb_mo * 12 glacier_annual_val = dataset_size_tb * 1000 * cost_glacier_gb_mo * 12 tier_cost_ratio_val = cost_nvme_gb_mo / cost_s3_gb_mo egress_100tb_val = dataset_size_tb * 1000 * cost_egress_gb - # Tail Latency Physics prob_tail_all = (1.0 - p_tail_fail) ** n_tail_servers + images_per_sec = 1000 + raw_bw_val = (images_per_sec * img_size) / MILLION + hdd_iops = 100 + hdd_slowdown_val = images_per_sec / hdd_iops + + gds_trad_us = 120 + gds_bypass_us = 30 + t_io_p99 = 500 + t_compute = 200 + gds_speedup_val = gds_trad_us / gds_bypass_us + # ┌── 3. GUARD (Invariants) ─────────────────────────────────────────── check(req_bw_imagenet_gbs > 10, f"ImageNet aggregate BW should be high, got {req_bw_imagenet_gbs:.1f} GB/s") check(ckpt_total_gb_val > 1000, "175B checkpoint must be > 1 TB") check(data_stall_pct_val == 20, "Data stall calculation mismatch") - check(prob_tail_all < 0.4, f"Tail latency should dominate, got {prob_tail_all:.2f}") # ┌── 4. OUTPUT (Formatting) ────────────────────────────────────────────── h100_bw_tbs = f"{h100_bw.m_as(TB/second):.2f}" @@ -198,6 +192,8 @@ class StorageHierarchyAnalysis: ckpt_total_gb_str = f"{ckpt_total_gb_val:,.0f}" ckpt_nvme_s_str = f"{ckpt_nvme_s_val:.1f}" ckpt_pfs_s_str = f"{ckpt_pfs_s_val:.1f}" + ckpt_weights_gb_str = f"{ckpt_weights_gb:,.0f}" + ckpt_optimizer_gb_str = f"{ckpt_optimizer_gb:,.0f}" s3_annual_str = f"{s3_annual_val:,.0f}" nvme_annual_str = f"{nvme_annual_val:,.0f}" @@ -209,6 +205,20 @@ class StorageHierarchyAnalysis: fail_rate_pct_str = f"{p_tail_fail * 100:.0f}" n_tail_servers_str = f"{n_tail_servers}" + raw_bw_str = f"{raw_bw_val:.0f}" + hdd_slowdown_factor = f"{hdd_slowdown_val:.0f}" + gds_trad_us_str = f"{gds_trad_us}" + gds_bypass_us_str = f"{gds_bypass_us}" + prefetch_t_io_p99_str = f"{t_io_p99}" + prefetch_t_compute_str = f"{t_compute}" + prefetch_min_depth = f"{math.ceil(t_io_p99/t_compute)}" + prefetch_safe_depth = f"{math.ceil(t_io_p99/t_compute) + 2}" + gds_speedup_str = f"{gds_speedup_val:.0f}" + t_comp_stall_str = f"{t_comp_val}" + t_io_stall_str = f"{t_io_val}" + ckpt_fleet_total_pb_val = (ckpt_total_gb_val * 4320) / 1e6 + bw_ratio_val = h100_bw.m_as(GB/second) / nvme_bw_raw.m_as(GB/second) + # ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── h100_bw_tbs = StorageHierarchyAnalysis.h100_bw_tbs nvme_bw = StorageHierarchyAnalysis.nvme_bw_str @@ -225,6 +235,8 @@ stall_pct_display_math = md( ckpt_total_gb = StorageHierarchyAnalysis.ckpt_total_gb_str ckpt_nvme_s = StorageHierarchyAnalysis.ckpt_nvme_s_str ckpt_pfs_s = StorageHierarchyAnalysis.ckpt_pfs_s_str +ckpt_weights_gb_str = StorageHierarchyAnalysis.ckpt_weights_gb_str +ckpt_optimizer_gb_str = StorageHierarchyAnalysis.ckpt_optimizer_gb_str s3_annual_cost = StorageHierarchyAnalysis.s3_annual_str nvme_annual_cost = StorageHierarchyAnalysis.nvme_annual_str @@ -236,6 +248,20 @@ prob_tail_all_str = StorageHierarchyAnalysis.prob_tail_all_str fail_rate_pct_str = StorageHierarchyAnalysis.fail_rate_pct_str n_tail_servers_str = StorageHierarchyAnalysis.n_tail_servers_str +raw_bw_str = StorageHierarchyAnalysis.raw_bw_str +hdd_slowdown_factor = StorageHierarchyAnalysis.hdd_slowdown_factor +gds_trad_us = StorageHierarchyAnalysis.gds_trad_us_str +gds_bypass_us = StorageHierarchyAnalysis.gds_bypass_us_str +prefetch_t_io_p99_str = StorageHierarchyAnalysis.prefetch_t_io_p99_str +prefetch_t_compute_str = StorageHierarchyAnalysis.prefetch_t_compute_str +prefetch_min_depth = StorageHierarchyAnalysis.prefetch_min_depth +prefetch_safe_depth = StorageHierarchyAnalysis.prefetch_safe_depth +gds_speedup = StorageHierarchyAnalysis.gds_speedup_str +t_comp_stall_str = StorageHierarchyAnalysis.t_comp_stall_str +t_io_stall_str = StorageHierarchyAnalysis.t_io_stall_str +ckpt_fleet_total_pb = f"{StorageHierarchyAnalysis.ckpt_fleet_total_pb_val:,.1f}" +bw_ratio_str = f"{StorageHierarchyAnalysis.bw_ratio_val:,.0f}" + # GPU Specs for setup a100_mem = f"{A100_MEM_CAPACITY.m_as(GB):.0f}" h100_mem = f"{H100_MEM_CAPACITY.m_as(GB):.0f}" @@ -262,11 +288,11 @@ In the Fleet Stack model introduced in @sec-vol2-introduction, **Data Storage** Consider the running example that will thread through this chapter. A 175-billion parameter language model trains on 1.5 trillion tokens of text, roughly 3 TB in compressed form. Each training epoch reads every token once, in a shuffled order determined by the random seed. There is no "hot" subset of data that dominates access; every byte is consumed exactly once per pass. Meanwhile, each accelerator processes its local batch in roughly 200 ms, then waits for the next. If storage cannot deliver data within that 200 ms window, the accelerator sits idle, and the organization pays for silicon that produces heat instead of gradients. -This problem is deceptive because storage technology has improved enormously. NVMe drives achieve `{python} nvme_bw` GB/s of sequential throughput, a figure that would have seemed extraordinary a decade ago. But the accelerators improved faster. An H100 GPU consumes data from its HBM at `{python} h100_bw_tbs` TB/s, roughly 1,000$\times$ faster than a single NVMe drive can feed it. The gap between storage delivery and accelerator consumption is the central tension of this chapter, and it cannot be solved by any single technology. Instead, it requires a hierarchy of storage tiers, each carefully matched to a specific phase of the ML lifecycle, connected by pipelines that hide latency through prefetching and pipelining. +This problem is deceptive because storage technology has improved enormously. NVMe drives achieve `{python} nvme_bw` GB/s of sequential throughput, a figure that would have seemed extraordinary a decade ago. But the accelerators improved faster. An H100 GPU consumes data from its HBM at `{python} h100_bw_tbs` TB/s, roughly `{python} bw_ratio_str`$\times$ faster than a single NVMe drive can feed it. The gap between storage delivery and accelerator consumption is the central tension of this chapter, and it cannot be solved by any single technology. Instead, it requires a hierarchy of storage tiers, each carefully matched to a specific phase of the ML lifecycle, connected by pipelines that hide latency through prefetching and pipelining. The storage problem is fundamentally one of physics meeting economics. Physics dictates that data closer to the accelerator (in both physical distance and interconnect hops) can be delivered faster but in smaller quantities. Economics dictates that cheaper storage can hold more data but at greater distance. The engineering art is constructing a pipeline that bridges these constraints, keeping the expensive top tier full by drawing from cheaper lower tiers fast enough that the accelerator never perceives the delay. This chapter shows how to reason quantitatively about each tier in the hierarchy, how to size the pipeline that connects them, and how to make the economic tradeoffs that determine which data lives where. -Training our 175B model requires roughly 15 TB of training data (including preprocessed variants), stored across the hierarchy. The model generates 350 GB weight-only checkpoints (1,050 GB with optimizer state) every 30 minutes. Over a 30-day training run on 256 nodes, the storage system must deliver 3 TB of training data per epoch, absorb 4.3 PB of checkpoint writes, and stage model weights for evaluation runs. These numbers thread through every section of this chapter, grounding abstract principles in concrete engineering constraints. +Training our 175B model requires roughly 15 TB of training data (including preprocessed variants), stored across the hierarchy. The model generates 350 GB weight-only checkpoints (`{python} ckpt_total_gb` GB with optimizer state) every 30 minutes. Over a 30-day training run on 256 nodes, the storage system must deliver 3 TB of training data per epoch, absorb `{python} ckpt_fleet_total_pb` PB of checkpoint writes, and stage model weights for evaluation runs. These numbers thread through every section of this chapter, grounding abstract principles in concrete engineering constraints. The chapter proceeds through three layers of increasing distance from the accelerator. We begin with how ML workloads differ from traditional storage workloads, then trace the six-tier storage hierarchy from HBM to cold archive, examining the physics and economics at each level. We then turn to the data pipeline equation that governs required bandwidth, the GPU Direct Storage technology that eliminates CPU bottlenecks, and the economics that determine which tier houses which data. We conclude with common fallacies that trap even experienced engineers. @@ -979,7 +1005,7 @@ $$D_{min} = \left\lceil \frac{T_{I/O,p99}}{T_{\text{compute}}} \right\rceil$$ {# The prefetch depth equation (@eq-prefetch-depth) tells us how many batches must be in flight simultaneously to hide I/O latency behind computation. When the I/O system is slower than the accelerator (the common case in ML training), deeper prefetching is the only way to prevent data stalls. The equation uses P99 I/O latency rather than average latency because a single slow read can drain the buffer and stall the accelerator; sizing for the average guarantees frequent stalls at scale. -If I/O at the 99th percentile takes `{python} prefetch_t_io_p99_str` ms and compute takes `{python} prefetch_t_compute_str` ms, then $D_{min} = $ `{python} prefetch_min_depth` batches, with a safety margin of `{python} prefetch_safe_depth`. In practice, data loaders like PyTorch's `DataLoader` use `prefetch_factor` and `num_workers` parameters to control this depth. Setting `prefetch_factor=2` with 4 workers creates a buffer of 8 batches, which is typically sufficient for NVMe-backed pipelines but may be inadequate for object-storage-backed pipelines where P99 latency can exceed 500 ms. +If I/O at the 99th percentile takes 500 ms and compute takes 200 ms, then $D_{min} = 3$ batches, with a safety margin of 5. In practice, data loaders like PyTorch's `DataLoader` use `prefetch_factor` and `num_workers` parameters to control this depth. Setting `prefetch_factor=2` with 4 workers creates a buffer of 8 batches, which is typically sufficient for NVMe-backed pipelines but may be inadequate for object-storage-backed pipelines where P99 latency can exceed 500 ms. \index{prefetch buffer!depth calculation}To illustrate with our running example: the 175B model on 256 GPUs processes each batch in roughly 200 ms. Reading from local NVMe, the P99 I/O latency for a batch of tokenized text (roughly 40 MB per GPU) is approximately 50 ms. The minimum prefetch depth is $\lceil 50 / 200 \rceil = 1$ batch, and a safety margin of 2 is adequate. Reading from a parallel file system, the P99 I/O latency rises to roughly 200 ms due to network jitter and contention, requiring a minimum depth of $\lceil 200 / 200 \rceil = 1$, with a safety margin of 3 to account for occasional multi-hundred-millisecond outliers. Reading from object storage, the P99 latency can exceed 500 ms, requiring a depth of at least 3, with a safety margin of 5 or more. These numbers translate directly into host DRAM consumption: at 40 MB per batch, a depth-5 prefetch buffer per GPU consumes 200 MB, and 8 GPUs per node consume 1.6 GB. At 40 MB per batch with a depth of 1, the same node needs only 320 MB. The storage tier directly determines the memory cost of the prefetch buffer. @@ -1130,7 +1156,7 @@ At the end of the training job, the final model checkpoint is promoted from the The total number of data copies in this lifecycle is instructive. Each training sample traverses: object storage $\to$ NVMe (staging), NVMe $\to$ host DRAM (read), host DRAM $\to$ GPU HBM (transfer). Each checkpoint traverses: GPU HBM $\to$ NVMe (local save), NVMe $\to$ parallel file system (async copy), parallel file system $\to$ object storage (long-term retention). Every copy consumes bandwidth and contributes latency. The engineering goal is to minimize copies on the critical path (the training loop) and tolerate additional copies on non-critical paths (staging and archival). -The volume of data moved during a 30-day training run reveals a counterintuitive reality. A single staging copy of the 3 TB dataset from object storage to local NVMe accounts for a modest transfer. But the checkpointing process generates a vastly larger data stream. With a roughly 1,050 GB checkpoint created every 10 minutes across 256 nodes for 30 days, the system produces approximately 4,320 checkpoints, totaling over 4.3 petabytes of state that must traverse the storage hierarchy. Checkpoint data movement dwarfs training data movement for large models: while the 3 TB training dataset might be read a handful of times (once per epoch), the 4.3 PB of checkpoint data is generated anew, making checkpoint I/O the dominant storage workload. This insight explains why checkpoint staging strategy (write locally, replicate asynchronously) has a larger impact on overall storage design than training data pipeline optimization for frontier-scale models. +The volume of data moved during a 30-day training run reveals a counterintuitive reality. A single staging copy of the 3 TB dataset from object storage to local NVMe accounts for a modest transfer. But the checkpointing process generates a vastly larger data stream. With a roughly `{python} ckpt_total_gb` GB checkpoint created every 10 minutes across 256 nodes for 30 days, the system produces approximately 4,320 checkpoints, totaling over `{python} ckpt_fleet_total_pb` petabytes of state that must traverse the storage hierarchy. Checkpoint data movement dwarfs training data movement for large models: while the 3 TB training dataset might be read a handful of times (once per epoch), the `{python} ckpt_fleet_total_pb` PB of checkpoint data is generated anew, making checkpoint I/O the dominant storage workload. This insight explains why checkpoint staging strategy (write locally, replicate asynchronously) has a larger impact on overall storage design than training data pipeline optimization for frontier-scale models. With the physical data path established, we turn to the economic dimension of storage design. @@ -1264,7 +1290,7 @@ While model serving dictates its own economic calculus, the training phase intro \index{Checkpoint!storage architecture}When a thousand GPUs have been training for six hours since the last checkpoint and a power supply fails, what determines how much work is lost? The answer is entirely a storage problem: how quickly the most recent checkpoint was saved, and where it resides. Checkpoints are the most demanding write workload in the storage hierarchy. They are also among the most consequential: a lost checkpoint after a hardware failure means repeating hours or days of training. The interaction between checkpoint storage and fault tolerance strategy is covered in depth in @sec-fault-tolerance-reliability, where the Young-Daly formula (@sec-fault-tolerance-young-daly) derives the optimal checkpoint frequency from cluster failure rates and checkpoint write time. Here we focus on the storage architecture that minimizes $T_{save}$, the time the training pipeline pauses to save a checkpoint. -A `{python} gpt3_params_b`B parameter model with Adam optimizer generates checkpoints of approximately `{python} ckpt_total_gb` GB. The checkpoint includes model weights (`{python} ckpt_weights_gb_str` GB in FP16), optimizer state (momentum and variance, `{python} ckpt_optimizer_gb_str` GB in FP32), learning rate scheduler state, random number generator state, and the current data loader position. Every GPU in the cluster saves its shard of the checkpoint simultaneously, creating a checkpoint storm that the storage system must absorb without disrupting ongoing training reads. +A `{python} gpt3_params_b`B parameter model with Adam optimizer generates checkpoints of approximately `{python} ckpt_total_gb` GB. The checkpoint includes model weights (`350` GB in FP16), optimizer state (momentum and variance, `700` GB in FP32), learning rate scheduler state, random number generator state, and the current data loader position. Every GPU in the cluster saves its shard of the checkpoint simultaneously, creating a checkpoint storm that the storage system must absorb without disrupting ongoing training reads. ::: {.callout-definition title="Checkpoint Storm"} @@ -1293,7 +1319,7 @@ The **tiered staging** strategy minimizes $T_{save}$ by writing in two phases. I In the second phase, a background process asynchronously copies the local checkpoint to the parallel file system for durability. This copy can overlap with the next training iteration, so it does not block the pipeline. The risk is that if the node fails before the async copy completes, the local checkpoint is lost. The mitigation is to replicate checkpoints to at least two peer nodes' NVMe drives before declaring the save complete, providing durability even if one node fails. -The total storage consumed by checkpoints over the life of a training run is substantial. A 175B parameter model checkpointed every 10 minutes over a 30-day training run generates roughly 4,320 checkpoints, each `{python} ckpt_total_gb` GB, for a total of approximately 4.3 PB of checkpoint data. Retaining all of them is neither necessary nor economical. A common retention policy keeps the three most recent checkpoints on the parallel file system for fast recovery, copies every 100th checkpoint to object storage for long-term auditability, and deletes the rest. This policy reduces the parallel file system checkpoint footprint from 4.3 PB to roughly 3 TB (three live checkpoints) while preserving 43 historical snapshots in object storage for post-training analysis. +The total storage consumed by checkpoints over the life of a training run is substantial. A 175B parameter model checkpointed every 10 minutes over a 30-day training run generates roughly 4,320 checkpoints, each `{python} ckpt_total_gb` GB, for a total of approximately `{python} ckpt_fleet_total_pb` PB of checkpoint data. Retaining all of them is neither necessary nor economical. A common retention policy keeps the three most recent checkpoints on the parallel file system for fast recovery, copies every 100th checkpoint to object storage for long-term auditability, and deletes the rest. This policy reduces the parallel file system checkpoint footprint from `{python} ckpt_fleet_total_pb` PB to roughly 3 TB (three live checkpoints) while preserving 43 historical snapshots in object storage for post-training analysis. Incremental checkpointing offers a further optimization for reducing $T_{save}$. Rather than saving the entire model state every checkpoint, an incremental checkpoint saves only the parameters that changed since the last full checkpoint. For models where a large fraction of parameters are frozen (such as fine-tuning scenarios where only the last few layers are updated), incremental checkpoints can be orders of magnitude smaller than full checkpoints. The tradeoff is recovery complexity: restoring from an incremental checkpoint requires applying a chain of incremental updates to a base checkpoint, which extends recovery time. Most production systems use a hybrid approach, saving incremental checkpoints frequently and full checkpoints periodically (every 10th to 100th increment). @@ -1333,7 +1359,7 @@ Storage design errors are among the most expensive mistakes in ML infrastructure **Fallacy:** *NVMe is fast enough to feed GPUs directly without pipelining.* -A single NVMe drive delivers `{python} nvme_bw` GB/s, while an H100 consumes `{python} h100_bw_tbs` TB/s from HBM. The gap is roughly 1,000$\times$. Even four drives in RAID-0 only close this gap to 250$\times$. Without pipelining and prefetching to hide the latency of loading from NVMe to host DRAM to HBM, every batch transfer introduces a stall equal to the transfer time. NVMe speed is necessary but nowhere near sufficient; the entire pipeline architecture (multi-worker loading, prefetch buffers, async transfers) exists precisely because no single storage device can match accelerator bandwidth. The confusion arises because NVMe bandwidth is quoted in absolute terms (GB/s), which sounds impressive, but the relevant metric is the *ratio* of storage bandwidth to accelerator consumption rate, and this ratio is unfavorable by three orders of magnitude. +A single NVMe drive delivers `{python} nvme_bw` GB/s, while an H100 consumes `{python} h100_bw_tbs` TB/s from HBM. The gap is roughly `{python} bw_ratio_str`$\times$. Even four drives in RAID-0 only close this gap to 250$\times$. Without pipelining and prefetching to hide the latency of loading from NVMe to host DRAM to HBM, every batch transfer introduces a stall equal to the transfer time. NVMe speed is necessary but nowhere near sufficient; the entire pipeline architecture (multi-worker loading, prefetch buffers, async transfers) exists precisely because no single storage device can match accelerator bandwidth. The confusion arises because NVMe bandwidth is quoted in absolute terms (GB/s), which sounds impressive, but the relevant metric is the *ratio* of storage bandwidth to accelerator consumption rate, and this ratio is unfavorable by three orders of magnitude. **Fallacy:** *Object storage latency does not matter because we prefetch everything.* @@ -1378,10 +1404,10 @@ The complete storage picture for our running example -- a 30-day training run of | Model weights (FP16) | 350 GB | GPU HBM (distributed) | | Optimizer state (FP32) | 1,400 GB | GPU HBM (ZeRO-partitioned) | | Single checkpoint (full) | 1,050 GB | NVMe $\to$ PFS $\to$ Object Storage | -| All checkpoints (30 days, 10-min interval) | 4.3 PB | NVMe (transient) $\to$ PFS (recent) $\to$ Object (archive) | +| All checkpoints (30 days, 10-min interval) | `{python} ckpt_fleet_total_pb` PB | NVMe (transient) $\to$ PFS (recent) $\to$ Object (archive) | | Archive (retained checkpoints + dataset) | ~50 TB | Glacier | -**Total data moved through the hierarchy**: approximately 4.3 PB of checkpoint data plus 3 TB$\times$ epochs of training data reads. For a single-epoch language model training run, checkpoint I/O dominates data loading I/O by a factor of over 1,000. +**Total data moved through the hierarchy**: approximately `{python} ckpt_fleet_total_pb` PB of checkpoint data plus 3 TB$\times$ epochs of training data reads. For a single-epoch language model training run, checkpoint I/O dominates data loading I/O by a factor of over 1,000. ::: ## Summary {#sec-storage-summary} diff --git a/book/quarto/mlsys/constants.py b/book/quarto/mlsys/constants.py index f89797775..0dcc36fde 100644 --- a/book/quarto/mlsys/constants.py +++ b/book/quarto/mlsys/constants.py @@ -412,7 +412,7 @@ DLRM_MODEL_SIZE_FP32 = 100 * GB # Approximate total model size YOLOV8_NANO_FLOPs = 8.7e9 * flop # 640x640 # --- Storage (I/O Bandwidth) --- -NVME_SEQUENTIAL_BW = 3.5 * GB / second # NVMe SSD sequential read +NVME_SEQUENTIAL_BW = 7.0 * GB / second # NVMe SSD sequential read (Gen 4) SYSTEM_MEMORY_BW = 50 * GB / second # DDR4/DDR5 typical # --- Case Studies --- diff --git a/storage_cell.py b/storage_cell.py new file mode 100644 index 000000000..b81322590 --- /dev/null +++ b/storage_cell.py @@ -0,0 +1,142 @@ +import math +from mlsys.constants import ( + H100_MEM_BW, H100_FLOPS_FP16_TENSOR, NVME_SEQUENTIAL_BW, + GPT3_PARAMS, BILLION, MILLION, TRILLION, THOUSAND, + GB, TB, second, byte, flop, USD, kilowatt, hour, + GPUS_PER_HOST, SEC_PER_DAY, SEC_PER_YEAR, + H100_MEM_CAPACITY, BYTES_FP16, BYTES_FP32, + US, Mparam, RESNET50_PARAMS, + NVLINK_H100_BW, PCIE_GEN5_BW, + A100_MEM_CAPACITY, H100_FLOPS_FP8_TENSOR, H100_TDP +) +from mlsys.formatting import fmt, check, md + +class StorageHierarchyAnalysis: + """ + Namespace for global storage hierarchy and pipeline calculations. + """ + # 1. LOAD + h100_bw = H100_MEM_BW + h100_flops_fp16 = H100_FLOPS_FP16_TENSOR + h100_cap = H100_MEM_CAPACITY + nvme_bw_raw = NVME_SEQUENTIAL_BW + pfs_node_bw = (4.0 * GB / second) + s3_bw = (1.0 * GB / second) + + gpt3_params = GPT3_PARAMS.m_as('param') + t_step_ms = 200 + + n_gpus_image = 256 + img_size = 150 * THOUSAND + batch_img_gpu = 256 + util_target = 0.80 + + dataset_size_tb = 100 + cost_s3_gb_mo = 0.02 + cost_nvme_gb_mo = 0.10 + cost_glacier_gb_mo = 0.004 + cost_egress_gb = 0.09 + + n_tail_servers = 100 + p_tail_fail = 0.01 + + # 2. EXECUTE + t_step_s = t_step_ms / 1000 + req_bw_imagenet_val = (n_gpus_image * util_target * (batch_img_gpu * img_size)) / t_step_s + req_bw_imagenet_gbs = req_bw_imagenet_val / BILLION + + t_comp_val = 200 + t_io_val = 250 + stall_max_t = max(t_comp_val, t_io_val) + data_stall_pct_val = ((stall_max_t - t_comp_val) / stall_max_t) * 100 + + bytes_per_param_ckpt = BYTES_FP16.m_as(byte) + (2 * BYTES_FP32.m_as(byte)) + ckpt_total_gb_val = (gpt3_params * bytes_per_param_ckpt) / BILLION + + n_nodes = 256 + node_shard_gb = ckpt_total_gb_val / n_nodes + ckpt_nvme_s_val = node_shard_gb / (4 * nvme_bw_raw.m_as(GB/second)) + ckpt_pfs_s_val = node_shard_gb / pfs_node_bw.m_as(GB/second) + + s3_annual_val = dataset_size_tb * 1000 * cost_s3_gb_mo * 12 + nvme_annual_val = dataset_size_tb * 1000 * cost_nvme_gb_mo * 12 + glacier_annual_val = dataset_size_tb * 1000 * cost_glacier_gb_mo * 12 + tier_cost_ratio_val = cost_nvme_gb_mo / cost_s3_gb_mo + egress_100tb_val = dataset_size_tb * 1000 * cost_egress_gb + + prob_tail_all = (1.0 - p_tail_fail) ** n_tail_servers + + images_per_sec = 1000 + raw_bw_val = (images_per_sec * img_size) / MILLION + hdd_iops = 100 + hdd_slowdown_val = images_per_sec / hdd_iops + + gds_trad_us = 120 + gds_bypass_us = 30 + + # 3. GUARD + check(req_bw_imagenet_gbs > 10, f"ImageNet aggregate BW should be high, got {req_bw_imagenet_gbs:.1f} GB/s") + check(ckpt_total_gb_val > 1000, "175B checkpoint must be > 1 TB") + check(data_stall_pct_val == 20, "Data stall calculation mismatch") + + # 4. OUTPUT + h100_bw_tbs = f"{h100_bw.m_as(TB/second):.2f}" + nvme_bw_str = f"{nvme_bw_raw.m_as(GB/second):.1f}" + gpt3_params_b = f"{gpt3_params / BILLION:.0f}" + h100_hbm_cap_gb = f"{h100_cap.m_as(GB):.0f}" + req_bw_imagenet_str = f"{req_bw_imagenet_gbs:.1f}" + data_stall_pct_str = f"{data_stall_pct_val:.0f}" + ckpt_total_gb_str = f"{ckpt_total_gb_val:,.0f}" + ckpt_nvme_s_str = f"{ckpt_nvme_s_val:.1f}" + ckpt_pfs_s_str = f"{ckpt_pfs_s_val:.1f}" + s3_annual_str = f"{s3_annual_val:,.0f}" + nvme_annual_str = f"{nvme_annual_val:,.0f}" + glacier_annual_str = f"{glacier_annual_val:,.0f}" + tier_cost_ratio_str = f"{tier_cost_ratio_val:.0f}" + egress_100tb_str = f"{egress_100tb_val:,.0f}" + prob_tail_all_str = f"{prob_tail_all:.3f}" + fail_rate_pct_str = f"{p_tail_fail * 100:.0f}" + n_tail_servers_str = f"{n_tail_servers}" + raw_bw_str = f"{raw_bw_val:.0f}" + hdd_slowdown_factor = f"{hdd_slowdown_val:.0f}" + gds_trad_us_str = f"{gds_trad_us}" + gds_bypass_us_str = f"{gds_bypass_us}" + t_comp_stall_str = f"{t_comp_val}" + t_io_stall_str = f"{t_io_val}" + +# EXPORTS +h100_bw_tbs = StorageHierarchyAnalysis.h100_bw_tbs +nvme_bw = StorageHierarchyAnalysis.nvme_bw_str +gpt3_params_b = StorageHierarchyAnalysis.gpt3_params_b +h100_hbm_cap_gb = StorageHierarchyAnalysis.h100_hbm_cap_gb +req_bw_imagenet = StorageHierarchyAnalysis.req_bw_imagenet_str +data_stall_pct_str = StorageHierarchyAnalysis.data_stall_pct_str +stall_pct_display_math = md( + f"$$ ext{{Stall \%}} = \frac{{{StorageHierarchyAnalysis.t_io_val} - {StorageHierarchyAnalysis.t_comp_val}}}{{{StorageHierarchyAnalysis.t_io_val}}} = " + f"\mathbf{{{data_stall_pct_str}\%}}$$" +) +ckpt_total_gb = StorageHierarchyAnalysis.ckpt_total_gb_str +ckpt_nvme_s = StorageHierarchyAnalysis.ckpt_nvme_s_str +ckpt_pfs_s = StorageHierarchyAnalysis.ckpt_pfs_s_str +s3_annual_cost = StorageHierarchyAnalysis.s3_annual_str +nvme_annual_cost = StorageHierarchyAnalysis.nvme_annual_str +glacier_annual_cost = StorageHierarchyAnalysis.glacier_annual_str +tier_cost_ratio = StorageHierarchyAnalysis.tier_cost_ratio_str +egress_100tb_cost = StorageHierarchyAnalysis.egress_100tb_str +prob_tail_all_str = StorageHierarchyAnalysis.prob_tail_all_str +fail_rate_pct_str = StorageHierarchyAnalysis.fail_rate_pct_str +n_tail_servers_str = StorageHierarchyAnalysis.n_tail_servers_str +raw_bw_str = StorageHierarchyAnalysis.raw_bw_str +hdd_slowdown_factor = StorageHierarchyAnalysis.hdd_slowdown_factor +gds_trad_us = StorageHierarchyAnalysis.gds_trad_us_str +gds_bypass_us = StorageHierarchyAnalysis.gds_bypass_us_str +t_comp_stall_str = StorageHierarchyAnalysis.t_comp_stall_str +t_io_stall_str = StorageHierarchyAnalysis.t_io_stall_str +a100_mem = f"{A100_MEM_CAPACITY.m_as(GB):.0f}" +h100_mem = f"{H100_MEM_CAPACITY.m_as(GB):.0f}" +h100_fp8_tflops = f"{H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second):,.0f}" +h100_fp16_tflops = f"{H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second):,.0f}" +h100_tdp_w = f"{H100_TDP.m_as(watt):.0f}" +resnet_params_m = f"{RESNET50_PARAMS.m_as(Mparam):.1f}" +nvlink_bw_gbs = f"{NVLINK_H100_BW.m_as(GB/second):.0f}" +pcie5_bw_gbs = f"{PCIE_GEN5_BW.m_as(GB/second):.0f}" diff --git a/tinytorch/src/01_tensor/01_tensor.py b/tinytorch/src/01_tensor/01_tensor.py index ed4b24b0f..caf4656e8 100644 --- a/tinytorch/src/01_tensor/01_tensor.py +++ b/tinytorch/src/01_tensor/01_tensor.py @@ -934,11 +934,11 @@ Flattened Image × Hidden Weights = Hidden Features ``` Matrix Multiplication Process: A (2×3) B (3×2) C (2×2) - ┌ ┐ ┌ ┐ ┌ ┐ - │ 1 2 3 │ │ 7 8 │ │ 1×7+2×9+3×1 │ ┌ ┐ - │ │ × │ 9 1 │ = │ │ = │ 28 16│ - │ 4 5 6 │ │ 1 2 │ │ 4×7+5×9+6×1 │ │ 79 49│ - └ ┘ └ ┘ └ ┘ └ ┘ + ┌ ┐ ┌ ┐ ┌ ┐ + │ 1 2 3 │ │ 7 8 │ │ 1×7+2×9+3×1 1×8+2×1+3×2 │ ┌ ┐ + │ │ × │ 9 1 │ = │ │ = │ 28 16 │ + │ 4 5 6 │ │ 1 2 │ │ 4×7+5×9+6×1 4×8+5×1+6×2 │ │ 79 49 │ + └ ┘ └ ┘ └ ┘ └ ┘ Computation Breakdown: C[0,0] = A[0,:] · B[:,0] = [1,2,3] · [7,9,1] = 1×7 + 2×9 + 3×1 = 28