cs249r_book/mlsysim/paper/references.bib

@inproceedings{abadi2016,
  title = {Deep Learning With Differential Privacy},
  author = {Abadi, Martin and Chu, Andy and Goodfellow, Ian and others},
  year = {2016},
  booktitle = {Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security},
  publisher = {ACM},
  pages = {308--318},
  doi = {10.1145/2976749.2978318},
  url = {https://doi.org/10.1145/2976749.2978318},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1145/2976749.2978318},
}

@inproceedings{agrawal2024vidur,
  title = {Vidur: A Large-Scale Simulation Framework For {LLM} Inference},
  author = {
    Agrawal, Amey and Kedia, Nitin and Panwar, Ashish and Mohan, Jayashree and Kwatra, Nipun and
    Gulavani, Bhargav S. and Tumanov, Alexey and Ramjee, Ramachandran
  },
  year = {2024},
  booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
  publisher = {mlsys.org},
  url = {https://arxiv.org/abs/2405.05465},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {
    https://proceedings.mlsys.org/paper\_files/paper/2024/hash/b74a8de47d2b3c928360e0a011f48351-Abstract-Conference.html
  },
}

@article{agrawal2025,
  title = {Efficient LLM Inference via Chunked Prefills},
  author = {
    Agrawal, Amey and Kedia, Nitin and Panwar, Ashish and Mohan, Jayashree and Kwatra, Nipun and
    Gulavani, Bhargav S. and Tumanov, Alexey and Ramjee, Ramachandran
  },
  year = {2024},
  journal = {ACM SIGOPS Operating Systems Review},
  booktitle = {
    Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)
  },
  publisher = {Association for Computing Machinery (ACM)},
  volume = {59},
  number = {1},
  pages = {9--16},
  doi = {10.1145/3759441.3759444},
  issn = {0163-5980},
  url = {https://doi.org/10.1145/3759441.3759444},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://www.usenix.org/conference/osdi24/presentation/agrawal},
}

@misc{amodei2018ai,
  title = {{AI} and Compute},
  author = {Amodei, Dario and Hernandez, Danny},
  year = {2018},
  url = {https://openai.com/research/ai-and-compute},
  howpublished = {OpenAI Blog},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {https://openai.com/research/ai-and-compute},
}

@article{bambhaniya2024genz,
  title = {Demystifying Platform Requirements for Diverse {LLM} Inference Use Cases},
  author = {
    Bambhaniya, Abhimanyu and Raj, Ritik and Jeong, Geonhwa and Kundu, Souvik and Srinivasan,
    Sudarshan and Elavazhagan, Midhilesh and Kumar, Madhu and Krishna, Tushar
  },
  year = {2024},
  journal = {arXiv preprint arXiv:2406.01698},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
  x-verified-source = {
    https://arxiv.org/abs/2406.01698;
    https://www.semanticscholar.org/paper/Demystifying-Platform-Requirements-for-Diverse-LLM-Bambhaniya-Raj/f82ba56524bd595518729f207f16bd07a11ddeda
  },
}

@article{barroso2007,
  title = {The Case for Energy-Proportional Computing},
  author = {Barroso, Luiz Andr{\'e} and H{\"o}lzle, Urs},
  year = {2007},
  journal = {Computer},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {40},
  number = {12},
  pages = {33--37},
  doi = {10.1109/mc.2007.443},
  issn = {0018-9162},
  url = {https://doi.org/10.1109/mc.2007.443},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1109/MC.2007.443},
}

@book{barroso2019,
  title = {The Datacenter as a Computer},
  author = {Barroso, Luiz Andr{\'e} and H{\"o}lzle, Urs and Ranganathan, Parthasarathy},
  year = {2018},
  publisher = {Springer International Publishing},
  series = {Synthesis Lectures on Computer Architecture},
  doi = {10.1007/978-3-031-01761-2},
  isbn = {9783031006333, 9783031017612},
  issn = {1935-3235, 1935-3243},
  url = {https://doi.org/10.1007/978-3-031-01761-2},
  subtitle = {Designing Warehouse-Scale Machines},
  source = {Crossref},
  edition = {3rd},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@article{binkert2011,
  title = {The Gem5 Simulator},
  author = {Binkert, Nathan and Beckmann, Bradford and Black, Gabriel and others},
  year = {2011},
  journal = {ACM SIGARCH Computer Architecture News},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {39},
  number = {2},
  pages = {1--7},
  doi = {10.1145/2024716.2024718},
  issn = {0163-5964},
  url = {https://doi.org/10.1145/2024716.2024718},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1145/2024716.2024718},
}

@article{box1976,
  title = {Science and Statistics},
  author = {Box, George E. P.},
  year = {1976},
  month = {None},
  journal = {J. Am. Stat. Assoc.},
  publisher = {Informa UK Limited},
  volume = {71},
  number = {356},
  pages = {791--799},
  doi = {10.1080/01621459.1976.10480949},
  issn = {0162-1459, 1537-274X},
  url = {https://doi.org/10.1080/01621459.1976.10480949},
  source = {Crossref},
  x-verified = {2026-05-04},
  x-verified-by = {openai-MODEL},
  x-verified-source = {https://doi.org/10.1080/01621459.1976.10480949},
}

@article{chowdhery2022palm,
  title = {{PaLM}: Scaling Language Modeling with Pathways},
  author = {Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and others},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {240},
  pages = {1--113},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@misc{cox2011xv6,
  title = {xv6: A Simple, {Unix}-like Teaching Operating System},
  author = {Cox, Russ and Kaashoek, M. Frans and Morris, Robert},
  year = {2011},
  url = {https://pdos.csail.mit.edu/6.828/xv6},
  howpublished = {MIT PDOS},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://pdos.csail.mit.edu/6.828/xv6},
}

@article{daly2006,
  title = {A Higher Order Estimate of the Optimum Checkpoint Interval for Restart Dumps},
  author = {Daly, John T.},
  year = {2006},
  journal = {Future Gener. Comput. Syst.},
  publisher = {Elsevier BV},
  volume = {22},
  number = {3},
  pages = {303--312},
  doi = {10.1016/j.future.2004.11.016},
  issn = {0167-739X},
  url = {https://doi.org/10.1016/j.future.2004.11.016},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1016/j.future.2004.11.016},
}

@inproceedings{dao2022,
  title = {FlashAttention: Fast and Memory-Efficient Exact Attention With IO-Awareness},
  author = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
  year = {2022},
  booktitle = {Advances in Neural Information Processing Systems 35},
  publisher = {Neural Information Processing Systems Foundation, Inc. (NeurIPS)},
  volume = {35},
  pages = {16344--16359},
  doi = {10.52202/068431-1189},
  url = {https://doi.org/10.52202/068431-1189},
  source = {Crossref},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {
    https://proceedings.neurips.cc/paper\_files/paper/2022/hash/67d57c32e20fd0a7a302cb81d36e40d5-Abstract-Conference.html
  },
}

@article{dean2012large,
  title = {Large Scale Distributed Deep Networks},
  author = {Dean, Jeffrey and Corrado, Greg S. and Monga, Rajat and others},
  year = {2012},
  journal = {Advances in Neural Information Processing Systems},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  publisher = {Curran Associates},
  volume = {25},
  pages = {1223--1231},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
  x-verified-source = {
    https://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks;
    https://dl.acm.org/doi/10.5555/2999134.2999271
  },
}

@article{dean2013,
  title = {The Tail at Scale},
  author = {Dean, Jeffrey and Barroso, Luiz Andr{\'e}},
  year = {2013},
  journal = {Communications of the ACM},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {56},
  number = {2},
  pages = {74--80},
  doi = {10.1145/2408776.2408794},
  issn = {0001-0782, 1557-7317},
  url = {https://doi.org/10.1145/2408776.2408794},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1145/2408776.2408794},
}

@inproceedings{deepseek2025v3,
  title = {
    Insights Into DeepSeek-V3: Scaling Challenges and Reflections on Hardware for AI Architectures
  },
  author = {{DeepSeek-AI}},
  year = {2025},
  booktitle = {Proceedings of the 52nd Annual International Symposium on Computer Architecture},
  publisher = {ACM},
  pages = {1731--1745},
  doi = {10.1145/3695053.3731412},
  url = {https://doi.org/10.1145/3695053.3731412},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://api.crossref.org/works?query.title=Insights+into+DeepSeek-V3+Scaling+Challenges},
}

@inproceedings{eisenman2022checknrun,
  title = {Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models},
  author = {
    Eisenman, Assaf and Matam, Kiran Kumar and Ingram, Steven and Mudigere, Dheevatsa and
    Krishnamoorthi, Raghuraman and Nair, Krishnakumar and Smelyanskiy, Misha and Annavaram, Murali
  },
  year = {2022},
  booktitle = {
    Proceedings of the 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI)
  },
  publisher = {USENIX Association},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {https://www.usenix.org/conference/nsdi22/presentation/eisenman},
}

@inproceedings{faiz2024llmcarbon,
  title = {{LLMCarbon}: Modeling the End-to-End Carbon Footprint of Large Language Models},
  author = {
    Faiz, Ahmad and Kaneda, Sotaro and Wang, Ruhan and Osi, Rita and Sharma, Prateek and Chen, Fan
    and Jiang, Lei
  },
  year = {2024},
  booktitle = {Proceedings of the 12th International Conference on Learning Representations (ICLR)},
  publisher = {OpenReview.net},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
  x-verified-source = {https://arxiv.org/abs/2309.14393; https://openreview.net/forum?id=aIok3ZD9to},
}

@inproceedings{frantar2023gptq,
  title = {{GPTQ}: Accurate Post-Training Quantization for Generative Pre-trained Transformers},
  author = {Frantar, Elias and Ashkboos, Saleh and Hoefler, Torsten and Alistarh, Dan},
  year = {2023},
  booktitle = {Proceedings of the 11th International Conference on Learning Representations (ICLR)},
  publisher = {OpenReview.net},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://openreview.net/forum?id=tcbBPnfwxS},
}

@inbook{gholami2022,
  title = {A Survey of Quantization Methods for Efficient Neural Network Inference},
  author = {Gholami, Amir and Kim, Sehoon and Dong, Zhen and others},
  year = {2021},
  journal = {arXiv preprint arXiv:2103.13630},
  booktitle = {Low-Power Computer Vision},
  publisher = {Chapman and Hall/CRC},
  pages = {291--326},
  doi = {10.1201/9781003162810-13},
  isbn = {9781003162810},
  url = {https://doi.org/10.1201/9781003162810-13},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{gupta2022,
  title = {Act},
  author = {
    Gupta, Udit and Elgamal, Mariam and Hills, Gage and Wei, Gu-Yeon and Lee, Hsien-Hsin S. and
    Brooks, David and Wu, Carole-Jean
  },
  year = {2022},
  booktitle = {Proceedings of the 49th Annual International Symposium on Computer Architecture},
  publisher = {ACM},
  pages = {784--799},
  doi = {10.1145/3470496.3527408},
  url = {https://doi.org/10.1145/3470496.3527408},
  subtitle = {Designing Sustainable Computer Systems With an Architectural Carbon Modeling Tool},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1145/3470496.3527408},
}

@inproceedings{gupta2022chasing,
  title = {Chasing Carbon: The Elusive Environmental Footprint of Computing},
  author = {
    Gupta, Udit and Kim, Young Geun and Lee, Sylvia and Tse, Jordan and Lee, Hsien-Hsin S and Wei,
    Gu-Yeon and Brooks, David and Wu, Carole-Jean
  },
  year = {2022},
  booktitle = {IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
  publisher = {IEEE},
  pages = {85--99},
  doi = {10.1109/hpca53966.2022.00013},
  url = {https://doi.org/10.1109/hpca53966.2022.00013},
  source = {Crossref},
}

@inproceedings{han2016deep,
  title = {
    Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and
    Huffman Coding
  },
  author = {Han, Song and Mao, Huizi and Dally, William J.},
  year = {2016},
  booktitle = {Proceedings of the 4th International Conference on Learning Representations (ICLR)},
  publisher = {OpenReview.net},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://arxiv.org/abs/1510.00149},
}

@book{hennessy2024architecture,
  title = {Computer Architecture: A Quantitative Approach},
  author = {Hennessy, John L. and Patterson, David A. and Kozyrakis, Christos},
  year = {2024},
  publisher = {Morgan Kaufmann},
  isbn = {978-0443154065},
  edition = {7th},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{hoffmann2022chinchilla,
  title = {Training Compute-Optimal Large Language Models},
  author = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and others},
  year = {2022},
  booktitle = {Advances in Neural Information Processing Systems 35},
  publisher = {Neural Information Processing Systems Foundation, Inc. (NeurIPS)},
  volume = {35},
  pages = {30016--30030},
  doi = {10.52202/068431-2176},
  url = {https://doi.org/10.52202/068431-2176},
  source = {Crossref},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {
    https://proceedings.neurips.cc/paper\_files/paper/2022/hash/c1e2faff6f588870935f114ebe04a3e5-Abstract-Conference.html
  },
}

@inproceedings{isaev2023,
  title = {
    Calculon: A Methodology and Tool for High-Level Co-Design of Systems and Large Language Models
  },
  author = {Isaev, Mikhail and McDonald, Nic and Dennison, Larry and Vuduc, Richard},
  year = {2023},
  booktitle = {
    Proceedings of the International Conference for High Performance Computing, Networking, Storage
    and Analysis
  },
  publisher = {ACM},
  pages = {1--14},
  doi = {10.1145/3581784.3607102},
  url = {https://doi.org/10.1145/3581784.3607102},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1145/3581784.3607102},
}

@inproceedings{jia2019flexflow,
  title = {Beyond Data and Model Parallelism for Deep Neural Networks},
  author = {Jia, Zhihao and Zaharia, Matei and Aiken, Alex},
  year = {2019},
  booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
  publisher = {mlsys.org},
  url = {https://arxiv.org/abs/1807.05358},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {
    https://proceedings.mlsys.org/paper\_files/paper/2019/hash/b422680f3db0986ddd7f8f126baaf0fa-Abstract.html
  },
}

@inproceedings{jouppi2017,
  title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
  author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and others},
  year = {2017},
  booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
  publisher = {ACM},
  pages = {1--12},
  doi = {10.1145/3079856.3080246},
  url = {https://doi.org/10.1145/3079856.3080246},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1145/3079856.3080246},
}

@article{kaplan2020scaling,
  title = {Scaling Laws for Neural Language Models},
  author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and others},
  year = {2020},
  journal = {arXiv preprint arXiv:2001.08361},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@misc{kim2023llmanalysis,
  title = {llm-analysis: Latency and Memory Analysis of Transformer Models},
  author = {Li, Cheng},
  year = {2023},
  note = {Accessed: 2025-01-15},
  howpublished = {\url{https://github.com/cli99/llm-analysis}},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{kwon2023,
  title = {Efficient Memory Management for Large Language Model Serving With PagedAttention},
  author = {Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and others},
  year = {2023},
  booktitle = {Proceedings of the 29th Symposium on Operating Systems Principles},
  publisher = {ACM},
  pages = {611--626},
  doi = {10.1145/3600006.3613165},
  url = {https://doi.org/10.1145/3600006.3613165},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1145/3600006.3613165},
}

@article{leiserson1985,
  title = {Fat-Trees: Universal Networks for Hardware-Efficient Supercomputing},
  author = {Leiserson, Charles E.},
  year = {1985},
  journal = {IEEE Transactions on Computers},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {C-34},
  number = {10},
  pages = {892--901},
  doi = {10.1109/tc.1985.6312192},
  issn = {0018-9340},
  url = {https://doi.org/10.1109/tc.1985.6312192},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{leviathan2023fast,
  title = {Fast Inference from Transformers via Speculative Decoding},
  author = {Leviathan, Yaniv and Kalman, Matan and Matias, Yossi},
  year = {2023},
  booktitle = {Proceedings of the 40th International Conference on Machine Learning (ICML)},
  publisher = {PMLR},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://proceedings.mlr.press/v202/leviathan23a.html},
}

@inproceedings{liang2025lumos,
  title = {Lumos: Efficient Performance Modeling and Estimation for Large-scale {LLM} Training},
  author = {
    Liang, Mingyu and Kassa, Hiwot Tadese and Fu, Wenyin and Coutinho, Brian and Feng, Louis and
    Delimitrou, Christina
  },
  year = {2025},
  booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
  publisher = {mlsys.org},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://mlsys.org/virtual/2025/papers.html},
}

@inproceedings{lie2022cerebras,
  title = {Cerebras Architecture Deep Dive: First Look Inside the {HW/SW} Co-Design for Deep Learning},
  author = {Lie, Sean},
  year = {2022},
  booktitle = {IEEE Hot Chips 34 Symposium},
  publisher = {IEEE},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://hc34.hotchips.org/},
}

@article{lin2025,
  title = {AWQ: Activation-Aware Weight Quantization for On-Device LLM Compression and Acceleration},
  author = {
    Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Chen, Wei-Ming and Wang,
    Wei-Chen and Xiao, Guangxuan and Dang, Xingyu and Gan, Chuang and Han, Song
  },
  year = {2024},
  journal = {GetMobile: Mobile Computing and Communications},
  booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {28},
  number = {4},
  pages = {12--17},
  doi = {10.1145/3714983.3714987},
  issn = {2375-0529, 2375-0537},
  url = {https://doi.org/10.1145/3714983.3714987},
  source = {Crossref},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {
    https://proceedings.mlsys.org/paper\_files/paper/2024/hash/42a452cbafa9dd64e9ba4aa95cc1ef21-Abstract-Conference.html
  },
}

@article{little1961,
  title = {A Proof for the Queuing Formula: <I>L</i> = \ensuremath{\Lambda}<I>W</i>},
  author = {Little, John D. C.},
  year = {1961},
  journal = {Oper. Res.},
  publisher = {Institute for Operations Research and the Management Sciences (INFORMS)},
  volume = {9},
  number = {3},
  pages = {383--387},
  doi = {10.1287/opre.9.3.383},
  issn = {0030-364X, 1526-5463},
  url = {https://doi.org/10.1287/opre.9.3.383},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1287/opre.9.3.383},
}

@article{llama3team2024,
  title = {The {Llama} 3 Herd of Models},
  author = {{Llama Team, AI \@ Meta}},
  year = {2024},
  journal = {arXiv preprint arXiv:2407.21783},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@misc{lottick2019codecarbon,
  title = {Energy Usage Reports: Environmental Awareness as Part of Algorithmic Accountability},
  author = {Lottick, Kadan and Susai, Silvia and Friedler, Sorelle A. and Wilson, Jonathan P.},
  year = {2019},
  howpublished = {arXiv preprint arXiv:1911.08354},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@article{mattson2020,
  title = {MLPerf: An Industry Standard Benchmark Suite for Machine Learning Performance},
  author = {Mattson, Peter and Cheng, Christine and Diamos, Gregory and others},
  year = {2020},
  journal = {IEEE Micro},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {40},
  number = {2},
  pages = {8--16},
  doi = {10.1109/mm.2020.2974843},
  issn = {0272-1732, 1937-4143},
  url = {https://doi.org/10.1109/mm.2020.2974843},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1109/MM.2020.2974843},
}

@book{mlsysbook2025,
  title = {
    Machine Learning Systems: Principles and Practices of Engineering Artificially Intelligent
    Systems
  },
  author = {Reddi, Vijay Janapa and others},
  year = {2025},
  publisher = {Harvard University},
  url = {https://mlsysbook.ai},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://mlsysbook.ai},
}

@article{mohan2021,
  title = {Analyzing and Mitigating Data Stalls in DNN Training},
  author = {Mohan, Jayashree and Phanishayee, Amar and Raniwala, Ashish and Chidambaram, Vijay},
  year = {2021},
  journal = {Proc. VLDB Endow.},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {14},
  number = {5},
  pages = {771--784},
  doi = {10.14778/3446095.3446100},
  issn = {2150-8097},
  url = {https://doi.org/10.14778/3446095.3446100},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{murray2021tf,
  title = {tf.data: A Machine Learning Data Processing Framework},
  author = {Murray, Derek G. and Simsa, Jiri and Klimovic, Ana and Indyk, Ihor},
  year = {2021},
  booktitle = {Proceedings of the VLDB Endowment},
  publisher = {VLDB Endowment},
  volume = {14},
  number = {12},
  pages = {2945--2958},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://www.vldb.org/pvldb/vol14/p2945-klimovic.pdf},
}

@inproceedings{narayanan2021,
  title = {Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM},
  author = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and others},
  year = {2021},
  booktitle = {
    Proceedings of the International Conference for High Performance Computing, Networking, Storage
    and Analysis
  },
  publisher = {ACM},
  pages = {1--15},
  doi = {10.1145/3458817.3476209},
  url = {https://doi.org/10.1145/3458817.3476209},
  source = {Crossref},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {https://doi.org/10.1145/3458817.3476209},
}

@misc{nvidia2023h100,
  title = {{NVIDIA H100 Tensor Core GPU} Datasheet},
  author = {{NVIDIA Corporation}},
  year = {2023},
  note = {Accessed: 2024-06-15},
  howpublished = {\url{https://www.nvidia.com/en-us/data-center/h100/}},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{parashar2019,
  title = {Timeloop: A Systematic Approach to DNN Accelerator Evaluation},
  author = {
    Parashar, Angshuman and Raina, Priyanka and Shao, Yakun Sophia and Chen, Yu-Hsin and Emer, Joel
    and others
  },
  year = {2019},
  booktitle = {2019 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
  publisher = {IEEE},
  pages = {304--315},
  doi = {10.1109/ispass.2019.00042},
  url = {https://doi.org/10.1109/ispass.2019.00042},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1109/ISPASS.2019.00042},
}

@inproceedings{patel2024,
  title = {Splitwise: Efficient Generative LLM Inference Using Phase Splitting},
  author = {
    Patel, Pratyush and Choukse, Esha and Zhang, Chaojie and Shah, Aashaka and Goiri, {\'I}{\~n}igo
    and Maleki, Saeed and Bianchini, Ricardo
  },
  year = {2024},
  booktitle = {2024 ACM/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)},
  publisher = {IEEE},
  pages = {118--132},
  doi = {10.1109/isca59077.2024.00019},
  url = {https://doi.org/10.1109/isca59077.2024.00019},
  source = {Crossref},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {https://doi.org/10.1109/ISCA59077.2024.00019},
}

@book{patterson2014organization,
  title = {Computer Organization and Design: The Hardware/Software Interface},
  author = {Patterson, David A. and Hennessy, John L.},
  year = {2014},
  publisher = {Morgan Kaufmann},
  isbn = {978-0124077263},
  edition = {5th},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@article{patterson2021carbon,
  title = {Carbon Emissions and Large Neural Network Training},
  author = {Patterson, David and Gonzalez, Joseph and Le, Quoc and others},
  year = {2021},
  journal = {arXiv preprint arXiv:2104.10350},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{pope2023llm,
  title = {Efficiently Scaling Transformer Inference},
  author = {Pope, Reiner and Douglas, Sholto and Chowdhery, Aakanksha and others},
  year = {2023},
  booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
  publisher = {mlsys.org},
  volume = {5},
  url = {
    https://proceedings.mlsys.org/paper_files/paper/2023/hash/c4be71ab8d24cdfb45e3d06dbfca2780-Abstract-mlsys2023.html
  },
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {https://arxiv.org/abs/2211.05102},
}

@inproceedings{qi2017paleo,
  title = {{PALEO}: A Performance Model for Deep Neural Networks},
  author = {Qi, Hang and Sparks, Evan R. and Talwalkar, Ameet},
  year = {2017},
  booktitle = {Proceedings of the 5th International Conference on Learning Representations (ICLR)},
  publisher = {OpenReview.net},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://openreview.net/forum?id=SyVVJ85lg},
}

@inproceedings{rajbhandari2020,
  title = {ZeRO: Memory Optimizations Toward Training Trillion Parameter Models},
  author = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong},
  year = {2020},
  booktitle = {
    SC20: International Conference for High Performance Computing, Networking, Storage and Analysis
  },
  publisher = {IEEE},
  pages = {1--16},
  doi = {10.1109/sc41405.2020.00024},
  url = {https://doi.org/10.1109/sc41405.2020.00024},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1109/SC41405.2020.00024},
}

@inproceedings{rasley2020,
  title = {DeepSpeed},
  author = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
  year = {2020},
  booktitle = {
    Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \&amp; Data
    Mining
  },
  publisher = {ACM},
  pages = {3505--3506},
  doi = {10.1145/3394486.3406703},
  url = {https://doi.org/10.1145/3394486.3406703},
  subtitle = {System Optimizations Enable Training Deep Learning Models With Over 100 Billion Parameters},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1145/3394486.3406703},
}

@inproceedings{shazeer2017outrageously,
  title = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
  author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and others},
  year = {2017},
  booktitle = {Proceedings of the 5th International Conference on Learning Representations (ICLR)},
  publisher = {OpenReview.net},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://openreview.net/forum?id=B1ckMDqlg},
}

@article{shoeybi2019megatron,
  title = {{Megatron-LM}: Training Multi-Billion Parameter Language Models Using Model Parallelism},
  author = {
    Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared
    and Catanzaro, Bryan
  },
  year = {2019},
  journal = {arXiv preprint arXiv:1909.08053},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{snell2025scaling,
  title = {Scaling {LLM} Test-Time Compute Optimally can be More Effective than Scaling Model Parameters},
  author = {Snell, Charlie and Lee, Jaehoon and Xu, Kelvin and Kumar, Aviral},
  year = {2025},
  booktitle = {Proceedings of the 13th International Conference on Learning Representations (ICLR)},
  publisher = {OpenReview.net},
  eprint = {2408.03314},
  archiveprefix = {arXiv},
  x-verified = {2026-04-26},
  x-verified-by = {bib-web-verify},
  x-verified-source = {https://openreview.net/forum?id=4FWAwZtd2n},
}

@techreport{stephenson1999mco,
  title = {{Mars Climate Orbiter Mishap Investigation Board Phase I Report}},
  author = {
    Stephenson, Arthur G. and LaPiana, Lia S. and Mulville, Daniel R. and Rutledge, Peter J. and
    Bauer, Frank H. and Folta, David and Dukeman, Greg A. and Sackheim, Robert and Norvig, Peter
  },
  year = {1999},
  month = {None},
  institution = {National Aeronautics and Space Administration},
  x-verified = {2026-05-04},
  x-verified-by = {openai-MODEL},
  x-verified-source = {https://ntrs.nasa.gov/citations/20000010501},
}

@misc{sutton2019bitter,
  title = {The Bitter Lesson},
  author = {Sutton, Rich},
  year = {2019},
  note = {Accessed: 2024-06-15},
  howpublished = {\url{http://www.incompleteideas.net/IncIdeas/BitterLesson.html}},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@book{tanenbaum2006minix,
  title = {Operating Systems: Design and Implementation},
  author = {Tanenbaum, Andrew S. and Woodhull, Albert S.},
  year = {2006},
  publisher = {Prentice Hall},
  edition = {3rd},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@misc{tinytorch2025,
  title = {{TinyTorch}: A Progressive Educational Framework for Machine Learning Systems},
  author = {Reddi, Vijay Janapa and others},
  year = {2025},
  url = {https://mlsysbook.ai/tinytorch},
  howpublished = {Harvard University},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://mlsysbook.ai/tinytorch},
}

@inproceedings{wang2025simai,
  title = {
    {SimAI}: Unifying Architecture Design and Performance Tuning for Large-Scale Large Language
    Model Training with Scalability and Precision
  },
  author = {Wang, Xizheng and others},
  year = {2025},
  booktitle = {
    Proceedings of the 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI)
  },
  publisher = {USENIX Association},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
  x-verified-source = {
    https://www.usenix.org/conference/nsdi25/presentation/wang-xizheng-simai;
    https://dblp.org/rec/conf/nsdi/WangLXL0CZZZZLZ25.html
  },
}

@article{williams2009,
  title = {Roofline},
  author = {Williams, Samuel and Waterman, Andrew and Patterson, David},
  year = {2009},
  journal = {Communications of the ACM},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {52},
  number = {4},
  pages = {65--76},
  doi = {10.1145/1498765.1498785},
  issn = {0001-0782, 1557-7317},
  url = {https://doi.org/10.1145/1498765.1498785},
  subtitle = {An Insightful Visual Performance Model for Multicore Architectures},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1145/1498765.1498785},
}

@inproceedings{won2023,
  title = {
    ASTRA-Sim2.0: Modeling Hierarchical Networks and Disaggregated Systems for Large-Model Training
    at Scale
  },
  author = {
    Won, William and Heo, Taekyung and Rashidi, Saeed and Sridharan, Srinivas and Srinivasan,
    Sudarshan and Krishna, Tushar
  },
  year = {2023},
  booktitle = {2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
  publisher = {IEEE},
  pages = {283--294},
  doi = {10.1109/ispass57527.2023.00035},
  url = {https://doi.org/10.1109/ispass57527.2023.00035},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1109/ISPASS57527.2023.00035},
}

@misc{wongpanich2025fleet,
  title = {
    Machine Learning Fleet Efficiency: Analyzing and Optimizing Large-Scale {Google TPU} Systems
    with {ML} Productivity Goodput
  },
  author = {
    Wongpanich, Arissa and Oguntebi, Tayo and Baiocchi Paredes, Jose and Wang, Yu Emma and
    Phothilimthana, Phitchaya Mangpo and Mitra, Ritwika and Zhou, Zongwei and Kumar, Naveen and
    Reddi, Vijay Janapa
  },
  year = {2025},
  booktitle = {arXiv preprint arXiv:2502.06982},
  note = {arXiv:2502.06982},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
  x-verified-source = {https://arxiv.org/abs/2502.06982},
}

@inproceedings{wu2019,
  title = {Accelergy: An Architecture-Level Energy Estimation Methodology for Accelerator Designs},
  author = {Wu, Yannan Nellie and Emer, Joel S. and Sze, Vivienne},
  year = {2019},
  booktitle = {2019 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)},
  publisher = {IEEE},
  pages = {1--8},
  doi = {10.1109/iccad45719.2019.8942149},
  url = {https://doi.org/10.1109/iccad45719.2019.8942149},
  source = {Crossref},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://doi.org/10.1109/ICCAD45719.2019.8942149},
}

@article{young1974,
  title = {A First Order Approximation to the Optimum Checkpoint Interval},
  author = {Young, John W.},
  year = {1974},
  journal = {Communications of the ACM},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {17},
  number = {9},
  pages = {530--531},
  doi = {10.1145/361147.361115},
  issn = {0001-0782, 1557-7317},
  url = {https://doi.org/10.1145/361147.361115},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
  x-verified-source = {https://doi.org/10.1145/361147.361115},
}

@inproceedings{yu2021habitat,
  title = {Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training},
  author = {Yu, Geoffrey X. and Gao, Yubo and Golikov, Pavel and Pekhimenko, Gennady},
  year = {2021},
  booktitle = {Proceedings of the 2021 USENIX Annual Technical Conference (ATC)},
  publisher = {USENIX Association},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
  x-verified-source = {https://www.usenix.org/conference/atc21/presentation/yu},
}

@article{yuan2024llmviewer,
  title = {{LLM} Inference Unveiled: Survey and Roofline Model Insights},
  author = {Yuan, Zhihang and Shang, Yuzhang and Zhou, Yang and others},
  year = {2024},
  journal = {arXiv preprint arXiv:2402.16363},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{zhang2024,
  title = {LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference},
  author = {Zhang, Hengrui and Ning, August and Prabhakar, Rohan Baskar and Wentzlaff, David},
  year = {2024},
  booktitle = {2024 ACM/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)},
  publisher = {IEEE},
  pages = {1080--1096},
  doi = {10.1109/isca59077.2024.00082},
  url = {https://doi.org/10.1109/isca59077.2024.00082},
  source = {Crossref},
  x-verified = {2026-05-03},
  x-verified-by = {claude-bib-audit-2026-05},
  x-verified-status = {verified},
}

@inproceedings{zheng2024sglang,
  title = {SGLang: Efficient Execution of Structured Language Model Programs},
  author = {Zheng, Lianmin and Yin, Liangsheng and Xie, Zhiqiang and others},
  year = {2024},
  journal = {arXiv preprint arXiv:2312.07104},
  booktitle = {Advances in Neural Information Processing Systems 37},
  publisher = {Neural Information Processing Systems Foundation, Inc. (NeurIPS)},
  pages = {62557--62583},
  doi = {10.52202/079017-2000},
  url = {https://doi.org/10.52202/079017-2000},
  source = {Crossref},
  x-verified = {2026-04-09},
  x-verified-by = {pass-17-bib-hygiene},
}

@inproceedings{zhong2024distserve,
  title = {
    {DistServe}: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model
    Serving
  },
  author = {
    Zhong, Yinmin and Liu, Shengyu and Chen, Junda and Hu, Jianbo and Zhu, Yibo and Liu, Xuanzhe
    and Jin, Xin and Zhang, Hao
  },
  year = {2024},
  booktitle = {
    Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)
  },
  publisher = {USENIX Association},
  x-verified = {2026-04-08},
  x-verified-by = {pass-16-bib-sweep},
  x-verified-source = {https://www.usenix.org/conference/osdi24/presentation/zhong-yinmin},
}