@article{williams2009roofline,
  title     = {Roofline: An Insightful Visual Performance Model for Multicore Architectures},
  author    = {Williams, Samuel and Waterman, Andrew and Patterson, David},
  journal   = {Communications of the ACM},
  volume    = {52},
  number    = {4},
  pages     = {65--76},
  year      = {2009},
  publisher = {ACM},
  doi       = {10.1145/1498765.1498785}
}

@inproceedings{mlperf2020,
  title     = {{MLPerf}: An Industry Standard Benchmark Suite for Machine Learning Performance},
  author    = {Mattson, Peter and Cheng, Christine and Diamos, Gregory and others},
  booktitle = {IEEE/ACM International Symposium on Microarchitecture (MICRO)},
  year      = {2020},
  doi       = {10.1109/MICRO50266.2020.00045}
}

@inproceedings{rasley2020deepspeed,
  title     = {{DeepSpeed}: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters},
  author    = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
  booktitle = {ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
  year      = {2020},
  doi       = {10.1145/3394486.3406703}
}

@article{shoeybi2019megatron,
  title   = {{Megatron-LM}: Training Multi-Billion Parameter Language Models Using Model Parallelism},
  author  = {Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
  journal = {arXiv preprint arXiv:1909.08053},
  year    = {2019}
}

@article{young1974first,
  title   = {A First Order Approximation to the Optimum Checkpoint Interval},
  author  = {Young, John W.},
  journal = {Communications of the ACM},
  volume  = {17},
  number  = {9},
  pages   = {530--531},
  year    = {1974},
  doi     = {10.1145/361147.361115}
}

@article{daly2006higher,
  title   = {A Higher Order Estimate of the Optimum Checkpoint Interval for Restart Dumps},
  author  = {Daly, John T.},
  journal = {Future Generation Computer Systems},
  volume  = {22},
  number  = {3},
  pages   = {303--312},
  year    = {2006},
  doi     = {10.1016/j.future.2004.11.016}
}

@book{mlsysbook2024,
  title     = {Machine Learning Systems: Principles and Practices of Engineering Artificially Intelligent Systems},
  author    = {Reddi, Vijay Janapa and others},
  year      = {2024},
  publisher = {Harvard University},
  url       = {https://mlsysbook.ai}
}

@book{hennessy2019architecture,
  title     = {Computer Architecture: A Quantitative Approach},
  author    = {Hennessy, John L. and Patterson, David A.},
  edition   = {6th},
  year      = {2019},
  publisher = {Morgan Kaufmann},
  isbn      = {978-0128119051}
}

@inproceedings{jouppi2017datacenter,
  title     = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
  author    = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and others},
  booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture (ISCA)},
  pages     = {1--12},
  year      = {2017},
  doi       = {10.1145/3079856.3080246}
}

@article{dean2012large,
  title   = {Large Scale Distributed Deep Networks},
  author  = {Dean, Jeffrey and Corrado, Greg S. and Monga, Rajat and others},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {25},
  year    = {2012}
}

@inproceedings{amodei2018ai,
  title     = {{AI} and Compute},
  author    = {Amodei, Dario and Hernandez, Danny},
  booktitle = {OpenAI Blog},
  year      = {2018},
  url       = {https://openai.com/blog/ai-and-compute}
}

@article{patterson2022carbon,
  title   = {Carbon Emissions and Large Neural Network Training},
  author  = {Patterson, David and Gonzalez, Joseph and Le, Quoc and others},
  journal = {arXiv preprint arXiv:2104.10350},
  year    = {2022}
}

@inproceedings{rajbhandari2020zero,
  title     = {{ZeRO}: Memory Optimizations Toward Training Trillion Parameter Models},
  author    = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
  year      = {2020},
  doi       = {10.1109/SC41405.2020.00024}
}

@article{kaplan2020scaling,
  title   = {Scaling Laws for Neural Language Models},
  author  = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and others},
  journal = {arXiv preprint arXiv:2001.08361},
  year    = {2020}
}

@inproceedings{kwon2023efficient,
  title     = {Efficient Memory Management for Large Language Model Serving with {PagedAttention}},
  author    = {Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and others},
  booktitle = {Proceedings of the 29th ACM Symposium on Operating Systems Principles (SOSP)},
  year      = {2023},
  doi       = {10.1145/3600006.3613165}
}

@misc{nvidia2023h100,
  title        = {{NVIDIA H100 Tensor Core GPU} Datasheet},
  author       = {{NVIDIA Corporation}},
  year         = {2023},
  howpublished = {\url{https://www.nvidia.com/en-us/data-center/h100/}},
  note         = {Accessed: 2024-06-15}
}

@inproceedings{won2023astrasim2,
  title     = {{ASTRA-sim2.0}: Modeling Hierarchical Networks and Disaggregated Systems for Large-model Training at Scale},
  author    = {Won, William and Heo, Taekyung and Rashidi, Saeed and Sridharan, Srinivas and Srinivasan, Sudarshan and Krishna, Tushar},
  booktitle = {IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
  year      = {2023},
  doi       = {10.1109/ISPASS57527.2023.00035}
}

@inproceedings{calculon2023,
  title     = {Calculon: a Methodology and Tool for High-Level Co-Design of Systems and Large Language Models},
  author    = {Isaev, Mikhail and McDonald, Nic and Dennison, Larry and Vuduc, Richard},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
  year      = {2023},
  doi       = {10.1145/3581784.3607102}
}

@inproceedings{parashar2019timeloop,
  title     = {Timeloop: A Systematic Approach to {DNN} Accelerator Evaluation},
  author    = {Parashar, Angshuman and Raina, Priyanka and Shao, Yakun Sophia and Chen, Yu-Hsin and Emer, Joel and others},
  booktitle = {IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
  year      = {2019},
  doi       = {10.1109/ISPASS.2019.00042}
}

@inproceedings{wu2019accelergy,
  title     = {Accelergy: An Architecture-Level Energy Estimation Methodology for Accelerator Designs},
  author    = {Wu, Yannan Nellie and Emer, Joel S. and Sze, Vivienne},
  booktitle = {IEEE/ACM International Conference on Computer-Aided Design (ICCAD)},
  year      = {2019},
  doi       = {10.1109/ICCAD45719.2019.8942149}
}

@article{narayanan2021efficient,
  title   = {Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM},
  author  = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and others},
  journal = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
  year    = {2021}
}

@article{shazeer2017outrageously,
  title   = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
  author  = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and others},
  journal = {arXiv preprint arXiv:1701.06538},
  year    = {2017}
}