@article{williams2009roofline, title = {Roofline: An Insightful Visual Performance Model for Multicore Architectures}, author = {Williams, Samuel and Waterman, Andrew and Patterson, David}, journal = {Communications of the ACM}, volume = {52}, number = {4}, pages = {65--76}, year = {2009}, publisher = {ACM}, doi = {10.1145/1498765.1498785} } @inproceedings{mlperf2020, title = {{MLPerf}: An Industry Standard Benchmark Suite for Machine Learning Performance}, author = {Mattson, Peter and Cheng, Christine and Diamos, Gregory and others}, booktitle = {IEEE/ACM International Symposium on Microarchitecture (MICRO)}, year = {2020}, doi = {10.1109/MICRO50266.2020.00045} } @inproceedings{rasley2020deepspeed, title = {{DeepSpeed}: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters}, author = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong}, booktitle = {ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2020}, doi = {10.1145/3394486.3406703} } @article{shoeybi2019megatron, title = {{Megatron-LM}: Training Multi-Billion Parameter Language Models Using Model Parallelism}, author = {Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan}, journal = {arXiv preprint arXiv:1909.08053}, year = {2019} } @article{young1974first, title = {A First Order Approximation to the Optimum Checkpoint Interval}, author = {Young, John W.}, journal = {Communications of the ACM}, volume = {17}, number = {9}, pages = {530--531}, year = {1974}, doi = {10.1145/361147.361115} } @article{daly2006higher, title = {A Higher Order Estimate of the Optimum Checkpoint Interval for Restart Dumps}, author = {Daly, John T.}, journal = {Future Generation Computer Systems}, volume = {22}, number = {3}, pages = {303--312}, year = {2006}, doi = {10.1016/j.future.2004.11.016} } @book{mlsysbook2024, title = {Machine Learning Systems: Principles and Practices of Engineering Artificially Intelligent Systems}, author = {Reddi, Vijay Janapa and others}, year = {2024}, publisher = {Harvard University}, url = {https://mlsysbook.ai} } @book{hennessy2019architecture, title = {Computer Architecture: A Quantitative Approach}, author = {Hennessy, John L. and Patterson, David A.}, edition = {6th}, year = {2019}, publisher = {Morgan Kaufmann}, isbn = {978-0128119051} } @inproceedings{jouppi2017datacenter, title = {In-Datacenter Performance Analysis of a Tensor Processing Unit}, author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and others}, booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture (ISCA)}, pages = {1--12}, year = {2017}, doi = {10.1145/3079856.3080246} } @article{dean2012large, title = {Large Scale Distributed Deep Networks}, author = {Dean, Jeffrey and Corrado, Greg S. and Monga, Rajat and others}, journal = {Advances in Neural Information Processing Systems}, volume = {25}, year = {2012} } @inproceedings{amodei2018ai, title = {{AI} and Compute}, author = {Amodei, Dario and Hernandez, Danny}, booktitle = {OpenAI Blog}, year = {2018}, url = {https://openai.com/blog/ai-and-compute} } @article{patterson2022carbon, title = {Carbon Emissions and Large Neural Network Training}, author = {Patterson, David and Gonzalez, Joseph and Le, Quoc and others}, journal = {arXiv preprint arXiv:2104.10350}, year = {2022} } @inproceedings{rajbhandari2020zero, title = {{ZeRO}: Memory Optimizations Toward Training Trillion Parameter Models}, author = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)}, year = {2020}, doi = {10.1109/SC41405.2020.00024} } @article{kaplan2020scaling, title = {Scaling Laws for Neural Language Models}, author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and others}, journal = {arXiv preprint arXiv:2001.08361}, year = {2020} } @inproceedings{kwon2023efficient, title = {Efficient Memory Management for Large Language Model Serving with {PagedAttention}}, author = {Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and others}, booktitle = {Proceedings of the 29th ACM Symposium on Operating Systems Principles (SOSP)}, year = {2023}, doi = {10.1145/3600006.3613165} } @misc{nvidia2023h100, title = {{NVIDIA H100 Tensor Core GPU} Datasheet}, author = {{NVIDIA Corporation}}, year = {2023}, howpublished = {\url{https://www.nvidia.com/en-us/data-center/h100/}}, note = {Accessed: 2024-06-15} } @inproceedings{won2023astrasim2, title = {{ASTRA-sim2.0}: Modeling Hierarchical Networks and Disaggregated Systems for Large-model Training at Scale}, author = {Won, William and Heo, Taekyung and Rashidi, Saeed and Sridharan, Srinivas and Srinivasan, Sudarshan and Krishna, Tushar}, booktitle = {IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)}, year = {2023}, doi = {10.1109/ISPASS57527.2023.00035} } @inproceedings{calculon2023, title = {Calculon: a Methodology and Tool for High-Level Co-Design of Systems and Large Language Models}, author = {Isaev, Mikhail and McDonald, Nic and Dennison, Larry and Vuduc, Richard}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)}, year = {2023}, doi = {10.1145/3581784.3607102} } @inproceedings{parashar2019timeloop, title = {Timeloop: A Systematic Approach to {DNN} Accelerator Evaluation}, author = {Parashar, Angshuman and Raina, Priyanka and Shao, Yakun Sophia and Chen, Yu-Hsin and Emer, Joel and others}, booktitle = {IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)}, year = {2019}, doi = {10.1109/ISPASS.2019.00042} } @inproceedings{wu2019accelergy, title = {Accelergy: An Architecture-Level Energy Estimation Methodology for Accelerator Designs}, author = {Wu, Yannan Nellie and Emer, Joel S. and Sze, Vivienne}, booktitle = {IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, year = {2019}, doi = {10.1109/ICCAD45719.2019.8942149} } @article{narayanan2021efficient, title = {Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM}, author = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and others}, journal = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)}, year = {2021} } @article{shazeer2017outrageously, title = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer}, author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and others}, journal = {arXiv preprint arXiv:1701.06538}, year = {2017} }