mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 10:08:50 -05:00
Round 2 of the bib audit, covering paper subprojects (mlsysim,
tinytorch, periodic-table, mlperf-edu) that the textbook-focused first
pass deferred. Same pattern as round 1: surname/year prefixes did not
match the entry's actual paper, plus several corrupt entries from
Crossref misidentification.
Renames:
- mlsysim/{docs,paper}: barrett2024 -> zheng2024sglang (SGLang paper,
Zheng is first author).
- mlsysim/paper: zhao2025 -> deepseek2025v3 (DeepSeek-V3 ISCA paper,
corporate author DeepSeek-AI).
- tinytorch: key499f5624 -> tanenbaum1987os (hash-fallback for
Tanenbaum OS textbook); fry1985 -> abelson1996sicp (SICP 2nd ed,
Fry is not in author list); wooster1982 -> papert1980mindstorms
(Mindstorms by Papert, Wooster not in author list); collins2018 ->
collins1989apprenticeship (Cognitive Apprenticeship paper is 1989).
- tinytorch + periodic-table: vaswani2025 -> vaswani2017attention
(Attention paper is 2017; entries had a corrupt publisher and bogus
DOI from Crossref misidentification).
Body fixes accompanying renames:
- tanenbaum1987os, abelson1996sicp, papert1980mindstorms: rebuilt as
@book entries (were @article with stale review/journal DOIs).
- vaswani2017attention: rebuilt with canonical NeurIPS 2017 metadata
(Curran Associates, vol 30, pp 5998-6008); dropped corrupt DOI.
Orphan deletions:
- tinytorch keybe9561f4 (hash-fallback, no cite sites).
- mlperf-edu vaswani2017attention (orphan).
21 cite-site updates across 4 paper subprojects. bib_lint reports 0
errors across all 5 modified bibs.
1071 lines
38 KiB
BibTeX
1071 lines
38 KiB
BibTeX
@inproceedings{abadi2016,
|
|
title = {Deep Learning With Differential Privacy},
|
|
author = {Abadi, Martin and Chu, Andy and Goodfellow, Ian and others},
|
|
year = {2016},
|
|
booktitle = {Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security},
|
|
publisher = {ACM},
|
|
pages = {308--318},
|
|
doi = {10.1145/2976749.2978318},
|
|
url = {https://doi.org/10.1145/2976749.2978318},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1145/2976749.2978318},
|
|
}
|
|
|
|
@inproceedings{agrawal2024vidur,
|
|
title = {Vidur: A Large-Scale Simulation Framework For {LLM} Inference},
|
|
author = {
|
|
Agrawal, Amey and Kedia, Nitin and Panwar, Ashish and Mohan, Jayashree and Kwatra, Nipun and
|
|
Gulavani, Bhargav S. and Tumanov, Alexey and Ramjee, Ramachandran
|
|
},
|
|
year = {2024},
|
|
booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
|
|
publisher = {mlsys.org},
|
|
url = {https://arxiv.org/abs/2405.05465},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {
|
|
https://proceedings.mlsys.org/paper\_files/paper/2024/hash/b74a8de47d2b3c928360e0a011f48351-Abstract-Conference.html
|
|
},
|
|
}
|
|
|
|
@article{agrawal2025,
|
|
title = {Efficient LLM Inference via Chunked Prefills},
|
|
author = {
|
|
Agrawal, Amey and Kedia, Nitin and Panwar, Ashish and Mohan, Jayashree and Kwatra, Nipun and
|
|
Gulavani, Bhargav S. and Tumanov, Alexey and Ramjee, Ramachandran
|
|
},
|
|
year = {2024},
|
|
journal = {ACM SIGOPS Operating Systems Review},
|
|
booktitle = {
|
|
Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)
|
|
},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {59},
|
|
number = {1},
|
|
pages = {9--16},
|
|
doi = {10.1145/3759441.3759444},
|
|
issn = {0163-5980},
|
|
url = {https://doi.org/10.1145/3759441.3759444},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://www.usenix.org/conference/osdi24/presentation/agrawal},
|
|
}
|
|
|
|
@misc{amodei2018ai,
|
|
title = {{AI} and Compute},
|
|
author = {Amodei, Dario and Hernandez, Danny},
|
|
year = {2018},
|
|
url = {https://openai.com/research/ai-and-compute},
|
|
howpublished = {OpenAI Blog},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {https://openai.com/research/ai-and-compute},
|
|
}
|
|
|
|
@article{bambhaniya2024genz,
|
|
title = {Demystifying Platform Requirements for Diverse {LLM} Inference Use Cases},
|
|
author = {
|
|
Bambhaniya, Abhimanyu and Raj, Ritik and Jeong, Geonhwa and Kundu, Souvik and Srinivasan,
|
|
Sudarshan and Elavazhagan, Midhilesh and Kumar, Madhu and Krishna, Tushar
|
|
},
|
|
year = {2024},
|
|
journal = {arXiv preprint arXiv:2406.01698},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
x-verified-source = {
|
|
https://arxiv.org/abs/2406.01698;
|
|
https://www.semanticscholar.org/paper/Demystifying-Platform-Requirements-for-Diverse-LLM-Bambhaniya-Raj/f82ba56524bd595518729f207f16bd07a11ddeda
|
|
},
|
|
}
|
|
|
|
@article{barroso2007,
|
|
title = {The Case for Energy-Proportional Computing},
|
|
author = {Barroso, Luiz Andr{\'e} and H{\"o}lzle, Urs},
|
|
year = {2007},
|
|
journal = {Computer},
|
|
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
|
|
volume = {40},
|
|
number = {12},
|
|
pages = {33--37},
|
|
doi = {10.1109/mc.2007.443},
|
|
issn = {0018-9162},
|
|
url = {https://doi.org/10.1109/mc.2007.443},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1109/MC.2007.443},
|
|
}
|
|
|
|
@book{barroso2019,
|
|
title = {The Datacenter as a Computer},
|
|
author = {Barroso, Luiz Andr{\'e} and H{\"o}lzle, Urs and Ranganathan, Parthasarathy},
|
|
year = {2018},
|
|
publisher = {Springer International Publishing},
|
|
series = {Synthesis Lectures on Computer Architecture},
|
|
doi = {10.1007/978-3-031-01761-2},
|
|
isbn = {9783031006333, 9783031017612},
|
|
issn = {1935-3235, 1935-3243},
|
|
url = {https://doi.org/10.1007/978-3-031-01761-2},
|
|
subtitle = {Designing Warehouse-Scale Machines},
|
|
source = {Crossref},
|
|
edition = {3rd},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@article{binkert2011,
|
|
title = {The Gem5 Simulator},
|
|
author = {Binkert, Nathan and Beckmann, Bradford and Black, Gabriel and others},
|
|
year = {2011},
|
|
journal = {ACM SIGARCH Computer Architecture News},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {39},
|
|
number = {2},
|
|
pages = {1--7},
|
|
doi = {10.1145/2024716.2024718},
|
|
issn = {0163-5964},
|
|
url = {https://doi.org/10.1145/2024716.2024718},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1145/2024716.2024718},
|
|
}
|
|
|
|
@article{box1976,
|
|
title = {Science and Statistics},
|
|
author = {Box, George E. P.},
|
|
year = {1976},
|
|
month = {None},
|
|
journal = {J. Am. Stat. Assoc.},
|
|
publisher = {Informa UK Limited},
|
|
volume = {71},
|
|
number = {356},
|
|
pages = {791--799},
|
|
doi = {10.1080/01621459.1976.10480949},
|
|
issn = {0162-1459, 1537-274X},
|
|
url = {https://doi.org/10.1080/01621459.1976.10480949},
|
|
source = {Crossref},
|
|
x-verified = {2026-05-04},
|
|
x-verified-by = {openai-MODEL},
|
|
x-verified-source = {https://doi.org/10.1080/01621459.1976.10480949},
|
|
}
|
|
|
|
@article{chowdhery2022palm,
|
|
title = {{PaLM}: Scaling Language Modeling with Pathways},
|
|
author = {Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and others},
|
|
year = {2023},
|
|
journal = {Journal of Machine Learning Research},
|
|
volume = {24},
|
|
number = {240},
|
|
pages = {1--113},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@misc{cox2011xv6,
|
|
title = {xv6: A Simple, {Unix}-like Teaching Operating System},
|
|
author = {Cox, Russ and Kaashoek, M. Frans and Morris, Robert},
|
|
year = {2011},
|
|
url = {https://pdos.csail.mit.edu/6.828/xv6},
|
|
howpublished = {MIT PDOS},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://pdos.csail.mit.edu/6.828/xv6},
|
|
}
|
|
|
|
@article{daly2006,
|
|
title = {A Higher Order Estimate of the Optimum Checkpoint Interval for Restart Dumps},
|
|
author = {Daly, John T.},
|
|
year = {2006},
|
|
journal = {Future Gener. Comput. Syst.},
|
|
publisher = {Elsevier BV},
|
|
volume = {22},
|
|
number = {3},
|
|
pages = {303--312},
|
|
doi = {10.1016/j.future.2004.11.016},
|
|
issn = {0167-739X},
|
|
url = {https://doi.org/10.1016/j.future.2004.11.016},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1016/j.future.2004.11.016},
|
|
}
|
|
|
|
@inproceedings{dao2022,
|
|
title = {FlashAttention: Fast and Memory-Efficient Exact Attention With IO-Awareness},
|
|
author = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
|
|
year = {2022},
|
|
booktitle = {Advances in Neural Information Processing Systems 35},
|
|
publisher = {Neural Information Processing Systems Foundation, Inc. (NeurIPS)},
|
|
volume = {35},
|
|
pages = {16344--16359},
|
|
doi = {10.52202/068431-1189},
|
|
url = {https://doi.org/10.52202/068431-1189},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {
|
|
https://proceedings.neurips.cc/paper\_files/paper/2022/hash/67d57c32e20fd0a7a302cb81d36e40d5-Abstract-Conference.html
|
|
},
|
|
}
|
|
|
|
@article{dean2012large,
|
|
title = {Large Scale Distributed Deep Networks},
|
|
author = {Dean, Jeffrey and Corrado, Greg S. and Monga, Rajat and others},
|
|
year = {2012},
|
|
journal = {Advances in Neural Information Processing Systems},
|
|
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
|
|
publisher = {Curran Associates},
|
|
volume = {25},
|
|
pages = {1223--1231},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
x-verified-source = {
|
|
https://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks;
|
|
https://dl.acm.org/doi/10.5555/2999134.2999271
|
|
},
|
|
}
|
|
|
|
@article{dean2013,
|
|
title = {The Tail at Scale},
|
|
author = {Dean, Jeffrey and Barroso, Luiz Andr{\'e}},
|
|
year = {2013},
|
|
journal = {Communications of the ACM},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {56},
|
|
number = {2},
|
|
pages = {74--80},
|
|
doi = {10.1145/2408776.2408794},
|
|
issn = {0001-0782, 1557-7317},
|
|
url = {https://doi.org/10.1145/2408776.2408794},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1145/2408776.2408794},
|
|
}
|
|
|
|
@inproceedings{deepseek2025v3,
|
|
title = {
|
|
Insights Into DeepSeek-V3: Scaling Challenges and Reflections on Hardware for AI Architectures
|
|
},
|
|
author = {{DeepSeek-AI}},
|
|
year = {2025},
|
|
booktitle = {Proceedings of the 52nd Annual International Symposium on Computer Architecture},
|
|
publisher = {ACM},
|
|
pages = {1731--1745},
|
|
doi = {10.1145/3695053.3731412},
|
|
url = {https://doi.org/10.1145/3695053.3731412},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://api.crossref.org/works?query.title=Insights+into+DeepSeek-V3+Scaling+Challenges},
|
|
}
|
|
|
|
@inproceedings{eisenman2022checknrun,
|
|
title = {Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models},
|
|
author = {
|
|
Eisenman, Assaf and Matam, Kiran Kumar and Ingram, Steven and Mudigere, Dheevatsa and
|
|
Krishnamoorthi, Raghuraman and Nair, Krishnakumar and Smelyanskiy, Misha and Annavaram, Murali
|
|
},
|
|
year = {2022},
|
|
booktitle = {
|
|
Proceedings of the 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI)
|
|
},
|
|
publisher = {USENIX Association},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {https://www.usenix.org/conference/nsdi22/presentation/eisenman},
|
|
}
|
|
|
|
@inproceedings{faiz2024llmcarbon,
|
|
title = {{LLMCarbon}: Modeling the End-to-End Carbon Footprint of Large Language Models},
|
|
author = {
|
|
Faiz, Ahmad and Kaneda, Sotaro and Wang, Ruhan and Osi, Rita and Sharma, Prateek and Chen, Fan
|
|
and Jiang, Lei
|
|
},
|
|
year = {2024},
|
|
booktitle = {Proceedings of the 12th International Conference on Learning Representations (ICLR)},
|
|
publisher = {OpenReview.net},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
x-verified-source = {https://arxiv.org/abs/2309.14393; https://openreview.net/forum?id=aIok3ZD9to},
|
|
}
|
|
|
|
@inproceedings{frantar2023gptq,
|
|
title = {{GPTQ}: Accurate Post-Training Quantization for Generative Pre-trained Transformers},
|
|
author = {Frantar, Elias and Ashkboos, Saleh and Hoefler, Torsten and Alistarh, Dan},
|
|
year = {2023},
|
|
booktitle = {Proceedings of the 11th International Conference on Learning Representations (ICLR)},
|
|
publisher = {OpenReview.net},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://openreview.net/forum?id=tcbBPnfwxS},
|
|
}
|
|
|
|
@inbook{gholami2022,
|
|
title = {A Survey of Quantization Methods for Efficient Neural Network Inference},
|
|
author = {Gholami, Amir and Kim, Sehoon and Dong, Zhen and others},
|
|
year = {2021},
|
|
journal = {arXiv preprint arXiv:2103.13630},
|
|
booktitle = {Low-Power Computer Vision},
|
|
publisher = {Chapman and Hall/CRC},
|
|
pages = {291--326},
|
|
doi = {10.1201/9781003162810-13},
|
|
isbn = {9781003162810},
|
|
url = {https://doi.org/10.1201/9781003162810-13},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{gupta2022,
|
|
title = {Act},
|
|
author = {
|
|
Gupta, Udit and Elgamal, Mariam and Hills, Gage and Wei, Gu-Yeon and Lee, Hsien-Hsin S. and
|
|
Brooks, David and Wu, Carole-Jean
|
|
},
|
|
year = {2022},
|
|
booktitle = {Proceedings of the 49th Annual International Symposium on Computer Architecture},
|
|
publisher = {ACM},
|
|
pages = {784--799},
|
|
doi = {10.1145/3470496.3527408},
|
|
url = {https://doi.org/10.1145/3470496.3527408},
|
|
subtitle = {Designing Sustainable Computer Systems With an Architectural Carbon Modeling Tool},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1145/3470496.3527408},
|
|
}
|
|
|
|
@inproceedings{gupta2022chasing,
|
|
title = {Chasing Carbon: The Elusive Environmental Footprint of Computing},
|
|
author = {
|
|
Gupta, Udit and Kim, Young Geun and Lee, Sylvia and Tse, Jordan and Lee, Hsien-Hsin S and Wei,
|
|
Gu-Yeon and Brooks, David and Wu, Carole-Jean
|
|
},
|
|
year = {2022},
|
|
booktitle = {IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
|
|
publisher = {IEEE},
|
|
pages = {85--99},
|
|
doi = {10.1109/hpca53966.2022.00013},
|
|
url = {https://doi.org/10.1109/hpca53966.2022.00013},
|
|
source = {Crossref},
|
|
}
|
|
|
|
@inproceedings{han2016deep,
|
|
title = {
|
|
Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and
|
|
Huffman Coding
|
|
},
|
|
author = {Han, Song and Mao, Huizi and Dally, William J.},
|
|
year = {2016},
|
|
booktitle = {Proceedings of the 4th International Conference on Learning Representations (ICLR)},
|
|
publisher = {OpenReview.net},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://arxiv.org/abs/1510.00149},
|
|
}
|
|
|
|
@book{hennessy2024architecture,
|
|
title = {Computer Architecture: A Quantitative Approach},
|
|
author = {Hennessy, John L. and Patterson, David A. and Kozyrakis, Christos},
|
|
year = {2024},
|
|
publisher = {Morgan Kaufmann},
|
|
isbn = {978-0443154065},
|
|
edition = {7th},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{hoffmann2022chinchilla,
|
|
title = {Training Compute-Optimal Large Language Models},
|
|
author = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and others},
|
|
year = {2022},
|
|
booktitle = {Advances in Neural Information Processing Systems 35},
|
|
publisher = {Neural Information Processing Systems Foundation, Inc. (NeurIPS)},
|
|
volume = {35},
|
|
pages = {30016--30030},
|
|
doi = {10.52202/068431-2176},
|
|
url = {https://doi.org/10.52202/068431-2176},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {
|
|
https://proceedings.neurips.cc/paper\_files/paper/2022/hash/c1e2faff6f588870935f114ebe04a3e5-Abstract-Conference.html
|
|
},
|
|
}
|
|
|
|
@inproceedings{isaev2023,
|
|
title = {
|
|
Calculon: A Methodology and Tool for High-Level Co-Design of Systems and Large Language Models
|
|
},
|
|
author = {Isaev, Mikhail and McDonald, Nic and Dennison, Larry and Vuduc, Richard},
|
|
year = {2023},
|
|
booktitle = {
|
|
Proceedings of the International Conference for High Performance Computing, Networking, Storage
|
|
and Analysis
|
|
},
|
|
publisher = {ACM},
|
|
pages = {1--14},
|
|
doi = {10.1145/3581784.3607102},
|
|
url = {https://doi.org/10.1145/3581784.3607102},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1145/3581784.3607102},
|
|
}
|
|
|
|
@inproceedings{jia2019flexflow,
|
|
title = {Beyond Data and Model Parallelism for Deep Neural Networks},
|
|
author = {Jia, Zhihao and Zaharia, Matei and Aiken, Alex},
|
|
year = {2019},
|
|
booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
|
|
publisher = {mlsys.org},
|
|
url = {https://arxiv.org/abs/1807.05358},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {
|
|
https://proceedings.mlsys.org/paper\_files/paper/2019/hash/b422680f3db0986ddd7f8f126baaf0fa-Abstract.html
|
|
},
|
|
}
|
|
|
|
@inproceedings{jouppi2017,
|
|
title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
|
|
author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and others},
|
|
year = {2017},
|
|
booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
|
|
publisher = {ACM},
|
|
pages = {1--12},
|
|
doi = {10.1145/3079856.3080246},
|
|
url = {https://doi.org/10.1145/3079856.3080246},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1145/3079856.3080246},
|
|
}
|
|
|
|
@article{kaplan2020scaling,
|
|
title = {Scaling Laws for Neural Language Models},
|
|
author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and others},
|
|
year = {2020},
|
|
journal = {arXiv preprint arXiv:2001.08361},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@misc{kim2023llmanalysis,
|
|
title = {llm-analysis: Latency and Memory Analysis of Transformer Models},
|
|
author = {Li, Cheng},
|
|
year = {2023},
|
|
note = {Accessed: 2025-01-15},
|
|
howpublished = {\url{https://github.com/cli99/llm-analysis}},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{kwon2023,
|
|
title = {Efficient Memory Management for Large Language Model Serving With PagedAttention},
|
|
author = {Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and others},
|
|
year = {2023},
|
|
booktitle = {Proceedings of the 29th Symposium on Operating Systems Principles},
|
|
publisher = {ACM},
|
|
pages = {611--626},
|
|
doi = {10.1145/3600006.3613165},
|
|
url = {https://doi.org/10.1145/3600006.3613165},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1145/3600006.3613165},
|
|
}
|
|
|
|
@article{leiserson1985,
|
|
title = {Fat-Trees: Universal Networks for Hardware-Efficient Supercomputing},
|
|
author = {Leiserson, Charles E.},
|
|
year = {1985},
|
|
journal = {IEEE Transactions on Computers},
|
|
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
|
|
volume = {C-34},
|
|
number = {10},
|
|
pages = {892--901},
|
|
doi = {10.1109/tc.1985.6312192},
|
|
issn = {0018-9340},
|
|
url = {https://doi.org/10.1109/tc.1985.6312192},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{leviathan2023fast,
|
|
title = {Fast Inference from Transformers via Speculative Decoding},
|
|
author = {Leviathan, Yaniv and Kalman, Matan and Matias, Yossi},
|
|
year = {2023},
|
|
booktitle = {Proceedings of the 40th International Conference on Machine Learning (ICML)},
|
|
publisher = {PMLR},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://proceedings.mlr.press/v202/leviathan23a.html},
|
|
}
|
|
|
|
@inproceedings{liang2025lumos,
|
|
title = {Lumos: Efficient Performance Modeling and Estimation for Large-scale {LLM} Training},
|
|
author = {
|
|
Liang, Mingyu and Kassa, Hiwot Tadese and Fu, Wenyin and Coutinho, Brian and Feng, Louis and
|
|
Delimitrou, Christina
|
|
},
|
|
year = {2025},
|
|
booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
|
|
publisher = {mlsys.org},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://mlsys.org/virtual/2025/papers.html},
|
|
}
|
|
|
|
@inproceedings{lie2022cerebras,
|
|
title = {Cerebras Architecture Deep Dive: First Look Inside the {HW/SW} Co-Design for Deep Learning},
|
|
author = {Lie, Sean},
|
|
year = {2022},
|
|
booktitle = {IEEE Hot Chips 34 Symposium},
|
|
publisher = {IEEE},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://hc34.hotchips.org/},
|
|
}
|
|
|
|
@article{lin2025,
|
|
title = {AWQ: Activation-Aware Weight Quantization for On-Device LLM Compression and Acceleration},
|
|
author = {
|
|
Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Chen, Wei-Ming and Wang,
|
|
Wei-Chen and Xiao, Guangxuan and Dang, Xingyu and Gan, Chuang and Han, Song
|
|
},
|
|
year = {2024},
|
|
journal = {GetMobile: Mobile Computing and Communications},
|
|
booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {28},
|
|
number = {4},
|
|
pages = {12--17},
|
|
doi = {10.1145/3714983.3714987},
|
|
issn = {2375-0529, 2375-0537},
|
|
url = {https://doi.org/10.1145/3714983.3714987},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {
|
|
https://proceedings.mlsys.org/paper\_files/paper/2024/hash/42a452cbafa9dd64e9ba4aa95cc1ef21-Abstract-Conference.html
|
|
},
|
|
}
|
|
|
|
@article{little1961,
|
|
title = {A Proof for the Queuing Formula: <I>L</i> = \ensuremath{\Lambda}<I>W</i>},
|
|
author = {Little, John D. C.},
|
|
year = {1961},
|
|
journal = {Oper. Res.},
|
|
publisher = {Institute for Operations Research and the Management Sciences (INFORMS)},
|
|
volume = {9},
|
|
number = {3},
|
|
pages = {383--387},
|
|
doi = {10.1287/opre.9.3.383},
|
|
issn = {0030-364X, 1526-5463},
|
|
url = {https://doi.org/10.1287/opre.9.3.383},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1287/opre.9.3.383},
|
|
}
|
|
|
|
@article{llama3team2024,
|
|
title = {The {Llama} 3 Herd of Models},
|
|
author = {{Llama Team, AI \@ Meta}},
|
|
year = {2024},
|
|
journal = {arXiv preprint arXiv:2407.21783},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@misc{lottick2019codecarbon,
|
|
title = {Energy Usage Reports: Environmental Awareness as Part of Algorithmic Accountability},
|
|
author = {Lottick, Kadan and Susai, Silvia and Friedler, Sorelle A. and Wilson, Jonathan P.},
|
|
year = {2019},
|
|
howpublished = {arXiv preprint arXiv:1911.08354},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@article{mattson2020,
|
|
title = {MLPerf: An Industry Standard Benchmark Suite for Machine Learning Performance},
|
|
author = {Mattson, Peter and Cheng, Christine and Diamos, Gregory and others},
|
|
year = {2020},
|
|
journal = {IEEE Micro},
|
|
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
|
|
volume = {40},
|
|
number = {2},
|
|
pages = {8--16},
|
|
doi = {10.1109/mm.2020.2974843},
|
|
issn = {0272-1732, 1937-4143},
|
|
url = {https://doi.org/10.1109/mm.2020.2974843},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1109/MM.2020.2974843},
|
|
}
|
|
|
|
@book{mlsysbook2025,
|
|
title = {
|
|
Machine Learning Systems: Principles and Practices of Engineering Artificially Intelligent
|
|
Systems
|
|
},
|
|
author = {Reddi, Vijay Janapa and others},
|
|
year = {2025},
|
|
publisher = {Harvard University},
|
|
url = {https://mlsysbook.ai},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://mlsysbook.ai},
|
|
}
|
|
|
|
@article{mohan2021,
|
|
title = {Analyzing and Mitigating Data Stalls in DNN Training},
|
|
author = {Mohan, Jayashree and Phanishayee, Amar and Raniwala, Ashish and Chidambaram, Vijay},
|
|
year = {2021},
|
|
journal = {Proc. VLDB Endow.},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {14},
|
|
number = {5},
|
|
pages = {771--784},
|
|
doi = {10.14778/3446095.3446100},
|
|
issn = {2150-8097},
|
|
url = {https://doi.org/10.14778/3446095.3446100},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{murray2021tf,
|
|
title = {tf.data: A Machine Learning Data Processing Framework},
|
|
author = {Murray, Derek G. and Simsa, Jiri and Klimovic, Ana and Indyk, Ihor},
|
|
year = {2021},
|
|
booktitle = {Proceedings of the VLDB Endowment},
|
|
publisher = {VLDB Endowment},
|
|
volume = {14},
|
|
number = {12},
|
|
pages = {2945--2958},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://www.vldb.org/pvldb/vol14/p2945-klimovic.pdf},
|
|
}
|
|
|
|
@inproceedings{narayanan2021,
|
|
title = {Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM},
|
|
author = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and others},
|
|
year = {2021},
|
|
booktitle = {
|
|
Proceedings of the International Conference for High Performance Computing, Networking, Storage
|
|
and Analysis
|
|
},
|
|
publisher = {ACM},
|
|
pages = {1--15},
|
|
doi = {10.1145/3458817.3476209},
|
|
url = {https://doi.org/10.1145/3458817.3476209},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {https://doi.org/10.1145/3458817.3476209},
|
|
}
|
|
|
|
@misc{nvidia2023h100,
|
|
title = {{NVIDIA H100 Tensor Core GPU} Datasheet},
|
|
author = {{NVIDIA Corporation}},
|
|
year = {2023},
|
|
note = {Accessed: 2024-06-15},
|
|
howpublished = {\url{https://www.nvidia.com/en-us/data-center/h100/}},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{parashar2019,
|
|
title = {Timeloop: A Systematic Approach to DNN Accelerator Evaluation},
|
|
author = {
|
|
Parashar, Angshuman and Raina, Priyanka and Shao, Yakun Sophia and Chen, Yu-Hsin and Emer, Joel
|
|
and others
|
|
},
|
|
year = {2019},
|
|
booktitle = {2019 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
|
|
publisher = {IEEE},
|
|
pages = {304--315},
|
|
doi = {10.1109/ispass.2019.00042},
|
|
url = {https://doi.org/10.1109/ispass.2019.00042},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1109/ISPASS.2019.00042},
|
|
}
|
|
|
|
@inproceedings{patel2024,
|
|
title = {Splitwise: Efficient Generative LLM Inference Using Phase Splitting},
|
|
author = {
|
|
Patel, Pratyush and Choukse, Esha and Zhang, Chaojie and Shah, Aashaka and Goiri, {\'I}{\~n}igo
|
|
and Maleki, Saeed and Bianchini, Ricardo
|
|
},
|
|
year = {2024},
|
|
booktitle = {2024 ACM/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)},
|
|
publisher = {IEEE},
|
|
pages = {118--132},
|
|
doi = {10.1109/isca59077.2024.00019},
|
|
url = {https://doi.org/10.1109/isca59077.2024.00019},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {https://doi.org/10.1109/ISCA59077.2024.00019},
|
|
}
|
|
|
|
@book{patterson2014organization,
|
|
title = {Computer Organization and Design: The Hardware/Software Interface},
|
|
author = {Patterson, David A. and Hennessy, John L.},
|
|
year = {2014},
|
|
publisher = {Morgan Kaufmann},
|
|
isbn = {978-0124077263},
|
|
edition = {5th},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@article{patterson2021carbon,
|
|
title = {Carbon Emissions and Large Neural Network Training},
|
|
author = {Patterson, David and Gonzalez, Joseph and Le, Quoc and others},
|
|
year = {2021},
|
|
journal = {arXiv preprint arXiv:2104.10350},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{pope2023llm,
|
|
title = {Efficiently Scaling Transformer Inference},
|
|
author = {Pope, Reiner and Douglas, Sholto and Chowdhery, Aakanksha and others},
|
|
year = {2023},
|
|
booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
|
|
publisher = {mlsys.org},
|
|
volume = {5},
|
|
url = {
|
|
https://proceedings.mlsys.org/paper_files/paper/2023/hash/c4be71ab8d24cdfb45e3d06dbfca2780-Abstract-mlsys2023.html
|
|
},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {https://arxiv.org/abs/2211.05102},
|
|
}
|
|
|
|
@inproceedings{qi2017paleo,
|
|
title = {{PALEO}: A Performance Model for Deep Neural Networks},
|
|
author = {Qi, Hang and Sparks, Evan R. and Talwalkar, Ameet},
|
|
year = {2017},
|
|
booktitle = {Proceedings of the 5th International Conference on Learning Representations (ICLR)},
|
|
publisher = {OpenReview.net},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://openreview.net/forum?id=SyVVJ85lg},
|
|
}
|
|
|
|
@inproceedings{rajbhandari2020,
|
|
title = {ZeRO: Memory Optimizations Toward Training Trillion Parameter Models},
|
|
author = {Rajbhandari, Samyam and Rasley, Jeff and Ruwase, Olatunji and He, Yuxiong},
|
|
year = {2020},
|
|
booktitle = {
|
|
SC20: International Conference for High Performance Computing, Networking, Storage and Analysis
|
|
},
|
|
publisher = {IEEE},
|
|
pages = {1--16},
|
|
doi = {10.1109/sc41405.2020.00024},
|
|
url = {https://doi.org/10.1109/sc41405.2020.00024},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1109/SC41405.2020.00024},
|
|
}
|
|
|
|
@inproceedings{rasley2020,
|
|
title = {DeepSpeed},
|
|
author = {Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
|
|
year = {2020},
|
|
booktitle = {
|
|
Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data
|
|
Mining
|
|
},
|
|
publisher = {ACM},
|
|
pages = {3505--3506},
|
|
doi = {10.1145/3394486.3406703},
|
|
url = {https://doi.org/10.1145/3394486.3406703},
|
|
subtitle = {System Optimizations Enable Training Deep Learning Models With Over 100 Billion Parameters},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1145/3394486.3406703},
|
|
}
|
|
|
|
@inproceedings{shazeer2017outrageously,
|
|
title = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
|
|
author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and others},
|
|
year = {2017},
|
|
booktitle = {Proceedings of the 5th International Conference on Learning Representations (ICLR)},
|
|
publisher = {OpenReview.net},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://openreview.net/forum?id=B1ckMDqlg},
|
|
}
|
|
|
|
@article{shoeybi2019megatron,
|
|
title = {{Megatron-LM}: Training Multi-Billion Parameter Language Models Using Model Parallelism},
|
|
author = {
|
|
Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared
|
|
and Catanzaro, Bryan
|
|
},
|
|
year = {2019},
|
|
journal = {arXiv preprint arXiv:1909.08053},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{snell2025scaling,
|
|
title = {Scaling {LLM} Test-Time Compute Optimally can be More Effective than Scaling Model Parameters},
|
|
author = {Snell, Charlie and Lee, Jaehoon and Xu, Kelvin and Kumar, Aviral},
|
|
year = {2025},
|
|
booktitle = {Proceedings of the 13th International Conference on Learning Representations (ICLR)},
|
|
publisher = {OpenReview.net},
|
|
eprint = {2408.03314},
|
|
archiveprefix = {arXiv},
|
|
x-verified = {2026-04-26},
|
|
x-verified-by = {bib-web-verify},
|
|
x-verified-source = {https://openreview.net/forum?id=4FWAwZtd2n},
|
|
}
|
|
|
|
@techreport{stephenson1999mco,
|
|
title = {{Mars Climate Orbiter Mishap Investigation Board Phase I Report}},
|
|
author = {
|
|
Stephenson, Arthur G. and LaPiana, Lia S. and Mulville, Daniel R. and Rutledge, Peter J. and
|
|
Bauer, Frank H. and Folta, David and Dukeman, Greg A. and Sackheim, Robert and Norvig, Peter
|
|
},
|
|
year = {1999},
|
|
month = {None},
|
|
institution = {National Aeronautics and Space Administration},
|
|
x-verified = {2026-05-04},
|
|
x-verified-by = {openai-MODEL},
|
|
x-verified-source = {https://ntrs.nasa.gov/citations/20000010501},
|
|
}
|
|
|
|
@misc{sutton2019bitter,
|
|
title = {The Bitter Lesson},
|
|
author = {Sutton, Rich},
|
|
year = {2019},
|
|
note = {Accessed: 2024-06-15},
|
|
howpublished = {\url{http://www.incompleteideas.net/IncIdeas/BitterLesson.html}},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@book{tanenbaum2006minix,
|
|
title = {Operating Systems: Design and Implementation},
|
|
author = {Tanenbaum, Andrew S. and Woodhull, Albert S.},
|
|
year = {2006},
|
|
publisher = {Prentice Hall},
|
|
edition = {3rd},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@misc{tinytorch2025,
|
|
title = {{TinyTorch}: A Progressive Educational Framework for Machine Learning Systems},
|
|
author = {Reddi, Vijay Janapa and others},
|
|
year = {2025},
|
|
url = {https://mlsysbook.ai/tinytorch},
|
|
howpublished = {Harvard University},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://mlsysbook.ai/tinytorch},
|
|
}
|
|
|
|
@inproceedings{wang2025simai,
|
|
title = {
|
|
{SimAI}: Unifying Architecture Design and Performance Tuning for Large-Scale Large Language
|
|
Model Training with Scalability and Precision
|
|
},
|
|
author = {Wang, Xizheng and others},
|
|
year = {2025},
|
|
booktitle = {
|
|
Proceedings of the 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI)
|
|
},
|
|
publisher = {USENIX Association},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
x-verified-source = {
|
|
https://www.usenix.org/conference/nsdi25/presentation/wang-xizheng-simai;
|
|
https://dblp.org/rec/conf/nsdi/WangLXL0CZZZZLZ25.html
|
|
},
|
|
}
|
|
|
|
@article{williams2009,
|
|
title = {Roofline},
|
|
author = {Williams, Samuel and Waterman, Andrew and Patterson, David},
|
|
year = {2009},
|
|
journal = {Communications of the ACM},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {52},
|
|
number = {4},
|
|
pages = {65--76},
|
|
doi = {10.1145/1498765.1498785},
|
|
issn = {0001-0782, 1557-7317},
|
|
url = {https://doi.org/10.1145/1498765.1498785},
|
|
subtitle = {An Insightful Visual Performance Model for Multicore Architectures},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1145/1498765.1498785},
|
|
}
|
|
|
|
@inproceedings{won2023,
|
|
title = {
|
|
ASTRA-Sim2.0: Modeling Hierarchical Networks and Disaggregated Systems for Large-Model Training
|
|
at Scale
|
|
},
|
|
author = {
|
|
Won, William and Heo, Taekyung and Rashidi, Saeed and Sridharan, Srinivas and Srinivasan,
|
|
Sudarshan and Krishna, Tushar
|
|
},
|
|
year = {2023},
|
|
booktitle = {2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
|
|
publisher = {IEEE},
|
|
pages = {283--294},
|
|
doi = {10.1109/ispass57527.2023.00035},
|
|
url = {https://doi.org/10.1109/ispass57527.2023.00035},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1109/ISPASS57527.2023.00035},
|
|
}
|
|
|
|
@misc{wongpanich2025fleet,
|
|
title = {
|
|
Machine Learning Fleet Efficiency: Analyzing and Optimizing Large-Scale {Google TPU} Systems
|
|
with {ML} Productivity Goodput
|
|
},
|
|
author = {
|
|
Wongpanich, Arissa and Oguntebi, Tayo and Baiocchi Paredes, Jose and Wang, Yu Emma and
|
|
Phothilimthana, Phitchaya Mangpo and Mitra, Ritwika and Zhou, Zongwei and Kumar, Naveen and
|
|
Reddi, Vijay Janapa
|
|
},
|
|
year = {2025},
|
|
booktitle = {arXiv preprint arXiv:2502.06982},
|
|
note = {arXiv:2502.06982},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
x-verified-source = {https://arxiv.org/abs/2502.06982},
|
|
}
|
|
|
|
@inproceedings{wu2019,
|
|
title = {Accelergy: An Architecture-Level Energy Estimation Methodology for Accelerator Designs},
|
|
author = {Wu, Yannan Nellie and Emer, Joel S. and Sze, Vivienne},
|
|
year = {2019},
|
|
booktitle = {2019 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)},
|
|
publisher = {IEEE},
|
|
pages = {1--8},
|
|
doi = {10.1109/iccad45719.2019.8942149},
|
|
url = {https://doi.org/10.1109/iccad45719.2019.8942149},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://doi.org/10.1109/ICCAD45719.2019.8942149},
|
|
}
|
|
|
|
@article{young1974,
|
|
title = {A First Order Approximation to the Optimum Checkpoint Interval},
|
|
author = {Young, John W.},
|
|
year = {1974},
|
|
journal = {Communications of the ACM},
|
|
publisher = {Association for Computing Machinery (ACM)},
|
|
volume = {17},
|
|
number = {9},
|
|
pages = {530--531},
|
|
doi = {10.1145/361147.361115},
|
|
issn = {0001-0782, 1557-7317},
|
|
url = {https://doi.org/10.1145/361147.361115},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
x-verified-source = {https://doi.org/10.1145/361147.361115},
|
|
}
|
|
|
|
@inproceedings{yu2021habitat,
|
|
title = {Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training},
|
|
author = {Yu, Geoffrey X. and Gao, Yubo and Golikov, Pavel and Pekhimenko, Gennady},
|
|
year = {2021},
|
|
booktitle = {Proceedings of the 2021 USENIX Annual Technical Conference (ATC)},
|
|
publisher = {USENIX Association},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
x-verified-source = {https://www.usenix.org/conference/atc21/presentation/yu},
|
|
}
|
|
|
|
@article{yuan2024llmviewer,
|
|
title = {{LLM} Inference Unveiled: Survey and Roofline Model Insights},
|
|
author = {Yuan, Zhihang and Shang, Yuzhang and Zhou, Yang and others},
|
|
year = {2024},
|
|
journal = {arXiv preprint arXiv:2402.16363},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{zhang2024,
|
|
title = {LLMCompass: Enabling Efficient Hardware Design for Large Language Model Inference},
|
|
author = {Zhang, Hengrui and Ning, August and Prabhakar, Rohan Baskar and Wentzlaff, David},
|
|
year = {2024},
|
|
booktitle = {2024 ACM/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)},
|
|
publisher = {IEEE},
|
|
pages = {1080--1096},
|
|
doi = {10.1109/isca59077.2024.00082},
|
|
url = {https://doi.org/10.1109/isca59077.2024.00082},
|
|
source = {Crossref},
|
|
x-verified = {2026-05-03},
|
|
x-verified-by = {claude-bib-audit-2026-05},
|
|
x-verified-status = {verified},
|
|
}
|
|
|
|
@inproceedings{zheng2024sglang,
|
|
title = {SGLang: Efficient Execution of Structured Language Model Programs},
|
|
author = {Zheng, Lianmin and Yin, Liangsheng and Xie, Zhiqiang and others},
|
|
year = {2024},
|
|
journal = {arXiv preprint arXiv:2312.07104},
|
|
booktitle = {Advances in Neural Information Processing Systems 37},
|
|
publisher = {Neural Information Processing Systems Foundation, Inc. (NeurIPS)},
|
|
pages = {62557--62583},
|
|
doi = {10.52202/079017-2000},
|
|
url = {https://doi.org/10.52202/079017-2000},
|
|
source = {Crossref},
|
|
x-verified = {2026-04-09},
|
|
x-verified-by = {pass-17-bib-hygiene},
|
|
}
|
|
|
|
@inproceedings{zhong2024distserve,
|
|
title = {
|
|
{DistServe}: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model
|
|
Serving
|
|
},
|
|
author = {
|
|
Zhong, Yinmin and Liu, Shengyu and Chen, Junda and Hu, Jianbo and Zhu, Yibo and Liu, Xuanzhe
|
|
and Jin, Xin and Zhang, Hao
|
|
},
|
|
year = {2024},
|
|
booktitle = {
|
|
Proceedings of the 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)
|
|
},
|
|
publisher = {USENIX Association},
|
|
x-verified = {2026-04-08},
|
|
x-verified-by = {pass-16-bib-sweep},
|
|
x-verified-source = {https://www.usenix.org/conference/osdi24/presentation/zhong-yinmin},
|
|
}
|