cs249r_book/contents/hw_acceleration/hw_acceleration.bib

@article{gwennap_certus-nx_nodate,
  author        = {Gwennap, Linley},
  language      = {en},
  title         = {Certus-{NX} Innovates General-Purpose {FPGAs}}
}

@inproceedings{adolf2016fathom,
 author = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-yeon and Brooks, David},
 booktitle = {2016 IEEE International Symposium on Workload Characterization (IISWC)},
 doi = {10.1109/iiswc.2016.7581275},
 organization = {IEEE},
 pages = {1--10},
 publisher = {IEEE},
 source = {Crossref},
 title = {Fathom: {Reference} workloads for modern deep learning methods},
 url = {https://doi.org/10.1109/iiswc.2016.7581275},
 year = {2016}
}

@inproceedings{agnesina2023autodmp,
 author = {Agnesina, Anthony and Rajvanshi, Puranjay and Yang, Tian and Pradipta, Geraldo and Jiao, Austin and Keller, Ben and Khailany, Brucek and Ren, Haoxing},
 booktitle = {Proceedings of the 2023 International Symposium on Physical Design},
 pages = {149--157},
 title = {AutoDMP: Automated dreamplace-based macro placement},
 year = {2023}
}

@article{asit2021accelerating,
 author = {Asit K. Mishra and Jorge Albericio Latorre and Jeff Pool and Darko Stosic and Dusan Stosic and Ganesh Venkatesh and Chong Yu and Paulius Micikevicius},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/journals/corr/abs-2104-08378.bib},
 eprint = {2104.08378},
 eprinttype = {arXiv},
 journal = {CoRR},
 timestamp = {Mon, 26 Apr 2021 17:25:10 +0200},
 title = {Accelerating Sparse Deep Neural Networks},
 url = {https://arxiv.org/abs/2104.08378},
 volume = {abs/2104.08378},
 year = {2021}
}

@article{bains2020business,
 author = {Bains, Sunny},
 doi = {10.1038/s41928-020-0449-1},
 issn = {2520-1131},
 journal = {Nature Electronics},
 number = {7},
 pages = {348--351},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {The business of building brains},
 url = {https://doi.org/10.1038/s41928-020-0449-1},
 volume = {3},
 year = {2020}
}

@inproceedings{bhardwaj2020comprehensive,
 author = {Bhardwaj, Kshitij and Havasi, Marton and Yao, Yuan and Brooks, David M and Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Wei, Gu-Yeon},
 booktitle = {Proceedings of the ACM/IEEE International Symposium on Low Power Electronics and Design},
 pages = {145--150},
 title = {A comprehensive methodology to determine optimal coherence interfaces for many-accelerator SoCs},
 year = {2020}
}

@article{biggs2021natively,
 author = {Biggs, John and Myers, James and Kufel, Jedrzej and Ozer, Emre and Craske, Simon and Sou, Antony and Ramsdale, Catherine and Williamson, Ken and Price, Richard and White, Scott},
 doi = {10.1038/s41586-021-03625-w},
 issn = {0028-0836, 1476-4687},
 journal = {Nature},
 number = {7868},
 pages = {532--536},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {A natively flexible 32-bit Arm microprocessor},
 url = {https://doi.org/10.1038/s41586-021-03625-w},
 volume = {595},
 year = {2021}
}

@article{binkert2011gem5,
 author = {Binkert, Nathan and Beckmann, Bradford and Black, Gabriel and Reinhardt, Steven K. and Saidi, Ali and Basu, Arkaprava and Hestness, Joel and Hower, Derek R. and Krishna, Tushar and Sardashti, Somayeh and Sen, Rathijit and Sewell, Korey and Shoaib, Muhammad and Vaish, Nilay and Hill, Mark D. and Wood, David A.},
 doi = {10.1145/2024716.2024718},
 issn = {0163-5964},
 journal = {ACM SIGARCH Computer Architecture News},
 number = {2},
 pages = {1--7},
 publisher = {Association for Computing Machinery (ACM)},
 source = {Crossref},
 title = {The gem5 simulator},
 url = {https://doi.org/10.1145/2024716.2024718},
 volume = {39},
 year = {2011}
}

@inproceedings{brown2020language,
 author = {Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert{-}Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/nips/BrownMRSKDNSSAA20.bib},
 booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
 editor = {Hugo Larochelle and Marc'Aurelio Ranzato and Raia Hadsell and Maria{-}Florina Balcan and Hsuan{-}Tien Lin},
 timestamp = {Tue, 19 Jan 2021 00:00:00 +0100},
 title = {Language Models are Few-Shot Learners},
 url = {https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html},
 year = {2020}
}

@article{burr2016recent,
 author = {Burr, Geoffrey W. and BrightSky, Matthew J. and Sebastian, Abu and Cheng, Huai-Yu and Wu, Jau-Yi and Kim, Sangbum and Sosa, Norma E. and Papandreou, Nikolaos and Lung, Hsiang-Lan and Pozidis, Haralampos and Eleftheriou, Evangelos and Lam, Chung H.},
 doi = {10.1109/jetcas.2016.2547718},
 issn = {2156-3357, 2156-3365},
 journal = {IEEE Journal on Emerging and Selected Topics in Circuits and Systems},
 number = {2},
 pages = {146--162},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Recent Progress in Phase-{Change\ensuremath{<}?Pub} \_newline {?\ensuremath{>}Memory} Technology},
 url = {https://doi.org/10.1109/jetcas.2016.2547718},
 volume = {6},
 year = {2016}
}

@inproceedings{chen2018tvm,
 author = {Chen, Tianqi and Moreau, Thierry and Jiang, Ziheng and Zheng, Lianmin and Yan, Eddie and Shen, Haichen and Cowan, Meghan and Wang, Leyuan and Hu, Yuwei and Ceze, Luis and others},
 booktitle = {13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)},
 pages = {578--594},
 title = {{TVM:} {An} automated End-to-End optimizing compiler for deep learning},
 year = {2018}
}

@article{cheng2017survey,
 author = {Cheng, Yu and Wang, Duo and Zhou, Pan and Zhang, Tao},
 doi = {10.1109/msp.2017.2765695},
 issn = {1053-5888},
 journal = {IEEE Signal Process Mag.},
 number = {1},
 pages = {126--136},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Model Compression and Acceleration for Deep Neural Networks: {The} Principles, Progress, and Challenges},
 url = {https://doi.org/10.1109/msp.2017.2765695},
 volume = {35},
 year = {2018}
}

@article{chi2016prime,
 author = {Chi, Ping and Li, Shuangchen and Xu, Cong and Zhang, Tao and Zhao, Jishen and Liu, Yongpan and Wang, Yu and Xie, Yuan},
 doi = {10.1145/3007787.3001140},
 issn = {0163-5964},
 journal = {ACM SIGARCH Computer Architecture News},
 number = {3},
 pages = {27--39},
 publisher = {Association for Computing Machinery (ACM)},
 source = {Crossref},
 subtitle = {a novel processing-in-memory architecture for neural network computation in ReRAM-based main memory},
 title = {Prime},
 url = {https://doi.org/10.1145/3007787.3001140},
 volume = {44},
 year = {2016}
}

@article{chua1971memristor,
 author = {Chua, L.},
 doi = {10.1109/tct.1971.1083337},
 issn = {0018-9324},
 journal = {\#IEEE\_J\_CT\#},
 number = {5},
 pages = {507--519},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Memristor-The missing circuit element},
 url = {https://doi.org/10.1109/tct.1971.1083337},
 volume = {18},
 year = {1971}
}

@article{davies2018loihi,
 author = {Davies, Mike and Srinivasa, Narayan and Lin, Tsung-Han and Chinya, Gautham and Cao, Yongqiang and Choday, Sri Harsha and Dimou, Georgios and Joshi, Prasad and Imam, Nabil and Jain, Shweta and Liao, Yuyun and Lin, Chit-Kwan and Lines, Andrew and Liu, Ruokun and Mathaikutty, Deepak and McCoy, Steven and Paul, Arnab and Tse, Jonathan and Venkataramanan, Guruguhanathan and Weng, Yi-Hsin and Wild, Andreas and Yang, Yoonseok and Wang, Hong},
 doi = {10.1109/mm.2018.112130359},
 issn = {0272-1732, 1937-4143},
 journal = {IEEE Micro},
 number = {1},
 pages = {82--99},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Loihi: {A} Neuromorphic Manycore Processor with On-Chip Learning},
 url = {https://doi.org/10.1109/mm.2018.112130359},
 volume = {38},
 year = {2018}
}

@article{davies2021advancing,
 author = {Davies, Mike and Wild, Andreas and Orchard, Garrick and Sandamirskaya, Yulia and Guerra, Gabriel A. Fonseca and Joshi, Prasad and Plank, Philipp and Risbud, Sumedh R.},
 doi = {10.1109/jproc.2021.3067593},
 issn = {0018-9219, 1558-2256},
 journal = {Proc. IEEE},
 number = {5},
 pages = {911--934},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Advancing Neuromorphic Computing With Loihi: {A} Survey of Results and Outlook},
 url = {https://doi.org/10.1109/jproc.2021.3067593},
 volume = {109},
 year = {2021}
}

@article{dongarra2009evolution,
 author = {Dongarra, Jack J},
 journal = {IBM J. Res. Dev.},
 pages = {3--4},
 title = {The evolution of high performance computing on system z},
 volume = {53},
 year = {2009}
}

@article{duarte2022fastml,
 author = {Duarte, Javier and Tran, Nhan and Hawks, Ben and Herwig, Christian and Muhizi, Jules and Prakash, Shvetank and Reddi, Vijay Janapa},
 journal = {ArXiv preprint},
 title = {{FastML} Science Benchmarks: {Accelerating} Real-Time Scientific Edge Machine Learning},
 url = {https://arxiv.org/abs/2207.07958},
 volume = {abs/2207.07958},
 year = {2022}
}

@article{eshraghian2023training,
 author = {Eshraghian, Jason K. and Ward, Max and Neftci, Emre O. and Wang, Xinxin and Lenz, Gregor and Dwivedi, Girish and Bennamoun, Mohammed and Jeong, Doo Seok and Lu, Wei D.},
 bdsk-url-1 = {https://doi.org/10.1109/JPROC.2023.3308088},
 doi = {10.1109/jproc.2023.3308088},
 issn = {0018-9219, 1558-2256},
 journal = {Proc. IEEE},
 number = {9},
 pages = {1016--1054},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Training Spiking Neural Networks Using Lessons From Deep Learning},
 url = {https://doi.org/10.1109/jproc.2023.3308088},
 volume = {111},
 year = {2023}
}

@article{farah2005neuroethics,
 author = {Farah, Martha J.},
 doi = {10.1016/j.tics.2004.12.001},
 issn = {1364-6613},
 journal = {Trends Cogn. Sci.},
 number = {1},
 pages = {34--40},
 publisher = {Elsevier BV},
 source = {Crossref},
 title = {Neuroethics: {The} practical and the philosophical},
 url = {https://doi.org/10.1016/j.tics.2004.12.001},
 volume = {9},
 year = {2005}
}

@inproceedings{fowers2018configurable,
 author = {Fowers, Jeremy and Ovtcharov, Kalin and Papamichael, Michael and Massengill, Todd and Liu, Ming and Lo, Daniel and Alkalay, Shlomi and Haselman, Michael and Adams, Logan and Ghandi, Mahdi and Heil, Stephen and Patel, Prerak and Sapek, Adam and Weisz, Gabriel and Woods, Lisa and Lanka, Sitaram and Reinhardt, Steven K. and Caulfield, Adrian M. and Chung, Eric S. and Burger, Doug},
 booktitle = {2018 ACM/IEEE 45th Annual International Symposium on Computer Architecture (ISCA)},
 doi = {10.1109/isca.2018.00012},
 organization = {IEEE},
 pages = {1--14},
 publisher = {IEEE},
 source = {Crossref},
 title = {A Configurable Cloud-Scale {DNN} Processor for Real-Time {AI}},
 url = {https://doi.org/10.1109/isca.2018.00012},
 year = {2018}
}

@article{furber2016large,
 author = {Furber, Steve},
 doi = {10.1088/1741-2560/13/5/051001},
 issn = {1741-2560, 1741-2552},
 journal = {J. Neural Eng.},
 number = {5},
 pages = {051001},
 publisher = {IOP Publishing},
 source = {Crossref},
 title = {Large-scale neuromorphic computing systems},
 url = {https://doi.org/10.1088/1741-2560/13/5/051001},
 volume = {13},
 year = {2016}
}

@article{gale2019state,
 author = {Gale, Trevor and Elsen, Erich and Hooker, Sara},
 journal = {ArXiv preprint},
 title = {The state of sparsity in deep neural networks},
 url = {https://arxiv.org/abs/1902.09574},
 volume = {abs/1902.09574},
 year = {2019}
}

@inproceedings{gannot1994verilog,
 author = {Gannot, G. and Ligthart, M.},
 bdsk-url-1 = {https://doi.org/10.1109/IVC.1994.323743},
 booktitle = {International Verilog HDL Conference},
 doi = {10.1109/ivc.1994.323743},
 number = {},
 pages = {86--92},
 publisher = {IEEE},
 source = {Crossref},
 title = {Verilog {HDL} based {FPGA} design},
 url = {https://doi.org/10.1109/ivc.1994.323743},
 volume = {},
 year = {1994}
}

@article{gates2009flexible,
 author = {Gates, Byron D.},
 doi = {10.1126/science.1171230},
 issn = {0036-8075, 1095-9203},
 journal = {Science},
 number = {5921},
 pages = {1566--1567},
 publisher = {American Association for the Advancement of Science (AAAS)},
 source = {Crossref},
 title = {Flexible Electronics},
 url = {https://doi.org/10.1126/science.1171230},
 volume = {323},
 year = {2009}
}

@article{goodyear2017social,
 author = {Goodyear, Victoria A.},
 doi = {10.1080/2159676x.2017.1303790},
 issn = {2159-676X, 2159-6778},
 journal = {Qualitative Research in Sport, Exercise and Health},
 number = {3},
 pages = {285--302},
 publisher = {Informa UK Limited},
 source = {Crossref},
 title = {Social media, apps and wearable technologies: {Navigating} ethical dilemmas and procedures},
 url = {https://doi.org/10.1080/2159676x.2017.1303790},
 volume = {9},
 year = {2017}
}

@article{gwennapcertusnx,
 author = {Gwennap, Linley},
 language = {en},
 title = {Certus-{NX} Innovates General-Purpose {FPGAs}}
}

@article{haensch2018next,
 author = {Haensch, Wilfried and Gokmen, Tayfun and Puri, Ruchir},
 doi = {10.1109/jproc.2018.2871057},
 issn = {0018-9219, 1558-2256},
 journal = {Proc. IEEE},
 number = {1},
 pages = {108--122},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {The Next Generation of Deep Learning Hardware: {Analog} Computing},
 url = {https://doi.org/10.1109/jproc.2018.2871057},
 volume = {107},
 year = {2019}
}

@article{hazan2021neuromorphic,
 author = {Hazan, Avi and Ezra Tsur, Elishai},
 doi = {10.3389/fnins.2021.627221},
 issn = {1662-453X},
 journal = {Front. Neurosci.},
 pages = {627221},
 publisher = {Frontiers Media SA},
 source = {Crossref},
 title = {Neuromorphic Analog Implementation of Neural Engineering Framework-Inspired Spiking Neuron for High-Dimensional Representation},
 url = {https://doi.org/10.3389/fnins.2021.627221},
 volume = {15},
 year = {2021}
}

@article{hennessy2019golden,
 abstract = {Innovations like domain-specific hardware, enhanced security, open instruction sets, and agile chip development will lead the way.},
 author = {Hennessy, John L. and Patterson, David A.},
 copyright = {http://www.acm.org/publications/policies/copyright\_policy\#Background},
 doi = {10.1145/3282307},
 issn = {0001-0782, 1557-7317},
 journal = {Commun. ACM},
 language = {en},
 number = {2},
 pages = {48--60},
 publisher = {Association for Computing Machinery (ACM)},
 source = {Crossref},
 title = {A new golden age for computer architecture},
 url = {https://doi.org/10.1145/3282307},
 volume = {62},
 year = {2019}
}

@misc{howard2017mobilenets,
 author = {Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
 journal = {ArXiv preprint},
 title = {{MobileNets:} {Efficient} Convolutional Neural Networks for Mobile Vision Applications},
 url = {https://arxiv.org/abs/1704.04861},
 volume = {abs/1704.04861},
 year = {2017}
}

@article{huang2010pseudo,
 author = {Huang, Tsung-Ching and Fukuda, Kenjiro and Lo, Chun-Ming and Yeh, Yung-Hui and Sekitani, Tsuyoshi and Someya, Takao and Cheng, Kwang-Ting},
 doi = {10.1109/ted.2010.2088127},
 issn = {0018-9383, 1557-9646},
 journal = {IEEE Trans. Electron Devices},
 number = {1},
 pages = {141--150},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Pseudo-{CMOS:} {A} Design Style for Low-Cost and Robust Flexible Electronics},
 url = {https://doi.org/10.1109/ted.2010.2088127},
 volume = {58},
 year = {2011}
}

@inproceedings{ignatov2018ai,
 author = {Ignatov, Andrey and Timofte, Radu and Kulik, Andrei and Yang, Seungsoo and Wang, Ke and Baum, Felix and Wu, Max and Xu, Lirong and Van Gool, Luc},
 booktitle = {2019 IEEE/CVF International Conference on Computer Vision Workshop (ICCVW)},
 doi = {10.1109/iccvw.2019.00447},
 pages = {0--0},
 publisher = {IEEE},
 source = {Crossref},
 title = {{AI} Benchmark: {All} About Deep Learning on Smartphones in 2019},
 url = {https://doi.org/10.1109/iccvw.2019.00447},
 year = {2019}
}

@article{ignatov2018ai,
 abstract = {Over the last years, the computational power of mobile devices such as smartphones and tablets has grown dramatically, reaching the level of desktop computers available not long ago. While standard smartphone apps are no longer a problem for them, there is still a group of tasks that can easily challenge even high-end devices, namely running artificial intelligence algorithms. In this paper, we present a study of the current state of deep learning in the Android ecosystem and describe available frameworks, programming models and the limitations of running AI on smartphones. We give an overview of the hardware acceleration resources available on four main mobile chipset platforms: Qualcomm, HiSilicon, MediaTek and Samsung. Additionally, we present the real-world performance results of different mobile SoCs collected with AI Benchmark that are covering all main existing hardware configurations.},
 author = {Ignatov, Andrey and Timofte, Radu and Chou, William and Wang, Ke and Wu, Max and Hartley, Tim and Van Gool, Luc},
 booktitle = {Proceedings of the European Conference on Computer Vision (ECCV) Workshops},
 pages = {0--0},
 publisher = {arXiv},
 title = {{AI} Benchmark: {Running} deep neural networks on Android smartphones},
 year = {2018}
}

@inproceedings{imani2016resistive,
 author = {Imani, Mohsen and Rahimi, Abbas and S. Rosing, Tajana},
 booktitle = {Proceedings of the 2016 Design, Automation \&amp; Test in Europe Conference \&amp; Exhibition (DATE)},
 doi = {10.3850/9783981537079\_0454},
 organization = {IEEE},
 pages = {1327--1332},
 publisher = {Research Publishing Services},
 source = {Crossref},
 title = {Resistive Configurable Associative Memory for Approximate Computing},
 url = {https://doi.org/10.3850/9783981537079\_0454},
 year = {2016}
}

@inproceedings{jacob2018quantization,
 author = {Benoit Jacob and Skirmantas Kligys and Bo Chen and Menglong Zhu and Matthew Tang and Andrew G. Howard and Hartwig Adam and Dmitry Kalenichenko},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/cvpr/JacobKCZTHAK18.bib},
 booktitle = {2018 {IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR} 2018, Salt Lake City, UT, USA, June 18-22, 2018},
 doi = {10.1109/CVPR.2018.00286},
 pages = {2704--2713},
 publisher = {{IEEE} Computer Society},
 timestamp = {Wed, 06 Feb 2019 00:00:00 +0100},
 title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference},
 url = {http://openaccess.thecvf.com/content\_cvpr\_2018/html/Jacob\_Quantization\_and\_Training\_CVPR\_2018\_paper.html},
 year = {2018}
}

@misc{jia2018dissecting,
 author = {Jia, Zhe and Maggioni, Marco and Staiger, Benjamin and Scarpazza, Daniele P.},
 journal = {ArXiv preprint},
 title = {Dissecting the {NVIDIA} {Volta} {GPU} Architecture via Microbenchmarking},
 url = {https://arxiv.org/abs/1804.06826},
 volume = {abs/1804.06826},
 year = {2018}
}

@inproceedings{jia2019beyond,
 author = {Zhihao Jia and Matei Zaharia and Alex Aiken},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/mlsys/JiaZA19.bib},
 booktitle = {Proceedings of Machine Learning and Systems 2019, MLSys 2019, Stanford, CA, USA, March 31 - April 2, 2019},
 editor = {Ameet Talwalkar and Virginia Smith and Matei Zaharia},
 publisher = {mlsys.org},
 timestamp = {Thu, 18 Jun 2020 01:00:00 +0200},
 title = {Beyond Data and Model Parallelism for Deep Neural Networks},
 url = {https://proceedings.mlsys.org/book/265.pdf},
 year = {2019}
}

@inproceedings{jouppi2017datacenter,
 abstract = {Many architects believe that major improvements in cost-energy-performance must now come from domain-specific hardware. This paper evaluates a custom ASIC{\textemdash}called a Tensor Processing Unit (TPU) {\textemdash} deployed in datacenters since 2015 that accelerates the inference phase of neural networks (NN). The heart of the TPU is a 65,536 8-bit MAC matrix multiply unit that offers a peak throughput of 92 TeraOps/second (TOPS) and a large (28 MiB) software-managed on-chip memory. The TPU's deterministic execution model is a better match to the 99th-percentile response-time requirement of our NN applications than are the time-varying optimizations of CPUs and GPUs that help average throughput more than guaranteed latency. The lack of such features helps explain why, despite having myriad MACs and a big memory, the TPU is relatively small and low power. We compare the TPU to a server-class Intel Haswell CPU and an Nvidia K80 GPU, which are contemporaries deployed in the same datacenters. Our workload, written in the high-level TensorFlow framework, uses production NN applications (MLPs, CNNs, and LSTMs) that represent 95\% of our datacenters' NN inference demand. Despite low utilization for some applications, the TPU is on average about 15X {\textendash} 30X faster than its contemporary GPU or CPU, with TOPS/Watt about 30X {\textendash} 80X higher. Moreover, using the CPU's GDDR5 memory in the TPU would triple achieved TOPS and raise TOPS/Watt to nearly 70X the GPU and 200X the CPU.},
 address = {New York, NY, USA},
 author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and Boyle, Rick and Cantin, Pierre-luc and Chao, Clifford and Clark, Chris and Coriell, Jeremy and Daley, Mike and Dau, Matt and Dean, Jeffrey and Gelb, Ben and Ghaemmaghami, Tara Vazir and Gottipati, Rajendra and Gulland, William and Hagmann, Robert and Ho, C. Richard and Hogberg, Doug and Hu, John and Hundt, Robert and Hurt, Dan and Ibarz, Julian and Jaffey, Aaron and Jaworski, Alek and Kaplan, Alexander and Khaitan, Harshit and Killebrew, Daniel and Koch, Andy and Kumar, Naveen and Lacy, Steve and Laudon, James and Law, James and Le, Diemthu and Leary, Chris and Liu, Zhuyuan and Lucke, Kyle and Lundin, Alan and MacKean, Gordon and Maggiore, Adriana and Mahony, Maire and Miller, Kieran and Nagarajan, Rahul and Narayanaswami, Ravi and Ni, Ray and Nix, Kathy and Norrie, Thomas and Omernick, Mark and Penukonda, Narayana and Phelps, Andy and Ross, Jonathan and Ross, Matt and Salek, Amir and Samadiani, Emad and Severn, Chris and Sizikov, Gregory and Snelham, Matthew and Souter, Jed and Steinberg, Dan and Swing, Andy and Tan, Mercedes and Thorson, Gregory and Tian, Bo and Toma, Horia and Tuttle, Erick and Vasudevan, Vijay and Walter, Richard and Wang, Walter and Wilcox, Eric and Yoon, Doe Hyun},
 bdsk-url-1 = {https://doi.org/10.1145/3079856.3080246},
 booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
 doi = {10.1145/3079856.3080246},
 isbn = {9781450348928},
 keywords = {accelerator, neural network, MLP, TPU, CNN, deep learning, domain-specific architecture, GPU, TensorFlow, DNN, RNN, LSTM},
 location = {Toronto, ON, Canada},
 numpages = {12},
 pages = {1--12},
 publisher = {ACM},
 series = {ISCA '17},
 source = {Crossref},
 title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
 url = {https://doi.org/10.1145/3079856.3080246},
 year = {2017}
}

@inproceedings{jouppi2017indatacenter,
 abstract = {Many architects believe that major improvements in cost-energy-performance must now come from domain-specific hardware. This paper evaluates a custom ASIC{\textemdash}called a Tensor Processing Unit (TPU) {\textemdash} deployed in datacenters since 2015 that accelerates the inference phase of neural networks (NN). The heart of the TPU is a 65,536 8-bit MAC matrix multiply unit that offers a peak throughput of 92 TeraOps/second (TOPS) and a large (28 MiB) software-managed on-chip memory. The TPU's deterministic execution model is a better match to the 99th-percentile response-time requirement of our NN applications than are the time-varying optimizations of CPUs and GPUs that help average throughput more than guaranteed latency. The lack of such features helps explain why, despite having myriad MACs and a big memory, the TPU is relatively small and low power. We compare the TPU to a server-class Intel Haswell CPU and an Nvidia K80 GPU, which are contemporaries deployed in the same datacenters. Our workload, written in the high-level TensorFlow framework, uses production NN applications (MLPs, CNNs, and LSTMs) that represent 95\% of our datacenters' NN inference demand. Despite low utilization for some applications, the TPU is on average about 15X {\textendash} 30X faster than its contemporary GPU or CPU, with TOPS/Watt about 30X {\textendash} 80X higher. Moreover, using the CPU's GDDR5 memory in the TPU would triple achieved TOPS and raise TOPS/Watt to nearly 70X the GPU and 200X the CPU.},
 address = {New York, NY, USA},
 author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and Boyle, Rick and Cantin, Pierre-luc and Chao, Clifford and Clark, Chris and Coriell, Jeremy and Daley, Mike and Dau, Matt and Dean, Jeffrey and Gelb, Ben and Ghaemmaghami, Tara Vazir and Gottipati, Rajendra and Gulland, William and Hagmann, Robert and Ho, C. Richard and Hogberg, Doug and Hu, John and Hundt, Robert and Hurt, Dan and Ibarz, Julian and Jaffey, Aaron and Jaworski, Alek and Kaplan, Alexander and Khaitan, Harshit and Killebrew, Daniel and Koch, Andy and Kumar, Naveen and Lacy, Steve and Laudon, James and Law, James and Le, Diemthu and Leary, Chris and Liu, Zhuyuan and Lucke, Kyle and Lundin, Alan and MacKean, Gordon and Maggiore, Adriana and Mahony, Maire and Miller, Kieran and Nagarajan, Rahul and Narayanaswami, Ravi and Ni, Ray and Nix, Kathy and Norrie, Thomas and Omernick, Mark and Penukonda, Narayana and Phelps, Andy and Ross, Jonathan and Ross, Matt and Salek, Amir and Samadiani, Emad and Severn, Chris and Sizikov, Gregory and Snelham, Matthew and Souter, Jed and Steinberg, Dan and Swing, Andy and Tan, Mercedes and Thorson, Gregory and Tian, Bo and Toma, Horia and Tuttle, Erick and Vasudevan, Vijay and Walter, Richard and Wang, Walter and Wilcox, Eric and Yoon, Doe Hyun},
 bdsk-url-1 = {https://doi.org/10.1145/3079856.3080246},
 booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
 doi = {10.1145/3079856.3080246},
 isbn = {9781450348928},
 keywords = {accelerator, neural network, MLP, TPU, CNN, deep learning, domain-specific architecture, GPU, TensorFlow, DNN, RNN, LSTM},
 location = {Toronto, ON, Canada},
 numpages = {12},
 pages = {1--12},
 publisher = {ACM},
 series = {ISCA '17},
 source = {Crossref},
 title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
 url = {https://doi.org/10.1145/3079856.3080246},
 year = {2017}
}

@inproceedings{jouppi2023tpu,
 abstract = {In response to innovations in machine learning (ML) models, production workloads changed radically and rapidly. TPU v4 is the fifth Google domain specific architecture (DSA) and its third supercomputer for such ML models. Optical circuit switches (OCSes) dynamically reconfigure its interconnect topology to improve scale, availability, utilization, modularity, deployment, security, power, and performance; users can pick a twisted 3D torus topology if desired. Much cheaper, lower power, and faster than Infiniband, OCSes and underlying optical components are lt;5\% of system cost and lt;3\% of system power. Each TPU v4 includes SparseCores, dataflow processors that accelerate models that rely on embeddings by 5x{\textendash}7x yet use only 5\% of die area and power. Deployed since 2020, TPU v4 outperforms TPU v3 by 2.1x and improves performance/Watt by 2.7x. The TPU v4 supercomputer is 4x larger at 4096 chips and thus nearly 10x faster overall, which along with OCS flexibility and availability allows a large language model to train at an average of ~60\% of peak FLOPS/second. For similar sized systems, it is ~4.3x{\textendash}4.5x faster than the Graphcore IPU Bow and is 1.2x{\textendash}1.7x faster and uses 1.3x{\textendash}1.9x less power than the Nvidia A100. TPU v4s inside the energy-optimized warehouse scale computers of Google Cloud use ~2{\textendash}6x less energy and produce ~20x less CO2e than contemporary DSAs in typical on-premise data centers.},
 address = {New York, NY, USA},
 articleno = {82},
 author = {Jouppi, Norm and Kurian, George and Li, Sheng and Ma, Peter and Nagarajan, Rahul and Nai, Lifeng and Patil, Nishant and Subramanian, Suvinay and Swing, Andy and Towles, Brian and Young, Clifford and Zhou, Xiang and Zhou, Zongwei and Patterson, David A},
 bdsk-url-1 = {https://doi.org/10.1145/3579371.3589350},
 booktitle = {Proceedings of the 50th Annual International Symposium on Computer Architecture},
 doi = {10.1145/3579371.3589350},
 isbn = {9798400700958},
 keywords = {warehouse scale computer, embeddings, supercomputer, domain specific architecture, reconfigurable, TPU, large language model, power usage effectiveness, CO2 equivalent emissions, energy, optical interconnect, IPU, machine learning, GPU, carbon emissions},
 location = {Orlando, FL, USA},
 numpages = {14},
 publisher = {ACM},
 series = {ISCA '23},
 source = {Crossref},
 title = {{TPU} v4: {An} Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings},
 url = {https://doi.org/10.1145/3579371.3589350},
 year = {2023}
}

@inproceedings{kao2020confuciux,
 author = {Kao, Sheng-Chun and Jeong, Geonhwa and Krishna, Tushar},
 booktitle = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
 organization = {IEEE},
 pages = {622--636},
 title = {Confuciux: Autonomous hardware resource assignment for dnn accelerators using reinforcement learning},
 year = {2020}
}

@inproceedings{kao2020gamma,
 author = {Kao, Sheng-Chun and Krishna, Tushar},
 booktitle = {Proceedings of the 39th International Conference on Computer-Aided Design},
 pages = {1--9},
 title = {Gamma: Automating the hw mapping of dnn models on accelerators via genetic algorithm},
 year = {2020}
}

@misc{krishnan2022multiagent,
 archiveprefix = {arXiv},
 author = {Srivatsan Krishnan and Natasha Jaques and Shayegan Omidshafiei and Dan Zhang and Izzeddin Gur and Vijay Janapa Reddi and Aleksandra Faust},
 eprint = {2211.16385},
 primaryclass = {cs.AR},
 title = {Multi-Agent Reinforcement Learning for Microprocessor Design Space Exploration},
 year = {2022}
}

@inproceedings{krishnan2023archgym,
 author = {Krishnan, Srivatsan and Yazdanbakhsh, Amir and Prakash, Shvetank and Jabbour, Jason and Uchendu, Ikechukwu and Ghosh, Susobhan and Boroujerdian, Behzad and Richins, Daniel and Tripathy, Devashree and Faust, Aleksandra and Janapa Reddi, Vijay},
 booktitle = {Proceedings of the 50th Annual International Symposium on Computer Architecture},
 doi = {10.1145/3579371.3589049},
 pages = {1--16},
 publisher = {ACM},
 source = {Crossref},
 title = {{ArchGym:} {An} Open-Source Gymnasium for Machine Learning Assisted Architecture Design},
 url = {https://doi.org/10.1145/3579371.3589049},
 year = {2023}
}

@article{kwon2022flexible,
 author = {Kwon, Sun Hwa and Dong, Lin},
 doi = {10.1016/j.nanoen.2022.107632},
 issn = {2211-2855},
 journal = {Nano Energy},
 pages = {107632},
 publisher = {Elsevier BV},
 source = {Crossref},
 title = {Flexible sensors and machine learning for heart monitoring},
 url = {https://doi.org/10.1016/j.nanoen.2022.107632},
 volume = {102},
 year = {2022}
}

@inproceedings{Li2020Additive,
 author = {Yuhang Li and Xin Dong and Wei Wang},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/iclr/LiDW20.bib},
 booktitle = {8th International Conference on Learning Representations, {ICLR} 2020, Addis Ababa, Ethiopia, April 26-30, 2020},
 publisher = {OpenReview.net},
 timestamp = {Tue, 18 Aug 2020 01:00:00 +0200},
 title = {Additive Powers-of-Two Quantization: An Efficient Non-uniform Discretization for Neural Networks},
 url = {https://openreview.net/forum?id=BkgXT24tDS},
 year = {2020}
}

@inproceedings{lin2022ondevice,
 author = {Zhu, Ligeng and Hu, Lanxiang and Lin, Ji and Chen, Wei-Ming and Wang, Wei-Chen and Gan, Chuang and Han, Song},
 booktitle = {56th Annual IEEE/ACM International Symposium on Microarchitecture},
 doi = {10.1145/3613424.3614307},
 publisher = {ACM},
 source = {Crossref},
 title = {{PockEngine:} {Sparse} and Efficient Fine-tuning in a Pocket},
 url = {https://doi.org/10.1145/3613424.3614307},
 year = {2023}
}

@article{lin2023awq,
 author = {Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
 journal = {arXiv},
 title = {{AWQ:} {Activation-aware} Weight Quantization for {LLM} Compression and Acceleration},
 year = {2023}
}

@article{lindholm2008nvidia,
 abstract = {To enable flexible, programmable graphics and high-performance computing, NVIDIA has developed the Tesla scalable unified graphics and parallel computing architecture. Its scalable parallel array of processors is massively multithreaded and programmable in C or via graphics APIs.},
 author = {Lindholm, Erik and Nickolls, John and Oberman, Stuart and Montrym, John},
 bdsk-url-1 = {https://ieeexplore.ieee.org/document/4523358},
 bdsk-url-2 = {https://doi.org/10.1109/MM.2008.31},
 doi = {10.1109/mm.2008.31},
 issn = {0272-1732},
 journal = {IEEE Micro},
 note = {Conference Name: IEEE Micro},
 number = {2},
 pages = {39--55},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 shorttitle = {NVIDIA Tesla},
 source = {Crossref},
 title = {{NVIDIA} Tesla: {A} Unified Graphics and Computing Architecture},
 url = {https://doi.org/10.1109/mm.2008.31},
 urldate = {2023-11-07},
 volume = {28},
 year = {2008}
}

@article{loh20083dstacked,
 author = {Loh, Gabriel H.},
 doi = {10.1145/1394608.1382159},
 issn = {0163-5964},
 journal = {ACM SIGARCH Computer Architecture News},
 number = {3},
 pages = {453--464},
 publisher = {Association for Computing Machinery (ACM)},
 source = {Crossref},
 title = {{3D}-Stacked Memory Architectures for Multi-core Processors},
 url = {https://doi.org/10.1145/1394608.1382159},
 volume = {36},
 year = {2008}
}

@inproceedings{luebke2008cuda,
 author = {Luebke, David},
 bdsk-url-1 = {https://doi.org/10.1109/ISBI.2008.4541126},
 booktitle = {2008 5th IEEE International Symposium on Biomedical Imaging: From Nano to Macro},
 doi = {10.1109/isbi.2008.4541126},
 number = {},
 pages = {836--838},
 publisher = {IEEE},
 source = {Crossref},
 title = {{CUDA:} {Scalable} parallel programming for high-performance scientific computing},
 url = {https://doi.org/10.1109/isbi.2008.4541126},
 volume = {},
 year = {2008}
}

@article{maass1997networks,
 author = {Maass, Wolfgang},
 doi = {10.1016/s0893-6080(97)00011-7},
 issn = {0893-6080},
 journal = {Neural Networks},
 number = {9},
 pages = {1659--1671},
 publisher = {Elsevier BV},
 source = {Crossref},
 title = {Networks of spiking neurons: {The} third generation of neural network models},
 url = {https://doi.org/10.1016/s0893-6080(97)00011-7},
 volume = {10},
 year = {1997}
}

@article{markovic2020physics,
 author = {Markovi\'c, Danijela and Mizrahi, Alice and Querlioz, Damien and Grollier, Julie},
 doi = {10.1038/s42254-020-0208-2},
 issn = {2522-5820},
 journal = {Nature Reviews Physics},
 number = {9},
 pages = {499--510},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {Physics for neuromorphic computing},
 url = {https://doi.org/10.1038/s42254-020-0208-2},
 volume = {2},
 year = {2020}
}

@article{mattson2020mlperf,
 author = {Mattson, Peter and Reddi, Vijay Janapa and Cheng, Christine and Coleman, Cody and Diamos, Greg and Kanter, David and Micikevicius, Paulius and Patterson, David and Schmuelling, Guenther and Tang, Hanlin and Wei, Gu-Yeon and Wu, Carole-Jean},
 doi = {10.1109/mm.2020.2974843},
 issn = {0272-1732, 1937-4143},
 journal = {IEEE Micro},
 number = {2},
 pages = {8--16},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {{MLPerf:} {An} Industry Standard Benchmark Suite for Machine Learning Performance},
 url = {https://doi.org/10.1109/mm.2020.2974843},
 volume = {40},
 year = {2020}
}

@article{miller2000optical,
 author = {Miller, D.A.B.},
 doi = {10.1109/2944.902184},
 issn = {1077-260X, 1558-4542},
 journal = {\#IEEE\_J\_JSTQE\#},
 number = {6},
 pages = {1312--1317},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Optical interconnects to silicon},
 url = {https://doi.org/10.1109/2944.902184},
 volume = {6},
 year = {2000}
}

@article{mirhoseini2021graph,
 author = {Mirhoseini, Azalia and Goldie, Anna and Yazgan, Mustafa and Jiang, Joe Wenjie and Songhori, Ebrahim and Wang, Shen and Lee, Young-Joon and Johnson, Eric and Pathak, Omkar and Nazi, Azade and others},
 journal = {Nature},
 number = {7862},
 pages = {207--212},
 publisher = {Nature Publishing Group},
 title = {A graph placement methodology for fast chip design},
 volume = {594},
 year = {2021}
}

@article{mittal2021survey,
 author = {Mittal, Sparsh and Verma, Gaurav and Kaushik, Brajesh and Khanday, Farooq A.},
 doi = {10.1016/j.sysarc.2021.102276},
 issn = {1383-7621},
 journal = {J. Syst. Architect.},
 pages = {102276},
 publisher = {Elsevier BV},
 source = {Crossref},
 title = {A survey of {SRAM}-based in-memory computing techniques and applications},
 url = {https://doi.org/10.1016/j.sysarc.2021.102276},
 volume = {119},
 year = {2021}
}

@article{modha2023neural,
 author = {Modha, Dharmendra S. and Akopyan, Filipp and Andreopoulos, Alexander and Appuswamy, Rathinakumar and Arthur, John V. and Cassidy, Andrew S. and Datta, Pallab and DeBole, Michael V. and Esser, Steven K. and Otero, Carlos Ortega and Sawada, Jun and Taba, Brian and Amir, Arnon and Bablani, Deepika and Carlson, Peter J. and Flickner, Myron D. and Gandhasri, Rajamohan and Garreau, Guillaume J. and Ito, Megumi and Klamo, Jennifer L. and Kusnitz, Jeffrey A. and McClatchey, Nathaniel J. and McKinstry, Jeffrey L. and Nakamura, Yutaka and Nayak, Tapan K. and Risk, William P. and Schleupen, Kai and Shaw, Ben and Sivagnaname, Jay and Smith, Daniel F. and Terrizzano, Ignacio and Ueda, Takanori},
 doi = {10.1126/science.adh1174},
 issn = {0036-8075, 1095-9203},
 journal = {Science},
 number = {6668},
 pages = {329--335},
 publisher = {American Association for the Advancement of Science (AAAS)},
 source = {Crossref},
 title = {Neural inference at the frontier of energy, space, and time},
 url = {https://doi.org/10.1126/science.adh1174},
 volume = {382},
 year = {2023}
}

@inproceedings{munshi2009opencl,
 author = {Munshi, Aaftab},
 bdsk-url-1 = {https://doi.org/10.1109/HOTCHIPS.2009.7478342},
 booktitle = {2009 IEEE Hot Chips 21 Symposium (HCS)},
 doi = {10.1109/hotchips.2009.7478342},
 number = {},
 pages = {1--314},
 publisher = {IEEE},
 source = {Crossref},
 title = {The {OpenCL} specification},
 url = {https://doi.org/10.1109/hotchips.2009.7478342},
 volume = {},
 year = {2009}
}

@article{musk2019integrated,
 author = {Musk, Elon and others},
 doi = {10.2196/16194},
 issn = {1438-8871},
 journal = {J. Med. Internet Res.},
 number = {10},
 pages = {e16194},
 publisher = {JMIR Publications Inc.},
 source = {Crossref},
 title = {An Integrated Brain-Machine Interface Platform With Thousands of Channels},
 url = {https://doi.org/10.2196/16194},
 volume = {21},
 year = {2019}
}

@article{norrie2021design,
 author = {Norrie, Thomas and Patil, Nishant and Yoon, Doe Hyun and Kurian, George and Li, Sheng and Laudon, James and Young, Cliff and Jouppi, Norman and Patterson, David},
 bdsk-url-1 = {https://doi.org/10.1109/MM.2021.3058217},
 doi = {10.1109/mm.2021.3058217},
 issn = {0272-1732, 1937-4143},
 journal = {IEEE Micro},
 number = {2},
 pages = {56--63},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {The Design Process for Google's Training Chips: {Tpuv2} and {TPUv3}},
 url = {https://doi.org/10.1109/mm.2021.3058217},
 volume = {41},
 year = {2021}
}

@book{patterson2016computer,
 author = {Patterson, David A and Hennessy, John L},
 publisher = {Morgan kaufmann},
 title = {Computer organization and design {ARM} edition: {The} hardware software interface},
 year = {2016}
}

@article{putnam2014reconfigurable,
 abstract = {Datacenter workloads demand high computational capabilities, flexibility, power efficiency, and low cost. It is challenging to improve all of these factors simultaneously. To advance datacenter capabilities beyond what commodity server designs can provide, we have designed and built a composable, reconfigurablefabric to accelerate portions of large-scale software services. Each instantiation of the fabric consists of a 6x8 2-D torus of high-end Stratix V FPGAs embedded into a half-rack of 48 machines. One FPGA is placed into each server, accessible through PCIe, and wired directly to other FPGAs with pairs of 10 Gb SAS cables In this paper, we describe a medium-scale deployment of this fabric on a bed of 1,632 servers, and measure its efficacy in accelerating the Bing web search engine. We describe the requirements and architecture of the system, detail the critical engineering challenges and solutions needed to make the system robust in the presence of failures, and measure the performance, power, and resilience of the system when ranking candidate documents. Under high load, the largescale reconfigurable fabric improves the ranking throughput of each server by a factor of 95\% for a fixed latency distribution{\textemdash} or, while maintaining equivalent throughput, reduces the tail latency by 29\%},
 author = {Putnam, Andrew and Caulfield, Adrian M. and Chung, Eric S. and Chiou, Derek and Constantinides, Kypros and Demme, John and Esmaeilzadeh, Hadi and Fowers, Jeremy and Gopal, Gopi Prashanth and Gray, Jan and Haselman, Michael and Hauck, Scott and Heil, Stephen and Hormati, Amir and Kim, Joo-Young and Lanka, Sitaram and Larus, James and Peterson, Eric and Pope, Simon and Smith, Aaron and Thong, Jason and Xiao, Phillip Yi and Burger, Doug},
 bdsk-url-1 = {https://dl.acm.org/doi/10.1145/2678373.2665678},
 bdsk-url-2 = {https://doi.org/10.1145/2678373.2665678},
 doi = {10.1145/2678373.2665678},
 issn = {0163-5964},
 journal = {ACM SIGARCH Computer Architecture News},
 language = {en},
 number = {3},
 pages = {13--24},
 publisher = {Association for Computing Machinery (ACM)},
 source = {Crossref},
 title = {A reconfigurable fabric for accelerating large-scale datacenter services},
 url = {https://doi.org/10.1145/2678373.2665678},
 urldate = {2023-11-07},
 volume = {42},
 year = {2014}
}

@inproceedings{rajat2009largescale,
 author = {Rajat Raina and Anand Madhavan and Andrew Y. Ng},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/icml/RainaMN09.bib},
 booktitle = {Proceedings of the 26th Annual International Conference on Machine Learning, {ICML} 2009, Montreal, Quebec, Canada, June 14-18, 2009},
 doi = {10.1145/1553374.1553486},
 editor = {Andrea Pohoreckyj Danyluk and L{\'{e}}on Bottou and Michael L. Littman},
 pages = {873--880},
 publisher = {{ACM}},
 series = {{ACM} International Conference Proceeding Series},
 timestamp = {Wed, 14 Nov 2018 00:00:00 +0100},
 title = {Large-scale deep unsupervised learning using graphics processors},
 url = {https://doi.org/10.1145/1553374.1553486},
 volume = {382},
 year = {2009}
}

@article{ranganathan2011from,
 author = {Ranganathan, Parthasarathy},
 doi = {10.1109/mc.2011.18},
 issn = {0018-9162},
 journal = {Computer},
 number = {1},
 pages = {39--48},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {From Microprocessors to Nanostores: {Rethinking} Data-Centric Systems},
 url = {https://doi.org/10.1109/mc.2011.18},
 volume = {44},
 year = {2011}
}

@inproceedings{reagen2017case,
 author = {Reagen, Brandon and Hern{\'a}ndez-Lobato, Jos{\'e} Miguel and Adolf, Robert and Gelbart, Michael and Whatmough, Paul and Wei, Gu-Yeon and Brooks, David},
 booktitle = {2017 IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)},
 organization = {IEEE},
 pages = {1--6},
 title = {A case for efficient accelerator design space exploration via bayesian optimization},
 year = {2017}
}

@inproceedings{reddi2020mlperf,
 author = {Reddi, Vijay Janapa and Cheng, Christine and Kanter, David and Mattson, Peter and Schmuelling, Guenther and Wu, Carole-Jean and Anderson, Brian and Breughe, Maximilien and Charlebois, Mark and Chou, William and Chukka, Ramesh and Coleman, Cody and Davis, Sam and Deng, Pan and Diamos, Greg and Duke, Jared and Fick, Dave and Gardner, J. Scott and Hubara, Itay and Idgunji, Sachin and Jablin, Thomas B. and Jiao, Jeff and John, Tom St. and Kanwar, Pankaj and Lee, David and Liao, Jeffery and Lokhmotov, Anton and Massa, Francisco and Meng, Peng and Micikevicius, Paulius and Osborne, Colin and Pekhimenko, Gennady and Rajan, Arun Tejusve Raghunath and Sequeira, Dilip and Sirasao, Ashish and Sun, Fei and Tang, Hanlin and Thomson, Michael and Wei, Frank and Wu, Ephrem and Xu, Lingjie and Yamada, Koichi and Yu, Bing and Yuan, George and Zhong, Aaron and Zhang, Peizhao and Zhou, Yuchen},
 booktitle = {2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA)},
 doi = {10.1109/isca45697.2020.00045},
 organization = {IEEE},
 pages = {446--459},
 publisher = {IEEE},
 source = {Crossref},
 title = {{MLPerf} Inference Benchmark},
 url = {https://doi.org/10.1109/isca45697.2020.00045},
 year = {2020}
}

@article{roskies2002neuroethics,
 author = {Roskies, Adina},
 doi = {10.1016/s0896-6273(02)00763-8},
 issn = {0896-6273},
 journal = {Neuron},
 number = {1},
 pages = {21--23},
 publisher = {Elsevier BV},
 source = {Crossref},
 title = {Neuroethics for the New Millenium},
 url = {https://doi.org/10.1016/s0896-6273(02)00763-8},
 volume = {35},
 year = {2002}
}

@article{samajdar2018scale,
 author = {Samajdar, Ananda and Zhu, Yuhao and Whatmough, Paul and Mattina, Matthew and Krishna, Tushar},
 journal = {ArXiv preprint},
 title = {Scale-sim: {Systolic} cnn accelerator simulator},
 url = {https://arxiv.org/abs/1811.02883},
 volume = {abs/1811.02883},
 year = {2018}
}

@article{schuman2022opportunities,
 author = {Schuman, Catherine D. and Kulkarni, Shruti R. and Parsa, Maryam and Mitchell, J. Parker and Date, Prasanna and Kay, Bill},
 doi = {10.1038/s43588-021-00184-y},
 issn = {2662-8457},
 journal = {Nature Computational Science},
 number = {1},
 pages = {10--19},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {Opportunities for neuromorphic computing algorithms and applications},
 url = {https://doi.org/10.1038/s43588-021-00184-y},
 volume = {2},
 year = {2022}
}

@misc{segal1999opengl,
 author = {Segal, Mark and Akeley, Kurt},
 title = {The {OpenGL} graphics system: {A} specification (version 1.1)},
 year = {1999}
}

@article{segura2018ethical,
 author = {Segura Anaya, L. H. and Alsadoon, Abeer and Costadopoulos, N. and Prasad, P. W. C.},
 doi = {10.1007/s11948-017-9872-8},
 issn = {1353-3452, 1471-5546},
 journal = {Sci. Eng. Ethics},
 number = {1},
 pages = {1--28},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {Ethical Implications of User Perceptions of Wearable Devices},
 url = {https://doi.org/10.1007/s11948-017-9872-8},
 volume = {24},
 year = {2017}
}

@article{shastri2021photonics,
 author = {Shastri, Bhavin J. and Tait, Alexander N. and Ferreira de Lima, T. and Pernice, Wolfram H. P. and Bhaskaran, Harish and Wright, C. D. and Prucnal, Paul R.},
 doi = {10.1038/s41566-020-00754-y},
 issn = {1749-4885, 1749-4893},
 journal = {Nat. Photonics},
 number = {2},
 pages = {102--114},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {Photonics for artificial intelligence and neuromorphic computing},
 url = {https://doi.org/10.1038/s41566-020-00754-y},
 volume = {15},
 year = {2021}
}

@inproceedings{suda2016throughput,
 author = {Suda, Naveen and Chandra, Vikas and Dasika, Ganesh and Mohanty, Abinash and Ma, Yufei and Vrudhula, Sarma and Seo, Jae-sun and Cao, Yu},
 booktitle = {Proceedings of the 2016 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 doi = {10.1145/2847263.2847276},
 pages = {16--25},
 publisher = {ACM},
 source = {Crossref},
 title = {Throughput-Optimized {OpenCL}-based {FPGA} Accelerator for Large-Scale Convolutional Neural Networks},
 url = {https://doi.org/10.1145/2847263.2847276},
 year = {2016}
}

@article{sze2017efficient,
 author = {Sze, Vivienne and Chen, Yu-Hsin and Yang, Tien-Ju and Emer, Joel S.},
 doi = {10.1109/jproc.2017.2761740},
 issn = {0018-9219, 1558-2256},
 journal = {Proc. IEEE},
 number = {12},
 pages = {2295--2329},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Efficient Processing of Deep Neural Networks: {A} Tutorial and Survey},
 url = {https://doi.org/10.1109/jproc.2017.2761740},
 volume = {105},
 year = {2017}
}

@article{sze2017efficient,
 abstract = {Deep neural networks (DNNs) are currently widely used for many artificial intelligence (AI) applications including computer vision, speech recognition, and robotics. While DNNs deliver state-of-the-art accuracy on many AI tasks, it comes at the cost of high computational complexity. Accordingly, techniques that enable efficient processing of DNNs to improve energy efficiency and throughput without sacrificing application accuracy or increasing hardware cost are critical to the wide deployment of DNNs in AI systems. This article aims to provide a comprehensive tutorial and survey about the recent advances towards the goal of enabling efficient processing of DNNs. Specifically, it will provide an overview of DNNs, discuss various hardware platforms and architectures that support DNNs, and highlight key trends in reducing the computation cost of DNNs either solely via hardware design changes or via joint hardware design and DNN algorithm changes. It will also summarize various development resources that enable researchers and practitioners to quickly get started in this field, and highlight important benchmarking metrics and design considerations that should be used for evaluating the rapidly growing number of DNN hardware designs, optionally including algorithmic co-designs, being proposed in academia and industry. The reader will take away the following concepts from this article: understand the key design considerations for DNNs; be able to evaluate different DNN hardware implementations with benchmarks and comparison metrics; understand the trade-offs between various hardware architectures and platforms; be able to evaluate the utility of various DNN design techniques for efficient processing; and understand recent implementation trends and opportunities.},
 archiveprefix = {arXiv},
 author = {Sze, Vivienne and Chen, Yu-Hsin and Yang, Tien-Ju and Emer, Joel S.},
 copyright = {http://arxiv.org/licenses/nonexclusive-distrib/1.0/},
 doi = {10.1109/jproc.2017.2761740},
 eprint = {1703.09039},
 issn = {0018-9219, 1558-2256},
 journal = {Proc. IEEE},
 number = {12},
 pages = {2295--2329},
 primaryclass = {cs.CV},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Efficient Processing of Deep Neural Networks: {A} Tutorial and Survey},
 url = {https://doi.org/10.1109/jproc.2017.2761740},
 volume = {105},
 year = {2017}
}

@article{tang2022soft,
 author = {Tang, Xin and He, Yichun and Liu, Jia},
 doi = {10.1063/5.0069516},
 issn = {2688-4089},
 journal = {Biophysics Reviews},
 number = {1},
 publisher = {AIP Publishing},
 source = {Crossref},
 title = {Soft bioelectronics for cardiac interfaces},
 url = {https://doi.org/10.1063/5.0069516},
 volume = {3},
 year = {2022}
}

@article{tang2023flexible,
 author = {Tang, Xin and Shen, Hao and Zhao, Siyuan and Li, Na and Liu, Jia},
 doi = {10.1038/s41928-022-00913-9},
 issn = {2520-1131},
 journal = {Nature Electronics},
 number = {2},
 pages = {109--118},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {Flexible brain{\textendash}computer interfaces},
 url = {https://doi.org/10.1038/s41928-022-00913-9},
 volume = {6},
 year = {2023}
}

@inproceedings{valenzuela2000genetic,
 author = {Valenzuela, Christine L and Wang, Pearl Y},
 booktitle = {Parallel Problem Solving from Nature PPSN VI: 6th International Conference Paris, France, September 18--20, 2000 Proceedings 6},
 organization = {Springer},
 pages = {671--680},
 title = {A genetic algorithm for VLSI floorplanning},
 year = {2000}
}

@article{verma2019memory,
 author = {Verma, Naveen and Jia, Hongyang and Valavi, Hossein and Tang, Yinqi and Ozatay, Murat and Chen, Lung-Yen and Zhang, Bonan and Deaville, Peter},
 doi = {10.1109/mssc.2019.2922889},
 issn = {1943-0582, 1943-0590},
 journal = {IEEE Solid-State Circuits Mag.},
 number = {3},
 pages = {43--55},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {In-Memory Computing: {Advances} and Prospects},
 url = {https://doi.org/10.1109/mssc.2019.2922889},
 volume = {11},
 year = {2019}
}

@article{vivet2021intact,
 author = {Vivet, Pascal and Guthmuller, Eric and Thonnart, Yvain and Pillonnet, Gael and Fuguet, Cesar and Miro-Panades, Ivan and Moritz, Guillaume and Durupt, Jean and Bernard, Christian and Varreau, Didier and Pontes, Julian and Thuries, Sebastien and Coriat, David and Harrand, Michel and Dutoit, Denis and Lattard, Didier and Arnaud, Lucile and Charbonnier, Jean and Coudrain, Perceval and Garnier, Arnaud and Berger, Frederic and Gueugnot, Alain and Greiner, Alain and Meunier, Quentin L. and Farcy, Alexis and Arriordaz, Alexandre and Cheramy, Severine and Clermidy, Fabien},
 bdsk-url-1 = {https://doi.org/10.1109/JSSC.2020.3036341},
 doi = {10.1109/jssc.2020.3036341},
 issn = {0018-9200, 1558-173X},
 journal = {IEEE J. Solid-State Circuits},
 number = {1},
 pages = {79--97},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {{IntAct:} {A} 96-Core Processor With Six Chiplets {3D}-Stacked on an Active Interposer With Distributed Interconnects and Integrated Power Management},
 url = {https://doi.org/10.1109/jssc.2020.3036341},
 volume = {56},
 year = {2021}
}

@inproceedings{wang2020apq,
 author = {Tianzhe Wang and Kuan Wang and Han Cai and Ji Lin and Zhijian Liu and Hanrui Wang and Yujun Lin and Song Han},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/cvpr/WangWCLL0LH20.bib},
 booktitle = {2020 {IEEE/CVF} Conference on Computer Vision and Pattern Recognition, {CVPR} 2020, Seattle, WA, USA, June 13-19, 2020},
 doi = {10.1109/CVPR42600.2020.00215},
 pages = {2075--2084},
 publisher = {{IEEE}},
 timestamp = {Tue, 22 Dec 2020 00:00:00 +0100},
 title = {{APQ:} Joint Search for Network Architecture, Pruning and Quantization Policy},
 url = {https://doi.org/10.1109/CVPR42600.2020.00215},
 year = {2020}
}

@book{weik1955survey,
 author = {Weik, Martin H.},
 language = {en},
 publisher = {Ballistic Research Laboratories},
 title = {A Survey of Domestic Electronic Digital Computing Systems},
 year = {1955}
}

@article{wong2012metal,
 author = {Wong, H.-S. Philip and Lee, Heng-Yuan and Yu, Shimeng and Chen, Yu-Sheng and Wu, Yi and Chen, Pang-Shiu and Lee, Byoungil and Chen, Frederick T. and Tsai, Ming-Jinn},
 doi = {10.1109/jproc.2012.2190369},
 issn = {0018-9219, 1558-2256},
 journal = {Proc. IEEE},
 number = {6},
 pages = {1951--1970},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {{Metal{\textendash}Oxide} {RRAM}},
 url = {https://doi.org/10.1109/jproc.2012.2190369},
 volume = {100},
 year = {2012}
}

@article{xiong2021mribased,
 abstract = {Brain tumor segmentation is a challenging problem in medical image processing and analysis. It is a very time-consuming and error-prone task. In order to reduce the burden on physicians and improve the segmentation accuracy, the computer-aided detection (CAD) systems need to be developed. Due to the powerful feature learning ability of the deep learning technology, many deep learning-based methods have been applied to the brain tumor segmentation CAD systems and achieved satisfactory accuracy. However, deep learning neural networks have high computational complexity, and the brain tumor segmentation process consumes significant time. Therefore, in order to achieve the high segmentation accuracy of brain tumors and obtain the segmentation results efficiently, it is very demanding to speed up the segmentation process of brain tumors.},
 author = {Xiong, Siyu and Wu, Guoqing and Fan, Xitian and Feng, Xuan and Huang, Zhongcheng and Cao, Wei and Zhou, Xuegong and Ding, Shijin and Yu, Jinhua and Wang, Lingli and Shi, Zhifeng},
 bdsk-url-1 = {https://doi.org/10.1186/s12859-021-04347-6},
 doi = {10.1186/s12859-021-04347-6},
 issn = {1471-2105},
 journal = {BMC Bioinf.},
 keywords = {Brain tumor segmatation, FPGA acceleration, Neural network},
 number = {1},
 pages = {421},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {{MRI}-based brain tumor segmentation using {FPGA}-accelerated neural network},
 url = {https://doi.org/10.1186/s12859-021-04347-6},
 urldate = {2023-11-07},
 volume = {22},
 year = {2021}
}

@article{xiu2019time,
 author = {Xiu, Liming},
 doi = {10.1109/mssc.2018.2882285},
 issn = {1943-0582, 1943-0590},
 journal = {IEEE Solid-State Circuits Mag.},
 number = {1},
 pages = {39--55},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Time Moore: {Exploiting} {Moore's} Law From The Perspective of Time},
 url = {https://doi.org/10.1109/mssc.2018.2882285},
 volume = {11},
 year = {2019}
}

@article{young2018recent,
 author = {Young, Tom and Hazarika, Devamanyu and Poria, Soujanya and Cambria, Erik},
 doi = {10.1109/mci.2018.2840738},
 issn = {1556-603X, 1556-6048},
 journal = {IEEE Comput. Intell. Mag.},
 number = {3},
 pages = {55--75},
 publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
 source = {Crossref},
 title = {Recent Trends in Deep Learning Based Natural Language Processing {[Review} Article]},
 url = {https://doi.org/10.1109/mci.2018.2840738},
 volume = {13},
 year = {2018}
}

@article{yu2023rl,
 abstract = {Logic synthesis is a crucial step in electronic design automation tools. The rapid developments of reinforcement learning (RL) have enabled the automated exploration of logic synthesis. Existing RL based methods may lead to data inefficiency, and the exploration approaches for FPGA and ASIC technology mapping in recent works lack the flexibility of the learning process. This work proposes ESE, a reinforcement learning based framework to efficiently learn the logic synthesis process. The framework supports the modeling of logic optimization and technology mapping for FPGA and ASIC. The optimization for the execution time of the synthesis script is also considered. For the modeling of FPGA mapping, the logic optimization and technology mapping are combined to be learned in a flexible way. For the modeling of ASIC mapping, the standard cell based optimization and LUT optimization operations are incorporated into the ASIC synthesis flow. To improve the utilization of samples, the Proximal Policy Optimization model is adopted. Furthermore, the framework is enhanced by supporting MIG based synthesis exploration. Experiments show that for FPGA technology mapping on the VTR benchmark, the average LUT-Level-Product and script runtime are improved by more than 18.3\% and 12.4\% respectively than previous works. For ASIC mapping on the EPFL benchmark, the average Area-Delay-Product is improved by 14.5\%.},
 address = {New York, NY, USA},
 author = {Qian, Yu and Zhou, Xuegong and Zhou, Hao and Wang, Lingli},
 doi = {10.1145/3632174},
 issn = {1084-4309},
 journal = {ACM Trans. Des. Autom. Electron. Syst.},
 keywords = {technology mapping, Majority-Inverter Graph, And-Inverter Graph, Reinforcement learning, logic optimization},
 month = {nov},
 note = {Just Accepted},
 publisher = {Association for Computing Machinery},
 title = {An Efficient Reinforcement Learning Based Framework for Exploring Logic Synthesis},
 url = {https://doi.org/10.1145/3632174},
 year = {2023}
}

@inproceedings{zhang2015fpga,
 author = {Zhang, Chen and Li, Peng and Sun, Guangyu and Guan, Yijin and Xiao, Bingjun and Cong, Jason Optimizing},
 booktitle = {SIGDA International Symposium on Field-Programmable Gate Arrays-FPGA},
 pages = {161--170},
 title = {{FPGA}-based Accelerator Design for Deep Convolutional Neural Networks Proceedings of the 2015 {ACM}},
 volume = {15},
 year = {2015}
}

@inproceedings{zhang2022fullstack,
 abstract = {The rapidly-changing deep learning landscape presents a unique opportunity for building inference accelerators optimized for specific datacenter-scale workloads. We propose Full-stack Accelerator Search Technique (FAST), a hardware accelerator search framework that defines a broad optimization environment covering key design decisions within the hardware-software stack, including hardware datapath, software scheduling, and compiler passes such as operation fusion and tensor padding. In this paper, we analyze bottlenecks in state-of-the-art vision and natural language processing (NLP) models, including EfficientNet and BERT, and use FAST to design accelerators capable of addressing these bottlenecks. FAST-generated accelerators optimized for single workloads improve Perf/TDP by 3.7\texttimes{} on average across all benchmarks compared to TPU-v3. A FAST-generated accelerator optimized for serving a suite of workloads improves Perf/TDP by 2.4\texttimes{} on average compared to TPU-v3. Our return on investment analysis shows that FAST-generated accelerators can potentially be practical for moderate-sized datacenter deployments.},
 address = {New York, NY, USA},
 author = {Zhang, Dan and Huda, Safeen and Songhori, Ebrahim and Prabhu, Kartik and Le, Quoc and Goldie, Anna and Mirhoseini, Azalia},
 booktitle = {Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems},
 doi = {10.1145/3503222.3507767},
 isbn = {9781450392051},
 keywords = {design space exploration, hardware-software codesign, tensor processing unit, machine learning, operation fusion},
 location = {Lausanne, Switzerland},
 numpages = {16},
 pages = {27-42},
 publisher = {Association for Computing Machinery},
 series = {ASPLOS '22},
 title = {A Full-Stack Search Technique for Domain Optimized Deep Learning Accelerators},
 url = {https://doi.org/10.1145/3503222.3507767},
 year = {2022}
}

@article{zhou2022photonic,
 author = {Zhou, Hailong and Dong, Jianji and Cheng, Junwei and Dong, Wenchan and Huang, Chaoran and Shen, Yichen and Zhang, Qiming and Gu, Min and Qian, Chao and Chen, Hongsheng and Ruan, Zhichao and Zhang, Xinliang},
 doi = {10.1038/s41377-022-00717-8},
 issn = {2047-7538},
 journal = {Light: Science \&amp; Applications},
 number = {1},
 pages = {30},
 publisher = {Springer Science and Business Media LLC},
 source = {Crossref},
 title = {Photonic matrix multiplication lights up photonic accelerator and beyond},
 url = {https://doi.org/10.1038/s41377-022-00717-8},
 volume = {11},
 year = {2022}
}

@inproceedings{zhou2023area,
 author = {Zhou, Guanglei and Anderson, Jason H},
 booktitle = {Proceedings of the 28th Asia and South Pacific Design Automation Conference},
 pages = {159--165},
 title = {Area-Driven FPGA Logic Synthesis Using Reinforcement Learning},
 year = {2023}
}

@inproceedings{zhu2018benchmarking,
 author = {Zhu, Hongyu and Akrout, Mohamed and Zheng, Bojian and Pelegris, Andrew and Jayarajan, Anand and Phanishayee, Amar and Schroeder, Bianca and Pekhimenko, Gennady},
 booktitle = {2018 IEEE International Symposium on Workload Characterization (IISWC)},
 doi = {10.1109/iiswc.2018.8573476},
 organization = {IEEE},
 pages = {88--100},
 publisher = {IEEE},
 source = {Crossref},
 title = {Benchmarking and Analyzing Deep Neural Network Training},
 url = {https://doi.org/10.1109/iiswc.2018.8573476},
 year = {2018}
}

@inproceedings{zhangfast,
author = {Zhang, Dan and Huda, Safeen and Songhori, Ebrahim and Prabhu, Kartik and Le, Quoc and Goldie, Anna and Mirhoseini, Azalia},
title = {A Full-Stack Search Technique for Domain Optimized Deep Learning Accelerators},
year = {2022},
isbn = {9781450392051},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3503222.3507767},
doi = {10.1145/3503222.3507767},
abstract = {The rapidly-changing deep learning landscape presents a unique opportunity for building inference accelerators optimized for specific datacenter-scale workloads. We propose Full-stack Accelerator Search Technique (FAST), a hardware accelerator search framework that defines a broad optimization environment covering key design decisions within the hardware-software stack, including hardware datapath, software scheduling, and compiler passes such as operation fusion and tensor padding. In this paper, we analyze bottlenecks in state-of-the-art vision and natural language processing (NLP) models, including EfficientNet and BERT, and use FAST to design accelerators capable of addressing these bottlenecks. FAST-generated accelerators optimized for single workloads improve Perf/TDP by 3.7\texttimes{} on average across all benchmarks compared to TPU-v3. A FAST-generated accelerator optimized for serving a suite of workloads improves Perf/TDP by 2.4\texttimes{} on average compared to TPU-v3. Our return on investment analysis shows that FAST-generated accelerators can potentially be practical for moderate-sized datacenter deployments.},
booktitle = {Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {27-42},
numpages = {16},
keywords = {design space exploration, hardware-software codesign, tensor processing unit, machine learning, operation fusion},
location = {Lausanne, Switzerland},
series = {ASPLOS '22}
}

@article{huang2022flexible,
  title={How Flexible is Your Computing System?},
  author={Huang, Shihua and Waeijen, Luc and Corporaal, Henk},
  journal={ACM Transactions on Embedded Computing Systems (TECS)},
  volume={21},
  number={4},
  pages={1--41},
  year={2022},
  publisher={ACM New York, NY}
}