cs249r_book/book/quarto/contents/vol1/optimizations/optimizations.bib

@article{alexnet2012,
  title = {ImageNet classification with deep convolutional neural networks},
  author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
  journal = {Communications of the ACM},
  booktitle = {Advances in Neural Information Processing Systems},
  publisher = {Association for Computing Machinery (ACM)},
  volume = {60},
  number = {6},
  pages = {84--90},
  doi = {10.1145/3065386},
  issn = {0001-0782,1557-7317},
  url = {https://doi.org/10.1145/3065386},
  source = {Crossref},
  date = {2017-05-24},
  editor = {Pereira, F. and Burges, C.J. and Bottou, L. and Weinberger, K.Q.},
}

@article{annette2020,
  title = {ANNETTE: Accurate Neural Network Execution Time Estimation With Stacked Models},
  author = {
    Wess, Matthias and Ivanov, Matvey and Unger, Christoph and Nookala, Anvesh and Wendt, Alexander
    and Jantsch, Axel
  },
  journal = {IEEE Access},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {9},
  pages = {3545--3556},
  doi = {10.1109/access.2020.3047259},
  issn = {2169-3536},
  url = {https://doi.org/10.1109/access.2020.3047259},
  source = {Crossref},
  date = {2021},
}

@article{ba2014deep,
  title = {Do Deep Nets Really Need to be Deep?},
  author = {Ba, Jimmy and Caruana, Rich},
  year = {2014},
  journal = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume = {27},
}

@article{banbury2020benchmarking,
  title = {Benchmarking TinyML Systems: Challenges and Direction},
  author = {
    Banbury, Colby R. and Reddi, Vijay Janapa and Lam, Max and Fu, William and Fazel, Amin and
    Holleman, Jeremy and Huang, Xinyuan and Hurtado, Robert and Kanter, David and Lokhmotov, Anton
    and Patterson, David and Pau, Danilo and Seo, Jae-sun and Sieracki, Jeff and Thakker, Urmish
    and Verhelst, Marian and Yadav, Poonam
  },
  journal = {arXiv preprint arXiv:2003.04821},
  url = {http://arxiv.org/abs/2003.04821v4},
  date = {2020-03-10},
  primaryclass = {cs.PF},
  archiveprefix = {arXiv},
}

@article{Bellec2018,
  title = {Memory-Efficient Deep Learning on a SpiNNaker 2 Prototype},
  author = {
    Liu, Chen and Bellec, Guillaume and Vogginger, Bernhard and Kappel, David and Partzsch,
    Johannes and Neum\"{a}rker, Felix and H\"{o}ppner, Sebastian and Maass, Wolfgang and Furber,
    Steve B. and Legenstein, Robert and Mayr, Christian G.
  },
  journal = {Frontiers in Neuroscience},
  booktitle = {International Conference on Learning Representations (ICLR)},
  publisher = {Frontiers Media SA},
  volume = {12},
  pages = {840},
  doi = {10.3389/fnins.2018.00840},
  issn = {1662-453X},
  url = {https://doi.org/10.3389/fnins.2018.00840},
  source = {Crossref},
  date = {2018-11-16},
  essn = {1662-453X},
}

@article{Bengio2013,
  title = {Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation},
  author = {Bengio, Yoshua and L\'{e}onard, Nicholas and Courville, Aaron},
  journal = {arXiv preprint},
  url = {http://arxiv.org/abs/1308.3432v1},
  date = {2013-08-15},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {1308.3432},
}

@article{bengio2013estimating,
  title = {Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation},
  author = {Bengio, Yoshua and L\'{e}onard, Nicholas and Courville, Aaron},
  journal = {arXiv preprint arXiv:1308.3432},
  url = {http://arxiv.org/abs/1308.3432v1},
  date = {2013-08-15},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{bengio2015conditional,
  title = {Conditional Computation in Neural Networks for faster models},
  author = {Bengio, Emmanuel and Bacon, Pierre-Luc and Pineau, Joelle and Precup, Doina},
  journal = {arXiv preprint arXiv:1511.06297},
  url = {http://arxiv.org/abs/1511.06297v2},
  date = {2015-11-19},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{Bergstra2011,
  title = {Adaptive MCMC with online relabeling},
  author = {Bardenet, R\'{e}mi and Capp\'{e}, Olivier and Fort, Gersende and K\'{e}gl, Bal\'{a}zs},
  journal = {Bernoulli},
  publisher = {Bernoulli Society for Mathematical Statistics and Probability},
  volume = {21},
  number = {3},
  doi = {10.3150/13-bej578},
  issn = {1350-7265},
  url = {https://doi.org/10.3150/13-bej578},
  source = {Crossref},
  date = {2015-08-01},
}

@phdthesis{blalock2020state,
  title = {Neural Network Pruning for ECG Arrhythmia Classification},
  author = {Labarge, Isaac E},
  journal = {Proceedings of Machine Learning and Systems (MLSys)},
  volume = {2},
  pages = {129--146},
  doi = {10.15368/theses.2020.76},
  url = {https://doi.org/10.15368/theses.2020.76},
  source = {Crossref},
  school = {California Polytechnic State University},
}

@article{brown2020gpt3,
  title = {Language Models are Few-Shot Learners},
  author = {
    Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and
    Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Saxena, Girish and Arora,
    Sandhini and others
  },
  year = {2020},
  journal = {Advances in Neural Information Processing Systems},
  volume = {33},
  pages = {1877--1901},
}

@inproceedings{cai2018proxylessnas,
  title = {ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware},
  author = {Cai, Han and Zhu, Ligeng and Han, Song},
  year = {2019},
  booktitle = {
    7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May
    6-9, 2019
  },
  publisher = {OpenReview.net},
  url = {https://openreview.net/forum?id=HylVB3AqYm},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/iclr/CaiZH19.bib},
  timestamp = {Tue, 24 Nov 2020 00:00:00 +0100},
}

@inproceedings{Cai2020,
  title = {Once-for-All: Train One Network and Specialize it for Efficient Deployment},
  author = {Cai, Han and Gan, Chuang and Han, Song},
  year = {2020},
  booktitle = {International Conference on Learning Representations},
}

@article{chen2016training,
  title = {
    Kodaikanal Digitized White-light Data Archive (1921-2011): Analysis of various solar cycle
    features
  },
  author = {
    Mandal, Sudip and Hegde, Manjunath and Samanta, Tanmoy and Hazra, Gopal and Banerjee, Dipankar
    and Ravindra, B
  },
  journal = {arXiv preprint arXiv:1608.04665},
  url = {http://arxiv.org/abs/1608.04665v2},
  date = {2016-08-16},
  primaryclass = {astro-ph.SR},
  archiveprefix = {arXiv},
}

@article{Cheng2022,
  title = {Memory-Efficient Deep Learning: Advances in Model Compression and Sparsification},
  author = {Cheng, Yu and others},
  year = {2022},
  journal = {ACM Computing Surveys},
}

@article{Choi2019,
  title = {PACT: Parameterized Clipping Activation for Quantized Neural Networks},
  author = {
    Choi, Jungwook and Wang, Zhuo and Venkataramani, Swagath and Chuang, Pierce I-Jen and
    Srinivasan, Vijayalakshmi and Gopalakrishnan, Kailash
  },
  journal = {arXiv preprint},
  url = {http://arxiv.org/abs/1805.06085v2},
  date = {2018-05-16},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
  eprint = {1805.06085},
}

@article{choudhary2020comprehensive,
  title = {A comprehensive survey on model compression and acceleration},
  author = {Choudhary, Tejalal and Mishra, Vipul and Goswami, Anurag and Sarangapani, Jagannathan},
  journal = {Artificial Intelligence Review},
  publisher = {Springer Science and Business Media LLC},
  volume = {53},
  number = {7},
  pages = {5113--5155},
  doi = {10.1007/s10462-020-09816-7},
  issn = {0269-2821,1573-7462},
  url = {https://doi.org/10.1007/s10462-020-09816-7},
  source = {Crossref},
  date = {2020-02-08},
}

@article{choukroun2019low,
  title = {Differentiable Soft Quantization: Bridging Full-Precision and Low-Bit Neural Networks},
  author = {
    Gong, Ruihao and Liu, Xianglong and Jiang, Shenghu and Li, Tianxiang and Hu, Peng and Lin,
    Jiazhen and Yu, Fengwei and Yan, Junjie
  },
  journal = {arXiv preprint arXiv:1908.05033},
  url = {http://arxiv.org/abs/1908.05033v1},
  date = {2019-08-14},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@inproceedings{Chowdhery2021,
  title = {Edge TPU: An Edge-Optimized Inference Accelerator for Deep Learning},
  author = {
    Chowdhery, Aakanksha and Noy, Anatoli and Misra, Gaurav and Dai, Zhuyun and Le, Quoc V. and
    Dean, Jeff
  },
  year = {2021},
  booktitle = {International Symposium on Computer Architecture},
}

@article{chowdhery2022palm,
  title = {PaLM: Scaling Language Modeling with Pathways},
  author = {
    Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav
    and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann,
    Sebastian and Schuh, Parker and Shi, Kensen and Tsvyashchenko, Sasha and Maynez, Joshua and
    Rao, Abhishek and Barnes, Parker and Tay, Yi and Shazeer, Noam and Prabhakaran, Vinodkumar and
    Reif, Emily and Du, Nan and Hutchinson, Ben and Pope, Reiner and Bradbury, James and Austin,
    Jacob and Isard, Michael and Gur-Ari, Guy and Yin, Pengcheng and Duke, Toju and Levskaya,
    Anselm and Ghemawat, Sanjay and Dev, Sunipa and Michalewski, Henryk and Garcia, Xavier and
    Misra, Vedant and Robinson, Kevin and Fedus, Liam and Zhou, Denny and Ippolito, Daphne and
    Luan, David and Lim, Hyeontaek and Zoph, Barret and Spiridonov, Alexander and Sepassi, Ryan and
    Dohan, David and Agrawal, Shivani and Omernick, Mark and Dai, Andrew M. and Pillai,
    Thanumalayan Sankaranarayana and Pellat, Marie and Lewkowycz, Aitor and Moreira, Erica and
    Child, Rewon and Polozov, Oleksandr and Lee, Katherine and Zhou, Zongwei and Wang, Xuezhi and
    Saeta, Brennan and Diaz, Mark and Firat, Orhan and Catasta, Michele and Wei, Jason and
    Meier-Hellstern, Kathy and Eck, Douglas and Dean, Jeff and Petrov, Slav and Fiedel, Noah
  },
  journal = {arXiv preprint arXiv:2204.02311},
  url = {http://arxiv.org/abs/2204.02311v5},
  date = {2022-04-05},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
}

@inproceedings{chu2021discovering,
  title = {Discovering Multi-Hardware Mobile Models via Architecture Search},
  author = {
    Chu, Grace and Arikan, Okan and Bender, Gabriel and Wang, Weijun and Brighton, Achille and
    Kindermans, Pieter-Jan and Liu, Hanxiao and Akin, Berkin and Gupta, Suyog and Howard, Andrew
  },
  booktitle = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
  publisher = {IEEE},
  pages = {3016--3025},
  doi = {10.1109/cvprw53098.2021.00337},
  url = {https://doi.org/10.1109/cvprw53098.2021.00337},
  source = {Crossref},
  date = {2021-06},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/cvpr/ChuABWBKLAG021.bib},
  timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
}

@article{Courbariaux2016,
  title = {BinaryConnect: Training Deep Neural Networks with Binary Weights during Propagations},
  author = {Courbariaux, Matthieu and Bengio, Yoshua and David, Jean-Pierre},
  year = {2016},
  journal = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume = {28},
  pages = {3123--3131},
}

@inproceedings{Cubuk2019,
  title = {AutoAugment: Learning Augmentation Strategies From Data},
  author = {Cubuk, Ekin D. and Zoph, Barret and Man\'{e}, Dandelion and Vasudevan, Vijay and Le, Quoc V.},
  booktitle = {2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {113--123},
  doi = {10.1109/cvpr.2019.00020},
  url = {https://doi.org/10.1109/cvpr.2019.00020},
  source = {Crossref},
  date = {2019-06},
}

@article{dao2022monarchexpressivestructuredmatrices,
  title = {Monarch: Expressive Structured Matrices for Efficient and Accurate Training},
  author = {
    Dao, Tri and Chen, Beidi and Sohoni, Nimit and Desai, Arjun and Poli, Michael and Grogan,
    Jessica and Liu, Alexander and Rao, Aniruddh and Rudra, Atri and R\'{e}, Christopher
  },
  url = {http://arxiv.org/abs/2204.00595v1},
  date = {2022-04-01},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {2204.00595},
}

@article{Davies2021,
  title = {Advancing Neuromorphic Computing with Sparse Networks},
  author = {Davies, Mike and others},
  year = {2021},
  journal = {Nature Electronics},
}

@article{dean2018new,
  title = {A New Golden Age in Computer Architecture: Empowering the Machine-Learning Revolution},
  author = {Dean, Jeff and Patterson, David and Young, Cliff},
  journal = {IEEE Micro},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {38},
  number = {2},
  pages = {21--29},
  doi = {10.1109/mm.2018.112130030},
  issn = {0272-1732,1937-4143},
  url = {https://doi.org/10.1109/mm.2018.112130030},
  source = {Crossref},
  date = {2018-03},
}

@inproceedings{Denton2014,
  title = {Exploiting Linear Structure Within Convolutional Networks for Efficient Evaluation},
  author = {Denton, Emily L and Chintala, Soumith and Fergus, Rob},
  year = {2014},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  pages = {1269--1277},
}

@article{dettmers2019sparse,
  title = {Sparse Networks from Scratch: Faster Training without Losing Performance},
  author = {Dettmers, Tim and Zettlemoyer, Luke},
  journal = {arXiv preprint arXiv:1907.04840},
  url = {http://arxiv.org/abs/1907.04840v2},
  date = {2019-07-10},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{devlin2018bert,
  title = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
  author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal = {arXiv preprint arXiv:1810.04805},
  url = {http://arxiv.org/abs/1810.04805v2},
  date = {2018-10-11},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
}

@inproceedings{dong2022splitnets,
  title = {
    SplitNets: Designing Neural Architectures for Efficient Distributed Computing on Head-Mounted
    Systems
  },
  author = {
    Dong, Xin and De Salvo, Barbara and Li, Meng and Liu, Chiao and Qu, Zhongnan and Kung, H.T. and
    Li, Ziyun
  },
  booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {12549--12559},
  doi = {10.1109/cvpr52688.2022.01223},
  url = {https://doi.org/10.1109/cvpr52688.2022.01223},
  source = {Crossref},
  date = {2022-06},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/cvpr/DongSLLQ0L22.bib},
  timestamp = {Sun, 22 Jan 2023 00:00:00 +0100},
}

@article{dynamicpruning2023,
  title = {A Dynamic Pruning Method on Multiple Sparse Structures in Deep Neural Networks},
  author = {
    Hu, Jie and Lin, Peng and Zhang, Huajun and Lan, Zining and Chen, Wenxin and Xie, Kailiang and
    Chen, Siyun and Wang, Hao and Chang, Sheng
  },
  journal = {IEEE Access},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {11},
  pages = {38448--38457},
  doi = {10.1109/access.2023.3267469},
  issn = {2169-3536},
  url = {https://doi.org/10.1109/access.2023.3267469},
  source = {Crossref},
  date = {2023},
  keywords = {
    Sparse matrices;Filtering theory;Training;Information filters;Neural networks;Tensors;Deep
    learning;Convolutional neural networks;dynamic channel pruning;network compression and
    acceleration;structured pruning
  },
}

@inproceedings{elsen2020fast,
  title = {Fast Sparse ConvNets},
  author = {Elsen, Erich and Dukhan, Marat and Gale, Trevor and Simonyan, Karen},
  booktitle = {2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {14617--14626},
  doi = {10.1109/cvpr42600.2020.01464},
  url = {https://doi.org/10.1109/cvpr42600.2020.01464},
  source = {Crossref},
  date = {2020-06},
}

@incollection{Elsken2019,
  title = {Neural Architecture Search},
  author = {Elsken, Thomas and Metzen, Jan Hendrik and Hutter, Frank},
  journal = {Journal of Machine Learning Research},
  booktitle = {Automated Machine Learning},
  publisher = {Springer International Publishing},
  pages = {63--77},
  doi = {10.1007/978-3-030-05318-5\_3},
  isbn = {9783030053178,9783030053185},
  issn = {2520-131X,2520-1328},
  url = {https://doi.org/10.1007/978-3-030-05318-5\_3},
  source = {Crossref},
  date = {2019},
}

@incollection{elsken2019neural,
  title = {Neural Architecture Search},
  author = {Elsken, Thomas and Metzen, Jan Hendrik and Hutter, Frank},
  journal = {Journal of Machine Learning Research},
  booktitle = {Automated Machine Learning},
  publisher = {Springer International Publishing},
  volume = {20},
  number = {55},
  pages = {63--77},
  doi = {10.1007/978-3-030-05318-5\_3},
  isbn = {9783030053178,9783030053185},
  issn = {2520-131X,2520-1328},
  url = {https://doi.org/10.1007/978-3-030-05318-5\_3},
  source = {Crossref},
  date = {2019},
}

@article{fahim2021hls4ml,
  title = {
    hls4ml: An Open-Source Codesign Workflow to Empower Scientific Low-Power Machine Learning
    Devices
  },
  author = {
    Fahim, Farah and Hawks, Benjamin and Herwig, Christian and Hirschauer, James and Jindariani,
    Sergo and Tran, Nhan and Carloni, Luca P. and Guglielmo, Giuseppe Di and Harris, Philip and
    Krupa, Jeffrey and Rankin, Dylan and Valentin, Manuel Blanco and Hester, Josiah and Luo, Yingyi
    and Mamish, John and Orgrenci-Memik, Seda and Aarrestad, Thea and Javed, Hamza and Loncar,
    Vladimir and Pierini, Maurizio and Pol, Adrian Alan and Summers, Sioni and Duarte, Javier and
    Hauck, Scott and Hsu, Shih-Chieh and Ngadiuba, Jennifer and Liu, Mia and Hoang, Duc and
    Kreinar, Edward and Wu, Zhenbin
  },
  url = {http://arxiv.org/abs/2103.05579v3},
  date = {2021-03-09},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {2103.05579},
}

@article{fedus2021switch,
  title = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
  author = {Fedus, William and Zoph, Barret and Shazeer, Noam},
  year = {2021},
  journal = {Journal of Machine Learning Research},
}

@incollection{Feurer2015,
  title = {Auto-sklearn: Efficient and Robust Automated Machine Learning},
  author = {
    Feurer, Matthias and Klein, Aaron and Eggensperger, Katharina and Springenberg, Jost Tobias and
    Blum, Manuel and Hutter, Frank
  },
  journal = {Advances in Neural Information Processing Systems},
  booktitle = {Automated Machine Learning},
  publisher = {Springer International Publishing},
  pages = {113--134},
  doi = {10.1007/978-3-030-05318-5\_6},
  isbn = {9783030053178,9783030053185},
  issn = {2520-131X,2520-1328},
  url = {https://doi.org/10.1007/978-3-030-05318-5\_6},
  source = {Crossref},
  date = {2019},
}

@article{frankle2018lottery,
  title = {The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks},
  author = {Frankle, Jonathan and Carbin, Michael},
  journal = {arXiv preprint arXiv:1803.03635},
  url = {http://arxiv.org/abs/1803.03635v5},
  date = {2018-03-09},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{gale2019state,
  title = {The State of Sparsity in Deep Neural Networks},
  author = {Gale, Trevor and Elsen, Erich and Hooker, Sara},
  journal = {arXiv preprint arXiv:1902.09574},
  url = {http://arxiv.org/abs/1902.09574v1},
  date = {2019-02-25},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{Gale2020,
  title = {On the Bauer-Furuta and Seiberg-Witten invariants of families of $4$-manifolds},
  author = {Baraglia, David and Konno, Hokuto},
  journal = {arXiv preprint arXiv:1903.01649},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  pages = {8955--8967},
  url = {http://arxiv.org/abs/1903.01649v3},
  date = {2019-03-05},
  primaryclass = {math.DG},
  archiveprefix = {arXiv},
}

@article{gale2020sparse,
  title = {The State of Sparsity in Deep Neural Networks},
  author = {Gale, Trevor and Elsen, Erich and Hooker, Sara},
  journal = {arXiv preprint arXiv:1902.09574},
  url = {http://arxiv.org/abs/1902.09574v1},
  date = {2019-02-25},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{gale2022megablocksefficientsparsetraining,
  title = {MegaBlocks: Efficient Sparse Training with Mixture-of-Experts},
  author = {Gale, Trevor and Narayanan, Deepak and Young, Cliff and Zaharia, Matei},
  url = {http://arxiv.org/abs/2211.15841v1},
  date = {2022-11-29},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {2211.15841},
}

@article{Gholami2021,
  title = {A Survey of Quantization Methods for Efficient Neural Network Inference},
  author = {
    Gholami, Amir and Kim, Sehoon and Dong, Zhen and Yao, Zhewei and Mahoney, Michael W. and
    Keutzer, Kurt
  },
  journal = {arXiv preprint arXiv:2103.13630},
  url = {http://arxiv.org/abs/2103.13630v3},
  date = {2021-03-25},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@article{gholami2021survey,
  title = {A Survey of Quantization Methods for Efficient Neural Network Inference},
  author = {
    Gholami, Amir and Kim, Sehoon and Dong, Zhen and Yao, Zhewei and Mahoney, Michael W. and
    Keutzer, Kurt
  },
  journal = {arXiv preprint arXiv:2103.13630},
  volume = {abs/2103.13630},
  url = {http://arxiv.org/abs/2103.13630v3},
  date = {2021-03-25},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@inproceedings{gordon2018morphnet,
  title = {MorphNet: Fast \&amp; Simple Resource-Constrained Structure Learning of Deep Networks},
  author = {
    Gordon, Ariel and Eban, Elad and Nachum, Ofir and Chen, Bo and Wu, Hao and Yang, Tien-Ju and
    Choi, Edward
  },
  booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  publisher = {IEEE},
  pages = {1586--1595},
  doi = {10.1109/cvpr.2018.00171},
  url = {https://doi.org/10.1109/cvpr.2018.00171},
  source = {Crossref},
  date = {2018-06},
}

@inproceedings{gordon2020compressing,
  title = {Compressing BERT: Studying the Effects of Weight Pruning on Transfer Learning},
  author = {Gordon, Mitchell and Duh, Kevin and Andrews, Nicholas},
  booktitle = {Proceedings of the 5th Workshop on Representation Learning for NLP},
  publisher = {Association for Computational Linguistics},
  doi = {10.18653/v1/2020.repl4nlp-1.18},
  url = {https://doi.org/10.18653/v1/2020.repl4nlp-1.18},
  source = {Crossref},
  date = {2020},
}

@article{gou2021knowledge,
  title = {Knowledge Distillation: A Survey},
  author = {Gou, Jianping and Yu, Baosheng and Maybank, Stephen J. and Tao, Dacheng},
  journal = {International Journal of Computer Vision},
  publisher = {Springer Science and Business Media LLC},
  volume = {129},
  number = {6},
  pages = {1789--1819},
  doi = {10.1007/s11263-021-01453-z},
  issn = {0920-5691,1573-1405},
  url = {https://doi.org/10.1007/s11263-021-01453-z},
  source = {Crossref},
  date = {2021-03-22},
}

@misc{gu2023deep,
  title = {Deep Learning Model Compression (ii) by Ivy Gu Medium},
  author = {Gu, Ivy},
  year = {2023},
  url = {https://ivygdy.medium.com/deep-learning-model-compression-ii-546352ea9453},
  urldate = {2023-10-20},
  bdsk-url-1 = {https://ivygdy.medium.com/deep-learning-model-compression-ii-546352ea9453},
}

@inproceedings{gupta2015deep,
  title = {Deep learning with limited numerical precision},
  author = {Gupta, Suyog and Agrawal, Ankur and Gopalakrishnan, Kailash and Narayanan, Pritish},
  year = {2015},
  booktitle = {International conference on machine learning},
  pages = {1737--1746},
  organization = {PMLR},
}

@article{Han2015,
  title = {Learning both Weights and Connections for Efficient Neural Networks},
  author = {Han, Song and Pool, Jeff and Tran, John and Dally, William J.},
  journal = {CoRR},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume = {abs/1506.02626},
  pages = {1135--1143},
  url = {http://arxiv.org/abs/1506.02626v3},
  date = {2015-06-08},
  primaryclass = {cs.NE},
  archiveprefix = {arXiv},
  source = {DBLP},
}

@article{han2015deep,
  title = {
    Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and
    Huffman Coding
  },
  author = {Han, Song and Mao, Huizi and Dally, William J.},
  journal = {arXiv preprint arXiv:1510.00149},
  url = {http://arxiv.org/abs/1510.00149v5},
  date = {2015-10-01},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@article{Han2016,
  title = {
    Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and
    Huffman Coding
  },
  author = {Han, Song and Mao, Huizi and Dally, William J.},
  year = {2016},
  journal = {International Conference on Learning Representations (ICLR)},
}

@article{hawks2021psandqs,
  title = {Ps and Qs: Quantization-Aware Pruning for Efficient Low Latency Neural Network Inference},
  author = {
    Hawks, Benjamin and Duarte, Javier and Fraser, Nicholas J. and Pappalardo, Alessandro and Tran,
    Nhan and Umuroglu, Yaman
  },
  journal = {Frontiers in Artificial Intelligence},
  publisher = {Frontiers Media SA},
  volume = {4},
  doi = {10.3389/frai.2021.676564},
  issn = {2624-8212},
  url = {https://doi.org/10.3389/frai.2021.676564},
  source = {Crossref},
  date = {2021-07-09},
}

@article{he2016deep,
  title = {Deep Residual Learning for Image Recognition},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  pages = {770--778},
  doi = {10.1109/CVPR.2016.90},
  url = {http://arxiv.org/abs/1512.03385v1},
  date = {2015-12-10},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@incollection{He2018,
  title = {AMC: AutoML for Model Compression and Acceleration on Mobile Devices},
  author = {He, Yihui and Lin, Ji and Liu, Zhijian and Wang, Hanrui and Li, Li-Jia and Han, Song},
  journal = {European Conference on Computer Vision},
  booktitle = {Computer Vision – ECCV 2018},
  publisher = {Springer International Publishing},
  pages = {815--832},
  doi = {10.1007/978-3-030-01234-2\_48},
  isbn = {9783030012335,9783030012342},
  issn = {0302-9743,1611-3349},
  url = {https://doi.org/10.1007/978-3-030-01234-2\_48},
  source = {Crossref},
  date = {2018},
}

@inproceedings{hegde2023introduction,
  title = {Towards a new interpretation of separable convolutions},
  author = {Ghosh, Tapabrata},
  booktitle = {2017 Intelligent Systems Conference (IntelliSys)},
  publisher = {IEEE},
  pages = {112--116},
  doi = {10.1109/intellisys.2017.8324241},
  url = {https://doi.org/10.1109/intellisys.2017.8324241},
  urldate = {2023-10-20},
  source = {Crossref},
  date = {2017-09},
  bdsk-url-1 = {https://www.analyticsvidhya.com/blog/2021/11/an-introduction-to-separable-convolutions/},
}

@article{hinton2015distilling,
  title = {Distilling the Knowledge in a Neural Network},
  author = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  booktitle = {NIPS Deep Learning and Representation Learning Workshop},
  publisher = {Wiley},
  doi = {10.1002/0471743984.vse0673},
  isbn = {9780471332305,9780471743989},
  url = {http://arxiv.org/abs/1503.02531v1},
  date = {2015-03-09},
  primaryclass = {stat.ML},
  archiveprefix = {arXiv},
  eprint = {1503.02531},
  source = {Crossref},
}

@article{Hoefler2021,
  title = {
    Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural
    networks
  },
  author = {Hoefler, Torsten and Alistarh, Dan and Ben-Nun, Tal and Dryden, Nikoli and Peste, Alexandra},
  journal = {arXiv preprint arXiv:2102.00554},
  volume = {22},
  pages = {1--124},
  url = {http://arxiv.org/abs/2102.00554v1},
  date = {2021-01-31},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{hoefler2021sparsity,
  title = {
    Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural
    networks
  },
  author = {
    Hoefler, Torsten and Alistarh, Dan and Ben-Nun, Tal and Dryden, Nikoli and Ziogas, Alexandros
    Nikolaos
  },
  year = {2021},
  journal = {Journal of Machine Learning Research},
  volume = {22},
  number = {241},
  pages = {1--124},
}

@article{howard2017mobilenets,
  title = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications},
  author = {
    Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and
    Weyand, Tobias and Andreetto, Marco and Adam, Hartwig
  },
  journal = {ArXiv preprint},
  booktitle = {arXiv preprint arXiv:1704.04861},
  volume = {abs/1704.04861},
  url = {http://arxiv.org/abs/1704.04861v1},
  date = {2017-04-17},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@article{hu2021lora,
  title = {LoRA: Low-Rank Adaptation of Large Language Models},
  author = {
    Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and
    Wang, Shean and Wang, Lu and Chen, Weizhu
  },
  journal = {arXiv preprint arXiv:2106.09685},
  url = {http://arxiv.org/abs/2106.09685v2},
  date = {2021-06-17},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
}

@article{hu2021triple,
  title = {
    Triple Wins: Boosting Accuracy, Robustness and Efficiency Together by Enabling Input-Adaptive
    Inference
  },
  author = {Hu, Bowen and Zhang, Zhiqiang and Fu, Yun},
  year = {2021},
  journal = {Advances in Neural Information Processing Systems},
  volume = {34},
  pages = {18537--18550},
}

@article{huang2023adaptive,
  title = {Adaptive Neural Networks for Real-Time Processing in Autonomous Systems},
  author = {Huang, Wei and Chen, Jie and Zhang, Lei},
  year = {2023},
  journal = {IEEE Transactions on Intelligent Transportation Systems},
  publisher = {IEEE},
}

@article{Hubara2018,
  title = {
    Quantized Neural Networks: Training Neural Networks with Low Precision Weights and Activations
  },
  author = {
    Hubara, Itay and Courbariaux, Matthieu and Soudry, Daniel and El-Yaniv, Ran and Bengio, Yoshua
  },
  year = {2018},
  journal = {Journal of Machine Learning Research (JMLR)},
  volume = {18},
  pages = {1--30},
}

@book{Hutter2019,
  title = {Automated Machine Learning},
  author = {Hutter, Frank and Kotthoff, Lars and Vanschoren, Joaquin},
  publisher = {Springer International Publishing},
  doi = {10.1007/978-3-030-05318-5},
  isbn = {9783030053178,9783030053185},
  issn = {2520-131X,2520-1328},
  url = {https://doi.org/10.1007/978-3-030-05318-5},
  source = {Crossref},
  subtitle = {Methods, Systems, Challenges},
  date = {2019},
}

@article{iandola2016squeezenet,
  title = {SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size},
  author = {
    Iandola, Forrest N. and Han, Song and Moskewicz, Matthew W. and Ashraf, Khalid and Dally,
    William J. and Keutzer, Kurt
  },
  journal = {ArXiv preprint},
  volume = {abs/1602.07360},
  url = {http://arxiv.org/abs/1602.07360v4},
  date = {2016-02-24},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
}

@misc{intellabs2023knowledge,
  title = {Neural Network Compression and Knowledge Distillation: Tutorial and Survey},
  author = {Ghojogh, Benyamin and Ghodsi, Ali},
  publisher = {Center for Open Science},
  doi = {10.31219/osf.io/4n2cb},
  url = {https://doi.org/10.31219/osf.io/4n2cb},
  urldate = {2023-10-20},
  source = {Crossref},
  date = {2024-10-15},
  bdsk-url-1 = {https://intellabs.github.io/distiller/knowledge\_distillation.html},
}

@inproceedings{isscc2014computings,
  title = {1.1 Computing's energy problem (and what we can do about it)},
  author = {Horowitz, Mark},
  booktitle = {2014 IEEE International Solid-State Circuits Conference Digest of Technical Papers (ISSCC)},
  publisher = {IEEE},
  doi = {10.1109/isscc.2014.6757323},
  url = {https://doi.org/10.1109/isscc.2014.6757323},
  urldate = {2014-03-06},
  source = {Crossref},
  date = {2014-02},
  bdsk-url-1 = {https://ieeexplore.ieee.org/document/6757323},
}

@inproceedings{Jacob2018,
  title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference},
  author = {
    Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and
    Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry
  },
  journal = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  publisher = {IEEE},
  pages = {2704--2713},
  doi = {10.1109/cvpr.2018.00286},
  url = {https://doi.org/10.1109/cvpr.2018.00286},
  source = {Crossref},
  date = {2018-06},
}

@inproceedings{jacob2018quantization,
  title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference},
  author = {
    Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and
    Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry
  },
  journal = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  publisher = {IEEE},
  pages = {2704--2713},
  doi = {10.1109/cvpr.2018.00286},
  url = {https://doi.org/10.1109/cvpr.2018.00286},
  source = {Crossref},
  date = {2018-06},
}

@article{jia2016dynamic,
  title = {Dynamic Filter Networks},
  author = {Jia, Xu and De Brabandere, Bert and Tuytelaars, Tinne and Gool, Luc Van},
  year = {2016},
  journal = {Advances in Neural Information Processing Systems},
  volume = {29},
}

@article{jiang2019accuracy,
  title = {
    Accuracy vs. Efficiency: Achieving Both through FPGA-Implementation Aware Neural Architecture
    Search
  },
  author = {
    Jiang, Weiwen and Zhang, Xinyi and Sha, Edwin H. -M. and Yang, Lei and Zhuge, Qingfeng and Shi,
    Yiyu and Hu, Jingtong
  },
  publisher = {Wiley},
  pages = {351--375},
  doi = {10.1002/9783527829026.ch13},
  isbn = {9783527348091,9783527829026},
  url = {http://arxiv.org/abs/1901.11211v1},
  date = {2019-01-31},
  primaryclass = {cs.DC},
  archiveprefix = {arXiv},
  eprint = {1901.11211},
  source = {Crossref},
}

@inproceedings{jiao2020tinybert,
  title = {TinyBERT: Distilling BERT for Natural Language Understanding},
  author = {
    Jiao, Xiaoqi and Yin, Yichun and Shang, Lifeng and Jiang, Xin and Chen, Xiao and Li, Linlin and
    Wang, Fang and Liu, Qun
  },
  booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2020},
  publisher = {Association for Computational Linguistics},
  doi = {10.18653/v1/2020.findings-emnlp.372},
  url = {https://doi.org/10.18653/v1/2020.findings-emnlp.372},
  source = {Crossref},
  date = {2020},
}

@article{jonathan2019lottery,
  title = {The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks},
  author = {Frankle, Jonathan and Carbin, Michael},
  journal = {arXiv preprint arXiv:1803.03635},
  booktitle = {
    7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May
    6-9, 2019
  },
  publisher = {OpenReview.net},
  url = {http://arxiv.org/abs/1803.03635v5},
  date = {2018-03-09},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  source = {DBLP},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/iclr/FrankleC19.bib},
  timestamp = {Thu, 25 Jul 2019 01:00:00 +0200},
}

@inproceedings{Joulin2017,
  title = {Bag of Tricks for Efficient Text Classification},
  author = {Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
  journal = {Journal of Machine Learning Research},
  booktitle = {
    Proceedings of the 15th Conference of the European Chapter of the Association for Computational
    Linguistics: Volume 2, Short Papers
  },
  publisher = {Association for Computational Linguistics},
  volume = {18},
  pages = {1--42},
  doi = {10.18653/v1/e17-2068},
  url = {https://doi.org/10.18653/v1/e17-2068},
  source = {Crossref},
  date = {2017},
}

@inproceedings{jouppi2017datacenter,
  title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
  author = {
    Jouppi, Norman P. and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav
    and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and
    Boyle, Rick and Cantin, Pierre-luc and Chao, Clifford and Clark, Chris and Coriell, Jeremy and
    Daley, Mike and Dau, Matt and Dean, Jeffrey and Gelb, Ben and Ghaemmaghami, Tara Vazir and
    Gottipati, Rajendra and Gulland, William and Hagmann, Robert and Ho, C. Richard and Hogberg,
    Doug and Hu, John and Hundt, Robert and Hurt, Dan and Ibarz, Julian and Jaffey, Aaron and
    Jaworski, Alek and Kaplan, Alexander and Khaitan, Harshit and Killebrew, Daniel and Koch, Andy
    and Kumar, Naveen and Lacy, Steve and Laudon, James and Law, James and Le, Diemthu and Leary,
    Chris and Liu, Zhuyuan and Lucke, Kyle and Lundin, Alan and MacKean, Gordon and Maggiore,
    Adriana and Mahony, Maire and Miller, Kieran and Nagarajan, Rahul and Narayanaswami, Ravi and
    Ni, Ray and Nix, Kathy and Norrie, Thomas and Omernick, Mark and Penukonda, Narayana and
    Phelps, Andy and Ross, Jonathan and Ross, Matt and Salek, Amir and Samadiani, Emad and Severn,
    Chris and Sizikov, Gregory and Snelham, Matthew and Souter, Jed and Steinberg, Dan and Swing,
    Andy and Tan, Mercedes and Thorson, Gregory and Tian, Bo and Toma, Horia and Tuttle, Erick and
    Vasudevan, Vijay and Walter, Richard and Wang, Walter and Wilcox, Eric and Yoon, Doe Hyun
  },
  journal = {Proceedings of the 44th Annual International Symposium on Computer Architecture (ISCA)},
  booktitle = {Proceedings of the 44th Annual International Symposium on Computer Architecture},
  publisher = {ACM},
  pages = {1--12},
  doi = {10.1145/3079856.3080246},
  url = {https://doi.org/10.1145/3079856.3080246},
  source = {Crossref},
  date = {2017-06-24},
}

@inproceedings{Jouppi2021,
  title = {Ten Lessons From Three Generations Shaped Google's TPUv4i : Industrial Product},
  author = {
    Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and Gottscho, Mark and Jablin,
    Thomas B. and Kurian, George and Laudon, James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and
    Norrie, Thomas and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou, Zongwei and
    Patterson, David
  },
  journal = {Proceedings of the IEEE Hot Chips Symposium},
  booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)},
  publisher = {IEEE},
  pages = {1--14},
  doi = {10.1109/isca52012.2021.00010},
  url = {https://doi.org/10.1109/isca52012.2021.00010},
  source = {Crossref},
  date = {2021-06},
}

@article{koren2009matrix,
  title = {Matrix Factorization Techniques for Recommender Systems},
  author = {Koren, Yehuda and Bell, Robert and Volinsky, Chris},
  journal = {Computer},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  volume = {42},
  number = {8},
  pages = {30--37},
  doi = {10.1109/mc.2009.263},
  issn = {0018-9162},
  url = {https://doi.org/10.1109/mc.2009.263},
  source = {Crossref},
  date = {2009-08},
}

@article{krishna2023raman,
  title = {RAMAN: A Re-configurable and Sparse tinyML Accelerator for Inference on Edge},
  author = {
    Krishna, Adithya and Nudurupati, Srikanth Rohit and G, Chandana D and Dwivedi, Pritesh and van
    Schaik, Andr\'{e} and Mehendale, Mahesh and Thakur, Chetan Singh
  },
  url = {http://arxiv.org/abs/2306.06493v1},
  date = {2023-06-10},
  primaryclass = {cs.NE},
  archiveprefix = {arXiv},
  eprint = {2306.06493},
}

@article{krishnamoorthi2018quantizing,
  title = {Quantizing deep convolutional networks for efficient inference: A whitepaper},
  author = {Krishnamoorthi, Raghuraman},
  journal = {arXiv preprint arXiv:1806.08342},
  volume = {abs/1806.08342},
  url = {http://arxiv.org/abs/1806.08342v1},
  date = {2018-06-21},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{kullback1951information,
  title = {On Information and Sufficiency},
  author = {Kullback, S. and Leibler, R. A.},
  journal = {The Annals of Mathematical Statistics},
  publisher = {Institute of Mathematical Statistics},
  volume = {22},
  number = {1},
  pages = {79--86},
  doi = {10.1214/aoms/1177729694},
  issn = {0003-4851},
  url = {https://doi.org/10.1214/aoms/1177729694},
  source = {Crossref},
  date = {1951-03},
}

@article{kuzmin2022fp8,
  title = {FP8 Quantization: The Power of the Exponent},
  author = {
    Kuzmin, Andrey and Baalen, Mart Van and Ren, Yuwei and Nagel, Markus and Peters, Jorn and
    Blankevoort, Tijmen
  },
  url = {http://arxiv.org/abs/2208.09225v2},
  date = {2022-08-19},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {2208.09225},
}

@article{kwon2021hardwaresoftware,
  title = {
    Hardware/Software Co-Design for TinyML Voice-Recognition Application on Resource Frugal Edge
    Devices
  },
  author = {Kwon, Jisu and Park, Daejin},
  journal = {Applied Sciences},
  publisher = {MDPI AG},
  volume = {11},
  number = {22},
  pages = {11073},
  doi = {10.3390/app112211073},
  issn = {2076-3417},
  url = {https://doi.org/10.3390/app112211073},
  source = {Crossref},
  date = {2021-11-22},
  article-number = {11073},
  bdsk-url-1 = {https://www.mdpi.com/2076-3417/11/22/11073},
  bdsk-url-2 = {https://doi.org/10.3390/app112211073},
}

@article{lai2018cmsisnn,
  title = {CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs},
  author = {Lai, Liangzhen and Suda, Naveen and Chandra, Vikas},
  url = {http://arxiv.org/abs/1801.06601v1},
  date = {2018-01-19},
  primaryclass = {cs.NE},
  archiveprefix = {arXiv},
  eprint = {1801.06601},
}

@inproceedings{lecun1990optimal,
  title = {Optimal Brain Damage.},
  author = {LeCun, Yann and Denker, John S. and Solla, Sara A.},
  year = {1989},
  journal = {NIPS},
  booktitle = {Advances in Neural Information Processing Systems 2 (NIPS 1989)},
  pages = {598--605},
  url = {http://papers.nips.cc/paper/250-optimal-brain-damage},
  source = {DBLP},
}

@inproceedings{lepikhin2020gshard,
  title = {GShard: Scaling Giant Models with Conditional Computation},
  author = {Lepikhin, Dmitry and others},
  year = {2020},
  booktitle = {Proceedings of the International Conference on Learning Representations},
}

@article{Li2016,
  title = {Ternary Weight Networks},
  author = {Li, Fengfu and Liu, Bin and Wang, Xiaoxing and Zhang, Bo and Yan, Junchi},
  journal = {arXiv preprint},
  url = {http://arxiv.org/abs/1605.04711v3},
  date = {2016-05-16},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
  eprint = {1605.04711},
}

@article{Li2021,
  title = {Hyperband: A Novel Bandit-Based Approach to Hyperparameter Optimization.},
  author = {
    Li, Lisha and Jamieson, Kevin G. and DeSalvo, Giulia and Rostamizadeh, Afshin and Talwalkar,
    Ameet
  },
  year = {2017},
  journal = {J. Mach. Learn. Res.},
  booktitle = {Journal of Machine Learning Research},
  volume = {18},
  pages = {185:1--185:52},
  url = {https://jmlr.org/papers/v18/16-558.html},
  source = {DBLP},
}

@inproceedings{lin2020mcunet,
  title = {MCUNet: Tiny Deep Learning on IoT Devices},
  author = {Lin, Ji and Chen, Wei-Ming and Lin, Yujun and Cohn, John and Gan, Chuang and Han, Song},
  year = {2020},
  booktitle = {
    Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information
    Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual
  },
  url = {https://proceedings.neurips.cc/paper/2020/hash/86c51678350f656dcc7f490a43946ee5-Abstract.html},
  editor = {
    Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin,
    Hsuan-Tien
  },
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/nips/LinCLCG020.bib},
  timestamp = {Thu, 11 Feb 2021 00:00:00 +0100},
}

@article{lin2023awq,
  title = {AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
  author = {
    Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Chen, Wei-Ming and Wang,
    Wei-Chen and Xiao, Guangxuan and Dang, Xingyu and Gan, Chuang and Han, Song
  },
  journal = {arXiv preprint arXiv:2306.00978},
  volume = {abs/2306.00978},
  url = {http://arxiv.org/abs/2306.00978v5},
  date = {2023-06-01},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
}

@article{lu2023steplearningnmstructured,
  title = {STEP: Learning N:M Structured Sparsity Masks from Scratch with Precondition},
  author = {
    Lu, Yucheng and Agrawal, Shivani and Subramanian, Suvinay and Rybakov, Oleg and Sa, Christopher
    De and Yazdanbakhsh, Amir
  },
  url = {http://arxiv.org/abs/2302.01172v1},
  date = {2023-02-02},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {2302.01172},
}

@article{lubana2020gradient,
  title = {A Gradient Flow Framework For Analyzing Network Pruning},
  author = {Lubana, Ekdeep Singh and Dick, Robert P.},
  journal = {arXiv preprint arXiv:2009.11839},
  url = {http://arxiv.org/abs/2009.11839v4},
  date = {2020-09-24},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{mellempudi2019mixed,
  title = {Mixed Precision Training With 8-bit Floating Point},
  author = {Mellempudi, Naveen and Srinivasan, Sudarshan and Das, Dipankar and Kaul, Bharat},
  journal = {arXiv preprint arXiv:1905.12334},
  url = {http://arxiv.org/abs/1905.12334v1},
  date = {2019-05-29},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{micikevicius2018mixed,
  title = {Mixed Precision Training},
  author = {
    Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich
    and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaiev, Oleksii and Venkatesh,
    Ganesh and Wu, Hao
  },
  journal = {arXiv preprint arXiv:1710.03740},
  url = {http://arxiv.org/abs/1710.03740v3},
  date = {2017-10-10},
  primaryclass = {cs.AI},
  archiveprefix = {arXiv},
}

@article{micikevicius2022fp8,
  title = {FP8 Formats for Deep Learning},
  author = {
    Micikevicius, Paulius and Stosic, Dusan and Burgess, Neil and Cornea, Marius and Dubey, Pradeep
    and Grisenthwaite, Richard and Ha, Sangwon and Heinecke, Alexander and Judd, Patrick and
    Kamalu, John and Mellempudi, Naveen and Oberman, Stuart and Shoeybi, Mohammad and Siu, Michael
    and Wu, Hao
  },
  journal = {arXiv preprint arXiv:2209.05433},
  url = {http://arxiv.org/abs/2209.05433v2},
  date = {2022-09-12},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{nagel2021white,
  title = {A White Paper on Neural Network Quantization},
  author = {
    Nagel, Markus and Fournarakis, Marios and Amjad, Rana Ali and Bondarenko, Yelysei and van
    Baalen, Mart and Blankevoort, Tijmen
  },
  journal = {arXiv preprint arXiv:2106.08295},
  url = {http://arxiv.org/abs/2106.08295v1},
  date = {2021-06-15},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{nagel2021whitepaper,
  title = {A White Paper on Neural Network Quantization},
  author = {
    Nagel, Markus and Fournarakis, Marios and Amjad, Rana Ali and Bondarenko, Yelysei and van
    Baalen, Mart and Blankevoort, Tijmen
  },
  journal = {arXiv preprint arXiv:2106.08295},
  url = {http://arxiv.org/abs/2106.08295v1},
  date = {2021-06-15},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@inproceedings{NVIDIA2020,
  title = {
    Demystifying the Nvidia Ampere Architecture through Microbenchmarking and Instruction-level
    Analysis
  },
  author = {Abdelkhalik, Hamdy and Arafa, Yehia and Santhi, Nandakishore and Badawy, Abdel-Hameed A.},
  booktitle = {2022 IEEE High Performance Extreme Computing Conference (HPEC)},
  publisher = {IEEE},
  doi = {10.1109/hpec55821.2022.9926299},
  url = {https://doi.org/10.1109/hpec55821.2022.9926299},
  source = {Crossref},
  date = {2022-09-19},
}

@article{patterson2021carbon,
  title = {Carbon Emissions and Large Neural Network Training},
  author = {
    Patterson, David and Gonzalez, Joseph and Le, Quoc and Liang, Chen and Munguia, Lluis-Miquel
    and Rothchild, Daniel and So, David and Texier, Maud and Dean, Jeff
  },
  journal = {arXiv preprint arXiv:2104.10350},
  url = {http://arxiv.org/abs/2104.10350v3},
  date = {2021-04-21},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@inproceedings{prakash2022cfu,
  title = {
    CFU Playground: Full-Stack Open-Source Framework for Tiny Machine Learning (TinyML)
    Acceleration on FPGAs
  },
  author = {
    Prakash, Shvetank and Callahan, Tim and Bushagour, Joseph and Banbury, Colby and Green, Alan V.
    and Warden, Pete and Ansell, Tim and Reddi, Vijay Janapa
  },
  journal = {ArXiv preprint},
  booktitle = {2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
  publisher = {IEEE},
  volume = {abs/2201.01863},
  pages = {157--167},
  doi = {10.1109/ispass57527.2023.00024},
  url = {https://doi.org/10.1109/ispass57527.2023.00024},
  source = {Crossref},
  date = {2023-04},
}

@article{qi2021efficient,
  title = {An efficient pruning scheme of deep neural networks for Internet of Things applications},
  author = {
    Qi, Chen and Shen, Shibo and Li, Rongpeng and Zhao, Zhifeng and Liu, Qing and Liang, Jing and
    Zhang, Honggang
  },
  journal = {EURASIP Journal on Advances in Signal Processing},
  publisher = {Springer Science and Business Media LLC},
  volume = {2021},
  number = {1},
  pages = {31},
  doi = {10.1186/s13634-021-00744-4},
  issn = {1687-6180},
  url = {https://doi.org/10.1186/s13634-021-00744-4},
  source = {Crossref},
  date = {2021-06-29},
}

@inproceedings{rachwan2022winning,
  title = {Winning the lottery ahead of time: Efficient early network pruning},
  author = {
    Rachwan, John and Z\"{u}gner, Daniel and Charpentier, Bertrand and Geisler, Simon and Ayle,
    Morgane and G\"{u}nnemann, Stephan
  },
  year = {2022},
  booktitle = {International Conference on Machine Learning},
  pages = {18293--18309},
  organization = {PMLR},
}

@inproceedings{radosavovic2020designing,
  title = {Designing Network Design Spaces},
  author = {
    Radosavovic, Ilija and Kosaraju, Raj Prateek and Girshick, Ross and He, Kaiming and Dollar,
    Piotr
  },
  journal = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  booktitle = {2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {10428--10436},
  doi = {10.1109/cvpr42600.2020.01044},
  url = {https://doi.org/10.1109/cvpr42600.2020.01044},
  source = {Crossref},
  date = {2020-06},
}

@incollection{Rastegari2016,
  title = {XNOR-Net: ImageNet Classification Using Binary Convolutional Neural Networks},
  author = {Rastegari, Mohammad and Ordonez, Vicente and Redmon, Joseph and Farhadi, Ali},
  journal = {European Conference on Computer Vision (ECCV)},
  booktitle = {Computer Vision – ECCV 2016},
  publisher = {Springer International Publishing},
  pages = {525--542},
  doi = {10.1007/978-3-319-46493-0\_32},
  isbn = {9783319464923,9783319464930},
  issn = {0302-9743,1611-3349},
  url = {https://doi.org/10.1007/978-3-319-46493-0\_32},
  source = {Crossref},
  date = {2016},
}

@article{Real2019,
  title = {Regularized Evolution for Image Classifier Architecture Search},
  author = {Real, Esteban and Aggarwal, Alok and Huang, Yanping and Le, Quoc V.},
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  publisher = {Association for the Advancement of Artificial Intelligence (AAAI)},
  volume = {33},
  number = {01},
  pages = {4780--4789},
  doi = {10.1609/aaai.v33i01.33014780},
  issn = {2374-3468,2159-5399},
  url = {https://doi.org/10.1609/aaai.v33i01.33014780},
  source = {Crossref},
  date = {2019-07-17},
}

@article{real2019regularized,
  title = {Regularized Evolution for Image Classifier Architecture Search},
  author = {Real, Esteban and Aggarwal, Alok and Huang, Yanping and Le, Quoc V.},
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  booktitle = {AAAI Conference on Artificial Intelligence},
  publisher = {Association for the Advancement of Artificial Intelligence (AAAI)},
  volume = {33},
  number = {01},
  pages = {4780--4789},
  doi = {10.1609/aaai.v33i01.33014780},
  issn = {2374-3468,2159-5399},
  url = {https://doi.org/10.1609/aaai.v33i01.33014780},
  source = {Crossref},
  date = {2019-07-17},
}

@inproceedings{sabour2017dynamic,
  title = {Dynamic Routing Between Capsules},
  author = {Sabour, Sara and Frosst, Nicholas and Hinton, Geoffrey E},
  year = {2017},
  booktitle = {Advances in Neural Information Processing Systems},
  volume = {30},
}

@article{sanh2019distilbert,
  title = {DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},
  author = {Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas},
  journal = {arXiv preprint arXiv:1910.01108},
  url = {http://arxiv.org/abs/1910.01108v4},
  date = {2019-10-02},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
}

@article{scardapane2020should,
  title = {Why should I trust you? A survey of explainability of machine learning for healthcare},
  author = {Scardapane, Simone and Wang, Ye and Panella, Massimo},
  year = {2020},
  journal = {Pattern Recognition Letters},
  publisher = {Elsevier},
  volume = {140},
  pages = {47--57},
}

@inproceedings{shazeer2017outrageously,
  title = {Outrageously large neural networks: The sparsely-gated mixture-of-experts layer},
  author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Piotr and others},
  year = {2017},
  booktitle = {International Conference on Learning Representations},
}

@article{sheng2019qbert,
  title = {Q-BERT: Hessian Based Ultra Low Precision Quantization of BERT},
  author = {
    Shen, Sheng and Dong, Zhen and Ye, Jiayu and Ma, Linjian and Yao, Zhewei and Gholami, Amir and
    Mahoney, Michael W. and Keutzer, Kurt
  },
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
  publisher = {Association for the Advancement of Artificial Intelligence (AAAI)},
  volume = {34},
  number = {05},
  pages = {8815--8821},
  doi = {10.1609/aaai.v34i05.6409},
  issn = {2374-3468,2159-5399},
  url = {http://arxiv.org/abs/1909.05840v2},
  date = {2019-09-12},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/journals/corr/abs-1909-05840.bib},
  eprint = {1909.05840},
  eprinttype = {arXiv},
  timestamp = {Wed, 18 Sep 2019 10:38:36 +0200},
  source = {Crossref},
}

@inproceedings{sun2019patient,
  title = {Patient Knowledge Distillation for BERT Model Compression},
  author = {Sun, Siqi and Cheng, Yu and Gan, Zhe and Liu, Jingjing},
  booktitle = {
    Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the
    9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)
  },
  publisher = {Association for Computational Linguistics},
  doi = {10.18653/v1/d19-1441},
  url = {https://doi.org/10.18653/v1/d19-1441},
  source = {Crossref},
  date = {2019},
}

@inproceedings{Tan2019,
  title = {EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks},
  author = {Tan, Mingxing and Le, Quoc V.},
  year = {2019},
  booktitle = {International Conference on Machine Learning},
}

@inproceedings{tan2019efficientnet,
  title = {EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks},
  author = {Tan, Mingxing and Le, Quoc V},
  year = {2019},
  booktitle = {International Conference on Machine Learning (ICML)},
  pages = {6105--6114},
}

@inproceedings{tan2019mnasnet,
  title = {MnasNet: Platform-Aware Neural Architecture Search for Mobile},
  author = {
    Tan, Mingxing and Chen, Bo and Pang, Ruoming and Vasudevan, Vijay and Sandler, Mark and Howard,
    Andrew and Le, Quoc V.
  },
  booktitle = {2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {2815--2823},
  doi = {10.1109/cvpr.2019.00293},
  url = {https://doi.org/10.1109/cvpr.2019.00293},
  source = {Crossref},
  date = {2019-06},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/cvpr/TanCPVSHL19.bib},
  timestamp = {Tue, 12 Jan 2021 00:00:00 +0100},
}

@article{tan2020efficientnet,
  title = {EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks},
  author = {Tan, Mingxing and Le, Quoc V.},
  publisher = {Wiley},
  pages = {111--131},
  doi = {10.1002/9781394205639.ch6},
  isbn = {9781394205608,9781394205639},
  url = {http://arxiv.org/abs/1905.11946v5},
  date = {2019-05-28},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {1905.11946},
  source = {Crossref},
}

@article{tang2020understanding,
  title = {Primordial black holes and secondary gravitational waves from k/G inflation},
  author = {Lin, Jiong and Gao, Qing and Gong, Yungui and Lu, Yizhou and Zhang, Chao and Zhang, Fengge},
  journal = {arXiv preprint arXiv:2001.05909},
  url = {http://arxiv.org/abs/2001.05909v2},
  date = {2020-01-16},
  primaryclass = {gr-qc},
  archiveprefix = {arXiv},
}

@article{teerapittayanon2016branchynet,
  title = {BranchyNet: Fast Inference via Early Exiting from Deep Neural Networks},
  author = {Teerapittayanon, Surat and McDanel, Bradley and Kung, H. T.},
  journal = {arXiv preprint arXiv:1709.01686},
  booktitle = {2016 23rd International Conference on Pattern Recognition (ICPR)},
  publisher = {IEEE},
  pages = {2464--2469},
  doi = {10.1109/icpr.2016.7900006},
  url = {http://arxiv.org/abs/1709.01686v1},
  date = {2017-09-06},
  primaryclass = {cs.NE},
  archiveprefix = {arXiv},
  source = {Crossref},
}

@inproceedings{Umuroglu2017,
  title = {FINN},
  author = {
    Umuroglu, Yaman and Fraser, Nicholas J. and Gambardella, Giulio and Blott, Michaela and Leong,
    Philip and Jahre, Magnus and Vissers, Kees
  },
  journal = {ACM/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA)},
  booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
  publisher = {ACM},
  pages = {65--74},
  doi = {10.1145/3020078.3021744},
  url = {https://doi.org/10.1145/3020078.3021744},
  source = {Crossref},
  subtitle = {A Framework for Fast, Scalable Binarized Neural Network Inference},
  date = {2017-02-22},
}

@article{Vanschoren2019,
  title = {Meta-Learning: A Survey},
  author = {Vanschoren, Joaquin},
  journal = {ArXiv preprint arXiv:1810.03548},
  url = {http://arxiv.org/abs/1810.03548v1},
  date = {2018-10-08},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@inproceedings{vaswani2017attention,
  title = {The Best of Both Worlds: Combining Recent Advances in Neural Machine Translation},
  author = {
    Chen, Mia Xu and Firat, Orhan and Bapna, Ankur and Johnson, Melvin and Macherey, Wolfgang and
    Foster, George and Jones, Llion and Schuster, Mike and Shazeer, Noam and Parmar, Niki and
    Vaswani, Ashish and Uszkoreit, Jakob and Kaiser, Lukasz and Chen, Zhifeng and Wu, Yonghui and
    Hughes, Macduff
  },
  journal = {Adv Neural Inf Process Syst},
  booktitle = {
    Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume
    1: Long Papers)
  },
  publisher = {Association for Computational Linguistics},
  volume = {30},
  pages = {5998--6008},
  doi = {10.18653/v1/p18-1008},
  url = {https://doi.org/10.18653/v1/p18-1008},
  source = {Crossref},
  date = {2018},
}

@incollection{wang2018skipnet,
  title = {SkipNet: Learning Dynamic Routing in Convolutional Networks},
  author = {Wang, Xin and Yu, Fisher and Dou, Zi-Yi and Darrell, Trevor and Gonzalez, Joseph E.},
  booktitle = {Computer Vision – ECCV 2018},
  publisher = {Springer International Publishing},
  pages = {420--436},
  doi = {10.1007/978-3-030-01261-8\_25},
  isbn = {9783030012601,9783030012618},
  issn = {0302-9743,1611-3349},
  url = {https://doi.org/10.1007/978-3-030-01261-8\_25},
  source = {Crossref},
  date = {2018},
  organization = {Springer},
}

@article{wang2019benchmarking,
  title = {Benchmarking TPU, GPU, and CPU Platforms for Deep Learning},
  author = {Wang, Yu Emma and Wei, Gu-Yeon and Brooks, David},
  journal = {arXiv preprint arXiv:1907.10701},
  url = {http://arxiv.org/abs/1907.10701v4},
  date = {2019-07-24},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{wang2021glam,
  title = {EtherSolve: Computing an Accurate Control-Flow Graph from Ethereum Bytecode},
  author = {Contro, Filippo and Crosara, Marco and Ceccato, Mariano and Preda, Mila Dalla},
  journal = {arXiv preprint arXiv:2103.09113},
  url = {http://arxiv.org/abs/2103.09113v1},
  date = {2021-03-16},
  primaryclass = {cs.SE},
  archiveprefix = {arXiv},
}

@inproceedings{Wu2016,
  title = {Quantized Convolutional Neural Networks for Mobile Devices},
  author = {Wu, Jiaxiang and Leng, Cong and Wang, Yuhang and Hu, Qinghao and Cheng, Jian},
  journal = {IEEE Conference on Computer Vision and Pattern Recognition},
  booktitle = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {4820--4828},
  doi = {10.1109/cvpr.2016.521},
  url = {https://doi.org/10.1109/cvpr.2016.521},
  source = {Crossref},
  date = {2016-06},
}

@inproceedings{wu2019fast,
  title = {Fast Neural Networks: Efficient and Adaptive Computation for Inference},
  author = {Wu, Jian and Cheng, Hao and Zhang, Yifan},
  year = {2019},
  booktitle = {Advances in Neural Information Processing Systems},
}

@inproceedings{wu2019fbnet,
  title = {FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable Neural Architecture Search},
  author = {
    Wu, Bichen and Keutzer, Kurt and Dai, Xiaoliang and Zhang, Peizhao and Wang, Yanghan and Sun,
    Fei and Wu, Yiming and Tian, Yuandong and Vajda, Peter and Jia, Yangqing
  },
  booktitle = {2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {10726--10734},
  doi = {10.1109/cvpr.2019.01099},
  url = {https://doi.org/10.1109/cvpr.2019.01099},
  source = {Crossref},
  date = {2019-06},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/cvpr/WuDZWSWTVJK19.bib},
  timestamp = {Mon, 20 Jan 2020 00:00:00 +0100},
}

@article{wu2020integer,
  title = {Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation},
  author = {Wu, Hao and Judd, Patrick and Zhang, Xiaojie and Isaev, Mikhail and Micikevicius, Paulius},
  journal = {arXiv preprint arXiv:2004.09602},
  volume = {abs/2004.09602},
  url = {http://arxiv.org/abs/2004.09602v1},
  date = {2020-04-20},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
}

@article{xiao2022smoothquant,
  title = {SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models},
  author = {Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
  journal = {ArXiv preprint},
  volume = {abs/2211.10438},
  url = {http://arxiv.org/abs/2211.10438v7},
  date = {2022-11-18},
  primaryclass = {cs.CL},
  archiveprefix = {arXiv},
}

@inproceedings{xin-etal-2021-berxit,
  title = {BERxiT: Early Exiting for BERT with Better Fine-Tuning and Extension to Regression},
  author = {Xin, Ji and Tang, Raphael and Yu, Yaoliang and Lin, Jimmy},
  booktitle = {
    Proceedings of the 16th Conference of the European Chapter of the Association for Computational
    Linguistics: Main Volume
  },
  publisher = {Association for Computational Linguistics},
  address = {Online},
  pages = {91--104},
  doi = {10.18653/v1/2021.eacl-main.8},
  url = {https://doi.org/10.18653/v1/2021.eacl-main.8},
  source = {Crossref},
  date = {2021},
  editor = {Merlo, Paola and Tiedemann, Jorg and Tsarfaty, Reut},
  abstract = {
    The slow speed of BERT has motivated much research on accelerating its inference, and the early
    exiting idea has been proposed to make trade-offs between model quality and efficiency. This
    paper aims to address two weaknesses of previous work: (1) existing fine-tuning strategies for
    early exiting models fail to take full advantage of BERT; (2) methods to make exiting decisions
    are limited to classification tasks. We propose a more advanced fine-tuning strategy and a
    learning-to-exit module that extends early exiting to tasks other than classification.
    Experiments demonstrate improved early exiting for BERT, with better trade-offs obtained by the
    proposed fine-tuning strategy, successful application to regression tasks, and the possibility
    to combine it with other acceleration methods. Source code can be found at
    <https://github.com/castorini/berxit>.
  },
}

@article{xinyu,
  title = {The molecular biology of FMRP: new insights into fragile X syndrome},
  author = {Richter, Joel D. and Zhao, Xinyu},
  journal = {Nature Reviews Neuroscience},
  publisher = {Springer Science and Business Media LLC},
  volume = {22},
  number = {4},
  pages = {209--222},
  doi = {10.1038/s41583-021-00432-0},
  issn = {1471-003X,1471-0048},
  url = {https://doi.org/10.1038/s41583-021-00432-0},
  source = {Crossref},
  date = {2021-02-19},
  essn = {1471-0048},
  abstract = {
    Some simple examples for showing how to use tensor decomposition to reconstruct fluid dynamics
  },
  bdsk-url-1 = {https://medium.com/},
}

@inproceedings{xu2018alternating,
  title = {Alternating Multi-bit Quantization for Recurrent Neural Networks},
  author = {
    Xu, Chen and Yao, Jianqiang and Lin, Zhouchen and Ou, Wenwu and Cao, Yuanbin and Wang, Zhirong
    and Zha, Hongbin
  },
  year = {2018},
  booktitle = {
    6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada,
    April 30 - May 3, 2018, Conference Track Proceedings
  },
  publisher = {OpenReview.net},
  url = {https://openreview.net/forum?id=S19dR9x0b},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/iclr/XuYLOCWZ18.bib},
  timestamp = {Thu, 25 Jul 2019 01:00:00 +0200},
}

@article{yang2020coexploration,
  title = {
    Co-Exploration of Neural Architectures and Heterogeneous ASIC Accelerator Designs Targeting
    Multiple Tasks
  },
  author = {
    Yang, Lei and Yan, Zheyu and Li, Meng and Kwon, Hyoukjun and Lai, Liangzhen and Krishna, Tushar
    and Chandra, Vikas and Jiang, Weiwen and Shi, Yiyu
  },
  publisher = {Wiley},
  pages = {523--587},
  doi = {10.1002/9783527667703.ch67},
  isbn = {9783527411917,9783527667703},
  url = {http://arxiv.org/abs/2002.04116v1},
  date = {2020-02-10},
  primaryclass = {cs.LG},
  archiveprefix = {arXiv},
  eprint = {2002.04116},
  source = {Crossref},
}

@inproceedings{yang2020resolution,
  title = {Resolution Adaptive Networks for Efficient Inference},
  author = {Yang, Le and Han, Yizeng and Chen, Xi and Song, Shiji and Dai, Jifeng and Huang, Gao},
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  booktitle = {2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  publisher = {IEEE},
  pages = {2366--2375},
  doi = {10.1109/cvpr42600.2020.00244},
  url = {https://doi.org/10.1109/cvpr42600.2020.00244},
  source = {Crossref},
  date = {2020-06},
}

@inproceedings{yao2021hawq,
  title = {HAWQ-V3: Dyadic Neural Network Quantization},
  author = {Yao, Zhewei and Gholami, Amir and Shen, Sheng and Keutzer, Kurt and Mahoney, Michael W.},
  year = {2021},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning (ICML)},
  pages = {11875--11886},
  organization = {PMLR},
}

@article{yu2023efficient,
  title = {Efficient Early Exiting Strategies for Neural Network Acceleration},
  author = {Yu, Jun and Li, Peng and Wang, Zhenhua},
  year = {2023},
  journal = {IEEE Transactions on Neural Networks and Learning Systems},
  publisher = {IEEE},
}

@article{zhang2019autoshrink,
  title = {AutoShrink: A Topology-Aware NAS for Discovering Efficient Neural Architecture},
  author = {
    Zhang, Tunhou and Cheng, Hsin-Pai and Li, Zhenwen and Yan, Feng and Huang, Chengyu and Li, Hai
    and Chen, Yiran
  },
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  booktitle = {
    The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, The Thirty-Second
    Innovative Applications of Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI
    Symposium on Educational Advances in Artificial Intelligence, EAAI 2020, New York, NY, USA,
    February 7-12, 2020
  },
  publisher = {Association for the Advancement of Artificial Intelligence (AAAI)},
  volume = {34},
  number = {04},
  pages = {6829--6836},
  doi = {10.1609/aaai.v34i04.6163},
  issn = {2374-3468,2159-5399},
  url = {https://doi.org/10.1609/aaai.v34i04.6163},
  source = {Crossref},
  date = {2020-04-03},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl = {https://dblp.org/rec/conf/aaai/ZhangCL0HLC20.bib},
  timestamp = {Tue, 02 Feb 2021 00:00:00 +0100},
}

@inproceedings{zhang2020fast,
  title = {Fast Hardware-Aware Neural Architecture Search},
  author = {Zhang, Li Lyna and Yang, Yuqing and Jiang, Yuhang and Zhu, Wenwu and Liu, Yunxin},
  booktitle = {2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
  publisher = {IEEE},
  doi = {10.1109/cvprw50498.2020.00354},
  url = {https://doi.org/10.1109/cvprw50498.2020.00354},
  source = {Crossref},
  date = {2020-06},
}

@article{zhang2021learning,
  title = {Learning-based Efficient Sparsity and Quantization for Neural Network Compression},
  author = {Zhang, Yi and Yang, Jianlei and Song, Linghao and Shi, Yiyu and Wang, Yu and Xie, Yuan},
  year = {2021},
  journal = {IEEE Transactions on Neural Networks and Learning Systems},
  volume = {32},
  number = {9},
  pages = {3980--3994},
}

@article{zhou2021analognets,
  title = {
    AnalogNets: ML-HW Co-Design of Noise-robust TinyML Models and Always-On Analog
    Compute-in-Memory Accelerator
  },
  author = {
    Zhou, Chuteng and Redondo, Fernando Garcia and B\"{u}chel, Julian and Boybat, Irem and Comas,
    Xavier Timoneda and Nandakumar, S. R. and Das, Shidhartha and Sebastian, Abu and Gallo, Manuel
    Le and Whatmough, Paul N.
  },
  url = {http://arxiv.org/abs/2111.06503v1},
  date = {2021-11-10},
  primaryclass = {cs.AR},
  archiveprefix = {arXiv},
  eprint = {2111.06503},
}

@article{zhou2021learningnmfinegrainedstructured,
  title = {Learning N:M Fine-grained Structured Sparse Neural Networks From Scratch},
  author = {
    Zhou, Aojun and Ma, Yukun and Zhu, Junnan and Liu, Jianbo and Zhang, Zhijie and Yuan, Kun and
    Sun, Wenxiu and Li, Hongsheng
  },
  url = {http://arxiv.org/abs/2102.04010v2},
  date = {2021-02-08},
  primaryclass = {cs.CV},
  archiveprefix = {arXiv},
  eprint = {2102.04010},
}

@article{Zhu2017,
  title = {Trained Ternary Quantization},
  author = {Zhu, Chenzhuo and Han, Song and Mao, Huizi and Dally, William J.},
  year = {2017},
  journal = {International Conference on Learning Representations (ICLR)},
}

@inproceedings{Zoph2017,
  title = {Neural Architecture Search with Reinforcement Learning},
  author = {Zoph, Barret and Le, Quoc V.},
  year = {2017},
  booktitle = {International Conference on Learning Representations},
}

@inproceedings{zoph2017neural,
  title = {Neural Architecture Search with Reinforcement Learning},
  author = {Zoph, Barret and Le, Quoc V},
  year = {2017},
  booktitle = {International Conference on Learning Representations (ICLR)},
}

@inproceedings{zoph2018learning,
  title = {Learning Transferable Architectures for Scalable Image Recognition},
  author = {Zoph, Barret and Vasudevan, Vijay and Shlens, Jonathon and Le, Quoc V.},
  booktitle = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  publisher = {IEEE},
  pages = {8697--8710},
  doi = {10.1109/cvpr.2018.00907},
  url = {https://doi.org/10.1109/cvpr.2018.00907},
  source = {Crossref},
  date = {2018-06},
}