dao-ailab.github.io/_bibliography/papers.bib at main · Dao-AILab/dao-ailab.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
@inproceedings{flash4,
  title={{FlashAttention}-4: Algorithm and Kernel Pipelining Co-design for Asymmetric Hardware Scaling},
  author={Zadouri*, Ted and Shah*, Jay and Hohnerbach*, Markus and Liu, Timmy and Thakkar, Vijay and Dao, Tri},
  booktitle={Machine Learning and Systems (MLSys)},
  year={2026}
}

@inproceedings{pan2025marconi,
  title={Marconi: Prefix Caching for the Era of Hybrid LLMs},
  author={Pan, Rui and Wang, Zhuang and Jia, Zhen and Karakus, Can and Zancato, Luca and Dao, Tri and Netravali, Ravi and Wang, Yida},
  booktitle={Machine Learning and Systems (MLSys)},
  year={2025},
  code={https://github.com/ruipeterpan/marconi},
  arxiv={2411.19379},
  award={Outstanding Paper Honorable Mention},
  award_name={Outstanding Paper Honorable Mention},
  selected={true}
}

@inproceedings{flash3,
  title={FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision},
  author={Shah*, Jay and Bikshandi*, Ganesh and Zhang, Ying and Thakkar, Vijay and Ramani, Pradeep and Dao, Tri},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2024},
  arxiv={2407.08608},
  code={https://github.com/Dao-AILab/flash-attention/},
  selected={true}
}

@inproceedings{mamba2,
  title={Transformers are {SSM}s: Generalized Models and Efficient Algorithms Through Structured State Space Duality},
  author={Dao*, Tri and Gu*, Albert},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2024},
  arxiv={2405.21060},
  code={https://github.com/state-spaces/mamba/},
  selected={true}
}

@article{mamba,
  title={Mamba: Linear-Time Sequence Modeling with Selective State Spaces},
  author={Gu*, Albert and Dao*, Tri},
  journal={Conference on Language Modeling (COLM)},
  year={2023},
  arxiv={2312.00752},
  code={https://github.com/state-spaces/mamba/},
  award={Outstanding Paper},
  award_name={Outstanding Paper},
  selected={true}
}

@inproceedings{dao2022flashattention,
  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022},
  arxiv={2205.14135},
  code={https://github.com/Dao-AILab/flash-attention/},
  selected={true},
  award={Best Paper award at the ICML Hardware Aware Efficient Training Workshop 2022, Inaugural Stanford Open Source Software Prize 2024},
  award_name={Awards}
}

@inproceedings{dao2022monarch,
  title={Monarch: Expressive Structured Matrices for Efficient and Accurate Training},
  author={Dao, Tri and Chen, Beidi and Sohoni, Nimit and Desai, Arjun and Poli, Michael and Grogan, Jessica and Liu, Alexander and Rao, Aniruddh and Rudra, Atri and R{\'e}, Christopher},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2022},
  arxiv={2204.00595},
  code={https://github.com/HazyResearch/fly},
  award={Outstanding Paper runner-up},
  award_name={Outstanding Paper runner-up},
  selected={true}
}

@inproceedings{lahoti2025mamba3,
  title={Mamba-3: Improved Sequence Modeling using State Space Principles},
  author={Lahoti, Aakash and Li, Kevin and Chen, Berlin and Wang, Caitlin and Bick, Aviv and Kolter, Zico and and Dao, Tri and Gu, Albert},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2026},
  award={Oral},
  award_name={Oral},
}

@inproceedings{guo2025sonicmoe,
  title={{SonicMoE}: Accelerating MoE with IO and Tile-aware Optimizations},
  author={Guo, Wentao and Mishra, Mayank and Cheng, Xinle and Stoica, Ion and Dao, Tri},
  booktitle={International Conference on Learning Representations (ICLR)},
  arxiv={2512.14080},
  year={2026}
}

@inproceedings{shao2025beat,
  title={Beat the long tail: Distribution-Aware Speculative Decoding for {RL} Training},
  author={Shao, Zelei and Srivatsa, Vikranth and Srivastava, Sanjana and Wu, Qingyang and Ariyak, Alpay and Wu, Xiaoxia and Patel, Ameen and Wang, Jue and Liang, Percy and Dao, Tri and others},
  booktitle={Machine Learning and Systems (MLSys)},
  arxiv={2511.13841},
  year={2026}
}

@inproceedings{kumar2026speculative,
  title={Speculative Speculative Decoding},
  author={Kumar, Tanishq and Dao, Tri and May, Avner},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2026}
}

@inproceedings{guo2025log,
  title={Log-Linear Attention},
  author={Guo, Han and Yang, Songlin and Goel, Tarushii and Xing, Eric P and Dao, Tri and Kim, Yoon},
  booktitle={International Conference on Learning Representations (ICLR)},
  arxiv={2506.04761},
  year={2026}
}

@inproceedings{zadouri2025hardware,
  title={Hardware-Efficient Attention for Fast Decoding},
  author={Zadouri, Ted and Strauss, Hubert and Dao, Tri},
  booktitle={Conference on Language Modeling (COLM)},
  arxiv={2505.21487},
  year={2025}
}

@inproceedings{po2025long,
  title={Long-context state-space video world models},
  author={Po, Ryan and Nitzan, Yotam and Zhang, Richard and Chen, Berlin and Dao, Tri and Shechtman, Eli and Wetzstein, Gordon and Huang, Xun},
  booktitle={International Conference on Computer Vision (ICCV)},
  arxiv={2505.20171},
  year={2025}
}

@article{wang2025m1,
  title={M1: Towards Scalable Test-Time Compute with Mamba Reasoning Models},
  author={Wang, Junxiong and Li, Wen-Ding and Paliotta, Daniele and Ritter, Daniel and Rush, Alexander M and Dao, Tri},
  journal={arXiv preprint arXiv:2504.10449},
  year={2025}
}

@article{ma2025hybridna,
  title={HybriDNA: A Hybrid Transformer-Mamba2 Long-Range DNA Language Model},
  author={Ma, Mingqian and Liu, Guoqing and Cao, Chuan and Deng, Pan and Dao, Tri and Gu, Albert and Jin, Peiran and Yang, Zhao and Xia, Yingce and Luo, Renqian and others},
  journal={arXiv preprint arXiv:2502.10807},
  year={2025}
}

@article{paliotta2025thinking,
  title={Thinking slow, fast: Scaling inference compute with distilled reasoners},
  author={Paliotta, Daniele and Wang, Junxiong and Pagliardini, Matteo and Li, Kevin Y and Bick, Aviv and Kolter, J Zico and Gu, Albert and Fleuret, Fran{\c{c}}ois and Dao, Tri},
  journal={arXiv preprint arXiv:2502.20339},
  year={2025}
}

@inproceedings{zhang2025ladder,
  title={Ladder-residual: parallelism-aware architecture for accelerating large model inference with communication overlapping},
  author={Zhang, Muru and Mishra, Mayank and Zhou, Zhongzhu and Brandon, William and Wang, Jue and Kim, Yoon and Ragan-Kelley, Jonathan and Song, Shuaiwen Leon and Athiwaratkun, Ben and Dao, Tri},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2025}
}

@inproceedings{weber2024redpajama,
  title={Redpajama: an open dataset for training large language models},
  author={Weber, Maurice and Fu, Daniel and Anthony, Quentin and Oren, Yonatan and Adams, Shane and Alexandrov, Anton and Lyu, Xiaozhong and Nguyen, Huu and Yao, Xiaozhe and Adams, Virgini and Athiwaratkun, Ben and Chalamala, Rahul and Chen, Kezhen and Ryabinin, Max and Dao, Tri and Liang, Percy and Ré, Christopher and Rish, Irina and Zhang, Ce},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2024}
}

@inproceedings{wang2024mamba,
  title={The mamba in the llama: Distilling and accelerating hybrid models},
  author={Wang, Junxiong and Paliotta, Daniele and May, Avner and Rush, Alexander M and Dao, Tri},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2024}
}

@inproceedings{hwang2024hydra,
  title={Hydra: Bidirectional state space models through generalized matrix mixers},
  author={Hwang, Sukjun and Lahoti, Aakash and Dao, Tri and Gu, Albert},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2024}
}

@inproceedings{liu2024bitdelta,
  title={Bitdelta: Your fine-tune may only be worth one bit},
  author={Liu, James and Xiao, Guangxuan and Li, Kai and Lee, Jason D and Han, Song and Dao, Tri and Cai, Tianle},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2024}
}
@article{waleffe2024empirical,
  title={An Empirical Study of Mamba-based Language Models},
  author={Waleffe, Roger and Byeon, Wonmin and Riach, Duncan and Norick, Brandon and Korthikanti, Vijay and Dao, Tri and Gu, Albert and Hatamizadeh, Ali and Singh, Sudhakar and Narayanan, Deepak and others},
  journal={arXiv preprint arXiv:2406.07887},
  year={2024}
}

@article{lozhkov2024starcoder,
  title={Starcoder 2 and the stack v2: The next generation},
  author={Lozhkov, Anton and Li, Raymond and Allal, Loubna Ben and Cassano, Federico and Lamy-Poirier, Joel and Tazi, Nouamane and Tang, Ao and Pykhtar, Dmytro and Liu, Jiawei and Wei, Yuxiang and others},
  journal={arXiv preprint arXiv:2402.19173},
  year={2024}
}

@inproceedings{schiff2024caduceus,
  title={Caduceus: Bi-directional equivariant long-range {DNA} sequence modeling},
  author={Schiff, Yair and Kao, Chia-Hsiang and Gokaslan, Aaron and Dao, Tri and Gu, Albert and Kuleshov, Volodymyr},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2024}
}

@inproceedings{mamba2,
  title={Transformers are {SSM}s: Generalized Models and Efficient Algorithms Through Structured State Space Duality},
  author={Dao*, Tri and Gu*, Albert},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2024}
}

@inproceedings{cai2024medusa,
  title={Medusa: Simple {LLM} inference acceleration framework with multiple decoding heads},
  author={Cai, Tianle and Li, Yuhong and Geng, Zhengyang and Peng, Hongwu and Lee, Jason D and Chen, Deming and Dao, Tri},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2024}
}

@inproceedings{dao2023flashattention2,
  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
  author={Dao, Tri},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2024},
  arxiv={2307.08691},
  code={https://github.com/Dao-AILab/flash-attention/}
}

@inproceedings{liu2023deja,
  title={Deja vu: Contextual sparsity for efficient llms at inference time},
  author={Liu, Zichang and Wang, Jue and Dao, Tri and Zhou, Tianyi and Yuan, Binhang and Song, Zhao and Shrivastava, Anshumali and Zhang, Ce and Tian, Yuandong and R{\'e}, Christopher and Chen, Beidi},
  booktitle={International Conference on Machine Learning},
  year={2023},
  award={Oral},
  award_name={Oral},
}

@inproceedings{zhang2023effectively,
  title={Effectively modeling time series with simple discrete state spaces},
  author={Zhang, Michael and Saab, Khaled K and Poli, Michael and Dao, Tri and Goel, Karan and R{\'e}, Christopher},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2023}
}

@inproceedings{poli2023hyena,
  title={Hyena Hierarchy: Towards Larger Convolutional Language Models},
  author={Poli*, Michael and Massaroli*, Stefano and Nguyen, Eric and Fu, Daniel Y and Dao, Tri and Baccus, Stephen and Bengio, Yoshua and Ermon, Stefano and R{\'e}, Christopher},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2023},
  award={Oral},
  award_name={Oral},
}

@inproceedings{fu2023simple,
  title={Simple Hardware-Efficient Long Convolutions for Sequence Modeling},
  author={Fu*, Daniel Y. and Epstein*, Elliot L and Nguyen, Eric and Thomas, Armin W and Zhang, Michael and Dao, Tri and Rudra, Atri and R{\'e}, Christopher},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2023}
}

@inproceedings{dao2023hungry,
  title={Hungry Hungry Hippos: Towards Language Modeling with State Space Models},
  author={Dao*, Tri and Fu*, Daniel Y. and Saab, Khaled K. and Thomas, Armin W. and Rudra, Atri and R{\'e}, Christopher},
  booktitle={The International Conference on Learning Representations ({ICLR})},
  year={2023},
  award={Spotlight},
  award_name={Spotlight},
}

@inproceedings{yuan2022decentralized,
  title={Decentralized Training of Foundation Models in Heterogeneous Environments},
  author={Yuan, Binhang and He, Yongjun and Davis, Jared Quincy and Zhang, Tianyi and Dao, Tri and Chen, Beidi and Liang, Percy and R{\'e}, Christopher and Zhang, Ce},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022},
  award={Oral},
  award_name={Oral},
}

@inproceedings{wang2022finetuning,
  title={Fine-tuning Language Models over Slow Networks using Activation Compression with Guarantees},
  author={Wang, June and Yuan, Binhang and Rimanic, Luka and He, Yongjun and Dao, Tri and Chen, Beidi and Liang, Percy and R{\'e}, Christopher and Zhang, Ce},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022}
}

@inproceedings{poli2022transform,
  title={Transform Once: Efficient Operator Learning in Frequency Domain},
  author={Poli, Michael and Massaroli, Stefano and Berto, Federico and Park, Jinkyoo and Dao, Tri and R{\'e}, Christopher and Ermon, Stefano},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022}
}

@inproceedings{nguyen2022s4nd,
  title={{S4ND}: Modeling Images and Videos as Multidimensional Signals with State Spaces},
  author={Nguyen, Eric and Goel, Karan and Gu, Albert and Downs, Gordon and Shah, Preey and Dao, Tri and Baccus, Stephen and R\'e, Christopher},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022}
}

@inproceedings{meng2022butterflyflow,
  title={Butterfly{F}low: Building Invertible Layers with Butterfly Matrices},
  author={Meng, Chenlin and Zhou, Linqi and Choi, Kristy and Dao, Tri and Ermon, Stefano},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2022}
}

@inproceedings{dao2021pixelated,
  title={Pixelated Butterfly: Simple and Efficient Sparse training for Neural Network Models},
  author={Dao*, Tri and Chen*, Beidi and Liang, Kaizhao and Yang, Jiaming and Song, Zhao and Rudra, Atri and R{\'e}, Christopher},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2022},
  award={Spotlight},
  award_name={Spotlight},
}

@inproceedings{chen2021scatterbrain,
  title={Scatterbrain: Unifying Sparse and Low-rank Attention},
  author={Chen*, Beidi and Dao*, Tri and Winsor, Eric and Song, Zhao and Rudra, Atri and Ré, Christopher},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2021}
}

@article{gu2021combining,
  title={Combining Recurrent, Convolutional, and Continuous-time Models with Linear State Space Layers},
  author={Gu, Albert and Johnson, Isys and Goel, Karan and Saab, Khaled and Dao, Tri and Rudra, Atri and R{\'e}, Christopher},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  year={2021}
}

@inproceedings{roberts2021rethinking,
  title={Rethinking Neural Operations for Diverse Tasks},
  author={Roberts, Nicholas and Khodak, Mikhail and Dao, Tri and Li, Liam and R{\'e}, Christopher and Talwalkar, Ameet},
  booktitle={Advances in Neural Information Processing Systems},
  year={2021}
}


@inproceedings{davis2021catformer,
  title={Catformer: Designing Stable Transformers via Sensitivity Analysis},
  author={Davis*, Jared Q and Gu*, Albert and Choromanski, Krzysztof and Dao, Tri and R{\'e}, Christopher and Finn, Chelsea and Liang, Percy},
  booktitle={International Conference on Machine Learning (ICML)},
  year={2021}
}

@inproceedings{dao2021knowledge,
  title={Knowledge Distillation as Semiparametric Inference},
  author={Dao, Tri and Kamath, Govinda M and Syrgkanis, Vasilis and Mackey, Lester},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2021}
}

@inproceedings{chen2021mongoose,
  title={MONGOOSE: A Learnable {LSH} Framework for Efficient Neural Network Training},
  author={Chen, Beidi and Liu, Zichang and Peng, Binghui and Xu, Zhaozhuo and Li, Jonathan Lingjie and Dao, Tri and Song, Zhao and Shrivastava, Anshumali and R{\'e}, Christopher},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2021},
  award={Oral},
  award_name={Oral},
}

@inproceedings{gu2020hippo,
  title={HiPPO: Recurrent Memory with Optimal Polynomial Projections},
  author={Gu*, Albert and Dao*, Tri and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
  booktitle={Advances in neural information processing systems (NeurIPS)},
  year={2020},
  award={Spotlight},
  award_name={Spotlight},
}

@incollection{dao2019kaleidoscope,
  title={Kaleidoscope: An Efficient, Learnable Representation For All Structured Linear Maps},
  author={Dao, Tri and Sohoni, Nimit and Gu, Albert and Eichhorn, Matthew and Blonder, Amit and Leszczynski, Megan and Rudra, Atri and R{\'e}, Christopher},
  booktitle={The International Conference on Learning Representations (ICLR)},
  year={2020},
  award={Spotlight},
  award_name={Spotlight},
}

@inproceedings{may2019downstream,
  title={On the downstream performance of compressed word embeddings},
  author={May, Avner and Zhang, Jian and Dao, Tri and R{\'e}, Christopher},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS) 32},
  year={2019},
  award={Spotlight},
  award_name={Spotlight},
}

@inproceedings{kuck2019approximating,
  title={Approximating the Permanent by Sampling from Adaptive Partitions},
  author={Kuck, Jonathan and Dao, Tri and Rezatofighi, Hamid and Sabharwal, Ashish and Ermon, Stefano},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS) 32},
  year={2019}
}

@incollection{kuck2019adaptive,
  title={Adaptive Hashing for Model Counting},
  author={Kuck, Jonathan and Dao, Tri and Zhao, Shengjia and Bartan, Burak and Sabharwal, Ashish and Ermon, Stefano},
  booktitle={Proceedings of the 35th Conference on Uncertainty in Artificial Intelligence (UAI)},
  year={2019},
}

@incollection{dao2019learning,
  title={Learning Fast Algorithms for Linear Transforms Using Butterfly Factorizations},
  author={Dao, Tri and Gu, Albert and Eichhorn, Matthew and Rudra, Atri and R{\'e}, Christopher},
  booktitle = {The International Conference on Machine Learning (ICML) 36},
  year={2019},
  award={Oral},
  award_name={Oral},
}

@incollection{dao2019kernel,
  title={A Kernel Theory of Modern Data Augmentation},
  author={Dao, Tri and Gu, Albert and Ratner, Alexander J and Smith, Virginia and De Sa, Christopher and R{\'e}, Christopher},
  booktitle = {The International Conference on Machine Learning (ICML) 36},
  year={2019},
}

@incollection{zhang2019low,
  title={Low-Precision Random {F}ourier Features for Memory-Constrained Kernel Approximation},
  author={Zhang, Jian and May, Avner and Dao, Tri and R{\'e}, Christopher},
  booktitle = {The International Conference on Artificial Intelligence and Statistics (AISTATS) 22},
  year = {2019},
}

@incollection{thomas2018learning,
  title = {Learning Compressed Transforms with Low Displacement Rank},
  author = {Thomas, Anna T and Gu, Albert and Dao, Tri and Rudra, Atri and R\'{e}, Christopher},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS) 31},
  year = {2018},
}

@incollection{dao2017gaussian,
  title = {Gaussian Quadrature for Kernel Features},
  author = {Dao, Tri and De Sa, Christopher M and R\'{e}, Christopher},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS) 30},
  year = {2017},
  award={Spotlight},
  award_name={Spotlight},
}

@article{li2023starcoder,
  title={StarCoder: may the source be with you!},
  author={Li, Raymond and Ben Allal, Loubna and Zi, Yangtian and Muennighoff, Niklas and Kocetkov, Denis and Mou, Chenghao and Marone, Marc and Akiki, Christopher and Li, Jia and Chim, Jenny and Liu, Qian and Zheltonozhskii, Evgenii and Zhuo, Terry Yue and Wang, Thomas and Dehaene, Olivier and Davaadorj, Mishig and Lamy-Poirier, Joel and Monteiro, Joao and Shliazhko, Oleh and Gontier, Nicolas and Meade, Nicholas and Zebaze, Armel and Yee, Ming-Ho and Umapathi, Logesh Kumar and Zhu, Jian and Lipkin, Benjamin and Oblokulov, Muhtasham and Wang, Zhiruo and Murthy, Rudra and Stillerman, Jason and Patel, Siva Sankalp and Abulkhanov, Dmitry and Zocca, Marco and Dey, Manan and Zhang, Zhihan and Fahmy, Nour and Bhattacharyya, Urvashi and Yu, Wenhao and Singh, Swayam and Luccioni, Sasha and Villegas, Paulo and Kunakov, Maxim and Zhdanov, Fedor and Romero, Manuel and Lee, Tony and Timor, Nadav and Ding, Jennifer and Schlesinger, Claire and Schoelkopf, Hailey and Ebert, Jan and Dao, Tri and Mishra, Mayank and Gu, Alex and Robinson, Jennifer and Anderson, Carolyn Jane and Dolan-Gavitt, Brendan and Contractor, Danish and Reddy, Siva and Fried, Daniel and Bahdanau, Dzmitry and Jernite, Yacine and Ferrandis, Carlos Munoz and Hughes, Sean and Wolf, Thomas and Guha, Arjun and von Werra, Leandro and de Vries, Harm},
  journal={Transactions on Machine Learning Research (TMLR)},
  year={2023},
  arxiv={2305.06161},
}

@article{dao2023flashdecoding,
  title={Flash-Decoding for long-context inference},
  author={Dao, Tri and Haziza, Daniel and Massa, Francisco and Sizov, Grigory},
  journal={PyTorch Blog},
  year={2023},
  html={https://pytorch.org/blog/flash-decoding/},
}

@article{oncescu2025opportunistic,
  title={Opportunistic Expert Activation: Batch-Aware Expert Routing for Faster Decode Without Retraining},
  author={Oncescu, Costin-Andrei and Wu, Qingyang and Chung, Wai Tong and Wu, Robert and Gopal, Bryan and Wang, Junxiong and Dao, Tri and Athiwaratkun, Ben},
  journal={arXiv preprint arXiv:2511.02237},
  year={2025},
  arxiv={2511.02237},
}

@article{mishra2026mrnn,
  title={{M$^2$RNN}: Non-Linear RNNs with Matrix-Valued States for Scalable Language Modeling},
  author={Mishra, Mayank and Tan, Shawn and Stoica, Ion and Gonzalez, Joseph and Dao, Tri},
  journal={arXiv preprint arXiv:2603.14360},
  year={2026},
  arxiv={2603.14360},
}

@article{chen2026aihw2035,
  title={{AI+HW} 2035: Shaping the Next Decade},
  author={Chen, Deming and Cong, Jason and Mirhoseini, Azalia and Kozyrakis, Christos and Mitra, Subhasish and Xiong, Jinjun and Young, Cliff and Anandkumar, Anima and Littman, Michael and Kirschen, Aron and Shao, Sophia and Leef, Serge and Shanbhag, Naresh and Milojicic, Dejan and Schulte, Michael and Cauwenberghs, Gert and Chow, Jerry M. and Dao, Tri and Gopalakrishnan, Kailash and Ho, Richard and Kim, Hoshik and Olukotun, Kunle and Pan, David Z. and Ren, Mark and Roth, Dan and Singh, Aarti and Sun, Yizhou and Wang, Yusu and LeCun, Yann and Puri, Ruchir},
  journal={arXiv preprint arXiv:2603.05225},
  year={2026},
  arxiv={2603.05225},
}

@article{wang2026speculative,
  title={When {RL} Meets Adaptive Speculative Training: A Unified Training-Serving System},
  author={Wang, Junxiong and Bie, Fengxiang and Li, Jisen and Zhou, Zhongzhu and Shao, Zelei and Wang, Yubo and Liu, Yinghui and Wu, Qingyang and May, Avner and Yanamandra, Sri and Zhang, Yineng and Zhang, Ce and Dao, Tri and Liang, Percy and Athiwaratkun, Ben and Song, Shuaiwen Leon and Xu, Chenfeng and Wu, Xiaoxia},
  journal={arXiv preprint arXiv:2602.06932},
  year={2026},
  arxiv={2602.06932},
}