@inproceedings{a88a94f3428342388108f2a90a024824,
title = "MUX-PLMs: Data Multiplexing for High-throughput Language Models",
abstract = "The widespread adoption of large language models such as ChatGPT and Bard has led to unprecedented demand for these technologies. The burgeoning cost of inference for ever-increasing model sizes coupled with hardware shortages has limited affordable access and poses a pressing need for efficiency approaches geared towards high throughput and performance. Multi-input multi-output (MIMO) algorithms such as data multiplexing, offer a promising solution with a many-fold increase in throughput by performing inference for multiple inputs at the cost of a single input. Yet these approaches are not currently performant enough to be deployed in modern systems. We change that by developing MUX-PLMs, a class of deployable high throughput pre-trained language models (PLMs) trained with data multiplexing, that can be fine-tuned on any downstream task. Our novel multiplexing and demultiplexing modules proficiently entangle and disentangle inputs, and enable high-performance high throughput MUX-PLMs that are competitive with vanilla PLMs while achieving 2x/5x inference speedup with only a 1 − 4% performance drop on a broad suite of tasks.",
author = "Vishvak Murahari and Ameet Deshpande and Jimenez, {Carlos E.} and Izhak Shafran and Mingqiu Wang and Yuan Cao and Karthik Narasimhan",
note = "Publisher Copyright: {\textcopyright} 2023 Association for Computational Linguistics.; 2023 Findings of the Association for Computational Linguistics: EMNLP 2023 ; Conference date: 06-12-2023 Through 10-12-2023",
year = "2023",
language = "English (US)",
series = "Findings of the Association for Computational Linguistics: EMNLP 2023",
publisher = "Association for Computational Linguistics (ACL)",
pages = "4540--4554",
booktitle = "Findings of the Association for Computational Linguistics",
}