@inproceedings{61475a35dabd440dbb13ec37b7400c4a,
title = "RUSH: A RobUst ScHeduler to Manage Uncertain Completion-Times in Shared Clouds",
abstract = "We address the problem of scheduling jobs with utilities that depend solely upon their completion-times in a shared cloud that imposes considerable uncertainty on the jobs' runtime. However, it is very hard to estimate the jobs' runtime in a shared cloud where jobs are often delayed due to reasons such as slow I/O performance and variations in memory availability. Unlike prior works, we acknowledge that runtime estimates are often erroneous and instead shift the burden of robustness to the job scheduler. Specifically, we present a scheduling problem that jointly accounts for: (i) job utilities specified as functions of their completion-time, and (ii) uncertainty in the jobs' runtime. Our proposed solution to this problem achieves lexicographic max-min fairness among the job utilities. We implement this as a robust scheduler, named RUSH, for YARN in Hadoop. Our experiments, using real-world data sets, illustrate RUSH's efficacy when compared with other commonly used schedulers.",
keywords = "Hadoop, Robust Scheduling, Runtime Estimation",
author = "Zhe Huang and Bharath Balasubramanian and Michael Wang and Tian Lan and Mung Chiang and Tsang, {Danny H.K.}",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; 36th IEEE International Conference on Distributed Computing Systems, ICDCS 2016 ; Conference date: 27-06-2016 Through 30-06-2016",
year = "2016",
month = aug,
day = "8",
doi = "10.1109/ICDCS.2016.95",
language = "English (US)",
series = "Proceedings - International Conference on Distributed Computing Systems",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "242--251",
booktitle = "Proceedings - 2016 IEEE 36th International Conference on Distributed Computing Systems, ICDCS 2016",
address = "United States",
}