@inproceedings{8e14c2fabb7648f1b0b25cf0467d9ed2,

title = "On the Worst-case Analysis of Temporal-difference Learning Algorithms",

abstract = "We study the worst-case behavior of a family of learning algorithms based on Sutton's [7] method of temporal differences. In our on-line learning framework, learning takes place in a sequence of trials, and the goal of the learning algorithm is to estimate a discounted sum of all the reinforcements that will be received in the future. In this setting, we are able to prove general upper bounds on the performance of a slightly modified version of Sutton's so-called TD(A) algorithm. These bounds are stated in terms of the performance of the best linear predictor on the given training sequence, and are proved without making any statistical assumptions of any kind about the process producing the learner's observed training sequence. We also prove lower bounds on the performance of any algorithm for this learning problem, and give a similar analysis of the closely related problem of learning to predict in a model in which the learner must produce predictions for a whole batch of observations before receiving reinforcement.",

author = "Schapire, {Robert E.} and Warmuth, {Manfred K.}",

note = "Funding Information: Supported by a VA Career Development Award and a VA Merit Review Award. Publisher Copyright: {\textcopyright} 1994 Proceedings of the 11th International Conference on Machine Learning, ICML 1994. All rights reserved.; 11th International Conference on Machine Learning, ICML 1994 ; Conference date: 10-07-1994 Through 13-07-1994",

year = "1994",

doi = "10.1016/B978-1-55860-335-6.50040-4",

language = "English (US)",

series = "Proceedings of the 11th International Conference on Machine Learning, ICML 1994",

publisher = "Morgan Kaufmann Publishers, Inc.",

pages = "266--274",

editor = "Cohen, {William W.} and Haym Hirsh",

booktitle = "Proceedings of the 11th International Conference on Machine Learning, ICML 1994",

address = "United States",

}