@article{76065dafe1b0476786594ec04be9f073,
title = "On Nonconvex Optimization for Machine Learning",
abstract = "Gradient descent (GD) and stochastic gradient descent (SGD) are the workhorses of large-scale machine learning. While classical theory focused on analyzing the performance of these methods in convex optimization problems, the most notable successes in machine learning have involved nonconvex optimization, and a gap has arisen between theory and practice. Indeed, traditional analyses of GD and SGD show that both algorithms converge to stationary points efficiently. But these analyses do not take into account the possibility of converging to saddle points. More recent theory has shown that GD and SGD can avoid saddle points, but the dependence on dimension in these analyses is polynomial. For modern machine learning, where the dimension can be in the millions, such dependence would be catastrophic. We analyze perturbed versions of GD and SGD and show that they are truly efficient-their dimension dependence is only polylogarithmic. Indeed, these algorithms converge to second-order stationary points in essentially the same time as they take to converge to classical first-order stationary points.",
keywords = "(stochastic) gradient descent, Saddle points, efficiency, perturbations",
author = "Chi Jin and Praneeth Netrapalli and Rong Ge and Kakade, {Sham M.} and Jordan, {Michael I.}",
note = "Funding Information: A preliminary version of this article, with a subset of the results that are presented here, was presented at ICML 2017 and appeared in the proceedings as Jin et al. [27]. We thank Yair Carmon, Tongyang Li and Quanquan Gu for valuable discussions. This work was supported in part by the Mathematical Data Science program of the Office of Naval Research under grant number N00014-18-1-2764. Rong Ge also acknowledges the funding support from NSF CCF-1704656, NSF CCF-1845171 (CAREER), Sloan Fellowship and Google Faculty Research Award. Authors{\textquoteright} addresses: C. Jin and M. I. Jordan, University of California, 2570 Hearst Ave, Berkeley, California, USA; emails: {chijin, jordan}@cs.berkeley.edu; P. Netrapalli, Microsoft Research, No. 9, Lavelle Road, Bengaluru, Karnataka, India; email: praneeth@microsoft.com; R. Ge, Duke University, 308 Research Dr, Durham, North Carolina, USA; email: rongge@cs.duke.edu; S. M. Kakade, University of Washington, 185 E Stevens Way NE, Seattle, Washington, USA; email: sham@cs.washington.edu. Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for components of this work owned by others than ACM must be honored. Abstracting with credit is permitted. To copy otherwise, or republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Request permissions from permissions@acm.org. {\textcopyright} 2021 Association for Computing Machinery. 0004-5411/2021/02-ART11 $15.00 https://doi.org/10.1145/3418526 Publisher Copyright: {\textcopyright} 2021 ACM.",
year = "2021",
month = mar,
doi = "10.1145/3418526",
language = "English (US)",
volume = "68",
journal = "Journal of the ACM",
issn = "0004-5411",
publisher = "Association for Computing Machinery (ACM)",
number = "2",
}