optimize.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. """
  2. Our own implementation of the Newton algorithm
  3. Unlike the scipy.optimize version, this version of the Newton conjugate
  4. gradient solver uses only one function call to retrieve the
  5. func value, the gradient value and a callable for the Hessian matvec
  6. product. If the function call is very expensive (e.g. for logistic
  7. regression with large design matrix), this approach gives very
  8. significant speedups.
  9. """
  10. # This is a modified file from scipy.optimize
  11. # Original authors: Travis Oliphant, Eric Jones
  12. # Modifications by Gael Varoquaux, Mathieu Blondel and Tom Dupre la Tour
  13. # License: BSD
  14. import warnings
  15. import numpy as np
  16. from ..exceptions import ConvergenceWarning
  17. from .fixes import line_search_wolfe1, line_search_wolfe2
  18. class _LineSearchError(RuntimeError):
  19. pass
  20. def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs):
  21. """
  22. Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
  23. suitable step length is not found, and raise an exception if a
  24. suitable step length is not found.
  25. Raises
  26. ------
  27. _LineSearchError
  28. If no suitable step size is found.
  29. """
  30. ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
  31. if ret[0] is None:
  32. # line search failed: try different one.
  33. ret = line_search_wolfe2(
  34. f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs
  35. )
  36. if ret[0] is None:
  37. raise _LineSearchError()
  38. return ret
  39. def _cg(fhess_p, fgrad, maxiter, tol):
  40. """
  41. Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
  42. with a conjugate gradient descent.
  43. Parameters
  44. ----------
  45. fhess_p : callable
  46. Function that takes the gradient as a parameter and returns the
  47. matrix product of the Hessian and gradient.
  48. fgrad : ndarray of shape (n_features,) or (n_features + 1,)
  49. Gradient vector.
  50. maxiter : int
  51. Number of CG iterations.
  52. tol : float
  53. Stopping criterion.
  54. Returns
  55. -------
  56. xsupi : ndarray of shape (n_features,) or (n_features + 1,)
  57. Estimated solution.
  58. """
  59. xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)
  60. ri = fgrad
  61. psupi = -ri
  62. i = 0
  63. dri0 = np.dot(ri, ri)
  64. while i <= maxiter:
  65. if np.sum(np.abs(ri)) <= tol:
  66. break
  67. Ap = fhess_p(psupi)
  68. # check curvature
  69. curv = np.dot(psupi, Ap)
  70. if 0 <= curv <= 3 * np.finfo(np.float64).eps:
  71. break
  72. elif curv < 0:
  73. if i > 0:
  74. break
  75. else:
  76. # fall back to steepest descent direction
  77. xsupi += dri0 / curv * psupi
  78. break
  79. alphai = dri0 / curv
  80. xsupi += alphai * psupi
  81. ri = ri + alphai * Ap
  82. dri1 = np.dot(ri, ri)
  83. betai = dri1 / dri0
  84. psupi = -ri + betai * psupi
  85. i = i + 1
  86. dri0 = dri1 # update np.dot(ri,ri) for next time.
  87. return xsupi
  88. def _newton_cg(
  89. grad_hess,
  90. func,
  91. grad,
  92. x0,
  93. args=(),
  94. tol=1e-4,
  95. maxiter=100,
  96. maxinner=200,
  97. line_search=True,
  98. warn=True,
  99. ):
  100. """
  101. Minimization of scalar function of one or more variables using the
  102. Newton-CG algorithm.
  103. Parameters
  104. ----------
  105. grad_hess : callable
  106. Should return the gradient and a callable returning the matvec product
  107. of the Hessian.
  108. func : callable
  109. Should return the value of the function.
  110. grad : callable
  111. Should return the function value and the gradient. This is used
  112. by the linesearch functions.
  113. x0 : array of float
  114. Initial guess.
  115. args : tuple, default=()
  116. Arguments passed to func_grad_hess, func and grad.
  117. tol : float, default=1e-4
  118. Stopping criterion. The iteration will stop when
  119. ``max{|g_i | i = 1, ..., n} <= tol``
  120. where ``g_i`` is the i-th component of the gradient.
  121. maxiter : int, default=100
  122. Number of Newton iterations.
  123. maxinner : int, default=200
  124. Number of CG iterations.
  125. line_search : bool, default=True
  126. Whether to use a line search or not.
  127. warn : bool, default=True
  128. Whether to warn when didn't converge.
  129. Returns
  130. -------
  131. xk : ndarray of float
  132. Estimated minimum.
  133. """
  134. x0 = np.asarray(x0).flatten()
  135. xk = x0
  136. k = 0
  137. if line_search:
  138. old_fval = func(x0, *args)
  139. old_old_fval = None
  140. # Outer loop: our Newton iteration
  141. while k < maxiter:
  142. # Compute a search direction pk by applying the CG method to
  143. # del2 f(xk) p = - fgrad f(xk) starting from 0.
  144. fgrad, fhess_p = grad_hess(xk, *args)
  145. absgrad = np.abs(fgrad)
  146. if np.max(absgrad) <= tol:
  147. break
  148. maggrad = np.sum(absgrad)
  149. eta = min([0.5, np.sqrt(maggrad)])
  150. termcond = eta * maggrad
  151. # Inner loop: solve the Newton update by conjugate gradient, to
  152. # avoid inverting the Hessian
  153. xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)
  154. alphak = 1.0
  155. if line_search:
  156. try:
  157. alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
  158. func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args
  159. )
  160. except _LineSearchError:
  161. warnings.warn("Line Search failed")
  162. break
  163. xk = xk + alphak * xsupi # upcast if necessary
  164. k += 1
  165. if warn and k >= maxiter:
  166. warnings.warn(
  167. "newton-cg failed to converge. Increase the number of iterations.",
  168. ConvergenceWarning,
  169. )
  170. return xk, k
  171. def _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None):
  172. """Check the OptimizeResult for successful convergence
  173. Parameters
  174. ----------
  175. solver : str
  176. Solver name. Currently only `lbfgs` is supported.
  177. result : OptimizeResult
  178. Result of the scipy.optimize.minimize function.
  179. max_iter : int, default=None
  180. Expected maximum number of iterations.
  181. extra_warning_msg : str, default=None
  182. Extra warning message.
  183. Returns
  184. -------
  185. n_iter : int
  186. Number of iterations.
  187. """
  188. # handle both scipy and scikit-learn solver names
  189. if solver == "lbfgs":
  190. if result.status != 0:
  191. try:
  192. # The message is already decoded in scipy>=1.6.0
  193. result_message = result.message.decode("latin1")
  194. except AttributeError:
  195. result_message = result.message
  196. warning_msg = (
  197. "{} failed to converge (status={}):\n{}.\n\n"
  198. "Increase the number of iterations (max_iter) "
  199. "or scale the data as shown in:\n"
  200. " https://scikit-learn.org/stable/modules/"
  201. "preprocessing.html"
  202. ).format(solver, result.status, result_message)
  203. if extra_warning_msg is not None:
  204. warning_msg += "\n" + extra_warning_msg
  205. warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2)
  206. if max_iter is not None:
  207. # In scipy <= 1.0.0, nit may exceed maxiter for lbfgs.
  208. # See https://github.com/scipy/scipy/issues/7854
  209. n_iter_i = min(result.nit, max_iter)
  210. else:
  211. n_iter_i = result.nit
  212. else:
  213. raise NotImplementedError
  214. return n_iter_i