_stochastic_gradient.py 86 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574
  1. # Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com> (main author)
  2. # Mathieu Blondel (partial_fit support)
  3. #
  4. # License: BSD 3 clause
  5. """Classification, regression and One-Class SVM using Stochastic Gradient
  6. Descent (SGD).
  7. """
  8. import warnings
  9. from abc import ABCMeta, abstractmethod
  10. from numbers import Integral, Real
  11. import numpy as np
  12. from ..base import (
  13. BaseEstimator,
  14. OutlierMixin,
  15. RegressorMixin,
  16. _fit_context,
  17. clone,
  18. is_classifier,
  19. )
  20. from ..exceptions import ConvergenceWarning
  21. from ..model_selection import ShuffleSplit, StratifiedShuffleSplit
  22. from ..utils import check_random_state, compute_class_weight
  23. from ..utils._param_validation import Hidden, Interval, StrOptions
  24. from ..utils.extmath import safe_sparse_dot
  25. from ..utils.metaestimators import available_if
  26. from ..utils.multiclass import _check_partial_fit_first_call
  27. from ..utils.parallel import Parallel, delayed
  28. from ..utils.validation import _check_sample_weight, check_is_fitted
  29. from ._base import LinearClassifierMixin, SparseCoefMixin, make_dataset
  30. from ._sgd_fast import (
  31. EpsilonInsensitive,
  32. Hinge,
  33. Huber,
  34. Log,
  35. ModifiedHuber,
  36. SquaredEpsilonInsensitive,
  37. SquaredHinge,
  38. SquaredLoss,
  39. _plain_sgd32,
  40. _plain_sgd64,
  41. )
  42. LEARNING_RATE_TYPES = {
  43. "constant": 1,
  44. "optimal": 2,
  45. "invscaling": 3,
  46. "adaptive": 4,
  47. "pa1": 5,
  48. "pa2": 6,
  49. }
  50. PENALTY_TYPES = {"none": 0, "l2": 2, "l1": 1, "elasticnet": 3}
  51. DEFAULT_EPSILON = 0.1
  52. # Default value of ``epsilon`` parameter.
  53. MAX_INT = np.iinfo(np.int32).max
  54. class _ValidationScoreCallback:
  55. """Callback for early stopping based on validation score"""
  56. def __init__(self, estimator, X_val, y_val, sample_weight_val, classes=None):
  57. self.estimator = clone(estimator)
  58. self.estimator.t_ = 1 # to pass check_is_fitted
  59. if classes is not None:
  60. self.estimator.classes_ = classes
  61. self.X_val = X_val
  62. self.y_val = y_val
  63. self.sample_weight_val = sample_weight_val
  64. def __call__(self, coef, intercept):
  65. est = self.estimator
  66. est.coef_ = coef.reshape(1, -1)
  67. est.intercept_ = np.atleast_1d(intercept)
  68. return est.score(self.X_val, self.y_val, self.sample_weight_val)
  69. class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta):
  70. """Base class for SGD classification and regression."""
  71. _parameter_constraints: dict = {
  72. "fit_intercept": ["boolean"],
  73. "max_iter": [Interval(Integral, 1, None, closed="left")],
  74. "tol": [Interval(Real, 0, None, closed="left"), None],
  75. "shuffle": ["boolean"],
  76. "verbose": ["verbose"],
  77. "random_state": ["random_state"],
  78. "warm_start": ["boolean"],
  79. "average": [Interval(Integral, 0, None, closed="left"), bool, np.bool_],
  80. }
  81. def __init__(
  82. self,
  83. loss,
  84. *,
  85. penalty="l2",
  86. alpha=0.0001,
  87. C=1.0,
  88. l1_ratio=0.15,
  89. fit_intercept=True,
  90. max_iter=1000,
  91. tol=1e-3,
  92. shuffle=True,
  93. verbose=0,
  94. epsilon=0.1,
  95. random_state=None,
  96. learning_rate="optimal",
  97. eta0=0.0,
  98. power_t=0.5,
  99. early_stopping=False,
  100. validation_fraction=0.1,
  101. n_iter_no_change=5,
  102. warm_start=False,
  103. average=False,
  104. ):
  105. self.loss = loss
  106. self.penalty = penalty
  107. self.learning_rate = learning_rate
  108. self.epsilon = epsilon
  109. self.alpha = alpha
  110. self.C = C
  111. self.l1_ratio = l1_ratio
  112. self.fit_intercept = fit_intercept
  113. self.shuffle = shuffle
  114. self.random_state = random_state
  115. self.verbose = verbose
  116. self.eta0 = eta0
  117. self.power_t = power_t
  118. self.early_stopping = early_stopping
  119. self.validation_fraction = validation_fraction
  120. self.n_iter_no_change = n_iter_no_change
  121. self.warm_start = warm_start
  122. self.average = average
  123. self.max_iter = max_iter
  124. self.tol = tol
  125. @abstractmethod
  126. def fit(self, X, y):
  127. """Fit model."""
  128. def _more_validate_params(self, for_partial_fit=False):
  129. """Validate input params."""
  130. if self.early_stopping and for_partial_fit:
  131. raise ValueError("early_stopping should be False with partial_fit")
  132. if (
  133. self.learning_rate in ("constant", "invscaling", "adaptive")
  134. and self.eta0 <= 0.0
  135. ):
  136. raise ValueError("eta0 must be > 0")
  137. if self.learning_rate == "optimal" and self.alpha == 0:
  138. raise ValueError(
  139. "alpha must be > 0 since "
  140. "learning_rate is 'optimal'. alpha is used "
  141. "to compute the optimal learning rate."
  142. )
  143. # raises ValueError if not registered
  144. self._get_penalty_type(self.penalty)
  145. self._get_learning_rate_type(self.learning_rate)
  146. def _get_loss_function(self, loss):
  147. """Get concrete ``LossFunction`` object for str ``loss``."""
  148. loss_ = self.loss_functions[loss]
  149. loss_class, args = loss_[0], loss_[1:]
  150. if loss in ("huber", "epsilon_insensitive", "squared_epsilon_insensitive"):
  151. args = (self.epsilon,)
  152. return loss_class(*args)
  153. def _get_learning_rate_type(self, learning_rate):
  154. return LEARNING_RATE_TYPES[learning_rate]
  155. def _get_penalty_type(self, penalty):
  156. penalty = str(penalty).lower()
  157. return PENALTY_TYPES[penalty]
  158. def _allocate_parameter_mem(
  159. self,
  160. n_classes,
  161. n_features,
  162. input_dtype,
  163. coef_init=None,
  164. intercept_init=None,
  165. one_class=0,
  166. ):
  167. """Allocate mem for parameters; initialize if provided."""
  168. if n_classes > 2:
  169. # allocate coef_ for multi-class
  170. if coef_init is not None:
  171. coef_init = np.asarray(coef_init, dtype=input_dtype, order="C")
  172. if coef_init.shape != (n_classes, n_features):
  173. raise ValueError("Provided ``coef_`` does not match dataset. ")
  174. self.coef_ = coef_init
  175. else:
  176. self.coef_ = np.zeros(
  177. (n_classes, n_features), dtype=input_dtype, order="C"
  178. )
  179. # allocate intercept_ for multi-class
  180. if intercept_init is not None:
  181. intercept_init = np.asarray(
  182. intercept_init, order="C", dtype=input_dtype
  183. )
  184. if intercept_init.shape != (n_classes,):
  185. raise ValueError("Provided intercept_init does not match dataset.")
  186. self.intercept_ = intercept_init
  187. else:
  188. self.intercept_ = np.zeros(n_classes, dtype=input_dtype, order="C")
  189. else:
  190. # allocate coef_
  191. if coef_init is not None:
  192. coef_init = np.asarray(coef_init, dtype=input_dtype, order="C")
  193. coef_init = coef_init.ravel()
  194. if coef_init.shape != (n_features,):
  195. raise ValueError("Provided coef_init does not match dataset.")
  196. self.coef_ = coef_init
  197. else:
  198. self.coef_ = np.zeros(n_features, dtype=input_dtype, order="C")
  199. # allocate intercept_
  200. if intercept_init is not None:
  201. intercept_init = np.asarray(intercept_init, dtype=input_dtype)
  202. if intercept_init.shape != (1,) and intercept_init.shape != ():
  203. raise ValueError("Provided intercept_init does not match dataset.")
  204. if one_class:
  205. self.offset_ = intercept_init.reshape(
  206. 1,
  207. )
  208. else:
  209. self.intercept_ = intercept_init.reshape(
  210. 1,
  211. )
  212. else:
  213. if one_class:
  214. self.offset_ = np.zeros(1, dtype=input_dtype, order="C")
  215. else:
  216. self.intercept_ = np.zeros(1, dtype=input_dtype, order="C")
  217. # initialize average parameters
  218. if self.average > 0:
  219. self._standard_coef = self.coef_
  220. self._average_coef = np.zeros(
  221. self.coef_.shape, dtype=input_dtype, order="C"
  222. )
  223. if one_class:
  224. self._standard_intercept = 1 - self.offset_
  225. else:
  226. self._standard_intercept = self.intercept_
  227. self._average_intercept = np.zeros(
  228. self._standard_intercept.shape, dtype=input_dtype, order="C"
  229. )
  230. def _make_validation_split(self, y, sample_mask):
  231. """Split the dataset between training set and validation set.
  232. Parameters
  233. ----------
  234. y : ndarray of shape (n_samples, )
  235. Target values.
  236. sample_mask : ndarray of shape (n_samples, )
  237. A boolean array indicating whether each sample should be included
  238. for validation set.
  239. Returns
  240. -------
  241. validation_mask : ndarray of shape (n_samples, )
  242. Equal to True on the validation set, False on the training set.
  243. """
  244. n_samples = y.shape[0]
  245. validation_mask = np.zeros(n_samples, dtype=np.bool_)
  246. if not self.early_stopping:
  247. # use the full set for training, with an empty validation set
  248. return validation_mask
  249. if is_classifier(self):
  250. splitter_type = StratifiedShuffleSplit
  251. else:
  252. splitter_type = ShuffleSplit
  253. cv = splitter_type(
  254. test_size=self.validation_fraction, random_state=self.random_state
  255. )
  256. idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))
  257. if not np.any(sample_mask[idx_val]):
  258. raise ValueError(
  259. "The sample weights for validation set are all zero, consider using a"
  260. " different random state."
  261. )
  262. if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:
  263. raise ValueError(
  264. "Splitting %d samples into a train set and a validation set "
  265. "with validation_fraction=%r led to an empty set (%d and %d "
  266. "samples). Please either change validation_fraction, increase "
  267. "number of samples, or disable early_stopping."
  268. % (
  269. n_samples,
  270. self.validation_fraction,
  271. idx_train.shape[0],
  272. idx_val.shape[0],
  273. )
  274. )
  275. validation_mask[idx_val] = True
  276. return validation_mask
  277. def _make_validation_score_cb(
  278. self, validation_mask, X, y, sample_weight, classes=None
  279. ):
  280. if not self.early_stopping:
  281. return None
  282. return _ValidationScoreCallback(
  283. self,
  284. X[validation_mask],
  285. y[validation_mask],
  286. sample_weight[validation_mask],
  287. classes=classes,
  288. )
  289. def _prepare_fit_binary(est, y, i, input_dtye):
  290. """Initialization for fit_binary.
  291. Returns y, coef, intercept, average_coef, average_intercept.
  292. """
  293. y_i = np.ones(y.shape, dtype=input_dtye, order="C")
  294. y_i[y != est.classes_[i]] = -1.0
  295. average_intercept = 0
  296. average_coef = None
  297. if len(est.classes_) == 2:
  298. if not est.average:
  299. coef = est.coef_.ravel()
  300. intercept = est.intercept_[0]
  301. else:
  302. coef = est._standard_coef.ravel()
  303. intercept = est._standard_intercept[0]
  304. average_coef = est._average_coef.ravel()
  305. average_intercept = est._average_intercept[0]
  306. else:
  307. if not est.average:
  308. coef = est.coef_[i]
  309. intercept = est.intercept_[i]
  310. else:
  311. coef = est._standard_coef[i]
  312. intercept = est._standard_intercept[i]
  313. average_coef = est._average_coef[i]
  314. average_intercept = est._average_intercept[i]
  315. return y_i, coef, intercept, average_coef, average_intercept
  316. def fit_binary(
  317. est,
  318. i,
  319. X,
  320. y,
  321. alpha,
  322. C,
  323. learning_rate,
  324. max_iter,
  325. pos_weight,
  326. neg_weight,
  327. sample_weight,
  328. validation_mask=None,
  329. random_state=None,
  330. ):
  331. """Fit a single binary classifier.
  332. The i'th class is considered the "positive" class.
  333. Parameters
  334. ----------
  335. est : Estimator object
  336. The estimator to fit
  337. i : int
  338. Index of the positive class
  339. X : numpy array or sparse matrix of shape [n_samples,n_features]
  340. Training data
  341. y : numpy array of shape [n_samples, ]
  342. Target values
  343. alpha : float
  344. The regularization parameter
  345. C : float
  346. Maximum step size for passive aggressive
  347. learning_rate : str
  348. The learning rate. Accepted values are 'constant', 'optimal',
  349. 'invscaling', 'pa1' and 'pa2'.
  350. max_iter : int
  351. The maximum number of iterations (epochs)
  352. pos_weight : float
  353. The weight of the positive class
  354. neg_weight : float
  355. The weight of the negative class
  356. sample_weight : numpy array of shape [n_samples, ]
  357. The weight of each sample
  358. validation_mask : numpy array of shape [n_samples, ], default=None
  359. Precomputed validation mask in case _fit_binary is called in the
  360. context of a one-vs-rest reduction.
  361. random_state : int, RandomState instance, default=None
  362. If int, random_state is the seed used by the random number generator;
  363. If RandomState instance, random_state is the random number generator;
  364. If None, the random number generator is the RandomState instance used
  365. by `np.random`.
  366. """
  367. # if average is not true, average_coef, and average_intercept will be
  368. # unused
  369. y_i, coef, intercept, average_coef, average_intercept = _prepare_fit_binary(
  370. est, y, i, input_dtye=X.dtype
  371. )
  372. assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0]
  373. random_state = check_random_state(random_state)
  374. dataset, intercept_decay = make_dataset(
  375. X, y_i, sample_weight, random_state=random_state
  376. )
  377. penalty_type = est._get_penalty_type(est.penalty)
  378. learning_rate_type = est._get_learning_rate_type(learning_rate)
  379. if validation_mask is None:
  380. validation_mask = est._make_validation_split(y_i, sample_mask=sample_weight > 0)
  381. classes = np.array([-1, 1], dtype=y_i.dtype)
  382. validation_score_cb = est._make_validation_score_cb(
  383. validation_mask, X, y_i, sample_weight, classes=classes
  384. )
  385. # numpy mtrand expects a C long which is a signed 32 bit integer under
  386. # Windows
  387. seed = random_state.randint(MAX_INT)
  388. tol = est.tol if est.tol is not None else -np.inf
  389. _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype)
  390. coef, intercept, average_coef, average_intercept, n_iter_ = _plain_sgd(
  391. coef,
  392. intercept,
  393. average_coef,
  394. average_intercept,
  395. est.loss_function_,
  396. penalty_type,
  397. alpha,
  398. C,
  399. est.l1_ratio,
  400. dataset,
  401. validation_mask,
  402. est.early_stopping,
  403. validation_score_cb,
  404. int(est.n_iter_no_change),
  405. max_iter,
  406. tol,
  407. int(est.fit_intercept),
  408. int(est.verbose),
  409. int(est.shuffle),
  410. seed,
  411. pos_weight,
  412. neg_weight,
  413. learning_rate_type,
  414. est.eta0,
  415. est.power_t,
  416. 0,
  417. est.t_,
  418. intercept_decay,
  419. est.average,
  420. )
  421. if est.average:
  422. if len(est.classes_) == 2:
  423. est._average_intercept[0] = average_intercept
  424. else:
  425. est._average_intercept[i] = average_intercept
  426. return coef, intercept, n_iter_
  427. def _get_plain_sgd_function(input_dtype):
  428. return _plain_sgd32 if input_dtype == np.float32 else _plain_sgd64
  429. class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
  430. loss_functions = {
  431. "hinge": (Hinge, 1.0),
  432. "squared_hinge": (SquaredHinge, 1.0),
  433. "perceptron": (Hinge, 0.0),
  434. "log_loss": (Log,),
  435. "modified_huber": (ModifiedHuber,),
  436. "squared_error": (SquaredLoss,),
  437. "huber": (Huber, DEFAULT_EPSILON),
  438. "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
  439. "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, DEFAULT_EPSILON),
  440. }
  441. _parameter_constraints: dict = {
  442. **BaseSGD._parameter_constraints,
  443. "loss": [StrOptions(set(loss_functions))],
  444. "early_stopping": ["boolean"],
  445. "validation_fraction": [Interval(Real, 0, 1, closed="neither")],
  446. "n_iter_no_change": [Interval(Integral, 1, None, closed="left")],
  447. "n_jobs": [Integral, None],
  448. "class_weight": [StrOptions({"balanced"}), dict, None],
  449. }
  450. @abstractmethod
  451. def __init__(
  452. self,
  453. loss="hinge",
  454. *,
  455. penalty="l2",
  456. alpha=0.0001,
  457. l1_ratio=0.15,
  458. fit_intercept=True,
  459. max_iter=1000,
  460. tol=1e-3,
  461. shuffle=True,
  462. verbose=0,
  463. epsilon=DEFAULT_EPSILON,
  464. n_jobs=None,
  465. random_state=None,
  466. learning_rate="optimal",
  467. eta0=0.0,
  468. power_t=0.5,
  469. early_stopping=False,
  470. validation_fraction=0.1,
  471. n_iter_no_change=5,
  472. class_weight=None,
  473. warm_start=False,
  474. average=False,
  475. ):
  476. super().__init__(
  477. loss=loss,
  478. penalty=penalty,
  479. alpha=alpha,
  480. l1_ratio=l1_ratio,
  481. fit_intercept=fit_intercept,
  482. max_iter=max_iter,
  483. tol=tol,
  484. shuffle=shuffle,
  485. verbose=verbose,
  486. epsilon=epsilon,
  487. random_state=random_state,
  488. learning_rate=learning_rate,
  489. eta0=eta0,
  490. power_t=power_t,
  491. early_stopping=early_stopping,
  492. validation_fraction=validation_fraction,
  493. n_iter_no_change=n_iter_no_change,
  494. warm_start=warm_start,
  495. average=average,
  496. )
  497. self.class_weight = class_weight
  498. self.n_jobs = n_jobs
  499. def _partial_fit(
  500. self,
  501. X,
  502. y,
  503. alpha,
  504. C,
  505. loss,
  506. learning_rate,
  507. max_iter,
  508. classes,
  509. sample_weight,
  510. coef_init,
  511. intercept_init,
  512. ):
  513. first_call = not hasattr(self, "classes_")
  514. X, y = self._validate_data(
  515. X,
  516. y,
  517. accept_sparse="csr",
  518. dtype=[np.float64, np.float32],
  519. order="C",
  520. accept_large_sparse=False,
  521. reset=first_call,
  522. )
  523. n_samples, n_features = X.shape
  524. _check_partial_fit_first_call(self, classes)
  525. n_classes = self.classes_.shape[0]
  526. # Allocate datastructures from input arguments
  527. self._expanded_class_weight = compute_class_weight(
  528. self.class_weight, classes=self.classes_, y=y
  529. )
  530. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  531. if getattr(self, "coef_", None) is None or coef_init is not None:
  532. self._allocate_parameter_mem(
  533. n_classes=n_classes,
  534. n_features=n_features,
  535. input_dtype=X.dtype,
  536. coef_init=coef_init,
  537. intercept_init=intercept_init,
  538. )
  539. elif n_features != self.coef_.shape[-1]:
  540. raise ValueError(
  541. "Number of features %d does not match previous data %d."
  542. % (n_features, self.coef_.shape[-1])
  543. )
  544. self.loss_function_ = self._get_loss_function(loss)
  545. if not hasattr(self, "t_"):
  546. self.t_ = 1.0
  547. # delegate to concrete training procedure
  548. if n_classes > 2:
  549. self._fit_multiclass(
  550. X,
  551. y,
  552. alpha=alpha,
  553. C=C,
  554. learning_rate=learning_rate,
  555. sample_weight=sample_weight,
  556. max_iter=max_iter,
  557. )
  558. elif n_classes == 2:
  559. self._fit_binary(
  560. X,
  561. y,
  562. alpha=alpha,
  563. C=C,
  564. learning_rate=learning_rate,
  565. sample_weight=sample_weight,
  566. max_iter=max_iter,
  567. )
  568. else:
  569. raise ValueError(
  570. "The number of classes has to be greater than one; got %d class"
  571. % n_classes
  572. )
  573. return self
  574. def _fit(
  575. self,
  576. X,
  577. y,
  578. alpha,
  579. C,
  580. loss,
  581. learning_rate,
  582. coef_init=None,
  583. intercept_init=None,
  584. sample_weight=None,
  585. ):
  586. if hasattr(self, "classes_"):
  587. # delete the attribute otherwise _partial_fit thinks it's not the first call
  588. delattr(self, "classes_")
  589. # labels can be encoded as float, int, or string literals
  590. # np.unique sorts in asc order; largest class id is positive class
  591. y = self._validate_data(y=y)
  592. classes = np.unique(y)
  593. if self.warm_start and hasattr(self, "coef_"):
  594. if coef_init is None:
  595. coef_init = self.coef_
  596. if intercept_init is None:
  597. intercept_init = self.intercept_
  598. else:
  599. self.coef_ = None
  600. self.intercept_ = None
  601. if self.average > 0:
  602. self._standard_coef = self.coef_
  603. self._standard_intercept = self.intercept_
  604. self._average_coef = None
  605. self._average_intercept = None
  606. # Clear iteration count for multiple call to fit.
  607. self.t_ = 1.0
  608. self._partial_fit(
  609. X,
  610. y,
  611. alpha,
  612. C,
  613. loss,
  614. learning_rate,
  615. self.max_iter,
  616. classes,
  617. sample_weight,
  618. coef_init,
  619. intercept_init,
  620. )
  621. if (
  622. self.tol is not None
  623. and self.tol > -np.inf
  624. and self.n_iter_ == self.max_iter
  625. ):
  626. warnings.warn(
  627. (
  628. "Maximum number of iteration reached before "
  629. "convergence. Consider increasing max_iter to "
  630. "improve the fit."
  631. ),
  632. ConvergenceWarning,
  633. )
  634. return self
  635. def _fit_binary(self, X, y, alpha, C, sample_weight, learning_rate, max_iter):
  636. """Fit a binary classifier on X and y."""
  637. coef, intercept, n_iter_ = fit_binary(
  638. self,
  639. 1,
  640. X,
  641. y,
  642. alpha,
  643. C,
  644. learning_rate,
  645. max_iter,
  646. self._expanded_class_weight[1],
  647. self._expanded_class_weight[0],
  648. sample_weight,
  649. random_state=self.random_state,
  650. )
  651. self.t_ += n_iter_ * X.shape[0]
  652. self.n_iter_ = n_iter_
  653. # need to be 2d
  654. if self.average > 0:
  655. if self.average <= self.t_ - 1:
  656. self.coef_ = self._average_coef.reshape(1, -1)
  657. self.intercept_ = self._average_intercept
  658. else:
  659. self.coef_ = self._standard_coef.reshape(1, -1)
  660. self._standard_intercept = np.atleast_1d(intercept)
  661. self.intercept_ = self._standard_intercept
  662. else:
  663. self.coef_ = coef.reshape(1, -1)
  664. # intercept is a float, need to convert it to an array of length 1
  665. self.intercept_ = np.atleast_1d(intercept)
  666. def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter):
  667. """Fit a multi-class classifier by combining binary classifiers
  668. Each binary classifier predicts one class versus all others. This
  669. strategy is called OvA (One versus All) or OvR (One versus Rest).
  670. """
  671. # Precompute the validation split using the multiclass labels
  672. # to ensure proper balancing of the classes.
  673. validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0)
  674. # Use joblib to fit OvA in parallel.
  675. # Pick the random seed for each job outside of fit_binary to avoid
  676. # sharing the estimator random state between threads which could lead
  677. # to non-deterministic behavior
  678. random_state = check_random_state(self.random_state)
  679. seeds = random_state.randint(MAX_INT, size=len(self.classes_))
  680. result = Parallel(
  681. n_jobs=self.n_jobs, verbose=self.verbose, require="sharedmem"
  682. )(
  683. delayed(fit_binary)(
  684. self,
  685. i,
  686. X,
  687. y,
  688. alpha,
  689. C,
  690. learning_rate,
  691. max_iter,
  692. self._expanded_class_weight[i],
  693. 1.0,
  694. sample_weight,
  695. validation_mask=validation_mask,
  696. random_state=seed,
  697. )
  698. for i, seed in enumerate(seeds)
  699. )
  700. # take the maximum of n_iter_ over every binary fit
  701. n_iter_ = 0.0
  702. for i, (_, intercept, n_iter_i) in enumerate(result):
  703. self.intercept_[i] = intercept
  704. n_iter_ = max(n_iter_, n_iter_i)
  705. self.t_ += n_iter_ * X.shape[0]
  706. self.n_iter_ = n_iter_
  707. if self.average > 0:
  708. if self.average <= self.t_ - 1.0:
  709. self.coef_ = self._average_coef
  710. self.intercept_ = self._average_intercept
  711. else:
  712. self.coef_ = self._standard_coef
  713. self._standard_intercept = np.atleast_1d(self.intercept_)
  714. self.intercept_ = self._standard_intercept
  715. @_fit_context(prefer_skip_nested_validation=True)
  716. def partial_fit(self, X, y, classes=None, sample_weight=None):
  717. """Perform one epoch of stochastic gradient descent on given samples.
  718. Internally, this method uses ``max_iter = 1``. Therefore, it is not
  719. guaranteed that a minimum of the cost function is reached after calling
  720. it once. Matters such as objective convergence, early stopping, and
  721. learning rate adjustments should be handled by the user.
  722. Parameters
  723. ----------
  724. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  725. Subset of the training data.
  726. y : ndarray of shape (n_samples,)
  727. Subset of the target values.
  728. classes : ndarray of shape (n_classes,), default=None
  729. Classes across all calls to partial_fit.
  730. Can be obtained by via `np.unique(y_all)`, where y_all is the
  731. target vector of the entire dataset.
  732. This argument is required for the first call to partial_fit
  733. and can be omitted in the subsequent calls.
  734. Note that y doesn't need to contain all labels in `classes`.
  735. sample_weight : array-like, shape (n_samples,), default=None
  736. Weights applied to individual samples.
  737. If not provided, uniform weights are assumed.
  738. Returns
  739. -------
  740. self : object
  741. Returns an instance of self.
  742. """
  743. if not hasattr(self, "classes_"):
  744. self._more_validate_params(for_partial_fit=True)
  745. if self.class_weight == "balanced":
  746. raise ValueError(
  747. "class_weight '{0}' is not supported for "
  748. "partial_fit. In order to use 'balanced' weights,"
  749. " use compute_class_weight('{0}', "
  750. "classes=classes, y=y). "
  751. "In place of y you can use a large enough sample "
  752. "of the full training set target to properly "
  753. "estimate the class frequency distributions. "
  754. "Pass the resulting weights as the class_weight "
  755. "parameter.".format(self.class_weight)
  756. )
  757. return self._partial_fit(
  758. X,
  759. y,
  760. alpha=self.alpha,
  761. C=1.0,
  762. loss=self.loss,
  763. learning_rate=self.learning_rate,
  764. max_iter=1,
  765. classes=classes,
  766. sample_weight=sample_weight,
  767. coef_init=None,
  768. intercept_init=None,
  769. )
  770. @_fit_context(prefer_skip_nested_validation=True)
  771. def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
  772. """Fit linear model with Stochastic Gradient Descent.
  773. Parameters
  774. ----------
  775. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  776. Training data.
  777. y : ndarray of shape (n_samples,)
  778. Target values.
  779. coef_init : ndarray of shape (n_classes, n_features), default=None
  780. The initial coefficients to warm-start the optimization.
  781. intercept_init : ndarray of shape (n_classes,), default=None
  782. The initial intercept to warm-start the optimization.
  783. sample_weight : array-like, shape (n_samples,), default=None
  784. Weights applied to individual samples.
  785. If not provided, uniform weights are assumed. These weights will
  786. be multiplied with class_weight (passed through the
  787. constructor) if class_weight is specified.
  788. Returns
  789. -------
  790. self : object
  791. Returns an instance of self.
  792. """
  793. self._more_validate_params()
  794. return self._fit(
  795. X,
  796. y,
  797. alpha=self.alpha,
  798. C=1.0,
  799. loss=self.loss,
  800. learning_rate=self.learning_rate,
  801. coef_init=coef_init,
  802. intercept_init=intercept_init,
  803. sample_weight=sample_weight,
  804. )
  805. class SGDClassifier(BaseSGDClassifier):
  806. """Linear classifiers (SVM, logistic regression, etc.) with SGD training.
  807. This estimator implements regularized linear models with stochastic
  808. gradient descent (SGD) learning: the gradient of the loss is estimated
  809. each sample at a time and the model is updated along the way with a
  810. decreasing strength schedule (aka learning rate). SGD allows minibatch
  811. (online/out-of-core) learning via the `partial_fit` method.
  812. For best results using the default learning rate schedule, the data should
  813. have zero mean and unit variance.
  814. This implementation works with data represented as dense or sparse arrays
  815. of floating point values for the features. The model it fits can be
  816. controlled with the loss parameter; by default, it fits a linear support
  817. vector machine (SVM).
  818. The regularizer is a penalty added to the loss function that shrinks model
  819. parameters towards the zero vector using either the squared euclidean norm
  820. L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
  821. parameter update crosses the 0.0 value because of the regularizer, the
  822. update is truncated to 0.0 to allow for learning sparse models and achieve
  823. online feature selection.
  824. Read more in the :ref:`User Guide <sgd>`.
  825. Parameters
  826. ----------
  827. loss : {'hinge', 'log_loss', 'modified_huber', 'squared_hinge',\
  828. 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive',\
  829. 'squared_epsilon_insensitive'}, default='hinge'
  830. The loss function to be used.
  831. - 'hinge' gives a linear SVM.
  832. - 'log_loss' gives logistic regression, a probabilistic classifier.
  833. - 'modified_huber' is another smooth loss that brings tolerance to
  834. outliers as well as probability estimates.
  835. - 'squared_hinge' is like hinge but is quadratically penalized.
  836. - 'perceptron' is the linear loss used by the perceptron algorithm.
  837. - The other losses, 'squared_error', 'huber', 'epsilon_insensitive' and
  838. 'squared_epsilon_insensitive' are designed for regression but can be useful
  839. in classification as well; see
  840. :class:`~sklearn.linear_model.SGDRegressor` for a description.
  841. More details about the losses formulas can be found in the
  842. :ref:`User Guide <sgd_mathematical_formulation>`.
  843. penalty : {'l2', 'l1', 'elasticnet', None}, default='l2'
  844. The penalty (aka regularization term) to be used. Defaults to 'l2'
  845. which is the standard regularizer for linear SVM models. 'l1' and
  846. 'elasticnet' might bring sparsity to the model (feature selection)
  847. not achievable with 'l2'. No penalty is added when set to `None`.
  848. alpha : float, default=0.0001
  849. Constant that multiplies the regularization term. The higher the
  850. value, the stronger the regularization. Also used to compute the
  851. learning rate when `learning_rate` is set to 'optimal'.
  852. Values must be in the range `[0.0, inf)`.
  853. l1_ratio : float, default=0.15
  854. The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
  855. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
  856. Only used if `penalty` is 'elasticnet'.
  857. Values must be in the range `[0.0, 1.0]`.
  858. fit_intercept : bool, default=True
  859. Whether the intercept should be estimated or not. If False, the
  860. data is assumed to be already centered.
  861. max_iter : int, default=1000
  862. The maximum number of passes over the training data (aka epochs).
  863. It only impacts the behavior in the ``fit`` method, and not the
  864. :meth:`partial_fit` method.
  865. Values must be in the range `[1, inf)`.
  866. .. versionadded:: 0.19
  867. tol : float or None, default=1e-3
  868. The stopping criterion. If it is not None, training will stop
  869. when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
  870. epochs.
  871. Convergence is checked against the training loss or the
  872. validation loss depending on the `early_stopping` parameter.
  873. Values must be in the range `[0.0, inf)`.
  874. .. versionadded:: 0.19
  875. shuffle : bool, default=True
  876. Whether or not the training data should be shuffled after each epoch.
  877. verbose : int, default=0
  878. The verbosity level.
  879. Values must be in the range `[0, inf)`.
  880. epsilon : float, default=0.1
  881. Epsilon in the epsilon-insensitive loss functions; only if `loss` is
  882. 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
  883. For 'huber', determines the threshold at which it becomes less
  884. important to get the prediction exactly right.
  885. For epsilon-insensitive, any differences between the current prediction
  886. and the correct label are ignored if they are less than this threshold.
  887. Values must be in the range `[0.0, inf)`.
  888. n_jobs : int, default=None
  889. The number of CPUs to use to do the OVA (One Versus All, for
  890. multi-class problems) computation.
  891. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  892. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  893. for more details.
  894. random_state : int, RandomState instance, default=None
  895. Used for shuffling the data, when ``shuffle`` is set to ``True``.
  896. Pass an int for reproducible output across multiple function calls.
  897. See :term:`Glossary <random_state>`.
  898. Integer values must be in the range `[0, 2**32 - 1]`.
  899. learning_rate : str, default='optimal'
  900. The learning rate schedule:
  901. - 'constant': `eta = eta0`
  902. - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
  903. where `t0` is chosen by a heuristic proposed by Leon Bottou.
  904. - 'invscaling': `eta = eta0 / pow(t, power_t)`
  905. - 'adaptive': `eta = eta0`, as long as the training keeps decreasing.
  906. Each time n_iter_no_change consecutive epochs fail to decrease the
  907. training loss by tol or fail to increase validation score by tol if
  908. `early_stopping` is `True`, the current learning rate is divided by 5.
  909. .. versionadded:: 0.20
  910. Added 'adaptive' option
  911. eta0 : float, default=0.0
  912. The initial learning rate for the 'constant', 'invscaling' or
  913. 'adaptive' schedules. The default value is 0.0 as eta0 is not used by
  914. the default schedule 'optimal'.
  915. Values must be in the range `(0.0, inf)`.
  916. power_t : float, default=0.5
  917. The exponent for inverse scaling learning rate [default 0.5].
  918. Values must be in the range `(-inf, inf)`.
  919. early_stopping : bool, default=False
  920. Whether to use early stopping to terminate training when validation
  921. score is not improving. If set to `True`, it will automatically set aside
  922. a stratified fraction of training data as validation and terminate
  923. training when validation score returned by the `score` method is not
  924. improving by at least tol for n_iter_no_change consecutive epochs.
  925. .. versionadded:: 0.20
  926. Added 'early_stopping' option
  927. validation_fraction : float, default=0.1
  928. The proportion of training data to set aside as validation set for
  929. early stopping. Must be between 0 and 1.
  930. Only used if `early_stopping` is True.
  931. Values must be in the range `(0.0, 1.0)`.
  932. .. versionadded:: 0.20
  933. Added 'validation_fraction' option
  934. n_iter_no_change : int, default=5
  935. Number of iterations with no improvement to wait before stopping
  936. fitting.
  937. Convergence is checked against the training loss or the
  938. validation loss depending on the `early_stopping` parameter.
  939. Integer values must be in the range `[1, max_iter)`.
  940. .. versionadded:: 0.20
  941. Added 'n_iter_no_change' option
  942. class_weight : dict, {class_label: weight} or "balanced", default=None
  943. Preset for the class_weight fit parameter.
  944. Weights associated with classes. If not given, all classes
  945. are supposed to have weight one.
  946. The "balanced" mode uses the values of y to automatically adjust
  947. weights inversely proportional to class frequencies in the input data
  948. as ``n_samples / (n_classes * np.bincount(y))``.
  949. warm_start : bool, default=False
  950. When set to True, reuse the solution of the previous call to fit as
  951. initialization, otherwise, just erase the previous solution.
  952. See :term:`the Glossary <warm_start>`.
  953. Repeatedly calling fit or partial_fit when warm_start is True can
  954. result in a different solution than when calling fit a single time
  955. because of the way the data is shuffled.
  956. If a dynamic learning rate is used, the learning rate is adapted
  957. depending on the number of samples already seen. Calling ``fit`` resets
  958. this counter, while ``partial_fit`` will result in increasing the
  959. existing counter.
  960. average : bool or int, default=False
  961. When set to `True`, computes the averaged SGD weights across all
  962. updates and stores the result in the ``coef_`` attribute. If set to
  963. an int greater than 1, averaging will begin once the total number of
  964. samples seen reaches `average`. So ``average=10`` will begin
  965. averaging after seeing 10 samples.
  966. Integer values must be in the range `[1, n_samples]`.
  967. Attributes
  968. ----------
  969. coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
  970. (n_classes, n_features)
  971. Weights assigned to the features.
  972. intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
  973. Constants in decision function.
  974. n_iter_ : int
  975. The actual number of iterations before reaching the stopping criterion.
  976. For multiclass fits, it is the maximum over every binary fit.
  977. loss_function_ : concrete ``LossFunction``
  978. classes_ : array of shape (n_classes,)
  979. t_ : int
  980. Number of weight updates performed during training.
  981. Same as ``(n_iter_ * n_samples + 1)``.
  982. n_features_in_ : int
  983. Number of features seen during :term:`fit`.
  984. .. versionadded:: 0.24
  985. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  986. Names of features seen during :term:`fit`. Defined only when `X`
  987. has feature names that are all strings.
  988. .. versionadded:: 1.0
  989. See Also
  990. --------
  991. sklearn.svm.LinearSVC : Linear support vector classification.
  992. LogisticRegression : Logistic regression.
  993. Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to
  994. ``SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant",
  995. penalty=None)``.
  996. Examples
  997. --------
  998. >>> import numpy as np
  999. >>> from sklearn.linear_model import SGDClassifier
  1000. >>> from sklearn.preprocessing import StandardScaler
  1001. >>> from sklearn.pipeline import make_pipeline
  1002. >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
  1003. >>> Y = np.array([1, 1, 2, 2])
  1004. >>> # Always scale the input. The most convenient way is to use a pipeline.
  1005. >>> clf = make_pipeline(StandardScaler(),
  1006. ... SGDClassifier(max_iter=1000, tol=1e-3))
  1007. >>> clf.fit(X, Y)
  1008. Pipeline(steps=[('standardscaler', StandardScaler()),
  1009. ('sgdclassifier', SGDClassifier())])
  1010. >>> print(clf.predict([[-0.8, -1]]))
  1011. [1]
  1012. """
  1013. _parameter_constraints: dict = {
  1014. **BaseSGDClassifier._parameter_constraints,
  1015. "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
  1016. "alpha": [Interval(Real, 0, None, closed="left")],
  1017. "l1_ratio": [Interval(Real, 0, 1, closed="both")],
  1018. "power_t": [Interval(Real, None, None, closed="neither")],
  1019. "epsilon": [Interval(Real, 0, None, closed="left")],
  1020. "learning_rate": [
  1021. StrOptions({"constant", "optimal", "invscaling", "adaptive"}),
  1022. Hidden(StrOptions({"pa1", "pa2"})),
  1023. ],
  1024. "eta0": [Interval(Real, 0, None, closed="left")],
  1025. }
  1026. def __init__(
  1027. self,
  1028. loss="hinge",
  1029. *,
  1030. penalty="l2",
  1031. alpha=0.0001,
  1032. l1_ratio=0.15,
  1033. fit_intercept=True,
  1034. max_iter=1000,
  1035. tol=1e-3,
  1036. shuffle=True,
  1037. verbose=0,
  1038. epsilon=DEFAULT_EPSILON,
  1039. n_jobs=None,
  1040. random_state=None,
  1041. learning_rate="optimal",
  1042. eta0=0.0,
  1043. power_t=0.5,
  1044. early_stopping=False,
  1045. validation_fraction=0.1,
  1046. n_iter_no_change=5,
  1047. class_weight=None,
  1048. warm_start=False,
  1049. average=False,
  1050. ):
  1051. super().__init__(
  1052. loss=loss,
  1053. penalty=penalty,
  1054. alpha=alpha,
  1055. l1_ratio=l1_ratio,
  1056. fit_intercept=fit_intercept,
  1057. max_iter=max_iter,
  1058. tol=tol,
  1059. shuffle=shuffle,
  1060. verbose=verbose,
  1061. epsilon=epsilon,
  1062. n_jobs=n_jobs,
  1063. random_state=random_state,
  1064. learning_rate=learning_rate,
  1065. eta0=eta0,
  1066. power_t=power_t,
  1067. early_stopping=early_stopping,
  1068. validation_fraction=validation_fraction,
  1069. n_iter_no_change=n_iter_no_change,
  1070. class_weight=class_weight,
  1071. warm_start=warm_start,
  1072. average=average,
  1073. )
  1074. def _check_proba(self):
  1075. if self.loss not in ("log_loss", "modified_huber"):
  1076. raise AttributeError(
  1077. "probability estimates are not available for loss=%r" % self.loss
  1078. )
  1079. return True
  1080. @available_if(_check_proba)
  1081. def predict_proba(self, X):
  1082. """Probability estimates.
  1083. This method is only available for log loss and modified Huber loss.
  1084. Multiclass probability estimates are derived from binary (one-vs.-rest)
  1085. estimates by simple normalization, as recommended by Zadrozny and
  1086. Elkan.
  1087. Binary probability estimates for loss="modified_huber" are given by
  1088. (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions
  1089. it is necessary to perform proper probability calibration by wrapping
  1090. the classifier with
  1091. :class:`~sklearn.calibration.CalibratedClassifierCV` instead.
  1092. Parameters
  1093. ----------
  1094. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1095. Input data for prediction.
  1096. Returns
  1097. -------
  1098. ndarray of shape (n_samples, n_classes)
  1099. Returns the probability of the sample for each class in the model,
  1100. where classes are ordered as they are in `self.classes_`.
  1101. References
  1102. ----------
  1103. Zadrozny and Elkan, "Transforming classifier scores into multiclass
  1104. probability estimates", SIGKDD'02,
  1105. https://dl.acm.org/doi/pdf/10.1145/775047.775151
  1106. The justification for the formula in the loss="modified_huber"
  1107. case is in the appendix B in:
  1108. http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf
  1109. """
  1110. check_is_fitted(self)
  1111. if self.loss == "log_loss":
  1112. return self._predict_proba_lr(X)
  1113. elif self.loss == "modified_huber":
  1114. binary = len(self.classes_) == 2
  1115. scores = self.decision_function(X)
  1116. if binary:
  1117. prob2 = np.ones((scores.shape[0], 2))
  1118. prob = prob2[:, 1]
  1119. else:
  1120. prob = scores
  1121. np.clip(scores, -1, 1, prob)
  1122. prob += 1.0
  1123. prob /= 2.0
  1124. if binary:
  1125. prob2[:, 0] -= prob
  1126. prob = prob2
  1127. else:
  1128. # the above might assign zero to all classes, which doesn't
  1129. # normalize neatly; work around this to produce uniform
  1130. # probabilities
  1131. prob_sum = prob.sum(axis=1)
  1132. all_zero = prob_sum == 0
  1133. if np.any(all_zero):
  1134. prob[all_zero, :] = 1
  1135. prob_sum[all_zero] = len(self.classes_)
  1136. # normalize
  1137. prob /= prob_sum.reshape((prob.shape[0], -1))
  1138. return prob
  1139. else:
  1140. raise NotImplementedError(
  1141. "predict_(log_)proba only supported when"
  1142. " loss='log_loss' or loss='modified_huber' "
  1143. "(%r given)"
  1144. % self.loss
  1145. )
  1146. @available_if(_check_proba)
  1147. def predict_log_proba(self, X):
  1148. """Log of probability estimates.
  1149. This method is only available for log loss and modified Huber loss.
  1150. When loss="modified_huber", probability estimates may be hard zeros
  1151. and ones, so taking the logarithm is not possible.
  1152. See ``predict_proba`` for details.
  1153. Parameters
  1154. ----------
  1155. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1156. Input data for prediction.
  1157. Returns
  1158. -------
  1159. T : array-like, shape (n_samples, n_classes)
  1160. Returns the log-probability of the sample for each class in the
  1161. model, where classes are ordered as they are in
  1162. `self.classes_`.
  1163. """
  1164. return np.log(self.predict_proba(X))
  1165. def _more_tags(self):
  1166. return {
  1167. "_xfail_checks": {
  1168. "check_sample_weights_invariance": (
  1169. "zero sample_weight is not equivalent to removing samples"
  1170. ),
  1171. },
  1172. "preserves_dtype": [np.float64, np.float32],
  1173. }
  1174. class BaseSGDRegressor(RegressorMixin, BaseSGD):
  1175. loss_functions = {
  1176. "squared_error": (SquaredLoss,),
  1177. "huber": (Huber, DEFAULT_EPSILON),
  1178. "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
  1179. "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, DEFAULT_EPSILON),
  1180. }
  1181. _parameter_constraints: dict = {
  1182. **BaseSGD._parameter_constraints,
  1183. "loss": [StrOptions(set(loss_functions))],
  1184. "early_stopping": ["boolean"],
  1185. "validation_fraction": [Interval(Real, 0, 1, closed="neither")],
  1186. "n_iter_no_change": [Interval(Integral, 1, None, closed="left")],
  1187. }
  1188. @abstractmethod
  1189. def __init__(
  1190. self,
  1191. loss="squared_error",
  1192. *,
  1193. penalty="l2",
  1194. alpha=0.0001,
  1195. l1_ratio=0.15,
  1196. fit_intercept=True,
  1197. max_iter=1000,
  1198. tol=1e-3,
  1199. shuffle=True,
  1200. verbose=0,
  1201. epsilon=DEFAULT_EPSILON,
  1202. random_state=None,
  1203. learning_rate="invscaling",
  1204. eta0=0.01,
  1205. power_t=0.25,
  1206. early_stopping=False,
  1207. validation_fraction=0.1,
  1208. n_iter_no_change=5,
  1209. warm_start=False,
  1210. average=False,
  1211. ):
  1212. super().__init__(
  1213. loss=loss,
  1214. penalty=penalty,
  1215. alpha=alpha,
  1216. l1_ratio=l1_ratio,
  1217. fit_intercept=fit_intercept,
  1218. max_iter=max_iter,
  1219. tol=tol,
  1220. shuffle=shuffle,
  1221. verbose=verbose,
  1222. epsilon=epsilon,
  1223. random_state=random_state,
  1224. learning_rate=learning_rate,
  1225. eta0=eta0,
  1226. power_t=power_t,
  1227. early_stopping=early_stopping,
  1228. validation_fraction=validation_fraction,
  1229. n_iter_no_change=n_iter_no_change,
  1230. warm_start=warm_start,
  1231. average=average,
  1232. )
  1233. def _partial_fit(
  1234. self,
  1235. X,
  1236. y,
  1237. alpha,
  1238. C,
  1239. loss,
  1240. learning_rate,
  1241. max_iter,
  1242. sample_weight,
  1243. coef_init,
  1244. intercept_init,
  1245. ):
  1246. first_call = getattr(self, "coef_", None) is None
  1247. X, y = self._validate_data(
  1248. X,
  1249. y,
  1250. accept_sparse="csr",
  1251. copy=False,
  1252. order="C",
  1253. dtype=[np.float64, np.float32],
  1254. accept_large_sparse=False,
  1255. reset=first_call,
  1256. )
  1257. y = y.astype(X.dtype, copy=False)
  1258. n_samples, n_features = X.shape
  1259. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  1260. # Allocate datastructures from input arguments
  1261. if first_call:
  1262. self._allocate_parameter_mem(
  1263. n_classes=1,
  1264. n_features=n_features,
  1265. input_dtype=X.dtype,
  1266. coef_init=coef_init,
  1267. intercept_init=intercept_init,
  1268. )
  1269. if self.average > 0 and getattr(self, "_average_coef", None) is None:
  1270. self._average_coef = np.zeros(n_features, dtype=X.dtype, order="C")
  1271. self._average_intercept = np.zeros(1, dtype=X.dtype, order="C")
  1272. self._fit_regressor(
  1273. X, y, alpha, C, loss, learning_rate, sample_weight, max_iter
  1274. )
  1275. return self
  1276. @_fit_context(prefer_skip_nested_validation=True)
  1277. def partial_fit(self, X, y, sample_weight=None):
  1278. """Perform one epoch of stochastic gradient descent on given samples.
  1279. Internally, this method uses ``max_iter = 1``. Therefore, it is not
  1280. guaranteed that a minimum of the cost function is reached after calling
  1281. it once. Matters such as objective convergence and early stopping
  1282. should be handled by the user.
  1283. Parameters
  1284. ----------
  1285. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1286. Subset of training data.
  1287. y : numpy array of shape (n_samples,)
  1288. Subset of target values.
  1289. sample_weight : array-like, shape (n_samples,), default=None
  1290. Weights applied to individual samples.
  1291. If not provided, uniform weights are assumed.
  1292. Returns
  1293. -------
  1294. self : object
  1295. Returns an instance of self.
  1296. """
  1297. if not hasattr(self, "coef_"):
  1298. self._more_validate_params(for_partial_fit=True)
  1299. return self._partial_fit(
  1300. X,
  1301. y,
  1302. self.alpha,
  1303. C=1.0,
  1304. loss=self.loss,
  1305. learning_rate=self.learning_rate,
  1306. max_iter=1,
  1307. sample_weight=sample_weight,
  1308. coef_init=None,
  1309. intercept_init=None,
  1310. )
  1311. def _fit(
  1312. self,
  1313. X,
  1314. y,
  1315. alpha,
  1316. C,
  1317. loss,
  1318. learning_rate,
  1319. coef_init=None,
  1320. intercept_init=None,
  1321. sample_weight=None,
  1322. ):
  1323. if self.warm_start and getattr(self, "coef_", None) is not None:
  1324. if coef_init is None:
  1325. coef_init = self.coef_
  1326. if intercept_init is None:
  1327. intercept_init = self.intercept_
  1328. else:
  1329. self.coef_ = None
  1330. self.intercept_ = None
  1331. # Clear iteration count for multiple call to fit.
  1332. self.t_ = 1.0
  1333. self._partial_fit(
  1334. X,
  1335. y,
  1336. alpha,
  1337. C,
  1338. loss,
  1339. learning_rate,
  1340. self.max_iter,
  1341. sample_weight,
  1342. coef_init,
  1343. intercept_init,
  1344. )
  1345. if (
  1346. self.tol is not None
  1347. and self.tol > -np.inf
  1348. and self.n_iter_ == self.max_iter
  1349. ):
  1350. warnings.warn(
  1351. (
  1352. "Maximum number of iteration reached before "
  1353. "convergence. Consider increasing max_iter to "
  1354. "improve the fit."
  1355. ),
  1356. ConvergenceWarning,
  1357. )
  1358. return self
  1359. @_fit_context(prefer_skip_nested_validation=True)
  1360. def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
  1361. """Fit linear model with Stochastic Gradient Descent.
  1362. Parameters
  1363. ----------
  1364. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1365. Training data.
  1366. y : ndarray of shape (n_samples,)
  1367. Target values.
  1368. coef_init : ndarray of shape (n_features,), default=None
  1369. The initial coefficients to warm-start the optimization.
  1370. intercept_init : ndarray of shape (1,), default=None
  1371. The initial intercept to warm-start the optimization.
  1372. sample_weight : array-like, shape (n_samples,), default=None
  1373. Weights applied to individual samples (1. for unweighted).
  1374. Returns
  1375. -------
  1376. self : object
  1377. Fitted `SGDRegressor` estimator.
  1378. """
  1379. self._more_validate_params()
  1380. return self._fit(
  1381. X,
  1382. y,
  1383. alpha=self.alpha,
  1384. C=1.0,
  1385. loss=self.loss,
  1386. learning_rate=self.learning_rate,
  1387. coef_init=coef_init,
  1388. intercept_init=intercept_init,
  1389. sample_weight=sample_weight,
  1390. )
  1391. def _decision_function(self, X):
  1392. """Predict using the linear model
  1393. Parameters
  1394. ----------
  1395. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1396. Returns
  1397. -------
  1398. ndarray of shape (n_samples,)
  1399. Predicted target values per element in X.
  1400. """
  1401. check_is_fitted(self)
  1402. X = self._validate_data(X, accept_sparse="csr", reset=False)
  1403. scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
  1404. return scores.ravel()
  1405. def predict(self, X):
  1406. """Predict using the linear model.
  1407. Parameters
  1408. ----------
  1409. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1410. Input data.
  1411. Returns
  1412. -------
  1413. ndarray of shape (n_samples,)
  1414. Predicted target values per element in X.
  1415. """
  1416. return self._decision_function(X)
  1417. def _fit_regressor(
  1418. self, X, y, alpha, C, loss, learning_rate, sample_weight, max_iter
  1419. ):
  1420. loss_function = self._get_loss_function(loss)
  1421. penalty_type = self._get_penalty_type(self.penalty)
  1422. learning_rate_type = self._get_learning_rate_type(learning_rate)
  1423. if not hasattr(self, "t_"):
  1424. self.t_ = 1.0
  1425. validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0)
  1426. validation_score_cb = self._make_validation_score_cb(
  1427. validation_mask, X, y, sample_weight
  1428. )
  1429. random_state = check_random_state(self.random_state)
  1430. # numpy mtrand expects a C long which is a signed 32 bit integer under
  1431. # Windows
  1432. seed = random_state.randint(0, MAX_INT)
  1433. dataset, intercept_decay = make_dataset(
  1434. X, y, sample_weight, random_state=random_state
  1435. )
  1436. tol = self.tol if self.tol is not None else -np.inf
  1437. if self.average:
  1438. coef = self._standard_coef
  1439. intercept = self._standard_intercept
  1440. average_coef = self._average_coef
  1441. average_intercept = self._average_intercept
  1442. else:
  1443. coef = self.coef_
  1444. intercept = self.intercept_
  1445. average_coef = None # Not used
  1446. average_intercept = [0] # Not used
  1447. _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype)
  1448. coef, intercept, average_coef, average_intercept, self.n_iter_ = _plain_sgd(
  1449. coef,
  1450. intercept[0],
  1451. average_coef,
  1452. average_intercept[0],
  1453. loss_function,
  1454. penalty_type,
  1455. alpha,
  1456. C,
  1457. self.l1_ratio,
  1458. dataset,
  1459. validation_mask,
  1460. self.early_stopping,
  1461. validation_score_cb,
  1462. int(self.n_iter_no_change),
  1463. max_iter,
  1464. tol,
  1465. int(self.fit_intercept),
  1466. int(self.verbose),
  1467. int(self.shuffle),
  1468. seed,
  1469. 1.0,
  1470. 1.0,
  1471. learning_rate_type,
  1472. self.eta0,
  1473. self.power_t,
  1474. 0,
  1475. self.t_,
  1476. intercept_decay,
  1477. self.average,
  1478. )
  1479. self.t_ += self.n_iter_ * X.shape[0]
  1480. if self.average > 0:
  1481. self._average_intercept = np.atleast_1d(average_intercept)
  1482. self._standard_intercept = np.atleast_1d(intercept)
  1483. if self.average <= self.t_ - 1.0:
  1484. # made enough updates for averaging to be taken into account
  1485. self.coef_ = average_coef
  1486. self.intercept_ = np.atleast_1d(average_intercept)
  1487. else:
  1488. self.coef_ = coef
  1489. self.intercept_ = np.atleast_1d(intercept)
  1490. else:
  1491. self.intercept_ = np.atleast_1d(intercept)
  1492. class SGDRegressor(BaseSGDRegressor):
  1493. """Linear model fitted by minimizing a regularized empirical loss with SGD.
  1494. SGD stands for Stochastic Gradient Descent: the gradient of the loss is
  1495. estimated each sample at a time and the model is updated along the way with
  1496. a decreasing strength schedule (aka learning rate).
  1497. The regularizer is a penalty added to the loss function that shrinks model
  1498. parameters towards the zero vector using either the squared euclidean norm
  1499. L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
  1500. parameter update crosses the 0.0 value because of the regularizer, the
  1501. update is truncated to 0.0 to allow for learning sparse models and achieve
  1502. online feature selection.
  1503. This implementation works with data represented as dense numpy arrays of
  1504. floating point values for the features.
  1505. Read more in the :ref:`User Guide <sgd>`.
  1506. Parameters
  1507. ----------
  1508. loss : str, default='squared_error'
  1509. The loss function to be used. The possible values are 'squared_error',
  1510. 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
  1511. The 'squared_error' refers to the ordinary least squares fit.
  1512. 'huber' modifies 'squared_error' to focus less on getting outliers
  1513. correct by switching from squared to linear loss past a distance of
  1514. epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is
  1515. linear past that; this is the loss function used in SVR.
  1516. 'squared_epsilon_insensitive' is the same but becomes squared loss past
  1517. a tolerance of epsilon.
  1518. More details about the losses formulas can be found in the
  1519. :ref:`User Guide <sgd_mathematical_formulation>`.
  1520. penalty : {'l2', 'l1', 'elasticnet', None}, default='l2'
  1521. The penalty (aka regularization term) to be used. Defaults to 'l2'
  1522. which is the standard regularizer for linear SVM models. 'l1' and
  1523. 'elasticnet' might bring sparsity to the model (feature selection)
  1524. not achievable with 'l2'. No penalty is added when set to `None`.
  1525. alpha : float, default=0.0001
  1526. Constant that multiplies the regularization term. The higher the
  1527. value, the stronger the regularization.
  1528. Also used to compute the learning rate when set to `learning_rate` is
  1529. set to 'optimal'.
  1530. l1_ratio : float, default=0.15
  1531. The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
  1532. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
  1533. Only used if `penalty` is 'elasticnet'.
  1534. fit_intercept : bool, default=True
  1535. Whether the intercept should be estimated or not. If False, the
  1536. data is assumed to be already centered.
  1537. max_iter : int, default=1000
  1538. The maximum number of passes over the training data (aka epochs).
  1539. It only impacts the behavior in the ``fit`` method, and not the
  1540. :meth:`partial_fit` method.
  1541. .. versionadded:: 0.19
  1542. tol : float or None, default=1e-3
  1543. The stopping criterion. If it is not None, training will stop
  1544. when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
  1545. epochs.
  1546. Convergence is checked against the training loss or the
  1547. validation loss depending on the `early_stopping` parameter.
  1548. .. versionadded:: 0.19
  1549. shuffle : bool, default=True
  1550. Whether or not the training data should be shuffled after each epoch.
  1551. verbose : int, default=0
  1552. The verbosity level.
  1553. epsilon : float, default=0.1
  1554. Epsilon in the epsilon-insensitive loss functions; only if `loss` is
  1555. 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
  1556. For 'huber', determines the threshold at which it becomes less
  1557. important to get the prediction exactly right.
  1558. For epsilon-insensitive, any differences between the current prediction
  1559. and the correct label are ignored if they are less than this threshold.
  1560. random_state : int, RandomState instance, default=None
  1561. Used for shuffling the data, when ``shuffle`` is set to ``True``.
  1562. Pass an int for reproducible output across multiple function calls.
  1563. See :term:`Glossary <random_state>`.
  1564. learning_rate : str, default='invscaling'
  1565. The learning rate schedule:
  1566. - 'constant': `eta = eta0`
  1567. - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
  1568. where t0 is chosen by a heuristic proposed by Leon Bottou.
  1569. - 'invscaling': `eta = eta0 / pow(t, power_t)`
  1570. - 'adaptive': eta = eta0, as long as the training keeps decreasing.
  1571. Each time n_iter_no_change consecutive epochs fail to decrease the
  1572. training loss by tol or fail to increase validation score by tol if
  1573. early_stopping is True, the current learning rate is divided by 5.
  1574. .. versionadded:: 0.20
  1575. Added 'adaptive' option
  1576. eta0 : float, default=0.01
  1577. The initial learning rate for the 'constant', 'invscaling' or
  1578. 'adaptive' schedules. The default value is 0.01.
  1579. power_t : float, default=0.25
  1580. The exponent for inverse scaling learning rate.
  1581. early_stopping : bool, default=False
  1582. Whether to use early stopping to terminate training when validation
  1583. score is not improving. If set to True, it will automatically set aside
  1584. a fraction of training data as validation and terminate
  1585. training when validation score returned by the `score` method is not
  1586. improving by at least `tol` for `n_iter_no_change` consecutive
  1587. epochs.
  1588. .. versionadded:: 0.20
  1589. Added 'early_stopping' option
  1590. validation_fraction : float, default=0.1
  1591. The proportion of training data to set aside as validation set for
  1592. early stopping. Must be between 0 and 1.
  1593. Only used if `early_stopping` is True.
  1594. .. versionadded:: 0.20
  1595. Added 'validation_fraction' option
  1596. n_iter_no_change : int, default=5
  1597. Number of iterations with no improvement to wait before stopping
  1598. fitting.
  1599. Convergence is checked against the training loss or the
  1600. validation loss depending on the `early_stopping` parameter.
  1601. .. versionadded:: 0.20
  1602. Added 'n_iter_no_change' option
  1603. warm_start : bool, default=False
  1604. When set to True, reuse the solution of the previous call to fit as
  1605. initialization, otherwise, just erase the previous solution.
  1606. See :term:`the Glossary <warm_start>`.
  1607. Repeatedly calling fit or partial_fit when warm_start is True can
  1608. result in a different solution than when calling fit a single time
  1609. because of the way the data is shuffled.
  1610. If a dynamic learning rate is used, the learning rate is adapted
  1611. depending on the number of samples already seen. Calling ``fit`` resets
  1612. this counter, while ``partial_fit`` will result in increasing the
  1613. existing counter.
  1614. average : bool or int, default=False
  1615. When set to True, computes the averaged SGD weights across all
  1616. updates and stores the result in the ``coef_`` attribute. If set to
  1617. an int greater than 1, averaging will begin once the total number of
  1618. samples seen reaches `average`. So ``average=10`` will begin
  1619. averaging after seeing 10 samples.
  1620. Attributes
  1621. ----------
  1622. coef_ : ndarray of shape (n_features,)
  1623. Weights assigned to the features.
  1624. intercept_ : ndarray of shape (1,)
  1625. The intercept term.
  1626. n_iter_ : int
  1627. The actual number of iterations before reaching the stopping criterion.
  1628. t_ : int
  1629. Number of weight updates performed during training.
  1630. Same as ``(n_iter_ * n_samples + 1)``.
  1631. n_features_in_ : int
  1632. Number of features seen during :term:`fit`.
  1633. .. versionadded:: 0.24
  1634. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1635. Names of features seen during :term:`fit`. Defined only when `X`
  1636. has feature names that are all strings.
  1637. .. versionadded:: 1.0
  1638. See Also
  1639. --------
  1640. HuberRegressor : Linear regression model that is robust to outliers.
  1641. Lars : Least Angle Regression model.
  1642. Lasso : Linear Model trained with L1 prior as regularizer.
  1643. RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
  1644. Ridge : Linear least squares with l2 regularization.
  1645. sklearn.svm.SVR : Epsilon-Support Vector Regression.
  1646. TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
  1647. Examples
  1648. --------
  1649. >>> import numpy as np
  1650. >>> from sklearn.linear_model import SGDRegressor
  1651. >>> from sklearn.pipeline import make_pipeline
  1652. >>> from sklearn.preprocessing import StandardScaler
  1653. >>> n_samples, n_features = 10, 5
  1654. >>> rng = np.random.RandomState(0)
  1655. >>> y = rng.randn(n_samples)
  1656. >>> X = rng.randn(n_samples, n_features)
  1657. >>> # Always scale the input. The most convenient way is to use a pipeline.
  1658. >>> reg = make_pipeline(StandardScaler(),
  1659. ... SGDRegressor(max_iter=1000, tol=1e-3))
  1660. >>> reg.fit(X, y)
  1661. Pipeline(steps=[('standardscaler', StandardScaler()),
  1662. ('sgdregressor', SGDRegressor())])
  1663. """
  1664. _parameter_constraints: dict = {
  1665. **BaseSGDRegressor._parameter_constraints,
  1666. "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
  1667. "alpha": [Interval(Real, 0, None, closed="left")],
  1668. "l1_ratio": [Interval(Real, 0, 1, closed="both")],
  1669. "power_t": [Interval(Real, None, None, closed="neither")],
  1670. "learning_rate": [
  1671. StrOptions({"constant", "optimal", "invscaling", "adaptive"}),
  1672. Hidden(StrOptions({"pa1", "pa2"})),
  1673. ],
  1674. "epsilon": [Interval(Real, 0, None, closed="left")],
  1675. "eta0": [Interval(Real, 0, None, closed="left")],
  1676. }
  1677. def __init__(
  1678. self,
  1679. loss="squared_error",
  1680. *,
  1681. penalty="l2",
  1682. alpha=0.0001,
  1683. l1_ratio=0.15,
  1684. fit_intercept=True,
  1685. max_iter=1000,
  1686. tol=1e-3,
  1687. shuffle=True,
  1688. verbose=0,
  1689. epsilon=DEFAULT_EPSILON,
  1690. random_state=None,
  1691. learning_rate="invscaling",
  1692. eta0=0.01,
  1693. power_t=0.25,
  1694. early_stopping=False,
  1695. validation_fraction=0.1,
  1696. n_iter_no_change=5,
  1697. warm_start=False,
  1698. average=False,
  1699. ):
  1700. super().__init__(
  1701. loss=loss,
  1702. penalty=penalty,
  1703. alpha=alpha,
  1704. l1_ratio=l1_ratio,
  1705. fit_intercept=fit_intercept,
  1706. max_iter=max_iter,
  1707. tol=tol,
  1708. shuffle=shuffle,
  1709. verbose=verbose,
  1710. epsilon=epsilon,
  1711. random_state=random_state,
  1712. learning_rate=learning_rate,
  1713. eta0=eta0,
  1714. power_t=power_t,
  1715. early_stopping=early_stopping,
  1716. validation_fraction=validation_fraction,
  1717. n_iter_no_change=n_iter_no_change,
  1718. warm_start=warm_start,
  1719. average=average,
  1720. )
  1721. def _more_tags(self):
  1722. return {
  1723. "_xfail_checks": {
  1724. "check_sample_weights_invariance": (
  1725. "zero sample_weight is not equivalent to removing samples"
  1726. ),
  1727. },
  1728. "preserves_dtype": [np.float64, np.float32],
  1729. }
  1730. class SGDOneClassSVM(BaseSGD, OutlierMixin):
  1731. """Solves linear One-Class SVM using Stochastic Gradient Descent.
  1732. This implementation is meant to be used with a kernel approximation
  1733. technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results
  1734. similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by
  1735. default.
  1736. Read more in the :ref:`User Guide <sgd_online_one_class_svm>`.
  1737. .. versionadded:: 1.0
  1738. Parameters
  1739. ----------
  1740. nu : float, default=0.5
  1741. The nu parameter of the One Class SVM: an upper bound on the
  1742. fraction of training errors and a lower bound of the fraction of
  1743. support vectors. Should be in the interval (0, 1]. By default 0.5
  1744. will be taken.
  1745. fit_intercept : bool, default=True
  1746. Whether the intercept should be estimated or not. Defaults to True.
  1747. max_iter : int, default=1000
  1748. The maximum number of passes over the training data (aka epochs).
  1749. It only impacts the behavior in the ``fit`` method, and not the
  1750. `partial_fit`. Defaults to 1000.
  1751. tol : float or None, default=1e-3
  1752. The stopping criterion. If it is not None, the iterations will stop
  1753. when (loss > previous_loss - tol). Defaults to 1e-3.
  1754. shuffle : bool, default=True
  1755. Whether or not the training data should be shuffled after each epoch.
  1756. Defaults to True.
  1757. verbose : int, default=0
  1758. The verbosity level.
  1759. random_state : int, RandomState instance or None, default=None
  1760. The seed of the pseudo random number generator to use when shuffling
  1761. the data. If int, random_state is the seed used by the random number
  1762. generator; If RandomState instance, random_state is the random number
  1763. generator; If None, the random number generator is the RandomState
  1764. instance used by `np.random`.
  1765. learning_rate : {'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal'
  1766. The learning rate schedule to use with `fit`. (If using `partial_fit`,
  1767. learning rate must be controlled directly).
  1768. - 'constant': `eta = eta0`
  1769. - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
  1770. where t0 is chosen by a heuristic proposed by Leon Bottou.
  1771. - 'invscaling': `eta = eta0 / pow(t, power_t)`
  1772. - 'adaptive': eta = eta0, as long as the training keeps decreasing.
  1773. Each time n_iter_no_change consecutive epochs fail to decrease the
  1774. training loss by tol or fail to increase validation score by tol if
  1775. early_stopping is True, the current learning rate is divided by 5.
  1776. eta0 : float, default=0.0
  1777. The initial learning rate for the 'constant', 'invscaling' or
  1778. 'adaptive' schedules. The default value is 0.0 as eta0 is not used by
  1779. the default schedule 'optimal'.
  1780. power_t : float, default=0.5
  1781. The exponent for inverse scaling learning rate [default 0.5].
  1782. warm_start : bool, default=False
  1783. When set to True, reuse the solution of the previous call to fit as
  1784. initialization, otherwise, just erase the previous solution.
  1785. See :term:`the Glossary <warm_start>`.
  1786. Repeatedly calling fit or partial_fit when warm_start is True can
  1787. result in a different solution than when calling fit a single time
  1788. because of the way the data is shuffled.
  1789. If a dynamic learning rate is used, the learning rate is adapted
  1790. depending on the number of samples already seen. Calling ``fit`` resets
  1791. this counter, while ``partial_fit`` will result in increasing the
  1792. existing counter.
  1793. average : bool or int, default=False
  1794. When set to True, computes the averaged SGD weights and stores the
  1795. result in the ``coef_`` attribute. If set to an int greater than 1,
  1796. averaging will begin once the total number of samples seen reaches
  1797. average. So ``average=10`` will begin averaging after seeing 10
  1798. samples.
  1799. Attributes
  1800. ----------
  1801. coef_ : ndarray of shape (1, n_features)
  1802. Weights assigned to the features.
  1803. offset_ : ndarray of shape (1,)
  1804. Offset used to define the decision function from the raw scores.
  1805. We have the relation: decision_function = score_samples - offset.
  1806. n_iter_ : int
  1807. The actual number of iterations to reach the stopping criterion.
  1808. t_ : int
  1809. Number of weight updates performed during training.
  1810. Same as ``(n_iter_ * n_samples + 1)``.
  1811. loss_function_ : concrete ``LossFunction``
  1812. n_features_in_ : int
  1813. Number of features seen during :term:`fit`.
  1814. .. versionadded:: 0.24
  1815. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1816. Names of features seen during :term:`fit`. Defined only when `X`
  1817. has feature names that are all strings.
  1818. .. versionadded:: 1.0
  1819. See Also
  1820. --------
  1821. sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.
  1822. Notes
  1823. -----
  1824. This estimator has a linear complexity in the number of training samples
  1825. and is thus better suited than the `sklearn.svm.OneClassSVM`
  1826. implementation for datasets with a large number of training samples (say
  1827. > 10,000).
  1828. Examples
  1829. --------
  1830. >>> import numpy as np
  1831. >>> from sklearn import linear_model
  1832. >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
  1833. >>> clf = linear_model.SGDOneClassSVM(random_state=42)
  1834. >>> clf.fit(X)
  1835. SGDOneClassSVM(random_state=42)
  1836. >>> print(clf.predict([[4, 4]]))
  1837. [1]
  1838. """
  1839. loss_functions = {"hinge": (Hinge, 1.0)}
  1840. _parameter_constraints: dict = {
  1841. **BaseSGD._parameter_constraints,
  1842. "nu": [Interval(Real, 0.0, 1.0, closed="right")],
  1843. "learning_rate": [
  1844. StrOptions({"constant", "optimal", "invscaling", "adaptive"}),
  1845. Hidden(StrOptions({"pa1", "pa2"})),
  1846. ],
  1847. "eta0": [Interval(Real, 0, None, closed="left")],
  1848. "power_t": [Interval(Real, None, None, closed="neither")],
  1849. }
  1850. def __init__(
  1851. self,
  1852. nu=0.5,
  1853. fit_intercept=True,
  1854. max_iter=1000,
  1855. tol=1e-3,
  1856. shuffle=True,
  1857. verbose=0,
  1858. random_state=None,
  1859. learning_rate="optimal",
  1860. eta0=0.0,
  1861. power_t=0.5,
  1862. warm_start=False,
  1863. average=False,
  1864. ):
  1865. self.nu = nu
  1866. super(SGDOneClassSVM, self).__init__(
  1867. loss="hinge",
  1868. penalty="l2",
  1869. C=1.0,
  1870. l1_ratio=0,
  1871. fit_intercept=fit_intercept,
  1872. max_iter=max_iter,
  1873. tol=tol,
  1874. shuffle=shuffle,
  1875. verbose=verbose,
  1876. epsilon=DEFAULT_EPSILON,
  1877. random_state=random_state,
  1878. learning_rate=learning_rate,
  1879. eta0=eta0,
  1880. power_t=power_t,
  1881. early_stopping=False,
  1882. validation_fraction=0.1,
  1883. n_iter_no_change=5,
  1884. warm_start=warm_start,
  1885. average=average,
  1886. )
  1887. def _fit_one_class(self, X, alpha, C, sample_weight, learning_rate, max_iter):
  1888. """Uses SGD implementation with X and y=np.ones(n_samples)."""
  1889. # The One-Class SVM uses the SGD implementation with
  1890. # y=np.ones(n_samples).
  1891. n_samples = X.shape[0]
  1892. y = np.ones(n_samples, dtype=X.dtype, order="C")
  1893. dataset, offset_decay = make_dataset(X, y, sample_weight)
  1894. penalty_type = self._get_penalty_type(self.penalty)
  1895. learning_rate_type = self._get_learning_rate_type(learning_rate)
  1896. # early stopping is set to False for the One-Class SVM. thus
  1897. # validation_mask and validation_score_cb will be set to values
  1898. # associated to early_stopping=False in _make_validation_split and
  1899. # _make_validation_score_cb respectively.
  1900. validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0)
  1901. validation_score_cb = self._make_validation_score_cb(
  1902. validation_mask, X, y, sample_weight
  1903. )
  1904. random_state = check_random_state(self.random_state)
  1905. # numpy mtrand expects a C long which is a signed 32 bit integer under
  1906. # Windows
  1907. seed = random_state.randint(0, np.iinfo(np.int32).max)
  1908. tol = self.tol if self.tol is not None else -np.inf
  1909. one_class = 1
  1910. # There are no class weights for the One-Class SVM and they are
  1911. # therefore set to 1.
  1912. pos_weight = 1
  1913. neg_weight = 1
  1914. if self.average:
  1915. coef = self._standard_coef
  1916. intercept = self._standard_intercept
  1917. average_coef = self._average_coef
  1918. average_intercept = self._average_intercept
  1919. else:
  1920. coef = self.coef_
  1921. intercept = 1 - self.offset_
  1922. average_coef = None # Not used
  1923. average_intercept = [0] # Not used
  1924. _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype)
  1925. coef, intercept, average_coef, average_intercept, self.n_iter_ = _plain_sgd(
  1926. coef,
  1927. intercept[0],
  1928. average_coef,
  1929. average_intercept[0],
  1930. self.loss_function_,
  1931. penalty_type,
  1932. alpha,
  1933. C,
  1934. self.l1_ratio,
  1935. dataset,
  1936. validation_mask,
  1937. self.early_stopping,
  1938. validation_score_cb,
  1939. int(self.n_iter_no_change),
  1940. max_iter,
  1941. tol,
  1942. int(self.fit_intercept),
  1943. int(self.verbose),
  1944. int(self.shuffle),
  1945. seed,
  1946. neg_weight,
  1947. pos_weight,
  1948. learning_rate_type,
  1949. self.eta0,
  1950. self.power_t,
  1951. one_class,
  1952. self.t_,
  1953. offset_decay,
  1954. self.average,
  1955. )
  1956. self.t_ += self.n_iter_ * n_samples
  1957. if self.average > 0:
  1958. self._average_intercept = np.atleast_1d(average_intercept)
  1959. self._standard_intercept = np.atleast_1d(intercept)
  1960. if self.average <= self.t_ - 1.0:
  1961. # made enough updates for averaging to be taken into account
  1962. self.coef_ = average_coef
  1963. self.offset_ = 1 - np.atleast_1d(average_intercept)
  1964. else:
  1965. self.coef_ = coef
  1966. self.offset_ = 1 - np.atleast_1d(intercept)
  1967. else:
  1968. self.offset_ = 1 - np.atleast_1d(intercept)
  1969. def _partial_fit(
  1970. self,
  1971. X,
  1972. alpha,
  1973. C,
  1974. loss,
  1975. learning_rate,
  1976. max_iter,
  1977. sample_weight,
  1978. coef_init,
  1979. offset_init,
  1980. ):
  1981. first_call = getattr(self, "coef_", None) is None
  1982. X = self._validate_data(
  1983. X,
  1984. None,
  1985. accept_sparse="csr",
  1986. dtype=[np.float64, np.float32],
  1987. order="C",
  1988. accept_large_sparse=False,
  1989. reset=first_call,
  1990. )
  1991. n_features = X.shape[1]
  1992. # Allocate datastructures from input arguments
  1993. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  1994. # We use intercept = 1 - offset where intercept is the intercept of
  1995. # the SGD implementation and offset is the offset of the One-Class SVM
  1996. # optimization problem.
  1997. if getattr(self, "coef_", None) is None or coef_init is not None:
  1998. self._allocate_parameter_mem(
  1999. n_classes=1,
  2000. n_features=n_features,
  2001. input_dtype=X.dtype,
  2002. coef_init=coef_init,
  2003. intercept_init=offset_init,
  2004. one_class=1,
  2005. )
  2006. elif n_features != self.coef_.shape[-1]:
  2007. raise ValueError(
  2008. "Number of features %d does not match previous data %d."
  2009. % (n_features, self.coef_.shape[-1])
  2010. )
  2011. if self.average and getattr(self, "_average_coef", None) is None:
  2012. self._average_coef = np.zeros(n_features, dtype=X.dtype, order="C")
  2013. self._average_intercept = np.zeros(1, dtype=X.dtype, order="C")
  2014. self.loss_function_ = self._get_loss_function(loss)
  2015. if not hasattr(self, "t_"):
  2016. self.t_ = 1.0
  2017. # delegate to concrete training procedure
  2018. self._fit_one_class(
  2019. X,
  2020. alpha=alpha,
  2021. C=C,
  2022. learning_rate=learning_rate,
  2023. sample_weight=sample_weight,
  2024. max_iter=max_iter,
  2025. )
  2026. return self
  2027. @_fit_context(prefer_skip_nested_validation=True)
  2028. def partial_fit(self, X, y=None, sample_weight=None):
  2029. """Fit linear One-Class SVM with Stochastic Gradient Descent.
  2030. Parameters
  2031. ----------
  2032. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  2033. Subset of the training data.
  2034. y : Ignored
  2035. Not used, present for API consistency by convention.
  2036. sample_weight : array-like, shape (n_samples,), optional
  2037. Weights applied to individual samples.
  2038. If not provided, uniform weights are assumed.
  2039. Returns
  2040. -------
  2041. self : object
  2042. Returns a fitted instance of self.
  2043. """
  2044. if not hasattr(self, "coef_"):
  2045. self._more_validate_params(for_partial_fit=True)
  2046. alpha = self.nu / 2
  2047. return self._partial_fit(
  2048. X,
  2049. alpha,
  2050. C=1.0,
  2051. loss=self.loss,
  2052. learning_rate=self.learning_rate,
  2053. max_iter=1,
  2054. sample_weight=sample_weight,
  2055. coef_init=None,
  2056. offset_init=None,
  2057. )
  2058. def _fit(
  2059. self,
  2060. X,
  2061. alpha,
  2062. C,
  2063. loss,
  2064. learning_rate,
  2065. coef_init=None,
  2066. offset_init=None,
  2067. sample_weight=None,
  2068. ):
  2069. if self.warm_start and hasattr(self, "coef_"):
  2070. if coef_init is None:
  2071. coef_init = self.coef_
  2072. if offset_init is None:
  2073. offset_init = self.offset_
  2074. else:
  2075. self.coef_ = None
  2076. self.offset_ = None
  2077. # Clear iteration count for multiple call to fit.
  2078. self.t_ = 1.0
  2079. self._partial_fit(
  2080. X,
  2081. alpha,
  2082. C,
  2083. loss,
  2084. learning_rate,
  2085. self.max_iter,
  2086. sample_weight,
  2087. coef_init,
  2088. offset_init,
  2089. )
  2090. if (
  2091. self.tol is not None
  2092. and self.tol > -np.inf
  2093. and self.n_iter_ == self.max_iter
  2094. ):
  2095. warnings.warn(
  2096. (
  2097. "Maximum number of iteration reached before "
  2098. "convergence. Consider increasing max_iter to "
  2099. "improve the fit."
  2100. ),
  2101. ConvergenceWarning,
  2102. )
  2103. return self
  2104. @_fit_context(prefer_skip_nested_validation=True)
  2105. def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):
  2106. """Fit linear One-Class SVM with Stochastic Gradient Descent.
  2107. This solves an equivalent optimization problem of the
  2108. One-Class SVM primal optimization problem and returns a weight vector
  2109. w and an offset rho such that the decision function is given by
  2110. <w, x> - rho.
  2111. Parameters
  2112. ----------
  2113. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  2114. Training data.
  2115. y : Ignored
  2116. Not used, present for API consistency by convention.
  2117. coef_init : array, shape (n_classes, n_features)
  2118. The initial coefficients to warm-start the optimization.
  2119. offset_init : array, shape (n_classes,)
  2120. The initial offset to warm-start the optimization.
  2121. sample_weight : array-like, shape (n_samples,), optional
  2122. Weights applied to individual samples.
  2123. If not provided, uniform weights are assumed. These weights will
  2124. be multiplied with class_weight (passed through the
  2125. constructor) if class_weight is specified.
  2126. Returns
  2127. -------
  2128. self : object
  2129. Returns a fitted instance of self.
  2130. """
  2131. self._more_validate_params()
  2132. alpha = self.nu / 2
  2133. self._fit(
  2134. X,
  2135. alpha=alpha,
  2136. C=1.0,
  2137. loss=self.loss,
  2138. learning_rate=self.learning_rate,
  2139. coef_init=coef_init,
  2140. offset_init=offset_init,
  2141. sample_weight=sample_weight,
  2142. )
  2143. return self
  2144. def decision_function(self, X):
  2145. """Signed distance to the separating hyperplane.
  2146. Signed distance is positive for an inlier and negative for an
  2147. outlier.
  2148. Parameters
  2149. ----------
  2150. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  2151. Testing data.
  2152. Returns
  2153. -------
  2154. dec : array-like, shape (n_samples,)
  2155. Decision function values of the samples.
  2156. """
  2157. check_is_fitted(self, "coef_")
  2158. X = self._validate_data(X, accept_sparse="csr", reset=False)
  2159. decisions = safe_sparse_dot(X, self.coef_.T, dense_output=True) - self.offset_
  2160. return decisions.ravel()
  2161. def score_samples(self, X):
  2162. """Raw scoring function of the samples.
  2163. Parameters
  2164. ----------
  2165. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  2166. Testing data.
  2167. Returns
  2168. -------
  2169. score_samples : array-like, shape (n_samples,)
  2170. Unshiffted scoring function values of the samples.
  2171. """
  2172. score_samples = self.decision_function(X) + self.offset_
  2173. return score_samples
  2174. def predict(self, X):
  2175. """Return labels (1 inlier, -1 outlier) of the samples.
  2176. Parameters
  2177. ----------
  2178. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  2179. Testing data.
  2180. Returns
  2181. -------
  2182. y : array, shape (n_samples,)
  2183. Labels of the samples.
  2184. """
  2185. y = (self.decision_function(X) >= 0).astype(np.int32)
  2186. y[y == 0] = -1 # for consistency with outlier detectors
  2187. return y
  2188. def _more_tags(self):
  2189. return {
  2190. "_xfail_checks": {
  2191. "check_sample_weights_invariance": (
  2192. "zero sample_weight is not equivalent to removing samples"
  2193. )
  2194. },
  2195. "preserves_dtype": [np.float64, np.float32],
  2196. }