| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901 |
- """
- Generalized Linear Models with Exponential Dispersion Family
- """
- # Author: Christian Lorentzen <lorentzen.ch@gmail.com>
- # some parts and tricks stolen from other sklearn files.
- # License: BSD 3 clause
- from numbers import Integral, Real
- import numpy as np
- import scipy.optimize
- from ..._loss.loss import (
- HalfGammaLoss,
- HalfPoissonLoss,
- HalfSquaredError,
- HalfTweedieLoss,
- HalfTweedieLossIdentity,
- )
- from ...base import BaseEstimator, RegressorMixin, _fit_context
- from ...utils import check_array
- from ...utils._openmp_helpers import _openmp_effective_n_threads
- from ...utils._param_validation import Hidden, Interval, StrOptions
- from ...utils.optimize import _check_optimize_result
- from ...utils.validation import _check_sample_weight, check_is_fitted
- from .._linear_loss import LinearModelLoss
- from ._newton_solver import NewtonCholeskySolver, NewtonSolver
- class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
- """Regression via a penalized Generalized Linear Model (GLM).
- GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
- predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
- Therefore, the fit minimizes the following objective function with L2 priors as
- regularizer::
- 1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
- with inverse link function h, s=sample_weight and per observation (unit) deviance
- deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative
- log-likelihood up to a constant (in w) term.
- The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
- Instead of implementing the EDM family and a link function separately, we directly
- use the loss functions `from sklearn._loss` which have the link functions included
- in them for performance reasons. We pick the loss functions that implement
- (1/2 times) EDM deviances.
- Read more in the :ref:`User Guide <Generalized_linear_models>`.
- .. versionadded:: 0.23
- Parameters
- ----------
- alpha : float, default=1
- Constant that multiplies the penalty term and thus determines the
- regularization strength. ``alpha = 0`` is equivalent to unpenalized
- GLMs. In this case, the design matrix `X` must have full column rank
- (no collinearities).
- Values must be in the range `[0.0, inf)`.
- fit_intercept : bool, default=True
- Specifies if a constant (a.k.a. bias or intercept) should be
- added to the linear predictor (X @ coef + intercept).
- solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
- Algorithm to use in the optimization problem:
- 'lbfgs'
- Calls scipy's L-BFGS-B optimizer.
- 'newton-cholesky'
- Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
- iterated reweighted least squares) with an inner Cholesky based solver.
- This solver is a good choice for `n_samples` >> `n_features`, especially
- with one-hot encoded categorical features with rare categories. Be aware
- that the memory usage of this solver has a quadratic dependency on
- `n_features` because it explicitly computes the Hessian matrix.
- .. versionadded:: 1.2
- max_iter : int, default=100
- The maximal number of iterations for the solver.
- Values must be in the range `[1, inf)`.
- tol : float, default=1e-4
- Stopping criterion. For the lbfgs solver,
- the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
- where ``g_j`` is the j-th component of the gradient (derivative) of
- the objective function.
- Values must be in the range `(0.0, inf)`.
- warm_start : bool, default=False
- If set to ``True``, reuse the solution of the previous call to ``fit``
- as initialization for ``coef_`` and ``intercept_``.
- verbose : int, default=0
- For the lbfgs solver set verbose to any positive number for verbosity.
- Values must be in the range `[0, inf)`.
- Attributes
- ----------
- coef_ : array of shape (n_features,)
- Estimated coefficients for the linear predictor (`X @ coef_ +
- intercept_`) in the GLM.
- intercept_ : float
- Intercept (a.k.a. bias) added to linear predictor.
- n_iter_ : int
- Actual number of iterations used in the solver.
- _base_loss : BaseLoss, default=HalfSquaredError()
- This is set during fit via `self._get_loss()`.
- A `_base_loss` contains a specific loss function as well as the link
- function. The loss to be minimized specifies the distributional assumption of
- the GLM, i.e. the distribution from the EDM. Here are some examples:
- ======================= ======== ==========================
- _base_loss Link Target Domain
- ======================= ======== ==========================
- HalfSquaredError identity y any real number
- HalfPoissonLoss log 0 <= y
- HalfGammaLoss log 0 < y
- HalfTweedieLoss log dependent on tweedie power
- HalfTweedieLossIdentity identity dependent on tweedie power
- ======================= ======== ==========================
- The link function of the GLM, i.e. mapping from linear predictor
- `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
- we have `y_pred = exp(X @ coeff + intercept)`.
- """
- # We allow for NewtonSolver classes for the "solver" parameter but do not
- # make them public in the docstrings. This facilitates testing and
- # benchmarking.
- _parameter_constraints: dict = {
- "alpha": [Interval(Real, 0.0, None, closed="left")],
- "fit_intercept": ["boolean"],
- "solver": [
- StrOptions({"lbfgs", "newton-cholesky"}),
- Hidden(type),
- ],
- "max_iter": [Interval(Integral, 1, None, closed="left")],
- "tol": [Interval(Real, 0.0, None, closed="neither")],
- "warm_start": ["boolean"],
- "verbose": ["verbose"],
- }
- def __init__(
- self,
- *,
- alpha=1.0,
- fit_intercept=True,
- solver="lbfgs",
- max_iter=100,
- tol=1e-4,
- warm_start=False,
- verbose=0,
- ):
- self.alpha = alpha
- self.fit_intercept = fit_intercept
- self.solver = solver
- self.max_iter = max_iter
- self.tol = tol
- self.warm_start = warm_start
- self.verbose = verbose
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None):
- """Fit a Generalized Linear Model.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Training data.
- y : array-like of shape (n_samples,)
- Target values.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- self : object
- Fitted model.
- """
- X, y = self._validate_data(
- X,
- y,
- accept_sparse=["csc", "csr"],
- dtype=[np.float64, np.float32],
- y_numeric=True,
- multi_output=False,
- )
- # required by losses
- if self.solver == "lbfgs":
- # lbfgs will force coef and therefore raw_prediction to be float64. The
- # base_loss needs y, X @ coef and sample_weight all of same dtype
- # (and contiguous).
- loss_dtype = np.float64
- else:
- loss_dtype = min(max(y.dtype, X.dtype), np.float64)
- y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
- # TODO: We could support samples_weight=None as the losses support it.
- # Note that _check_sample_weight calls check_array(order="C") required by
- # losses.
- sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
- n_samples, n_features = X.shape
- self._base_loss = self._get_loss()
- linear_loss = LinearModelLoss(
- base_loss=self._base_loss,
- fit_intercept=self.fit_intercept,
- )
- if not linear_loss.base_loss.in_y_true_range(y):
- raise ValueError(
- "Some value(s) of y are out of the valid range of the loss"
- f" {self._base_loss.__class__.__name__!r}."
- )
- # TODO: if alpha=0 check that X is not rank deficient
- # IMPORTANT NOTE: Rescaling of sample_weight:
- # We want to minimize
- # obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
- # + 1/2 * alpha * L2,
- # with
- # deviance = 2 * loss.
- # The objective is invariant to multiplying sample_weight by a constant. We
- # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
- # obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
- # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
- sample_weight = sample_weight / sample_weight.sum()
- if self.warm_start and hasattr(self, "coef_"):
- if self.fit_intercept:
- # LinearModelLoss needs intercept at the end of coefficient array.
- coef = np.concatenate((self.coef_, np.array([self.intercept_])))
- else:
- coef = self.coef_
- coef = coef.astype(loss_dtype, copy=False)
- else:
- coef = linear_loss.init_zero_coef(X, dtype=loss_dtype)
- if self.fit_intercept:
- coef[-1] = linear_loss.base_loss.link.link(
- np.average(y, weights=sample_weight)
- )
- l2_reg_strength = self.alpha
- n_threads = _openmp_effective_n_threads()
- # Algorithms for optimization:
- # Note again that our losses implement 1/2 * deviance.
- if self.solver == "lbfgs":
- func = linear_loss.loss_gradient
- opt_res = scipy.optimize.minimize(
- func,
- coef,
- method="L-BFGS-B",
- jac=True,
- options={
- "maxiter": self.max_iter,
- "maxls": 50, # default is 20
- "iprint": self.verbose - 1,
- "gtol": self.tol,
- # The constant 64 was found empirically to pass the test suite.
- # The point is that ftol is very small, but a bit larger than
- # machine precision for float64, which is the dtype used by lbfgs.
- "ftol": 64 * np.finfo(float).eps,
- },
- args=(X, y, sample_weight, l2_reg_strength, n_threads),
- )
- self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
- coef = opt_res.x
- elif self.solver == "newton-cholesky":
- sol = NewtonCholeskySolver(
- coef=coef,
- linear_loss=linear_loss,
- l2_reg_strength=l2_reg_strength,
- tol=self.tol,
- max_iter=self.max_iter,
- n_threads=n_threads,
- verbose=self.verbose,
- )
- coef = sol.solve(X, y, sample_weight)
- self.n_iter_ = sol.iteration
- elif issubclass(self.solver, NewtonSolver):
- sol = self.solver(
- coef=coef,
- linear_loss=linear_loss,
- l2_reg_strength=l2_reg_strength,
- tol=self.tol,
- max_iter=self.max_iter,
- n_threads=n_threads,
- )
- coef = sol.solve(X, y, sample_weight)
- self.n_iter_ = sol.iteration
- else:
- raise ValueError(f"Invalid solver={self.solver}.")
- if self.fit_intercept:
- self.intercept_ = coef[-1]
- self.coef_ = coef[:-1]
- else:
- # set intercept to zero as the other linear models do
- self.intercept_ = 0.0
- self.coef_ = coef
- return self
- def _linear_predictor(self, X):
- """Compute the linear_predictor = `X @ coef_ + intercept_`.
- Note that we often use the term raw_prediction instead of linear predictor.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Samples.
- Returns
- -------
- y_pred : array of shape (n_samples,)
- Returns predicted values of linear predictor.
- """
- check_is_fitted(self)
- X = self._validate_data(
- X,
- accept_sparse=["csr", "csc", "coo"],
- dtype=[np.float64, np.float32],
- ensure_2d=True,
- allow_nd=False,
- reset=False,
- )
- return X @ self.coef_ + self.intercept_
- def predict(self, X):
- """Predict using GLM with feature matrix X.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Samples.
- Returns
- -------
- y_pred : array of shape (n_samples,)
- Returns predicted values.
- """
- # check_array is done in _linear_predictor
- raw_prediction = self._linear_predictor(X)
- y_pred = self._base_loss.link.inverse(raw_prediction)
- return y_pred
- def score(self, X, y, sample_weight=None):
- """Compute D^2, the percentage of deviance explained.
- D^2 is a generalization of the coefficient of determination R^2.
- R^2 uses squared error and D^2 uses the deviance of this GLM, see the
- :ref:`User Guide <regression_metrics>`.
- D^2 is defined as
- :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
- :math:`D_{null}` is the null deviance, i.e. the deviance of a model
- with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
- The mean :math:`\\bar{y}` is averaged by sample_weight.
- Best possible score is 1.0 and it can be negative (because the model
- can be arbitrarily worse).
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Test samples.
- y : array-like of shape (n_samples,)
- True values of target.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- score : float
- D^2 of self.predict(X) w.r.t. y.
- """
- # TODO: Adapt link to User Guide in the docstring, once
- # https://github.com/scikit-learn/scikit-learn/pull/22118 is merged.
- #
- # Note, default score defined in RegressorMixin is R^2 score.
- # TODO: make D^2 a score function in module metrics (and thereby get
- # input validation and so on)
- raw_prediction = self._linear_predictor(X) # validates X
- # required by losses
- y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False)
- if sample_weight is not None:
- # Note that _check_sample_weight calls check_array(order="C") required by
- # losses.
- sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype)
- base_loss = self._base_loss
- if not base_loss.in_y_true_range(y):
- raise ValueError(
- "Some value(s) of y are out of the valid range of the loss"
- f" {base_loss.__name__}."
- )
- # Note that constant_to_optimal_zero is already multiplied by sample_weight.
- constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y))
- if sample_weight is not None:
- constant *= sample_weight.shape[0] / np.sum(sample_weight)
- # Missing factor of 2 in deviance cancels out.
- deviance = base_loss(
- y_true=y,
- raw_prediction=raw_prediction,
- sample_weight=sample_weight,
- n_threads=1,
- )
- y_mean = base_loss.link.link(np.average(y, weights=sample_weight))
- deviance_null = base_loss(
- y_true=y,
- raw_prediction=np.tile(y_mean, y.shape[0]),
- sample_weight=sample_weight,
- n_threads=1,
- )
- return 1 - (deviance + constant) / (deviance_null + constant)
- def _more_tags(self):
- try:
- # Create instance of BaseLoss if fit wasn't called yet. This is necessary as
- # TweedieRegressor might set the used loss during fit different from
- # self._base_loss.
- base_loss = self._get_loss()
- return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
- except (ValueError, AttributeError, TypeError):
- # This happens when the link or power parameter of TweedieRegressor is
- # invalid. We fallback on the default tags in that case.
- return {}
- def _get_loss(self):
- """This is only necessary because of the link and power arguments of the
- TweedieRegressor.
- Note that we do not need to pass sample_weight to the loss class as this is
- only needed to set loss.constant_hessian on which GLMs do not rely.
- """
- return HalfSquaredError()
- class PoissonRegressor(_GeneralizedLinearRegressor):
- """Generalized Linear Model with a Poisson distribution.
- This regressor uses the 'log' link function.
- Read more in the :ref:`User Guide <Generalized_linear_models>`.
- .. versionadded:: 0.23
- Parameters
- ----------
- alpha : float, default=1
- Constant that multiplies the L2 penalty term and determines the
- regularization strength. ``alpha = 0`` is equivalent to unpenalized
- GLMs. In this case, the design matrix `X` must have full column rank
- (no collinearities).
- Values of `alpha` must be in the range `[0.0, inf)`.
- fit_intercept : bool, default=True
- Specifies if a constant (a.k.a. bias or intercept) should be
- added to the linear predictor (`X @ coef + intercept`).
- solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
- Algorithm to use in the optimization problem:
- 'lbfgs'
- Calls scipy's L-BFGS-B optimizer.
- 'newton-cholesky'
- Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
- iterated reweighted least squares) with an inner Cholesky based solver.
- This solver is a good choice for `n_samples` >> `n_features`, especially
- with one-hot encoded categorical features with rare categories. Be aware
- that the memory usage of this solver has a quadratic dependency on
- `n_features` because it explicitly computes the Hessian matrix.
- .. versionadded:: 1.2
- max_iter : int, default=100
- The maximal number of iterations for the solver.
- Values must be in the range `[1, inf)`.
- tol : float, default=1e-4
- Stopping criterion. For the lbfgs solver,
- the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
- where ``g_j`` is the j-th component of the gradient (derivative) of
- the objective function.
- Values must be in the range `(0.0, inf)`.
- warm_start : bool, default=False
- If set to ``True``, reuse the solution of the previous call to ``fit``
- as initialization for ``coef_`` and ``intercept_`` .
- verbose : int, default=0
- For the lbfgs solver set verbose to any positive number for verbosity.
- Values must be in the range `[0, inf)`.
- Attributes
- ----------
- coef_ : array of shape (n_features,)
- Estimated coefficients for the linear predictor (`X @ coef_ +
- intercept_`) in the GLM.
- intercept_ : float
- Intercept (a.k.a. bias) added to linear predictor.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_iter_ : int
- Actual number of iterations used in the solver.
- See Also
- --------
- TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
- Examples
- --------
- >>> from sklearn import linear_model
- >>> clf = linear_model.PoissonRegressor()
- >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
- >>> y = [12, 17, 22, 21]
- >>> clf.fit(X, y)
- PoissonRegressor()
- >>> clf.score(X, y)
- 0.990...
- >>> clf.coef_
- array([0.121..., 0.158...])
- >>> clf.intercept_
- 2.088...
- >>> clf.predict([[1, 1], [3, 4]])
- array([10.676..., 21.875...])
- """
- _parameter_constraints: dict = {
- **_GeneralizedLinearRegressor._parameter_constraints
- }
- def __init__(
- self,
- *,
- alpha=1.0,
- fit_intercept=True,
- solver="lbfgs",
- max_iter=100,
- tol=1e-4,
- warm_start=False,
- verbose=0,
- ):
- super().__init__(
- alpha=alpha,
- fit_intercept=fit_intercept,
- solver=solver,
- max_iter=max_iter,
- tol=tol,
- warm_start=warm_start,
- verbose=verbose,
- )
- def _get_loss(self):
- return HalfPoissonLoss()
- class GammaRegressor(_GeneralizedLinearRegressor):
- """Generalized Linear Model with a Gamma distribution.
- This regressor uses the 'log' link function.
- Read more in the :ref:`User Guide <Generalized_linear_models>`.
- .. versionadded:: 0.23
- Parameters
- ----------
- alpha : float, default=1
- Constant that multiplies the L2 penalty term and determines the
- regularization strength. ``alpha = 0`` is equivalent to unpenalized
- GLMs. In this case, the design matrix `X` must have full column rank
- (no collinearities).
- Values of `alpha` must be in the range `[0.0, inf)`.
- fit_intercept : bool, default=True
- Specifies if a constant (a.k.a. bias or intercept) should be
- added to the linear predictor `X @ coef_ + intercept_`.
- solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
- Algorithm to use in the optimization problem:
- 'lbfgs'
- Calls scipy's L-BFGS-B optimizer.
- 'newton-cholesky'
- Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
- iterated reweighted least squares) with an inner Cholesky based solver.
- This solver is a good choice for `n_samples` >> `n_features`, especially
- with one-hot encoded categorical features with rare categories. Be aware
- that the memory usage of this solver has a quadratic dependency on
- `n_features` because it explicitly computes the Hessian matrix.
- .. versionadded:: 1.2
- max_iter : int, default=100
- The maximal number of iterations for the solver.
- Values must be in the range `[1, inf)`.
- tol : float, default=1e-4
- Stopping criterion. For the lbfgs solver,
- the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
- where ``g_j`` is the j-th component of the gradient (derivative) of
- the objective function.
- Values must be in the range `(0.0, inf)`.
- warm_start : bool, default=False
- If set to ``True``, reuse the solution of the previous call to ``fit``
- as initialization for `coef_` and `intercept_`.
- verbose : int, default=0
- For the lbfgs solver set verbose to any positive number for verbosity.
- Values must be in the range `[0, inf)`.
- Attributes
- ----------
- coef_ : array of shape (n_features,)
- Estimated coefficients for the linear predictor (`X @ coef_ +
- intercept_`) in the GLM.
- intercept_ : float
- Intercept (a.k.a. bias) added to linear predictor.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- n_iter_ : int
- Actual number of iterations used in the solver.
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- PoissonRegressor : Generalized Linear Model with a Poisson distribution.
- TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
- Examples
- --------
- >>> from sklearn import linear_model
- >>> clf = linear_model.GammaRegressor()
- >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
- >>> y = [19, 26, 33, 30]
- >>> clf.fit(X, y)
- GammaRegressor()
- >>> clf.score(X, y)
- 0.773...
- >>> clf.coef_
- array([0.072..., 0.066...])
- >>> clf.intercept_
- 2.896...
- >>> clf.predict([[1, 0], [2, 8]])
- array([19.483..., 35.795...])
- """
- _parameter_constraints: dict = {
- **_GeneralizedLinearRegressor._parameter_constraints
- }
- def __init__(
- self,
- *,
- alpha=1.0,
- fit_intercept=True,
- solver="lbfgs",
- max_iter=100,
- tol=1e-4,
- warm_start=False,
- verbose=0,
- ):
- super().__init__(
- alpha=alpha,
- fit_intercept=fit_intercept,
- solver=solver,
- max_iter=max_iter,
- tol=tol,
- warm_start=warm_start,
- verbose=verbose,
- )
- def _get_loss(self):
- return HalfGammaLoss()
- class TweedieRegressor(_GeneralizedLinearRegressor):
- """Generalized Linear Model with a Tweedie distribution.
- This estimator can be used to model different GLMs depending on the
- ``power`` parameter, which determines the underlying distribution.
- Read more in the :ref:`User Guide <Generalized_linear_models>`.
- .. versionadded:: 0.23
- Parameters
- ----------
- power : float, default=0
- The power determines the underlying target distribution according
- to the following table:
- +-------+------------------------+
- | Power | Distribution |
- +=======+========================+
- | 0 | Normal |
- +-------+------------------------+
- | 1 | Poisson |
- +-------+------------------------+
- | (1,2) | Compound Poisson Gamma |
- +-------+------------------------+
- | 2 | Gamma |
- +-------+------------------------+
- | 3 | Inverse Gaussian |
- +-------+------------------------+
- For ``0 < power < 1``, no distribution exists.
- alpha : float, default=1
- Constant that multiplies the L2 penalty term and determines the
- regularization strength. ``alpha = 0`` is equivalent to unpenalized
- GLMs. In this case, the design matrix `X` must have full column rank
- (no collinearities).
- Values of `alpha` must be in the range `[0.0, inf)`.
- fit_intercept : bool, default=True
- Specifies if a constant (a.k.a. bias or intercept) should be
- added to the linear predictor (`X @ coef + intercept`).
- link : {'auto', 'identity', 'log'}, default='auto'
- The link function of the GLM, i.e. mapping from linear predictor
- `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
- the link depending on the chosen `power` parameter as follows:
- - 'identity' for ``power <= 0``, e.g. for the Normal distribution
- - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
- distributions
- solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
- Algorithm to use in the optimization problem:
- 'lbfgs'
- Calls scipy's L-BFGS-B optimizer.
- 'newton-cholesky'
- Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
- iterated reweighted least squares) with an inner Cholesky based solver.
- This solver is a good choice for `n_samples` >> `n_features`, especially
- with one-hot encoded categorical features with rare categories. Be aware
- that the memory usage of this solver has a quadratic dependency on
- `n_features` because it explicitly computes the Hessian matrix.
- .. versionadded:: 1.2
- max_iter : int, default=100
- The maximal number of iterations for the solver.
- Values must be in the range `[1, inf)`.
- tol : float, default=1e-4
- Stopping criterion. For the lbfgs solver,
- the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
- where ``g_j`` is the j-th component of the gradient (derivative) of
- the objective function.
- Values must be in the range `(0.0, inf)`.
- warm_start : bool, default=False
- If set to ``True``, reuse the solution of the previous call to ``fit``
- as initialization for ``coef_`` and ``intercept_`` .
- verbose : int, default=0
- For the lbfgs solver set verbose to any positive number for verbosity.
- Values must be in the range `[0, inf)`.
- Attributes
- ----------
- coef_ : array of shape (n_features,)
- Estimated coefficients for the linear predictor (`X @ coef_ +
- intercept_`) in the GLM.
- intercept_ : float
- Intercept (a.k.a. bias) added to linear predictor.
- n_iter_ : int
- Actual number of iterations used in the solver.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- PoissonRegressor : Generalized Linear Model with a Poisson distribution.
- GammaRegressor : Generalized Linear Model with a Gamma distribution.
- Examples
- --------
- >>> from sklearn import linear_model
- >>> clf = linear_model.TweedieRegressor()
- >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
- >>> y = [2, 3.5, 5, 5.5]
- >>> clf.fit(X, y)
- TweedieRegressor()
- >>> clf.score(X, y)
- 0.839...
- >>> clf.coef_
- array([0.599..., 0.299...])
- >>> clf.intercept_
- 1.600...
- >>> clf.predict([[1, 1], [3, 4]])
- array([2.500..., 4.599...])
- """
- _parameter_constraints: dict = {
- **_GeneralizedLinearRegressor._parameter_constraints,
- "power": [Interval(Real, None, None, closed="neither")],
- "link": [StrOptions({"auto", "identity", "log"})],
- }
- def __init__(
- self,
- *,
- power=0.0,
- alpha=1.0,
- fit_intercept=True,
- link="auto",
- solver="lbfgs",
- max_iter=100,
- tol=1e-4,
- warm_start=False,
- verbose=0,
- ):
- super().__init__(
- alpha=alpha,
- fit_intercept=fit_intercept,
- solver=solver,
- max_iter=max_iter,
- tol=tol,
- warm_start=warm_start,
- verbose=verbose,
- )
- self.link = link
- self.power = power
- def _get_loss(self):
- if self.link == "auto":
- if self.power <= 0:
- # identity link
- return HalfTweedieLossIdentity(power=self.power)
- else:
- # log link
- return HalfTweedieLoss(power=self.power)
- if self.link == "log":
- return HalfTweedieLoss(power=self.power)
- if self.link == "identity":
- return HalfTweedieLossIdentity(power=self.power)
|