_pca.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701
  1. """ Principal Component Analysis.
  2. """
  3. # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
  4. # Olivier Grisel <olivier.grisel@ensta.org>
  5. # Mathieu Blondel <mathieu@mblondel.org>
  6. # Denis A. Engemann <denis-alexander.engemann@inria.fr>
  7. # Michael Eickenberg <michael.eickenberg@inria.fr>
  8. # Giorgio Patrini <giorgio.patrini@anu.edu.au>
  9. #
  10. # License: BSD 3 clause
  11. from math import log, sqrt
  12. from numbers import Integral, Real
  13. import numpy as np
  14. from scipy import linalg
  15. from scipy.sparse import issparse
  16. from scipy.sparse.linalg import svds
  17. from scipy.special import gammaln
  18. from ..base import _fit_context
  19. from ..utils import check_random_state
  20. from ..utils._arpack import _init_arpack_v0
  21. from ..utils._param_validation import Interval, RealNotInt, StrOptions
  22. from ..utils.deprecation import deprecated
  23. from ..utils.extmath import fast_logdet, randomized_svd, stable_cumsum, svd_flip
  24. from ..utils.validation import check_is_fitted
  25. from ._base import _BasePCA
  26. def _assess_dimension(spectrum, rank, n_samples):
  27. """Compute the log-likelihood of a rank ``rank`` dataset.
  28. The dataset is assumed to be embedded in gaussian noise of shape(n,
  29. dimf) having spectrum ``spectrum``. This implements the method of
  30. T. P. Minka.
  31. Parameters
  32. ----------
  33. spectrum : ndarray of shape (n_features,)
  34. Data spectrum.
  35. rank : int
  36. Tested rank value. It should be strictly lower than n_features,
  37. otherwise the method isn't specified (division by zero in equation
  38. (31) from the paper).
  39. n_samples : int
  40. Number of samples.
  41. Returns
  42. -------
  43. ll : float
  44. The log-likelihood.
  45. References
  46. ----------
  47. This implements the method of `Thomas P. Minka:
  48. Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604
  49. <https://proceedings.neurips.cc/paper/2000/file/7503cfacd12053d309b6bed5c89de212-Paper.pdf>`_
  50. """
  51. n_features = spectrum.shape[0]
  52. if not 1 <= rank < n_features:
  53. raise ValueError("the tested rank should be in [1, n_features - 1]")
  54. eps = 1e-15
  55. if spectrum[rank - 1] < eps:
  56. # When the tested rank is associated with a small eigenvalue, there's
  57. # no point in computing the log-likelihood: it's going to be very
  58. # small and won't be the max anyway. Also, it can lead to numerical
  59. # issues below when computing pa, in particular in log((spectrum[i] -
  60. # spectrum[j]) because this will take the log of something very small.
  61. return -np.inf
  62. pu = -rank * log(2.0)
  63. for i in range(1, rank + 1):
  64. pu += (
  65. gammaln((n_features - i + 1) / 2.0)
  66. - log(np.pi) * (n_features - i + 1) / 2.0
  67. )
  68. pl = np.sum(np.log(spectrum[:rank]))
  69. pl = -pl * n_samples / 2.0
  70. v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))
  71. pv = -np.log(v) * n_samples * (n_features - rank) / 2.0
  72. m = n_features * rank - rank * (rank + 1.0) / 2.0
  73. pp = log(2.0 * np.pi) * (m + rank) / 2.0
  74. pa = 0.0
  75. spectrum_ = spectrum.copy()
  76. spectrum_[rank:n_features] = v
  77. for i in range(rank):
  78. for j in range(i + 1, len(spectrum)):
  79. pa += log(
  80. (spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i])
  81. ) + log(n_samples)
  82. ll = pu + pl + pv + pp - pa / 2.0 - rank * log(n_samples) / 2.0
  83. return ll
  84. def _infer_dimension(spectrum, n_samples):
  85. """Infers the dimension of a dataset with a given spectrum.
  86. The returned value will be in [1, n_features - 1].
  87. """
  88. ll = np.empty_like(spectrum)
  89. ll[0] = -np.inf # we don't want to return n_components = 0
  90. for rank in range(1, spectrum.shape[0]):
  91. ll[rank] = _assess_dimension(spectrum, rank, n_samples)
  92. return ll.argmax()
  93. class PCA(_BasePCA):
  94. """Principal component analysis (PCA).
  95. Linear dimensionality reduction using Singular Value Decomposition of the
  96. data to project it to a lower dimensional space. The input data is centered
  97. but not scaled for each feature before applying the SVD.
  98. It uses the LAPACK implementation of the full SVD or a randomized truncated
  99. SVD by the method of Halko et al. 2009, depending on the shape of the input
  100. data and the number of components to extract.
  101. It can also use the scipy.sparse.linalg ARPACK implementation of the
  102. truncated SVD.
  103. Notice that this class does not support sparse input. See
  104. :class:`TruncatedSVD` for an alternative with sparse data.
  105. Read more in the :ref:`User Guide <PCA>`.
  106. Parameters
  107. ----------
  108. n_components : int, float or 'mle', default=None
  109. Number of components to keep.
  110. if n_components is not set all components are kept::
  111. n_components == min(n_samples, n_features)
  112. If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's
  113. MLE is used to guess the dimension. Use of ``n_components == 'mle'``
  114. will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.
  115. If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the
  116. number of components such that the amount of variance that needs to be
  117. explained is greater than the percentage specified by n_components.
  118. If ``svd_solver == 'arpack'``, the number of components must be
  119. strictly less than the minimum of n_features and n_samples.
  120. Hence, the None case results in::
  121. n_components == min(n_samples, n_features) - 1
  122. copy : bool, default=True
  123. If False, data passed to fit are overwritten and running
  124. fit(X).transform(X) will not yield the expected results,
  125. use fit_transform(X) instead.
  126. whiten : bool, default=False
  127. When True (False by default) the `components_` vectors are multiplied
  128. by the square root of n_samples and then divided by the singular values
  129. to ensure uncorrelated outputs with unit component-wise variances.
  130. Whitening will remove some information from the transformed signal
  131. (the relative variance scales of the components) but can sometime
  132. improve the predictive accuracy of the downstream estimators by
  133. making their data respect some hard-wired assumptions.
  134. svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'
  135. If auto :
  136. The solver is selected by a default policy based on `X.shape` and
  137. `n_components`: if the input data is larger than 500x500 and the
  138. number of components to extract is lower than 80% of the smallest
  139. dimension of the data, then the more efficient 'randomized'
  140. method is enabled. Otherwise the exact full SVD is computed and
  141. optionally truncated afterwards.
  142. If full :
  143. run exact full SVD calling the standard LAPACK solver via
  144. `scipy.linalg.svd` and select the components by postprocessing
  145. If arpack :
  146. run SVD truncated to n_components calling ARPACK solver via
  147. `scipy.sparse.linalg.svds`. It requires strictly
  148. 0 < n_components < min(X.shape)
  149. If randomized :
  150. run randomized SVD by the method of Halko et al.
  151. .. versionadded:: 0.18.0
  152. tol : float, default=0.0
  153. Tolerance for singular values computed by svd_solver == 'arpack'.
  154. Must be of range [0.0, infinity).
  155. .. versionadded:: 0.18.0
  156. iterated_power : int or 'auto', default='auto'
  157. Number of iterations for the power method computed by
  158. svd_solver == 'randomized'.
  159. Must be of range [0, infinity).
  160. .. versionadded:: 0.18.0
  161. n_oversamples : int, default=10
  162. This parameter is only relevant when `svd_solver="randomized"`.
  163. It corresponds to the additional number of random vectors to sample the
  164. range of `X` so as to ensure proper conditioning. See
  165. :func:`~sklearn.utils.extmath.randomized_svd` for more details.
  166. .. versionadded:: 1.1
  167. power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
  168. Power iteration normalizer for randomized SVD solver.
  169. Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd`
  170. for more details.
  171. .. versionadded:: 1.1
  172. random_state : int, RandomState instance or None, default=None
  173. Used when the 'arpack' or 'randomized' solvers are used. Pass an int
  174. for reproducible results across multiple function calls.
  175. See :term:`Glossary <random_state>`.
  176. .. versionadded:: 0.18.0
  177. Attributes
  178. ----------
  179. components_ : ndarray of shape (n_components, n_features)
  180. Principal axes in feature space, representing the directions of
  181. maximum variance in the data. Equivalently, the right singular
  182. vectors of the centered input data, parallel to its eigenvectors.
  183. The components are sorted by decreasing ``explained_variance_``.
  184. explained_variance_ : ndarray of shape (n_components,)
  185. The amount of variance explained by each of the selected components.
  186. The variance estimation uses `n_samples - 1` degrees of freedom.
  187. Equal to n_components largest eigenvalues
  188. of the covariance matrix of X.
  189. .. versionadded:: 0.18
  190. explained_variance_ratio_ : ndarray of shape (n_components,)
  191. Percentage of variance explained by each of the selected components.
  192. If ``n_components`` is not set then all components are stored and the
  193. sum of the ratios is equal to 1.0.
  194. singular_values_ : ndarray of shape (n_components,)
  195. The singular values corresponding to each of the selected components.
  196. The singular values are equal to the 2-norms of the ``n_components``
  197. variables in the lower-dimensional space.
  198. .. versionadded:: 0.19
  199. mean_ : ndarray of shape (n_features,)
  200. Per-feature empirical mean, estimated from the training set.
  201. Equal to `X.mean(axis=0)`.
  202. n_components_ : int
  203. The estimated number of components. When n_components is set
  204. to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
  205. number is estimated from input data. Otherwise it equals the parameter
  206. n_components, or the lesser value of n_features and n_samples
  207. if n_components is None.
  208. n_features_ : int
  209. Number of features in the training data.
  210. n_samples_ : int
  211. Number of samples in the training data.
  212. noise_variance_ : float
  213. The estimated noise covariance following the Probabilistic PCA model
  214. from Tipping and Bishop 1999. See "Pattern Recognition and
  215. Machine Learning" by C. Bishop, 12.2.1 p. 574 or
  216. http://www.miketipping.com/papers/met-mppca.pdf. It is required to
  217. compute the estimated data covariance and score samples.
  218. Equal to the average of (min(n_features, n_samples) - n_components)
  219. smallest eigenvalues of the covariance matrix of X.
  220. n_features_in_ : int
  221. Number of features seen during :term:`fit`.
  222. .. versionadded:: 0.24
  223. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  224. Names of features seen during :term:`fit`. Defined only when `X`
  225. has feature names that are all strings.
  226. .. versionadded:: 1.0
  227. See Also
  228. --------
  229. KernelPCA : Kernel Principal Component Analysis.
  230. SparsePCA : Sparse Principal Component Analysis.
  231. TruncatedSVD : Dimensionality reduction using truncated SVD.
  232. IncrementalPCA : Incremental Principal Component Analysis.
  233. References
  234. ----------
  235. For n_components == 'mle', this class uses the method from:
  236. `Minka, T. P.. "Automatic choice of dimensionality for PCA".
  237. In NIPS, pp. 598-604 <https://tminka.github.io/papers/pca/minka-pca.pdf>`_
  238. Implements the probabilistic PCA model from:
  239. `Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal
  240. component analysis". Journal of the Royal Statistical Society:
  241. Series B (Statistical Methodology), 61(3), 611-622.
  242. <http://www.miketipping.com/papers/met-mppca.pdf>`_
  243. via the score and score_samples methods.
  244. For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
  245. For svd_solver == 'randomized', see:
  246. :doi:`Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).
  247. "Finding structure with randomness: Probabilistic algorithms for
  248. constructing approximate matrix decompositions".
  249. SIAM review, 53(2), 217-288.
  250. <10.1137/090771806>`
  251. and also
  252. :doi:`Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).
  253. "A randomized algorithm for the decomposition of matrices".
  254. Applied and Computational Harmonic Analysis, 30(1), 47-68.
  255. <10.1016/j.acha.2010.02.003>`
  256. Examples
  257. --------
  258. >>> import numpy as np
  259. >>> from sklearn.decomposition import PCA
  260. >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
  261. >>> pca = PCA(n_components=2)
  262. >>> pca.fit(X)
  263. PCA(n_components=2)
  264. >>> print(pca.explained_variance_ratio_)
  265. [0.9924... 0.0075...]
  266. >>> print(pca.singular_values_)
  267. [6.30061... 0.54980...]
  268. >>> pca = PCA(n_components=2, svd_solver='full')
  269. >>> pca.fit(X)
  270. PCA(n_components=2, svd_solver='full')
  271. >>> print(pca.explained_variance_ratio_)
  272. [0.9924... 0.00755...]
  273. >>> print(pca.singular_values_)
  274. [6.30061... 0.54980...]
  275. >>> pca = PCA(n_components=1, svd_solver='arpack')
  276. >>> pca.fit(X)
  277. PCA(n_components=1, svd_solver='arpack')
  278. >>> print(pca.explained_variance_ratio_)
  279. [0.99244...]
  280. >>> print(pca.singular_values_)
  281. [6.30061...]
  282. """
  283. _parameter_constraints: dict = {
  284. "n_components": [
  285. Interval(Integral, 0, None, closed="left"),
  286. Interval(RealNotInt, 0, 1, closed="neither"),
  287. StrOptions({"mle"}),
  288. None,
  289. ],
  290. "copy": ["boolean"],
  291. "whiten": ["boolean"],
  292. "svd_solver": [StrOptions({"auto", "full", "arpack", "randomized"})],
  293. "tol": [Interval(Real, 0, None, closed="left")],
  294. "iterated_power": [
  295. StrOptions({"auto"}),
  296. Interval(Integral, 0, None, closed="left"),
  297. ],
  298. "n_oversamples": [Interval(Integral, 1, None, closed="left")],
  299. "power_iteration_normalizer": [StrOptions({"auto", "QR", "LU", "none"})],
  300. "random_state": ["random_state"],
  301. }
  302. def __init__(
  303. self,
  304. n_components=None,
  305. *,
  306. copy=True,
  307. whiten=False,
  308. svd_solver="auto",
  309. tol=0.0,
  310. iterated_power="auto",
  311. n_oversamples=10,
  312. power_iteration_normalizer="auto",
  313. random_state=None,
  314. ):
  315. self.n_components = n_components
  316. self.copy = copy
  317. self.whiten = whiten
  318. self.svd_solver = svd_solver
  319. self.tol = tol
  320. self.iterated_power = iterated_power
  321. self.n_oversamples = n_oversamples
  322. self.power_iteration_normalizer = power_iteration_normalizer
  323. self.random_state = random_state
  324. # TODO(1.4): remove in 1.4
  325. # mypy error: Decorated property not supported
  326. @deprecated( # type: ignore
  327. "Attribute `n_features_` was deprecated in version 1.2 and will be "
  328. "removed in 1.4. Use `n_features_in_` instead."
  329. )
  330. @property
  331. def n_features_(self):
  332. return self.n_features_in_
  333. @_fit_context(prefer_skip_nested_validation=True)
  334. def fit(self, X, y=None):
  335. """Fit the model with X.
  336. Parameters
  337. ----------
  338. X : array-like of shape (n_samples, n_features)
  339. Training data, where `n_samples` is the number of samples
  340. and `n_features` is the number of features.
  341. y : Ignored
  342. Ignored.
  343. Returns
  344. -------
  345. self : object
  346. Returns the instance itself.
  347. """
  348. self._fit(X)
  349. return self
  350. @_fit_context(prefer_skip_nested_validation=True)
  351. def fit_transform(self, X, y=None):
  352. """Fit the model with X and apply the dimensionality reduction on X.
  353. Parameters
  354. ----------
  355. X : array-like of shape (n_samples, n_features)
  356. Training data, where `n_samples` is the number of samples
  357. and `n_features` is the number of features.
  358. y : Ignored
  359. Ignored.
  360. Returns
  361. -------
  362. X_new : ndarray of shape (n_samples, n_components)
  363. Transformed values.
  364. Notes
  365. -----
  366. This method returns a Fortran-ordered array. To convert it to a
  367. C-ordered array, use 'np.ascontiguousarray'.
  368. """
  369. U, S, Vt = self._fit(X)
  370. U = U[:, : self.n_components_]
  371. if self.whiten:
  372. # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
  373. U *= sqrt(X.shape[0] - 1)
  374. else:
  375. # X_new = X * V = U * S * Vt * V = U * S
  376. U *= S[: self.n_components_]
  377. return U
  378. def _fit(self, X):
  379. """Dispatch to the right submethod depending on the chosen solver."""
  380. # Raise an error for sparse input.
  381. # This is more informative than the generic one raised by check_array.
  382. if issparse(X):
  383. raise TypeError(
  384. "PCA does not support sparse input. See "
  385. "TruncatedSVD for a possible alternative."
  386. )
  387. X = self._validate_data(
  388. X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy
  389. )
  390. # Handle n_components==None
  391. if self.n_components is None:
  392. if self.svd_solver != "arpack":
  393. n_components = min(X.shape)
  394. else:
  395. n_components = min(X.shape) - 1
  396. else:
  397. n_components = self.n_components
  398. # Handle svd_solver
  399. self._fit_svd_solver = self.svd_solver
  400. if self._fit_svd_solver == "auto":
  401. # Small problem or n_components == 'mle', just call full PCA
  402. if max(X.shape) <= 500 or n_components == "mle":
  403. self._fit_svd_solver = "full"
  404. elif 1 <= n_components < 0.8 * min(X.shape):
  405. self._fit_svd_solver = "randomized"
  406. # This is also the case of n_components in (0,1)
  407. else:
  408. self._fit_svd_solver = "full"
  409. # Call different fits for either full or truncated SVD
  410. if self._fit_svd_solver == "full":
  411. return self._fit_full(X, n_components)
  412. elif self._fit_svd_solver in ["arpack", "randomized"]:
  413. return self._fit_truncated(X, n_components, self._fit_svd_solver)
  414. def _fit_full(self, X, n_components):
  415. """Fit the model by computing full SVD on X."""
  416. n_samples, n_features = X.shape
  417. if n_components == "mle":
  418. if n_samples < n_features:
  419. raise ValueError(
  420. "n_components='mle' is only supported if n_samples >= n_features"
  421. )
  422. elif not 0 <= n_components <= min(n_samples, n_features):
  423. raise ValueError(
  424. "n_components=%r must be between 0 and "
  425. "min(n_samples, n_features)=%r with "
  426. "svd_solver='full'" % (n_components, min(n_samples, n_features))
  427. )
  428. # Center data
  429. self.mean_ = np.mean(X, axis=0)
  430. X -= self.mean_
  431. U, S, Vt = linalg.svd(X, full_matrices=False)
  432. # flip eigenvectors' sign to enforce deterministic output
  433. U, Vt = svd_flip(U, Vt)
  434. components_ = Vt
  435. # Get variance explained by singular values
  436. explained_variance_ = (S**2) / (n_samples - 1)
  437. total_var = explained_variance_.sum()
  438. explained_variance_ratio_ = explained_variance_ / total_var
  439. singular_values_ = S.copy() # Store the singular values.
  440. # Postprocess the number of components required
  441. if n_components == "mle":
  442. n_components = _infer_dimension(explained_variance_, n_samples)
  443. elif 0 < n_components < 1.0:
  444. # number of components for which the cumulated explained
  445. # variance percentage is superior to the desired threshold
  446. # side='right' ensures that number of features selected
  447. # their variance is always greater than n_components float
  448. # passed. More discussion in issue: #15669
  449. ratio_cumsum = stable_cumsum(explained_variance_ratio_)
  450. n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1
  451. # Compute noise covariance using Probabilistic PCA model
  452. # The sigma2 maximum likelihood (cf. eq. 12.46)
  453. if n_components < min(n_features, n_samples):
  454. self.noise_variance_ = explained_variance_[n_components:].mean()
  455. else:
  456. self.noise_variance_ = 0.0
  457. self.n_samples_ = n_samples
  458. self.components_ = components_[:n_components]
  459. self.n_components_ = n_components
  460. self.explained_variance_ = explained_variance_[:n_components]
  461. self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
  462. self.singular_values_ = singular_values_[:n_components]
  463. return U, S, Vt
  464. def _fit_truncated(self, X, n_components, svd_solver):
  465. """Fit the model by computing truncated SVD (by ARPACK or randomized)
  466. on X.
  467. """
  468. n_samples, n_features = X.shape
  469. if isinstance(n_components, str):
  470. raise ValueError(
  471. "n_components=%r cannot be a string with svd_solver='%s'"
  472. % (n_components, svd_solver)
  473. )
  474. elif not 1 <= n_components <= min(n_samples, n_features):
  475. raise ValueError(
  476. "n_components=%r must be between 1 and "
  477. "min(n_samples, n_features)=%r with "
  478. "svd_solver='%s'"
  479. % (n_components, min(n_samples, n_features), svd_solver)
  480. )
  481. elif svd_solver == "arpack" and n_components == min(n_samples, n_features):
  482. raise ValueError(
  483. "n_components=%r must be strictly less than "
  484. "min(n_samples, n_features)=%r with "
  485. "svd_solver='%s'"
  486. % (n_components, min(n_samples, n_features), svd_solver)
  487. )
  488. random_state = check_random_state(self.random_state)
  489. # Center data
  490. self.mean_ = np.mean(X, axis=0)
  491. X -= self.mean_
  492. if svd_solver == "arpack":
  493. v0 = _init_arpack_v0(min(X.shape), random_state)
  494. U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
  495. # svds doesn't abide by scipy.linalg.svd/randomized_svd
  496. # conventions, so reverse its outputs.
  497. S = S[::-1]
  498. # flip eigenvectors' sign to enforce deterministic output
  499. U, Vt = svd_flip(U[:, ::-1], Vt[::-1])
  500. elif svd_solver == "randomized":
  501. # sign flipping is done inside
  502. U, S, Vt = randomized_svd(
  503. X,
  504. n_components=n_components,
  505. n_oversamples=self.n_oversamples,
  506. n_iter=self.iterated_power,
  507. power_iteration_normalizer=self.power_iteration_normalizer,
  508. flip_sign=True,
  509. random_state=random_state,
  510. )
  511. self.n_samples_ = n_samples
  512. self.components_ = Vt
  513. self.n_components_ = n_components
  514. # Get variance explained by singular values
  515. self.explained_variance_ = (S**2) / (n_samples - 1)
  516. # Workaround in-place variance calculation since at the time numpy
  517. # did not have a way to calculate variance in-place.
  518. N = X.shape[0] - 1
  519. np.square(X, out=X)
  520. np.sum(X, axis=0, out=X[0])
  521. total_var = (X[0] / N).sum()
  522. self.explained_variance_ratio_ = self.explained_variance_ / total_var
  523. self.singular_values_ = S.copy() # Store the singular values.
  524. if self.n_components_ < min(n_features, n_samples):
  525. self.noise_variance_ = total_var - self.explained_variance_.sum()
  526. self.noise_variance_ /= min(n_features, n_samples) - n_components
  527. else:
  528. self.noise_variance_ = 0.0
  529. return U, S, Vt
  530. def score_samples(self, X):
  531. """Return the log-likelihood of each sample.
  532. See. "Pattern Recognition and Machine Learning"
  533. by C. Bishop, 12.2.1 p. 574
  534. or http://www.miketipping.com/papers/met-mppca.pdf
  535. Parameters
  536. ----------
  537. X : array-like of shape (n_samples, n_features)
  538. The data.
  539. Returns
  540. -------
  541. ll : ndarray of shape (n_samples,)
  542. Log-likelihood of each sample under the current model.
  543. """
  544. check_is_fitted(self)
  545. X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)
  546. Xr = X - self.mean_
  547. n_features = X.shape[1]
  548. precision = self.get_precision()
  549. log_like = -0.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1)
  550. log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))
  551. return log_like
  552. def score(self, X, y=None):
  553. """Return the average log-likelihood of all samples.
  554. See. "Pattern Recognition and Machine Learning"
  555. by C. Bishop, 12.2.1 p. 574
  556. or http://www.miketipping.com/papers/met-mppca.pdf
  557. Parameters
  558. ----------
  559. X : array-like of shape (n_samples, n_features)
  560. The data.
  561. y : Ignored
  562. Ignored.
  563. Returns
  564. -------
  565. ll : float
  566. Average log-likelihood of the samples under the current model.
  567. """
  568. return np.mean(self.score_samples(X))
  569. def _more_tags(self):
  570. return {"preserves_dtype": [np.float64, np.float32]}