_sparse_pca.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. """Matrix factorization with Sparse PCA."""
  2. # Author: Vlad Niculae, Gael Varoquaux, Alexandre Gramfort
  3. # License: BSD 3 clause
  4. from numbers import Integral, Real
  5. import numpy as np
  6. from ..base import (
  7. BaseEstimator,
  8. ClassNamePrefixFeaturesOutMixin,
  9. TransformerMixin,
  10. _fit_context,
  11. )
  12. from ..linear_model import ridge_regression
  13. from ..utils import check_random_state
  14. from ..utils._param_validation import Hidden, Interval, StrOptions
  15. from ..utils.extmath import svd_flip
  16. from ..utils.validation import check_array, check_is_fitted
  17. from ._dict_learning import MiniBatchDictionaryLearning, dict_learning
  18. class _BaseSparsePCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
  19. """Base class for SparsePCA and MiniBatchSparsePCA"""
  20. _parameter_constraints: dict = {
  21. "n_components": [None, Interval(Integral, 1, None, closed="left")],
  22. "alpha": [Interval(Real, 0.0, None, closed="left")],
  23. "ridge_alpha": [Interval(Real, 0.0, None, closed="left")],
  24. "max_iter": [Interval(Integral, 0, None, closed="left")],
  25. "tol": [Interval(Real, 0.0, None, closed="left")],
  26. "method": [StrOptions({"lars", "cd"})],
  27. "n_jobs": [Integral, None],
  28. "verbose": ["verbose"],
  29. "random_state": ["random_state"],
  30. }
  31. def __init__(
  32. self,
  33. n_components=None,
  34. *,
  35. alpha=1,
  36. ridge_alpha=0.01,
  37. max_iter=1000,
  38. tol=1e-8,
  39. method="lars",
  40. n_jobs=None,
  41. verbose=False,
  42. random_state=None,
  43. ):
  44. self.n_components = n_components
  45. self.alpha = alpha
  46. self.ridge_alpha = ridge_alpha
  47. self.max_iter = max_iter
  48. self.tol = tol
  49. self.method = method
  50. self.n_jobs = n_jobs
  51. self.verbose = verbose
  52. self.random_state = random_state
  53. @_fit_context(prefer_skip_nested_validation=True)
  54. def fit(self, X, y=None):
  55. """Fit the model from data in X.
  56. Parameters
  57. ----------
  58. X : array-like of shape (n_samples, n_features)
  59. Training vector, where `n_samples` is the number of samples
  60. and `n_features` is the number of features.
  61. y : Ignored
  62. Not used, present here for API consistency by convention.
  63. Returns
  64. -------
  65. self : object
  66. Returns the instance itself.
  67. """
  68. random_state = check_random_state(self.random_state)
  69. X = self._validate_data(X)
  70. self.mean_ = X.mean(axis=0)
  71. X = X - self.mean_
  72. if self.n_components is None:
  73. n_components = X.shape[1]
  74. else:
  75. n_components = self.n_components
  76. return self._fit(X, n_components, random_state)
  77. def transform(self, X):
  78. """Least Squares projection of the data onto the sparse components.
  79. To avoid instability issues in case the system is under-determined,
  80. regularization can be applied (Ridge regression) via the
  81. `ridge_alpha` parameter.
  82. Note that Sparse PCA components orthogonality is not enforced as in PCA
  83. hence one cannot use a simple linear projection.
  84. Parameters
  85. ----------
  86. X : ndarray of shape (n_samples, n_features)
  87. Test data to be transformed, must have the same number of
  88. features as the data used to train the model.
  89. Returns
  90. -------
  91. X_new : ndarray of shape (n_samples, n_components)
  92. Transformed data.
  93. """
  94. check_is_fitted(self)
  95. X = self._validate_data(X, reset=False)
  96. X = X - self.mean_
  97. U = ridge_regression(
  98. self.components_.T, X.T, self.ridge_alpha, solver="cholesky"
  99. )
  100. return U
  101. def inverse_transform(self, X):
  102. """Transform data from the latent space to the original space.
  103. This inversion is an approximation due to the loss of information
  104. induced by the forward decomposition.
  105. .. versionadded:: 1.2
  106. Parameters
  107. ----------
  108. X : ndarray of shape (n_samples, n_components)
  109. Data in the latent space.
  110. Returns
  111. -------
  112. X_original : ndarray of shape (n_samples, n_features)
  113. Reconstructed data in the original space.
  114. """
  115. check_is_fitted(self)
  116. X = check_array(X)
  117. return (X @ self.components_) + self.mean_
  118. @property
  119. def _n_features_out(self):
  120. """Number of transformed output features."""
  121. return self.components_.shape[0]
  122. def _more_tags(self):
  123. return {
  124. "preserves_dtype": [np.float64, np.float32],
  125. }
  126. class SparsePCA(_BaseSparsePCA):
  127. """Sparse Principal Components Analysis (SparsePCA).
  128. Finds the set of sparse components that can optimally reconstruct
  129. the data. The amount of sparseness is controllable by the coefficient
  130. of the L1 penalty, given by the parameter alpha.
  131. Read more in the :ref:`User Guide <SparsePCA>`.
  132. Parameters
  133. ----------
  134. n_components : int, default=None
  135. Number of sparse atoms to extract. If None, then ``n_components``
  136. is set to ``n_features``.
  137. alpha : float, default=1
  138. Sparsity controlling parameter. Higher values lead to sparser
  139. components.
  140. ridge_alpha : float, default=0.01
  141. Amount of ridge shrinkage to apply in order to improve
  142. conditioning when calling the transform method.
  143. max_iter : int, default=1000
  144. Maximum number of iterations to perform.
  145. tol : float, default=1e-8
  146. Tolerance for the stopping condition.
  147. method : {'lars', 'cd'}, default='lars'
  148. Method to be used for optimization.
  149. lars: uses the least angle regression method to solve the lasso problem
  150. (linear_model.lars_path)
  151. cd: uses the coordinate descent method to compute the
  152. Lasso solution (linear_model.Lasso). Lars will be faster if
  153. the estimated components are sparse.
  154. n_jobs : int, default=None
  155. Number of parallel jobs to run.
  156. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  157. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  158. for more details.
  159. U_init : ndarray of shape (n_samples, n_components), default=None
  160. Initial values for the loadings for warm restart scenarios. Only used
  161. if `U_init` and `V_init` are not None.
  162. V_init : ndarray of shape (n_components, n_features), default=None
  163. Initial values for the components for warm restart scenarios. Only used
  164. if `U_init` and `V_init` are not None.
  165. verbose : int or bool, default=False
  166. Controls the verbosity; the higher, the more messages. Defaults to 0.
  167. random_state : int, RandomState instance or None, default=None
  168. Used during dictionary learning. Pass an int for reproducible results
  169. across multiple function calls.
  170. See :term:`Glossary <random_state>`.
  171. Attributes
  172. ----------
  173. components_ : ndarray of shape (n_components, n_features)
  174. Sparse components extracted from the data.
  175. error_ : ndarray
  176. Vector of errors at each iteration.
  177. n_components_ : int
  178. Estimated number of components.
  179. .. versionadded:: 0.23
  180. n_iter_ : int
  181. Number of iterations run.
  182. mean_ : ndarray of shape (n_features,)
  183. Per-feature empirical mean, estimated from the training set.
  184. Equal to ``X.mean(axis=0)``.
  185. n_features_in_ : int
  186. Number of features seen during :term:`fit`.
  187. .. versionadded:: 0.24
  188. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  189. Names of features seen during :term:`fit`. Defined only when `X`
  190. has feature names that are all strings.
  191. .. versionadded:: 1.0
  192. See Also
  193. --------
  194. PCA : Principal Component Analysis implementation.
  195. MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less
  196. accurate.
  197. DictionaryLearning : Generic dictionary learning problem using a sparse code.
  198. Examples
  199. --------
  200. >>> import numpy as np
  201. >>> from sklearn.datasets import make_friedman1
  202. >>> from sklearn.decomposition import SparsePCA
  203. >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)
  204. >>> transformer = SparsePCA(n_components=5, random_state=0)
  205. >>> transformer.fit(X)
  206. SparsePCA(...)
  207. >>> X_transformed = transformer.transform(X)
  208. >>> X_transformed.shape
  209. (200, 5)
  210. >>> # most values in the components_ are zero (sparsity)
  211. >>> np.mean(transformer.components_ == 0)
  212. 0.9666...
  213. """
  214. _parameter_constraints: dict = {
  215. **_BaseSparsePCA._parameter_constraints,
  216. "U_init": [None, np.ndarray],
  217. "V_init": [None, np.ndarray],
  218. }
  219. def __init__(
  220. self,
  221. n_components=None,
  222. *,
  223. alpha=1,
  224. ridge_alpha=0.01,
  225. max_iter=1000,
  226. tol=1e-8,
  227. method="lars",
  228. n_jobs=None,
  229. U_init=None,
  230. V_init=None,
  231. verbose=False,
  232. random_state=None,
  233. ):
  234. super().__init__(
  235. n_components=n_components,
  236. alpha=alpha,
  237. ridge_alpha=ridge_alpha,
  238. max_iter=max_iter,
  239. tol=tol,
  240. method=method,
  241. n_jobs=n_jobs,
  242. verbose=verbose,
  243. random_state=random_state,
  244. )
  245. self.U_init = U_init
  246. self.V_init = V_init
  247. def _fit(self, X, n_components, random_state):
  248. """Specialized `fit` for SparsePCA."""
  249. code_init = self.V_init.T if self.V_init is not None else None
  250. dict_init = self.U_init.T if self.U_init is not None else None
  251. code, dictionary, E, self.n_iter_ = dict_learning(
  252. X.T,
  253. n_components,
  254. alpha=self.alpha,
  255. tol=self.tol,
  256. max_iter=self.max_iter,
  257. method=self.method,
  258. n_jobs=self.n_jobs,
  259. verbose=self.verbose,
  260. random_state=random_state,
  261. code_init=code_init,
  262. dict_init=dict_init,
  263. return_n_iter=True,
  264. )
  265. # flip eigenvectors' sign to enforce deterministic output
  266. code, dictionary = svd_flip(code, dictionary, u_based_decision=False)
  267. self.components_ = code.T
  268. components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]
  269. components_norm[components_norm == 0] = 1
  270. self.components_ /= components_norm
  271. self.n_components_ = len(self.components_)
  272. self.error_ = E
  273. return self
  274. class MiniBatchSparsePCA(_BaseSparsePCA):
  275. """Mini-batch Sparse Principal Components Analysis.
  276. Finds the set of sparse components that can optimally reconstruct
  277. the data. The amount of sparseness is controllable by the coefficient
  278. of the L1 penalty, given by the parameter alpha.
  279. Read more in the :ref:`User Guide <SparsePCA>`.
  280. Parameters
  281. ----------
  282. n_components : int, default=None
  283. Number of sparse atoms to extract. If None, then ``n_components``
  284. is set to ``n_features``.
  285. alpha : int, default=1
  286. Sparsity controlling parameter. Higher values lead to sparser
  287. components.
  288. ridge_alpha : float, default=0.01
  289. Amount of ridge shrinkage to apply in order to improve
  290. conditioning when calling the transform method.
  291. n_iter : int, default=100
  292. Number of iterations to perform for each mini batch.
  293. .. deprecated:: 1.2
  294. `n_iter` is deprecated in 1.2 and will be removed in 1.4. Use
  295. `max_iter` instead.
  296. max_iter : int, default=None
  297. Maximum number of iterations over the complete dataset before
  298. stopping independently of any early stopping criterion heuristics.
  299. If `max_iter` is not `None`, `n_iter` is ignored.
  300. .. versionadded:: 1.2
  301. callback : callable, default=None
  302. Callable that gets invoked every five iterations.
  303. batch_size : int, default=3
  304. The number of features to take in each mini batch.
  305. verbose : int or bool, default=False
  306. Controls the verbosity; the higher, the more messages. Defaults to 0.
  307. shuffle : bool, default=True
  308. Whether to shuffle the data before splitting it in batches.
  309. n_jobs : int, default=None
  310. Number of parallel jobs to run.
  311. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  312. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  313. for more details.
  314. method : {'lars', 'cd'}, default='lars'
  315. Method to be used for optimization.
  316. lars: uses the least angle regression method to solve the lasso problem
  317. (linear_model.lars_path)
  318. cd: uses the coordinate descent method to compute the
  319. Lasso solution (linear_model.Lasso). Lars will be faster if
  320. the estimated components are sparse.
  321. random_state : int, RandomState instance or None, default=None
  322. Used for random shuffling when ``shuffle`` is set to ``True``,
  323. during online dictionary learning. Pass an int for reproducible results
  324. across multiple function calls.
  325. See :term:`Glossary <random_state>`.
  326. tol : float, default=1e-3
  327. Control early stopping based on the norm of the differences in the
  328. dictionary between 2 steps. Used only if `max_iter` is not None.
  329. To disable early stopping based on changes in the dictionary, set
  330. `tol` to 0.0.
  331. .. versionadded:: 1.1
  332. max_no_improvement : int or None, default=10
  333. Control early stopping based on the consecutive number of mini batches
  334. that does not yield an improvement on the smoothed cost function. Used only if
  335. `max_iter` is not None.
  336. To disable convergence detection based on cost function, set
  337. `max_no_improvement` to `None`.
  338. .. versionadded:: 1.1
  339. Attributes
  340. ----------
  341. components_ : ndarray of shape (n_components, n_features)
  342. Sparse components extracted from the data.
  343. n_components_ : int
  344. Estimated number of components.
  345. .. versionadded:: 0.23
  346. n_iter_ : int
  347. Number of iterations run.
  348. mean_ : ndarray of shape (n_features,)
  349. Per-feature empirical mean, estimated from the training set.
  350. Equal to ``X.mean(axis=0)``.
  351. n_features_in_ : int
  352. Number of features seen during :term:`fit`.
  353. .. versionadded:: 0.24
  354. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  355. Names of features seen during :term:`fit`. Defined only when `X`
  356. has feature names that are all strings.
  357. .. versionadded:: 1.0
  358. See Also
  359. --------
  360. DictionaryLearning : Find a dictionary that sparsely encodes data.
  361. IncrementalPCA : Incremental principal components analysis.
  362. PCA : Principal component analysis.
  363. SparsePCA : Sparse Principal Components Analysis.
  364. TruncatedSVD : Dimensionality reduction using truncated SVD.
  365. Examples
  366. --------
  367. >>> import numpy as np
  368. >>> from sklearn.datasets import make_friedman1
  369. >>> from sklearn.decomposition import MiniBatchSparsePCA
  370. >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)
  371. >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,
  372. ... max_iter=10, random_state=0)
  373. >>> transformer.fit(X)
  374. MiniBatchSparsePCA(...)
  375. >>> X_transformed = transformer.transform(X)
  376. >>> X_transformed.shape
  377. (200, 5)
  378. >>> # most values in the components_ are zero (sparsity)
  379. >>> np.mean(transformer.components_ == 0)
  380. 0.9...
  381. """
  382. _parameter_constraints: dict = {
  383. **_BaseSparsePCA._parameter_constraints,
  384. "max_iter": [Interval(Integral, 0, None, closed="left"), None],
  385. "n_iter": [
  386. Interval(Integral, 0, None, closed="left"),
  387. Hidden(StrOptions({"deprecated"})),
  388. ],
  389. "callback": [None, callable],
  390. "batch_size": [Interval(Integral, 1, None, closed="left")],
  391. "shuffle": ["boolean"],
  392. "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None],
  393. }
  394. def __init__(
  395. self,
  396. n_components=None,
  397. *,
  398. alpha=1,
  399. ridge_alpha=0.01,
  400. n_iter="deprecated",
  401. max_iter=None,
  402. callback=None,
  403. batch_size=3,
  404. verbose=False,
  405. shuffle=True,
  406. n_jobs=None,
  407. method="lars",
  408. random_state=None,
  409. tol=1e-3,
  410. max_no_improvement=10,
  411. ):
  412. super().__init__(
  413. n_components=n_components,
  414. alpha=alpha,
  415. ridge_alpha=ridge_alpha,
  416. max_iter=max_iter,
  417. tol=tol,
  418. method=method,
  419. n_jobs=n_jobs,
  420. verbose=verbose,
  421. random_state=random_state,
  422. )
  423. self.n_iter = n_iter
  424. self.callback = callback
  425. self.batch_size = batch_size
  426. self.shuffle = shuffle
  427. self.max_no_improvement = max_no_improvement
  428. def _fit(self, X, n_components, random_state):
  429. """Specialized `fit` for MiniBatchSparsePCA."""
  430. transform_algorithm = "lasso_" + self.method
  431. est = MiniBatchDictionaryLearning(
  432. n_components=n_components,
  433. alpha=self.alpha,
  434. n_iter=self.n_iter,
  435. max_iter=self.max_iter,
  436. dict_init=None,
  437. batch_size=self.batch_size,
  438. shuffle=self.shuffle,
  439. n_jobs=self.n_jobs,
  440. fit_algorithm=self.method,
  441. random_state=random_state,
  442. transform_algorithm=transform_algorithm,
  443. transform_alpha=self.alpha,
  444. verbose=self.verbose,
  445. callback=self.callback,
  446. tol=self.tol,
  447. max_no_improvement=self.max_no_improvement,
  448. ).fit(X.T)
  449. self.components_, self.n_iter_ = est.transform(X.T).T, est.n_iter_
  450. components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]
  451. components_norm[components_norm == 0] = 1
  452. self.components_ /= components_norm
  453. self.n_components_ = len(self.components_)
  454. return self