_factor_analysis.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. """Factor Analysis.
  2. A latent linear variable model.
  3. FactorAnalysis is similar to probabilistic PCA implemented by PCA.score
  4. While PCA assumes Gaussian noise with the same variance for each
  5. feature, the FactorAnalysis model assumes different variances for
  6. each of them.
  7. This implementation is based on David Barber's Book,
  8. Bayesian Reasoning and Machine Learning,
  9. http://www.cs.ucl.ac.uk/staff/d.barber/brml,
  10. Algorithm 21.1
  11. """
  12. # Author: Christian Osendorfer <osendorf@gmail.com>
  13. # Alexandre Gramfort <alexandre.gramfort@inria.fr>
  14. # Denis A. Engemann <denis-alexander.engemann@inria.fr>
  15. # License: BSD3
  16. import warnings
  17. from math import log, sqrt
  18. from numbers import Integral, Real
  19. import numpy as np
  20. from scipy import linalg
  21. from ..base import (
  22. BaseEstimator,
  23. ClassNamePrefixFeaturesOutMixin,
  24. TransformerMixin,
  25. _fit_context,
  26. )
  27. from ..exceptions import ConvergenceWarning
  28. from ..utils import check_random_state
  29. from ..utils._param_validation import Interval, StrOptions
  30. from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
  31. from ..utils.validation import check_is_fitted
  32. class FactorAnalysis(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
  33. """Factor Analysis (FA).
  34. A simple linear generative model with Gaussian latent variables.
  35. The observations are assumed to be caused by a linear transformation of
  36. lower dimensional latent factors and added Gaussian noise.
  37. Without loss of generality the factors are distributed according to a
  38. Gaussian with zero mean and unit covariance. The noise is also zero mean
  39. and has an arbitrary diagonal covariance matrix.
  40. If we would restrict the model further, by assuming that the Gaussian
  41. noise is even isotropic (all diagonal entries are the same) we would obtain
  42. :class:`PCA`.
  43. FactorAnalysis performs a maximum likelihood estimate of the so-called
  44. `loading` matrix, the transformation of the latent variables to the
  45. observed ones, using SVD based approach.
  46. Read more in the :ref:`User Guide <FA>`.
  47. .. versionadded:: 0.13
  48. Parameters
  49. ----------
  50. n_components : int, default=None
  51. Dimensionality of latent space, the number of components
  52. of ``X`` that are obtained after ``transform``.
  53. If None, n_components is set to the number of features.
  54. tol : float, default=1e-2
  55. Stopping tolerance for log-likelihood increase.
  56. copy : bool, default=True
  57. Whether to make a copy of X. If ``False``, the input X gets overwritten
  58. during fitting.
  59. max_iter : int, default=1000
  60. Maximum number of iterations.
  61. noise_variance_init : array-like of shape (n_features,), default=None
  62. The initial guess of the noise variance for each feature.
  63. If None, it defaults to np.ones(n_features).
  64. svd_method : {'lapack', 'randomized'}, default='randomized'
  65. Which SVD method to use. If 'lapack' use standard SVD from
  66. scipy.linalg, if 'randomized' use fast ``randomized_svd`` function.
  67. Defaults to 'randomized'. For most applications 'randomized' will
  68. be sufficiently precise while providing significant speed gains.
  69. Accuracy can also be improved by setting higher values for
  70. `iterated_power`. If this is not sufficient, for maximum precision
  71. you should choose 'lapack'.
  72. iterated_power : int, default=3
  73. Number of iterations for the power method. 3 by default. Only used
  74. if ``svd_method`` equals 'randomized'.
  75. rotation : {'varimax', 'quartimax'}, default=None
  76. If not None, apply the indicated rotation. Currently, varimax and
  77. quartimax are implemented. See
  78. `"The varimax criterion for analytic rotation in factor analysis"
  79. <https://link.springer.com/article/10.1007%2FBF02289233>`_
  80. H. F. Kaiser, 1958.
  81. .. versionadded:: 0.24
  82. random_state : int or RandomState instance, default=0
  83. Only used when ``svd_method`` equals 'randomized'. Pass an int for
  84. reproducible results across multiple function calls.
  85. See :term:`Glossary <random_state>`.
  86. Attributes
  87. ----------
  88. components_ : ndarray of shape (n_components, n_features)
  89. Components with maximum variance.
  90. loglike_ : list of shape (n_iterations,)
  91. The log likelihood at each iteration.
  92. noise_variance_ : ndarray of shape (n_features,)
  93. The estimated noise variance for each feature.
  94. n_iter_ : int
  95. Number of iterations run.
  96. mean_ : ndarray of shape (n_features,)
  97. Per-feature empirical mean, estimated from the training set.
  98. n_features_in_ : int
  99. Number of features seen during :term:`fit`.
  100. .. versionadded:: 0.24
  101. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  102. Names of features seen during :term:`fit`. Defined only when `X`
  103. has feature names that are all strings.
  104. .. versionadded:: 1.0
  105. See Also
  106. --------
  107. PCA: Principal component analysis is also a latent linear variable model
  108. which however assumes equal noise variance for each feature.
  109. This extra assumption makes probabilistic PCA faster as it can be
  110. computed in closed form.
  111. FastICA: Independent component analysis, a latent variable model with
  112. non-Gaussian latent variables.
  113. References
  114. ----------
  115. - David Barber, Bayesian Reasoning and Machine Learning,
  116. Algorithm 21.1.
  117. - Christopher M. Bishop: Pattern Recognition and Machine Learning,
  118. Chapter 12.2.4.
  119. Examples
  120. --------
  121. >>> from sklearn.datasets import load_digits
  122. >>> from sklearn.decomposition import FactorAnalysis
  123. >>> X, _ = load_digits(return_X_y=True)
  124. >>> transformer = FactorAnalysis(n_components=7, random_state=0)
  125. >>> X_transformed = transformer.fit_transform(X)
  126. >>> X_transformed.shape
  127. (1797, 7)
  128. """
  129. _parameter_constraints: dict = {
  130. "n_components": [Interval(Integral, 0, None, closed="left"), None],
  131. "tol": [Interval(Real, 0.0, None, closed="left")],
  132. "copy": ["boolean"],
  133. "max_iter": [Interval(Integral, 1, None, closed="left")],
  134. "noise_variance_init": ["array-like", None],
  135. "svd_method": [StrOptions({"randomized", "lapack"})],
  136. "iterated_power": [Interval(Integral, 0, None, closed="left")],
  137. "rotation": [StrOptions({"varimax", "quartimax"}), None],
  138. "random_state": ["random_state"],
  139. }
  140. def __init__(
  141. self,
  142. n_components=None,
  143. *,
  144. tol=1e-2,
  145. copy=True,
  146. max_iter=1000,
  147. noise_variance_init=None,
  148. svd_method="randomized",
  149. iterated_power=3,
  150. rotation=None,
  151. random_state=0,
  152. ):
  153. self.n_components = n_components
  154. self.copy = copy
  155. self.tol = tol
  156. self.max_iter = max_iter
  157. self.svd_method = svd_method
  158. self.noise_variance_init = noise_variance_init
  159. self.iterated_power = iterated_power
  160. self.random_state = random_state
  161. self.rotation = rotation
  162. @_fit_context(prefer_skip_nested_validation=True)
  163. def fit(self, X, y=None):
  164. """Fit the FactorAnalysis model to X using SVD based approach.
  165. Parameters
  166. ----------
  167. X : array-like of shape (n_samples, n_features)
  168. Training data.
  169. y : Ignored
  170. Ignored parameter.
  171. Returns
  172. -------
  173. self : object
  174. FactorAnalysis class instance.
  175. """
  176. X = self._validate_data(X, copy=self.copy, dtype=np.float64)
  177. n_samples, n_features = X.shape
  178. n_components = self.n_components
  179. if n_components is None:
  180. n_components = n_features
  181. self.mean_ = np.mean(X, axis=0)
  182. X -= self.mean_
  183. # some constant terms
  184. nsqrt = sqrt(n_samples)
  185. llconst = n_features * log(2.0 * np.pi) + n_components
  186. var = np.var(X, axis=0)
  187. if self.noise_variance_init is None:
  188. psi = np.ones(n_features, dtype=X.dtype)
  189. else:
  190. if len(self.noise_variance_init) != n_features:
  191. raise ValueError(
  192. "noise_variance_init dimension does not "
  193. "with number of features : %d != %d"
  194. % (len(self.noise_variance_init), n_features)
  195. )
  196. psi = np.array(self.noise_variance_init)
  197. loglike = []
  198. old_ll = -np.inf
  199. SMALL = 1e-12
  200. # we'll modify svd outputs to return unexplained variance
  201. # to allow for unified computation of loglikelihood
  202. if self.svd_method == "lapack":
  203. def my_svd(X):
  204. _, s, Vt = linalg.svd(X, full_matrices=False, check_finite=False)
  205. return (
  206. s[:n_components],
  207. Vt[:n_components],
  208. squared_norm(s[n_components:]),
  209. )
  210. else: # svd_method == "randomized"
  211. random_state = check_random_state(self.random_state)
  212. def my_svd(X):
  213. _, s, Vt = randomized_svd(
  214. X,
  215. n_components,
  216. random_state=random_state,
  217. n_iter=self.iterated_power,
  218. )
  219. return s, Vt, squared_norm(X) - squared_norm(s)
  220. for i in range(self.max_iter):
  221. # SMALL helps numerics
  222. sqrt_psi = np.sqrt(psi) + SMALL
  223. s, Vt, unexp_var = my_svd(X / (sqrt_psi * nsqrt))
  224. s **= 2
  225. # Use 'maximum' here to avoid sqrt problems.
  226. W = np.sqrt(np.maximum(s - 1.0, 0.0))[:, np.newaxis] * Vt
  227. del Vt
  228. W *= sqrt_psi
  229. # loglikelihood
  230. ll = llconst + np.sum(np.log(s))
  231. ll += unexp_var + np.sum(np.log(psi))
  232. ll *= -n_samples / 2.0
  233. loglike.append(ll)
  234. if (ll - old_ll) < self.tol:
  235. break
  236. old_ll = ll
  237. psi = np.maximum(var - np.sum(W**2, axis=0), SMALL)
  238. else:
  239. warnings.warn(
  240. "FactorAnalysis did not converge."
  241. + " You might want"
  242. + " to increase the number of iterations.",
  243. ConvergenceWarning,
  244. )
  245. self.components_ = W
  246. if self.rotation is not None:
  247. self.components_ = self._rotate(W)
  248. self.noise_variance_ = psi
  249. self.loglike_ = loglike
  250. self.n_iter_ = i + 1
  251. return self
  252. def transform(self, X):
  253. """Apply dimensionality reduction to X using the model.
  254. Compute the expected mean of the latent variables.
  255. See Barber, 21.2.33 (or Bishop, 12.66).
  256. Parameters
  257. ----------
  258. X : array-like of shape (n_samples, n_features)
  259. Training data.
  260. Returns
  261. -------
  262. X_new : ndarray of shape (n_samples, n_components)
  263. The latent variables of X.
  264. """
  265. check_is_fitted(self)
  266. X = self._validate_data(X, reset=False)
  267. Ih = np.eye(len(self.components_))
  268. X_transformed = X - self.mean_
  269. Wpsi = self.components_ / self.noise_variance_
  270. cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T))
  271. tmp = np.dot(X_transformed, Wpsi.T)
  272. X_transformed = np.dot(tmp, cov_z)
  273. return X_transformed
  274. def get_covariance(self):
  275. """Compute data covariance with the FactorAnalysis model.
  276. ``cov = components_.T * components_ + diag(noise_variance)``
  277. Returns
  278. -------
  279. cov : ndarray of shape (n_features, n_features)
  280. Estimated covariance of data.
  281. """
  282. check_is_fitted(self)
  283. cov = np.dot(self.components_.T, self.components_)
  284. cov.flat[:: len(cov) + 1] += self.noise_variance_ # modify diag inplace
  285. return cov
  286. def get_precision(self):
  287. """Compute data precision matrix with the FactorAnalysis model.
  288. Returns
  289. -------
  290. precision : ndarray of shape (n_features, n_features)
  291. Estimated precision of data.
  292. """
  293. check_is_fitted(self)
  294. n_features = self.components_.shape[1]
  295. # handle corner cases first
  296. if self.n_components == 0:
  297. return np.diag(1.0 / self.noise_variance_)
  298. if self.n_components == n_features:
  299. return linalg.inv(self.get_covariance())
  300. # Get precision using matrix inversion lemma
  301. components_ = self.components_
  302. precision = np.dot(components_ / self.noise_variance_, components_.T)
  303. precision.flat[:: len(precision) + 1] += 1.0
  304. precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_))
  305. precision /= self.noise_variance_[:, np.newaxis]
  306. precision /= -self.noise_variance_[np.newaxis, :]
  307. precision.flat[:: len(precision) + 1] += 1.0 / self.noise_variance_
  308. return precision
  309. def score_samples(self, X):
  310. """Compute the log-likelihood of each sample.
  311. Parameters
  312. ----------
  313. X : ndarray of shape (n_samples, n_features)
  314. The data.
  315. Returns
  316. -------
  317. ll : ndarray of shape (n_samples,)
  318. Log-likelihood of each sample under the current model.
  319. """
  320. check_is_fitted(self)
  321. X = self._validate_data(X, reset=False)
  322. Xr = X - self.mean_
  323. precision = self.get_precision()
  324. n_features = X.shape[1]
  325. log_like = -0.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1)
  326. log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))
  327. return log_like
  328. def score(self, X, y=None):
  329. """Compute the average log-likelihood of the samples.
  330. Parameters
  331. ----------
  332. X : ndarray of shape (n_samples, n_features)
  333. The data.
  334. y : Ignored
  335. Ignored parameter.
  336. Returns
  337. -------
  338. ll : float
  339. Average log-likelihood of the samples under the current model.
  340. """
  341. return np.mean(self.score_samples(X))
  342. def _rotate(self, components, n_components=None, tol=1e-6):
  343. "Rotate the factor analysis solution."
  344. # note that tol is not exposed
  345. return _ortho_rotation(components.T, method=self.rotation, tol=tol)[
  346. : self.n_components
  347. ]
  348. @property
  349. def _n_features_out(self):
  350. """Number of transformed output features."""
  351. return self.components_.shape[0]
  352. def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100):
  353. """Return rotated components."""
  354. nrow, ncol = components.shape
  355. rotation_matrix = np.eye(ncol)
  356. var = 0
  357. for _ in range(max_iter):
  358. comp_rot = np.dot(components, rotation_matrix)
  359. if method == "varimax":
  360. tmp = comp_rot * np.transpose((comp_rot**2).sum(axis=0) / nrow)
  361. elif method == "quartimax":
  362. tmp = 0
  363. u, s, v = np.linalg.svd(np.dot(components.T, comp_rot**3 - tmp))
  364. rotation_matrix = np.dot(u, v)
  365. var_new = np.sum(s)
  366. if var != 0 and var_new < var * (1 + tol):
  367. break
  368. var = var_new
  369. return np.dot(components, rotation_matrix).T