| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929 |
- """
- =============================================================
- Online Latent Dirichlet Allocation with variational inference
- =============================================================
- This implementation is modified from Matthew D. Hoffman's onlineldavb code
- Link: https://github.com/blei-lab/onlineldavb
- """
- # Author: Chyi-Kwei Yau
- # Author: Matthew D. Hoffman (original onlineldavb implementation)
- from numbers import Integral, Real
- import numpy as np
- import scipy.sparse as sp
- from joblib import effective_n_jobs
- from scipy.special import gammaln, logsumexp
- from ..base import (
- BaseEstimator,
- ClassNamePrefixFeaturesOutMixin,
- TransformerMixin,
- _fit_context,
- )
- from ..utils import check_random_state, gen_batches, gen_even_slices
- from ..utils._param_validation import Interval, StrOptions
- from ..utils.parallel import Parallel, delayed
- from ..utils.validation import check_is_fitted, check_non_negative
- from ._online_lda_fast import (
- _dirichlet_expectation_1d as cy_dirichlet_expectation_1d,
- )
- from ._online_lda_fast import (
- _dirichlet_expectation_2d,
- )
- from ._online_lda_fast import (
- mean_change as cy_mean_change,
- )
- EPS = np.finfo(float).eps
- def _update_doc_distribution(
- X,
- exp_topic_word_distr,
- doc_topic_prior,
- max_doc_update_iter,
- mean_change_tol,
- cal_sstats,
- random_state,
- ):
- """E-step: update document-topic distribution.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- exp_topic_word_distr : ndarray of shape (n_topics, n_features)
- Exponential value of expectation of log topic word distribution.
- In the literature, this is `exp(E[log(beta)])`.
- doc_topic_prior : float
- Prior of document topic distribution `theta`.
- max_doc_update_iter : int
- Max number of iterations for updating document topic distribution in
- the E-step.
- mean_change_tol : float
- Stopping tolerance for updating document topic distribution in E-step.
- cal_sstats : bool
- Parameter that indicate to calculate sufficient statistics or not.
- Set `cal_sstats` to `True` when we need to run M-step.
- random_state : RandomState instance or None
- Parameter that indicate how to initialize document topic distribution.
- Set `random_state` to None will initialize document topic distribution
- to a constant number.
- Returns
- -------
- (doc_topic_distr, suff_stats) :
- `doc_topic_distr` is unnormalized topic distribution for each document.
- In the literature, this is `gamma`. we can calculate `E[log(theta)]`
- from it.
- `suff_stats` is expected sufficient statistics for the M-step.
- When `cal_sstats == False`, this will be None.
- """
- is_sparse_x = sp.issparse(X)
- n_samples, n_features = X.shape
- n_topics = exp_topic_word_distr.shape[0]
- if random_state:
- doc_topic_distr = random_state.gamma(100.0, 0.01, (n_samples, n_topics)).astype(
- X.dtype, copy=False
- )
- else:
- doc_topic_distr = np.ones((n_samples, n_topics), dtype=X.dtype)
- # In the literature, this is `exp(E[log(theta)])`
- exp_doc_topic = np.exp(_dirichlet_expectation_2d(doc_topic_distr))
- # diff on `component_` (only calculate it when `cal_diff` is True)
- suff_stats = (
- np.zeros(exp_topic_word_distr.shape, dtype=X.dtype) if cal_sstats else None
- )
- if is_sparse_x:
- X_data = X.data
- X_indices = X.indices
- X_indptr = X.indptr
- # These cython functions are called in a nested loop on usually very small arrays
- # (length=n_topics). In that case, finding the appropriate signature of the
- # fused-typed function can be more costly than its execution, hence the dispatch
- # is done outside of the loop.
- ctype = "float" if X.dtype == np.float32 else "double"
- mean_change = cy_mean_change[ctype]
- dirichlet_expectation_1d = cy_dirichlet_expectation_1d[ctype]
- eps = np.finfo(X.dtype).eps
- for idx_d in range(n_samples):
- if is_sparse_x:
- ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]]
- cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]]
- else:
- ids = np.nonzero(X[idx_d, :])[0]
- cnts = X[idx_d, ids]
- doc_topic_d = doc_topic_distr[idx_d, :]
- # The next one is a copy, since the inner loop overwrites it.
- exp_doc_topic_d = exp_doc_topic[idx_d, :].copy()
- exp_topic_word_d = exp_topic_word_distr[:, ids]
- # Iterate between `doc_topic_d` and `norm_phi` until convergence
- for _ in range(0, max_doc_update_iter):
- last_d = doc_topic_d
- # The optimal phi_{dwk} is proportional to
- # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
- norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps
- doc_topic_d = exp_doc_topic_d * np.dot(cnts / norm_phi, exp_topic_word_d.T)
- # Note: adds doc_topic_prior to doc_topic_d, in-place.
- dirichlet_expectation_1d(doc_topic_d, doc_topic_prior, exp_doc_topic_d)
- if mean_change(last_d, doc_topic_d) < mean_change_tol:
- break
- doc_topic_distr[idx_d, :] = doc_topic_d
- # Contribution of document d to the expected sufficient
- # statistics for the M step.
- if cal_sstats:
- norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps
- suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
- return (doc_topic_distr, suff_stats)
- class LatentDirichletAllocation(
- ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
- ):
- """Latent Dirichlet Allocation with online variational Bayes algorithm.
- The implementation is based on [1]_ and [2]_.
- .. versionadded:: 0.17
- Read more in the :ref:`User Guide <LatentDirichletAllocation>`.
- Parameters
- ----------
- n_components : int, default=10
- Number of topics.
- .. versionchanged:: 0.19
- ``n_topics`` was renamed to ``n_components``
- doc_topic_prior : float, default=None
- Prior of document topic distribution `theta`. If the value is None,
- defaults to `1 / n_components`.
- In [1]_, this is called `alpha`.
- topic_word_prior : float, default=None
- Prior of topic word distribution `beta`. If the value is None, defaults
- to `1 / n_components`.
- In [1]_, this is called `eta`.
- learning_method : {'batch', 'online'}, default='batch'
- Method used to update `_component`. Only used in :meth:`fit` method.
- In general, if the data size is large, the online update will be much
- faster than the batch update.
- Valid options::
- 'batch': Batch variational Bayes method. Use all training data in
- each EM update.
- Old `components_` will be overwritten in each iteration.
- 'online': Online variational Bayes method. In each EM update, use
- mini-batch of training data to update the ``components_``
- variable incrementally. The learning rate is controlled by the
- ``learning_decay`` and the ``learning_offset`` parameters.
- .. versionchanged:: 0.20
- The default learning method is now ``"batch"``.
- learning_decay : float, default=0.7
- It is a parameter that control learning rate in the online learning
- method. The value should be set between (0.5, 1.0] to guarantee
- asymptotic convergence. When the value is 0.0 and batch_size is
- ``n_samples``, the update method is same as batch learning. In the
- literature, this is called kappa.
- learning_offset : float, default=10.0
- A (positive) parameter that downweights early iterations in online
- learning. It should be greater than 1.0. In the literature, this is
- called tau_0.
- max_iter : int, default=10
- The maximum number of passes over the training data (aka epochs).
- It only impacts the behavior in the :meth:`fit` method, and not the
- :meth:`partial_fit` method.
- batch_size : int, default=128
- Number of documents to use in each EM iteration. Only used in online
- learning.
- evaluate_every : int, default=-1
- How often to evaluate perplexity. Only used in `fit` method.
- set it to 0 or negative number to not evaluate perplexity in
- training at all. Evaluating perplexity can help you check convergence
- in training process, but it will also increase total training time.
- Evaluating perplexity in every iteration might increase training time
- up to two-fold.
- total_samples : int, default=1e6
- Total number of documents. Only used in the :meth:`partial_fit` method.
- perp_tol : float, default=1e-1
- Perplexity tolerance in batch learning. Only used when
- ``evaluate_every`` is greater than 0.
- mean_change_tol : float, default=1e-3
- Stopping tolerance for updating document topic distribution in E-step.
- max_doc_update_iter : int, default=100
- Max number of iterations for updating document topic distribution in
- the E-step.
- n_jobs : int, default=None
- The number of jobs to use in the E-step.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- verbose : int, default=0
- Verbosity level.
- random_state : int, RandomState instance or None, default=None
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- Attributes
- ----------
- components_ : ndarray of shape (n_components, n_features)
- Variational parameters for topic word distribution. Since the complete
- conditional for topic word distribution is a Dirichlet,
- ``components_[i, j]`` can be viewed as pseudocount that represents the
- number of times word `j` was assigned to topic `i`.
- It can also be viewed as distribution over the words for each topic
- after normalization:
- ``model.components_ / model.components_.sum(axis=1)[:, np.newaxis]``.
- exp_dirichlet_component_ : ndarray of shape (n_components, n_features)
- Exponential value of expectation of log topic word distribution.
- In the literature, this is `exp(E[log(beta)])`.
- n_batch_iter_ : int
- Number of iterations of the EM step.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_iter_ : int
- Number of passes over the dataset.
- bound_ : float
- Final perplexity score on training set.
- doc_topic_prior_ : float
- Prior of document topic distribution `theta`. If the value is None,
- it is `1 / n_components`.
- random_state_ : RandomState instance
- RandomState instance that is generated either from a seed, the random
- number generator or by `np.random`.
- topic_word_prior_ : float
- Prior of topic word distribution `beta`. If the value is None, it is
- `1 / n_components`.
- See Also
- --------
- sklearn.discriminant_analysis.LinearDiscriminantAnalysis:
- A classifier with a linear decision boundary, generated by fitting
- class conditional densities to the data and using Bayes' rule.
- References
- ----------
- .. [1] "Online Learning for Latent Dirichlet Allocation", Matthew D.
- Hoffman, David M. Blei, Francis Bach, 2010
- https://github.com/blei-lab/onlineldavb
- .. [2] "Stochastic Variational Inference", Matthew D. Hoffman,
- David M. Blei, Chong Wang, John Paisley, 2013
- Examples
- --------
- >>> from sklearn.decomposition import LatentDirichletAllocation
- >>> from sklearn.datasets import make_multilabel_classification
- >>> # This produces a feature matrix of token counts, similar to what
- >>> # CountVectorizer would produce on text.
- >>> X, _ = make_multilabel_classification(random_state=0)
- >>> lda = LatentDirichletAllocation(n_components=5,
- ... random_state=0)
- >>> lda.fit(X)
- LatentDirichletAllocation(...)
- >>> # get topics for some given samples:
- >>> lda.transform(X[-2:])
- array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846],
- [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586 ]])
- """
- _parameter_constraints: dict = {
- "n_components": [Interval(Integral, 0, None, closed="neither")],
- "doc_topic_prior": [None, Interval(Real, 0, 1, closed="both")],
- "topic_word_prior": [None, Interval(Real, 0, 1, closed="both")],
- "learning_method": [StrOptions({"batch", "online"})],
- "learning_decay": [Interval(Real, 0, 1, closed="both")],
- "learning_offset": [Interval(Real, 1.0, None, closed="left")],
- "max_iter": [Interval(Integral, 0, None, closed="left")],
- "batch_size": [Interval(Integral, 0, None, closed="neither")],
- "evaluate_every": [Interval(Integral, None, None, closed="neither")],
- "total_samples": [Interval(Real, 0, None, closed="neither")],
- "perp_tol": [Interval(Real, 0, None, closed="left")],
- "mean_change_tol": [Interval(Real, 0, None, closed="left")],
- "max_doc_update_iter": [Interval(Integral, 0, None, closed="left")],
- "n_jobs": [None, Integral],
- "verbose": ["verbose"],
- "random_state": ["random_state"],
- }
- def __init__(
- self,
- n_components=10,
- *,
- doc_topic_prior=None,
- topic_word_prior=None,
- learning_method="batch",
- learning_decay=0.7,
- learning_offset=10.0,
- max_iter=10,
- batch_size=128,
- evaluate_every=-1,
- total_samples=1e6,
- perp_tol=1e-1,
- mean_change_tol=1e-3,
- max_doc_update_iter=100,
- n_jobs=None,
- verbose=0,
- random_state=None,
- ):
- self.n_components = n_components
- self.doc_topic_prior = doc_topic_prior
- self.topic_word_prior = topic_word_prior
- self.learning_method = learning_method
- self.learning_decay = learning_decay
- self.learning_offset = learning_offset
- self.max_iter = max_iter
- self.batch_size = batch_size
- self.evaluate_every = evaluate_every
- self.total_samples = total_samples
- self.perp_tol = perp_tol
- self.mean_change_tol = mean_change_tol
- self.max_doc_update_iter = max_doc_update_iter
- self.n_jobs = n_jobs
- self.verbose = verbose
- self.random_state = random_state
- def _init_latent_vars(self, n_features, dtype=np.float64):
- """Initialize latent variables."""
- self.random_state_ = check_random_state(self.random_state)
- self.n_batch_iter_ = 1
- self.n_iter_ = 0
- if self.doc_topic_prior is None:
- self.doc_topic_prior_ = 1.0 / self.n_components
- else:
- self.doc_topic_prior_ = self.doc_topic_prior
- if self.topic_word_prior is None:
- self.topic_word_prior_ = 1.0 / self.n_components
- else:
- self.topic_word_prior_ = self.topic_word_prior
- init_gamma = 100.0
- init_var = 1.0 / init_gamma
- # In the literature, this is called `lambda`
- self.components_ = self.random_state_.gamma(
- init_gamma, init_var, (self.n_components, n_features)
- ).astype(dtype, copy=False)
- # In the literature, this is `exp(E[log(beta)])`
- self.exp_dirichlet_component_ = np.exp(
- _dirichlet_expectation_2d(self.components_)
- )
- def _e_step(self, X, cal_sstats, random_init, parallel=None):
- """E-step in EM update.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- cal_sstats : bool
- Parameter that indicate whether to calculate sufficient statistics
- or not. Set ``cal_sstats`` to True when we need to run M-step.
- random_init : bool
- Parameter that indicate whether to initialize document topic
- distribution randomly in the E-step. Set it to True in training
- steps.
- parallel : joblib.Parallel, default=None
- Pre-initialized instance of joblib.Parallel.
- Returns
- -------
- (doc_topic_distr, suff_stats) :
- `doc_topic_distr` is unnormalized topic distribution for each
- document. In the literature, this is called `gamma`.
- `suff_stats` is expected sufficient statistics for the M-step.
- When `cal_sstats == False`, it will be None.
- """
- # Run e-step in parallel
- random_state = self.random_state_ if random_init else None
- # TODO: make Parallel._effective_n_jobs public instead?
- n_jobs = effective_n_jobs(self.n_jobs)
- if parallel is None:
- parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))
- results = parallel(
- delayed(_update_doc_distribution)(
- X[idx_slice, :],
- self.exp_dirichlet_component_,
- self.doc_topic_prior_,
- self.max_doc_update_iter,
- self.mean_change_tol,
- cal_sstats,
- random_state,
- )
- for idx_slice in gen_even_slices(X.shape[0], n_jobs)
- )
- # merge result
- doc_topics, sstats_list = zip(*results)
- doc_topic_distr = np.vstack(doc_topics)
- if cal_sstats:
- # This step finishes computing the sufficient statistics for the
- # M-step.
- suff_stats = np.zeros(self.components_.shape, dtype=self.components_.dtype)
- for sstats in sstats_list:
- suff_stats += sstats
- suff_stats *= self.exp_dirichlet_component_
- else:
- suff_stats = None
- return (doc_topic_distr, suff_stats)
- def _em_step(self, X, total_samples, batch_update, parallel=None):
- """EM update for 1 iteration.
- update `_component` by batch VB or online VB.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- total_samples : int
- Total number of documents. It is only used when
- batch_update is `False`.
- batch_update : bool
- Parameter that controls updating method.
- `True` for batch learning, `False` for online learning.
- parallel : joblib.Parallel, default=None
- Pre-initialized instance of joblib.Parallel
- Returns
- -------
- doc_topic_distr : ndarray of shape (n_samples, n_components)
- Unnormalized document topic distribution.
- """
- # E-step
- _, suff_stats = self._e_step(
- X, cal_sstats=True, random_init=True, parallel=parallel
- )
- # M-step
- if batch_update:
- self.components_ = self.topic_word_prior_ + suff_stats
- else:
- # online update
- # In the literature, the weight is `rho`
- weight = np.power(
- self.learning_offset + self.n_batch_iter_, -self.learning_decay
- )
- doc_ratio = float(total_samples) / X.shape[0]
- self.components_ *= 1 - weight
- self.components_ += weight * (
- self.topic_word_prior_ + doc_ratio * suff_stats
- )
- # update `component_` related variables
- self.exp_dirichlet_component_ = np.exp(
- _dirichlet_expectation_2d(self.components_)
- )
- self.n_batch_iter_ += 1
- return
- def _more_tags(self):
- return {
- "preserves_dtype": [np.float64, np.float32],
- "requires_positive_X": True,
- }
- def _check_non_neg_array(self, X, reset_n_features, whom):
- """check X format
- check X format and make sure no negative value in X.
- Parameters
- ----------
- X : array-like or sparse matrix
- """
- dtype = [np.float64, np.float32] if reset_n_features else self.components_.dtype
- X = self._validate_data(
- X,
- reset=reset_n_features,
- accept_sparse="csr",
- dtype=dtype,
- )
- check_non_negative(X, whom)
- return X
- @_fit_context(prefer_skip_nested_validation=True)
- def partial_fit(self, X, y=None):
- """Online VB with Mini-Batch update.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- self
- Partially fitted estimator.
- """
- first_time = not hasattr(self, "components_")
- X = self._check_non_neg_array(
- X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit"
- )
- n_samples, n_features = X.shape
- batch_size = self.batch_size
- # initialize parameters or check
- if first_time:
- self._init_latent_vars(n_features, dtype=X.dtype)
- if n_features != self.components_.shape[1]:
- raise ValueError(
- "The provided data has %d dimensions while "
- "the model was trained with feature size %d."
- % (n_features, self.components_.shape[1])
- )
- n_jobs = effective_n_jobs(self.n_jobs)
- with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:
- for idx_slice in gen_batches(n_samples, batch_size):
- self._em_step(
- X[idx_slice, :],
- total_samples=self.total_samples,
- batch_update=False,
- parallel=parallel,
- )
- return self
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """Learn model for the data X with variational Bayes method.
- When `learning_method` is 'online', use mini-batch update.
- Otherwise, use batch update.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- self
- Fitted estimator.
- """
- X = self._check_non_neg_array(
- X, reset_n_features=True, whom="LatentDirichletAllocation.fit"
- )
- n_samples, n_features = X.shape
- max_iter = self.max_iter
- evaluate_every = self.evaluate_every
- learning_method = self.learning_method
- batch_size = self.batch_size
- # initialize parameters
- self._init_latent_vars(n_features, dtype=X.dtype)
- # change to perplexity later
- last_bound = None
- n_jobs = effective_n_jobs(self.n_jobs)
- with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:
- for i in range(max_iter):
- if learning_method == "online":
- for idx_slice in gen_batches(n_samples, batch_size):
- self._em_step(
- X[idx_slice, :],
- total_samples=n_samples,
- batch_update=False,
- parallel=parallel,
- )
- else:
- # batch update
- self._em_step(
- X, total_samples=n_samples, batch_update=True, parallel=parallel
- )
- # check perplexity
- if evaluate_every > 0 and (i + 1) % evaluate_every == 0:
- doc_topics_distr, _ = self._e_step(
- X, cal_sstats=False, random_init=False, parallel=parallel
- )
- bound = self._perplexity_precomp_distr(
- X, doc_topics_distr, sub_sampling=False
- )
- if self.verbose:
- print(
- "iteration: %d of max_iter: %d, perplexity: %.4f"
- % (i + 1, max_iter, bound)
- )
- if last_bound and abs(last_bound - bound) < self.perp_tol:
- break
- last_bound = bound
- elif self.verbose:
- print("iteration: %d of max_iter: %d" % (i + 1, max_iter))
- self.n_iter_ += 1
- # calculate final perplexity value on train set
- doc_topics_distr, _ = self._e_step(
- X, cal_sstats=False, random_init=False, parallel=parallel
- )
- self.bound_ = self._perplexity_precomp_distr(
- X, doc_topics_distr, sub_sampling=False
- )
- return self
- def _unnormalized_transform(self, X):
- """Transform data X according to fitted model.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- Returns
- -------
- doc_topic_distr : ndarray of shape (n_samples, n_components)
- Document topic distribution for X.
- """
- doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False)
- return doc_topic_distr
- def transform(self, X):
- """Transform data X according to the fitted model.
- .. versionchanged:: 0.18
- *doc_topic_distr* is now normalized
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- Returns
- -------
- doc_topic_distr : ndarray of shape (n_samples, n_components)
- Document topic distribution for X.
- """
- check_is_fitted(self)
- X = self._check_non_neg_array(
- X, reset_n_features=False, whom="LatentDirichletAllocation.transform"
- )
- doc_topic_distr = self._unnormalized_transform(X)
- doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
- return doc_topic_distr
- def _approx_bound(self, X, doc_topic_distr, sub_sampling):
- """Estimate the variational bound.
- Estimate the variational bound over "all documents" using only the
- documents passed in as X. Since log-likelihood of each word cannot
- be computed directly, we use this bound to estimate it.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- doc_topic_distr : ndarray of shape (n_samples, n_components)
- Document topic distribution. In the literature, this is called
- gamma.
- sub_sampling : bool, default=False
- Compensate for subsampling of documents.
- It is used in calculate bound in online learning.
- Returns
- -------
- score : float
- """
- def _loglikelihood(prior, distr, dirichlet_distr, size):
- # calculate log-likelihood
- score = np.sum((prior - distr) * dirichlet_distr)
- score += np.sum(gammaln(distr) - gammaln(prior))
- score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1)))
- return score
- is_sparse_x = sp.issparse(X)
- n_samples, n_components = doc_topic_distr.shape
- n_features = self.components_.shape[1]
- score = 0
- dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr)
- dirichlet_component_ = _dirichlet_expectation_2d(self.components_)
- doc_topic_prior = self.doc_topic_prior_
- topic_word_prior = self.topic_word_prior_
- if is_sparse_x:
- X_data = X.data
- X_indices = X.indices
- X_indptr = X.indptr
- # E[log p(docs | theta, beta)]
- for idx_d in range(0, n_samples):
- if is_sparse_x:
- ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]]
- cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]]
- else:
- ids = np.nonzero(X[idx_d, :])[0]
- cnts = X[idx_d, ids]
- temp = (
- dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]
- )
- norm_phi = logsumexp(temp, axis=0)
- score += np.dot(cnts, norm_phi)
- # compute E[log p(theta | alpha) - log q(theta | gamma)]
- score += _loglikelihood(
- doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self.n_components
- )
- # Compensate for the subsampling of the population of documents
- if sub_sampling:
- doc_ratio = float(self.total_samples) / n_samples
- score *= doc_ratio
- # E[log p(beta | eta) - log q (beta | lambda)]
- score += _loglikelihood(
- topic_word_prior, self.components_, dirichlet_component_, n_features
- )
- return score
- def score(self, X, y=None):
- """Calculate approximate log-likelihood as score.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- score : float
- Use approximate bound as score.
- """
- check_is_fitted(self)
- X = self._check_non_neg_array(
- X, reset_n_features=False, whom="LatentDirichletAllocation.score"
- )
- doc_topic_distr = self._unnormalized_transform(X)
- score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)
- return score
- def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False):
- """Calculate approximate perplexity for data X with ability to accept
- precomputed doc_topic_distr
- Perplexity is defined as exp(-1. * log-likelihood per word)
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- doc_topic_distr : ndarray of shape (n_samples, n_components), \
- default=None
- Document topic distribution.
- If it is None, it will be generated by applying transform on X.
- Returns
- -------
- score : float
- Perplexity score.
- """
- if doc_topic_distr is None:
- doc_topic_distr = self._unnormalized_transform(X)
- else:
- n_samples, n_components = doc_topic_distr.shape
- if n_samples != X.shape[0]:
- raise ValueError(
- "Number of samples in X and doc_topic_distr do not match."
- )
- if n_components != self.n_components:
- raise ValueError("Number of topics does not match.")
- current_samples = X.shape[0]
- bound = self._approx_bound(X, doc_topic_distr, sub_sampling)
- if sub_sampling:
- word_cnt = X.sum() * (float(self.total_samples) / current_samples)
- else:
- word_cnt = X.sum()
- perword_bound = bound / word_cnt
- return np.exp(-1.0 * perword_bound)
- def perplexity(self, X, sub_sampling=False):
- """Calculate approximate perplexity for data X.
- Perplexity is defined as exp(-1. * log-likelihood per word)
- .. versionchanged:: 0.19
- *doc_topic_distr* argument has been deprecated and is ignored
- because user no longer has access to unnormalized distribution
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Document word matrix.
- sub_sampling : bool
- Do sub-sampling or not.
- Returns
- -------
- score : float
- Perplexity score.
- """
- check_is_fitted(self)
- X = self._check_non_neg_array(
- X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity"
- )
- return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)
- @property
- def _n_features_out(self):
- """Number of transformed output features."""
- return self.components_.shape[0]
|