| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590 |
- """Affinity Propagation clustering algorithm."""
- # Author: Alexandre Gramfort alexandre.gramfort@inria.fr
- # Gael Varoquaux gael.varoquaux@normalesup.org
- # License: BSD 3 clause
- import warnings
- from numbers import Integral, Real
- import numpy as np
- from .._config import config_context
- from ..base import BaseEstimator, ClusterMixin, _fit_context
- from ..exceptions import ConvergenceWarning
- from ..metrics import euclidean_distances, pairwise_distances_argmin
- from ..utils import check_random_state
- from ..utils._param_validation import Interval, StrOptions, validate_params
- from ..utils.validation import check_is_fitted
- def _equal_similarities_and_preferences(S, preference):
- def all_equal_preferences():
- return np.all(preference == preference.flat[0])
- def all_equal_similarities():
- # Create mask to ignore diagonal of S
- mask = np.ones(S.shape, dtype=bool)
- np.fill_diagonal(mask, 0)
- return np.all(S[mask].flat == S[mask].flat[0])
- return all_equal_preferences() and all_equal_similarities()
- def _affinity_propagation(
- S,
- *,
- preference,
- convergence_iter,
- max_iter,
- damping,
- verbose,
- return_n_iter,
- random_state,
- ):
- """Main affinity propagation algorithm."""
- n_samples = S.shape[0]
- if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
- # It makes no sense to run the algorithm in this case, so return 1 or
- # n_samples clusters, depending on preferences
- warnings.warn(
- "All samples have mutually equal similarities. "
- "Returning arbitrary cluster center(s)."
- )
- if preference.flat[0] >= S.flat[n_samples - 1]:
- return (
- (np.arange(n_samples), np.arange(n_samples), 0)
- if return_n_iter
- else (np.arange(n_samples), np.arange(n_samples))
- )
- else:
- return (
- (np.array([0]), np.array([0] * n_samples), 0)
- if return_n_iter
- else (np.array([0]), np.array([0] * n_samples))
- )
- # Place preference on the diagonal of S
- S.flat[:: (n_samples + 1)] = preference
- A = np.zeros((n_samples, n_samples))
- R = np.zeros((n_samples, n_samples)) # Initialize messages
- # Intermediate results
- tmp = np.zeros((n_samples, n_samples))
- # Remove degeneracies
- S += (
- np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
- ) * random_state.standard_normal(size=(n_samples, n_samples))
- # Execute parallel affinity propagation updates
- e = np.zeros((n_samples, convergence_iter))
- ind = np.arange(n_samples)
- for it in range(max_iter):
- # tmp = A + S; compute responsibilities
- np.add(A, S, tmp)
- I = np.argmax(tmp, axis=1)
- Y = tmp[ind, I] # np.max(A + S, axis=1)
- tmp[ind, I] = -np.inf
- Y2 = np.max(tmp, axis=1)
- # tmp = Rnew
- np.subtract(S, Y[:, None], tmp)
- tmp[ind, I] = S[ind, I] - Y2
- # Damping
- tmp *= 1 - damping
- R *= damping
- R += tmp
- # tmp = Rp; compute availabilities
- np.maximum(R, 0, tmp)
- tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
- # tmp = -Anew
- tmp -= np.sum(tmp, axis=0)
- dA = np.diag(tmp).copy()
- tmp.clip(0, np.inf, tmp)
- tmp.flat[:: n_samples + 1] = dA
- # Damping
- tmp *= 1 - damping
- A *= damping
- A -= tmp
- # Check for convergence
- E = (np.diag(A) + np.diag(R)) > 0
- e[:, it % convergence_iter] = E
- K = np.sum(E, axis=0)
- if it >= convergence_iter:
- se = np.sum(e, axis=1)
- unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
- if (not unconverged and (K > 0)) or (it == max_iter):
- never_converged = False
- if verbose:
- print("Converged after %d iterations." % it)
- break
- else:
- never_converged = True
- if verbose:
- print("Did not converge")
- I = np.flatnonzero(E)
- K = I.size # Identify exemplars
- if K > 0:
- if never_converged:
- warnings.warn(
- (
- "Affinity propagation did not converge, this model "
- "may return degenerate cluster centers and labels."
- ),
- ConvergenceWarning,
- )
- c = np.argmax(S[:, I], axis=1)
- c[I] = np.arange(K) # Identify clusters
- # Refine the final set of exemplars and clusters and return results
- for k in range(K):
- ii = np.where(c == k)[0]
- j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
- I[k] = ii[j]
- c = np.argmax(S[:, I], axis=1)
- c[I] = np.arange(K)
- labels = I[c]
- # Reduce labels to a sorted, gapless, list
- cluster_centers_indices = np.unique(labels)
- labels = np.searchsorted(cluster_centers_indices, labels)
- else:
- warnings.warn(
- (
- "Affinity propagation did not converge and this model "
- "will not have any cluster centers."
- ),
- ConvergenceWarning,
- )
- labels = np.array([-1] * n_samples)
- cluster_centers_indices = []
- if return_n_iter:
- return cluster_centers_indices, labels, it + 1
- else:
- return cluster_centers_indices, labels
- ###############################################################################
- # Public API
- @validate_params(
- {
- "S": ["array-like"],
- "return_n_iter": ["boolean"],
- },
- prefer_skip_nested_validation=False,
- )
- def affinity_propagation(
- S,
- *,
- preference=None,
- convergence_iter=15,
- max_iter=200,
- damping=0.5,
- copy=True,
- verbose=False,
- return_n_iter=False,
- random_state=None,
- ):
- """Perform Affinity Propagation Clustering of data.
- Read more in the :ref:`User Guide <affinity_propagation>`.
- Parameters
- ----------
- S : array-like of shape (n_samples, n_samples)
- Matrix of similarities between points.
- preference : array-like of shape (n_samples,) or float, default=None
- Preferences for each point - points with larger values of
- preferences are more likely to be chosen as exemplars. The number of
- exemplars, i.e. of clusters, is influenced by the input preferences
- value. If the preferences are not passed as arguments, they will be
- set to the median of the input similarities (resulting in a moderate
- number of clusters). For a smaller amount of clusters, this can be set
- to the minimum value of the similarities.
- convergence_iter : int, default=15
- Number of iterations with no change in the number
- of estimated clusters that stops the convergence.
- max_iter : int, default=200
- Maximum number of iterations.
- damping : float, default=0.5
- Damping factor between 0.5 and 1.
- copy : bool, default=True
- If copy is False, the affinity matrix is modified inplace by the
- algorithm, for memory efficiency.
- verbose : bool, default=False
- The verbosity level.
- return_n_iter : bool, default=False
- Whether or not to return the number of iterations.
- random_state : int, RandomState instance or None, default=None
- Pseudo-random number generator to control the starting state.
- Use an int for reproducible results across function calls.
- See the :term:`Glossary <random_state>`.
- .. versionadded:: 0.23
- this parameter was previously hardcoded as 0.
- Returns
- -------
- cluster_centers_indices : ndarray of shape (n_clusters,)
- Index of clusters centers.
- labels : ndarray of shape (n_samples,)
- Cluster labels for each point.
- n_iter : int
- Number of iterations run. Returned only if `return_n_iter` is
- set to True.
- Notes
- -----
- For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
- <sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
- When the algorithm does not converge, it will still return a arrays of
- ``cluster_center_indices`` and labels if there are any exemplars/clusters,
- however they may be degenerate and should be used with caution.
- When all training samples have equal similarities and equal preferences,
- the assignment of cluster centers and labels depends on the preference.
- If the preference is smaller than the similarities, a single cluster center
- and label ``0`` for every sample will be returned. Otherwise, every
- training sample becomes its own cluster center and is assigned a unique
- label.
- References
- ----------
- Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
- Between Data Points", Science Feb. 2007
- """
- estimator = AffinityPropagation(
- damping=damping,
- max_iter=max_iter,
- convergence_iter=convergence_iter,
- copy=copy,
- preference=preference,
- affinity="precomputed",
- verbose=verbose,
- random_state=random_state,
- ).fit(S)
- if return_n_iter:
- return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
- return estimator.cluster_centers_indices_, estimator.labels_
- class AffinityPropagation(ClusterMixin, BaseEstimator):
- """Perform Affinity Propagation Clustering of data.
- Read more in the :ref:`User Guide <affinity_propagation>`.
- Parameters
- ----------
- damping : float, default=0.5
- Damping factor in the range `[0.5, 1.0)` is the extent to
- which the current value is maintained relative to
- incoming values (weighted 1 - damping). This in order
- to avoid numerical oscillations when updating these
- values (messages).
- max_iter : int, default=200
- Maximum number of iterations.
- convergence_iter : int, default=15
- Number of iterations with no change in the number
- of estimated clusters that stops the convergence.
- copy : bool, default=True
- Make a copy of input data.
- preference : array-like of shape (n_samples,) or float, default=None
- Preferences for each point - points with larger values of
- preferences are more likely to be chosen as exemplars. The number
- of exemplars, ie of clusters, is influenced by the input
- preferences value. If the preferences are not passed as arguments,
- they will be set to the median of the input similarities.
- affinity : {'euclidean', 'precomputed'}, default='euclidean'
- Which affinity to use. At the moment 'precomputed' and
- ``euclidean`` are supported. 'euclidean' uses the
- negative squared euclidean distance between points.
- verbose : bool, default=False
- Whether to be verbose.
- random_state : int, RandomState instance or None, default=None
- Pseudo-random number generator to control the starting state.
- Use an int for reproducible results across function calls.
- See the :term:`Glossary <random_state>`.
- .. versionadded:: 0.23
- this parameter was previously hardcoded as 0.
- Attributes
- ----------
- cluster_centers_indices_ : ndarray of shape (n_clusters,)
- Indices of cluster centers.
- cluster_centers_ : ndarray of shape (n_clusters, n_features)
- Cluster centers (if affinity != ``precomputed``).
- labels_ : ndarray of shape (n_samples,)
- Labels of each point.
- affinity_matrix_ : ndarray of shape (n_samples, n_samples)
- Stores the affinity matrix used in ``fit``.
- n_iter_ : int
- Number of iterations taken to converge.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- AgglomerativeClustering : Recursively merges the pair of
- clusters that minimally increases a given linkage distance.
- FeatureAgglomeration : Similar to AgglomerativeClustering,
- but recursively merges features instead of samples.
- KMeans : K-Means clustering.
- MiniBatchKMeans : Mini-Batch K-Means clustering.
- MeanShift : Mean shift clustering using a flat kernel.
- SpectralClustering : Apply clustering to a projection
- of the normalized Laplacian.
- Notes
- -----
- For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
- <sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
- The algorithmic complexity of affinity propagation is quadratic
- in the number of points.
- When the algorithm does not converge, it will still return a arrays of
- ``cluster_center_indices`` and labels if there are any exemplars/clusters,
- however they may be degenerate and should be used with caution.
- When ``fit`` does not converge, ``cluster_centers_`` is still populated
- however it may be degenerate. In such a case, proceed with caution.
- If ``fit`` does not converge and fails to produce any ``cluster_centers_``
- then ``predict`` will label every sample as ``-1``.
- When all training samples have equal similarities and equal preferences,
- the assignment of cluster centers and labels depends on the preference.
- If the preference is smaller than the similarities, ``fit`` will result in
- a single cluster center and label ``0`` for every sample. Otherwise, every
- training sample becomes its own cluster center and is assigned a unique
- label.
- References
- ----------
- Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
- Between Data Points", Science Feb. 2007
- Examples
- --------
- >>> from sklearn.cluster import AffinityPropagation
- >>> import numpy as np
- >>> X = np.array([[1, 2], [1, 4], [1, 0],
- ... [4, 2], [4, 4], [4, 0]])
- >>> clustering = AffinityPropagation(random_state=5).fit(X)
- >>> clustering
- AffinityPropagation(random_state=5)
- >>> clustering.labels_
- array([0, 0, 0, 1, 1, 1])
- >>> clustering.predict([[0, 0], [4, 4]])
- array([0, 1])
- >>> clustering.cluster_centers_
- array([[1, 2],
- [4, 2]])
- """
- _parameter_constraints: dict = {
- "damping": [Interval(Real, 0.5, 1.0, closed="left")],
- "max_iter": [Interval(Integral, 1, None, closed="left")],
- "convergence_iter": [Interval(Integral, 1, None, closed="left")],
- "copy": ["boolean"],
- "preference": [
- "array-like",
- Interval(Real, None, None, closed="neither"),
- None,
- ],
- "affinity": [StrOptions({"euclidean", "precomputed"})],
- "verbose": ["verbose"],
- "random_state": ["random_state"],
- }
- def __init__(
- self,
- *,
- damping=0.5,
- max_iter=200,
- convergence_iter=15,
- copy=True,
- preference=None,
- affinity="euclidean",
- verbose=False,
- random_state=None,
- ):
- self.damping = damping
- self.max_iter = max_iter
- self.convergence_iter = convergence_iter
- self.copy = copy
- self.verbose = verbose
- self.preference = preference
- self.affinity = affinity
- self.random_state = random_state
- def _more_tags(self):
- return {"pairwise": self.affinity == "precomputed"}
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """Fit the clustering from features, or affinity matrix.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
- array-like of shape (n_samples, n_samples)
- Training instances to cluster, or similarities / affinities between
- instances if ``affinity='precomputed'``. If a sparse feature matrix
- is provided, it will be converted into a sparse ``csr_matrix``.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- self
- Returns the instance itself.
- """
- if self.affinity == "precomputed":
- accept_sparse = False
- else:
- accept_sparse = "csr"
- X = self._validate_data(X, accept_sparse=accept_sparse)
- if self.affinity == "precomputed":
- self.affinity_matrix_ = X.copy() if self.copy else X
- else: # self.affinity == "euclidean"
- self.affinity_matrix_ = -euclidean_distances(X, squared=True)
- if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
- raise ValueError(
- "The matrix of similarities must be a square array. "
- f"Got {self.affinity_matrix_.shape} instead."
- )
- if self.preference is None:
- preference = np.median(self.affinity_matrix_)
- else:
- preference = self.preference
- preference = np.array(preference, copy=False)
- random_state = check_random_state(self.random_state)
- (
- self.cluster_centers_indices_,
- self.labels_,
- self.n_iter_,
- ) = _affinity_propagation(
- self.affinity_matrix_,
- max_iter=self.max_iter,
- convergence_iter=self.convergence_iter,
- preference=preference,
- damping=self.damping,
- verbose=self.verbose,
- return_n_iter=True,
- random_state=random_state,
- )
- if self.affinity != "precomputed":
- self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
- return self
- def predict(self, X):
- """Predict the closest cluster each sample in X belongs to.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- New data to predict. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- labels : ndarray of shape (n_samples,)
- Cluster labels.
- """
- check_is_fitted(self)
- X = self._validate_data(X, reset=False, accept_sparse="csr")
- if not hasattr(self, "cluster_centers_"):
- raise ValueError(
- "Predict method is not supported when affinity='precomputed'."
- )
- if self.cluster_centers_.shape[0] > 0:
- with config_context(assume_finite=True):
- return pairwise_distances_argmin(X, self.cluster_centers_)
- else:
- warnings.warn(
- (
- "This model does not have any cluster centers "
- "because affinity propagation did not converge. "
- "Labeling every sample as '-1'."
- ),
- ConvergenceWarning,
- )
- return np.array([-1] * X.shape[0])
- def fit_predict(self, X, y=None):
- """Fit clustering from features/affinity matrix; return cluster labels.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
- array-like of shape (n_samples, n_samples)
- Training instances to cluster, or similarities / affinities between
- instances if ``affinity='precomputed'``. If a sparse feature matrix
- is provided, it will be converted into a sparse ``csr_matrix``.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- labels : ndarray of shape (n_samples,)
- Cluster labels.
- """
- return super().fit_predict(X, y)
|