| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113 |
- """Ordering Points To Identify the Clustering Structure (OPTICS)
- These routines execute the OPTICS algorithm, and implement various
- cluster extraction methods of the ordered list.
- Authors: Shane Grigsby <refuge@rocktalus.com>
- Adrin Jalali <adrinjalali@gmail.com>
- Erich Schubert <erich@debian.org>
- Hanmin Qin <qinhanmin2005@sina.com>
- License: BSD 3 clause
- """
- import warnings
- from numbers import Integral, Real
- import numpy as np
- from scipy.sparse import SparseEfficiencyWarning, issparse
- from ..base import BaseEstimator, ClusterMixin, _fit_context
- from ..exceptions import DataConversionWarning
- from ..metrics import pairwise_distances
- from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS
- from ..neighbors import NearestNeighbors
- from ..utils import gen_batches, get_chunk_n_rows
- from ..utils._param_validation import (
- HasMethods,
- Interval,
- RealNotInt,
- StrOptions,
- validate_params,
- )
- from ..utils.validation import check_memory
- class OPTICS(ClusterMixin, BaseEstimator):
- """Estimate clustering structure from vector array.
- OPTICS (Ordering Points To Identify the Clustering Structure), closely
- related to DBSCAN, finds core sample of high density and expands clusters
- from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
- neighborhood radius. Better suited for usage on large datasets than the
- current sklearn implementation of DBSCAN.
- Clusters are then extracted using a DBSCAN-like method
- (cluster_method = 'dbscan') or an automatic
- technique proposed in [1]_ (cluster_method = 'xi').
- This implementation deviates from the original OPTICS by first performing
- k-nearest-neighborhood searches on all points to identify core sizes, then
- computing only the distances to unprocessed points when constructing the
- cluster order. Note that we do not employ a heap to manage the expansion
- candidates, so the time complexity will be O(n^2).
- Read more in the :ref:`User Guide <optics>`.
- Parameters
- ----------
- min_samples : int > 1 or float between 0 and 1, default=5
- The number of samples in a neighborhood for a point to be considered as
- a core point. Also, up and down steep regions can't have more than
- ``min_samples`` consecutive non-steep points. Expressed as an absolute
- number or a fraction of the number of samples (rounded to be at least
- 2).
- max_eps : float, default=np.inf
- The maximum distance between two samples for one to be considered as
- in the neighborhood of the other. Default value of ``np.inf`` will
- identify clusters across all scales; reducing ``max_eps`` will result
- in shorter run times.
- metric : str or callable, default='minkowski'
- Metric to use for distance computation. Any metric from scikit-learn
- or scipy.spatial.distance can be used.
- If metric is a callable function, it is called on each
- pair of instances (rows) and the resulting value recorded. The callable
- should take two arrays as input and return one value indicating the
- distance between them. This works for Scipy's metrics, but is less
- efficient than passing the metric name as a string. If metric is
- "precomputed", `X` is assumed to be a distance matrix and must be
- square.
- Valid values for metric are:
- - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
- 'manhattan']
- - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
- 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
- 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
- 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
- 'yule']
- Sparse matrices are only supported by scikit-learn metrics.
- See the documentation for scipy.spatial.distance for details on these
- metrics.
- .. note::
- `'kulsinski'` is deprecated from SciPy 1.9 and will removed in SciPy 1.11.
- p : float, default=2
- Parameter for the Minkowski metric from
- :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
- equivalent to using manhattan_distance (l1), and euclidean_distance
- (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
- metric_params : dict, default=None
- Additional keyword arguments for the metric function.
- cluster_method : str, default='xi'
- The extraction method used to extract clusters using the calculated
- reachability and ordering. Possible values are "xi" and "dbscan".
- eps : float, default=None
- The maximum distance between two samples for one to be considered as
- in the neighborhood of the other. By default it assumes the same value
- as ``max_eps``.
- Used only when ``cluster_method='dbscan'``.
- xi : float between 0 and 1, default=0.05
- Determines the minimum steepness on the reachability plot that
- constitutes a cluster boundary. For example, an upwards point in the
- reachability plot is defined by the ratio from one point to its
- successor being at most 1-xi.
- Used only when ``cluster_method='xi'``.
- predecessor_correction : bool, default=True
- Correct clusters according to the predecessors calculated by OPTICS
- [2]_. This parameter has minimal effect on most datasets.
- Used only when ``cluster_method='xi'``.
- min_cluster_size : int > 1 or float between 0 and 1, default=None
- Minimum number of samples in an OPTICS cluster, expressed as an
- absolute number or a fraction of the number of samples (rounded to be
- at least 2). If ``None``, the value of ``min_samples`` is used instead.
- Used only when ``cluster_method='xi'``.
- algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
- Algorithm used to compute the nearest neighbors:
- - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
- - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
- - 'brute' will use a brute-force search.
- - 'auto' (default) will attempt to decide the most appropriate
- algorithm based on the values passed to :meth:`fit` method.
- Note: fitting on sparse input will override the setting of
- this parameter, using brute force.
- leaf_size : int, default=30
- Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
- :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
- construction and query, as well as the memory required to store the
- tree. The optimal value depends on the nature of the problem.
- memory : str or object with the joblib.Memory interface, default=None
- Used to cache the output of the computation of the tree.
- By default, no caching is done. If a string is given, it is the
- path to the caching directory.
- n_jobs : int, default=None
- The number of parallel jobs to run for neighbors search.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- Attributes
- ----------
- labels_ : ndarray of shape (n_samples,)
- Cluster labels for each point in the dataset given to fit().
- Noisy samples and points which are not included in a leaf cluster
- of ``cluster_hierarchy_`` are labeled as -1.
- reachability_ : ndarray of shape (n_samples,)
- Reachability distances per sample, indexed by object order. Use
- ``clust.reachability_[clust.ordering_]`` to access in cluster order.
- ordering_ : ndarray of shape (n_samples,)
- The cluster ordered list of sample indices.
- core_distances_ : ndarray of shape (n_samples,)
- Distance at which each sample becomes a core point, indexed by object
- order. Points which will never be core have a distance of inf. Use
- ``clust.core_distances_[clust.ordering_]`` to access in cluster order.
- predecessor_ : ndarray of shape (n_samples,)
- Point that a sample was reached from, indexed by object order.
- Seed points have a predecessor of -1.
- cluster_hierarchy_ : ndarray of shape (n_clusters, 2)
- The list of clusters in the form of ``[start, end]`` in each row, with
- all indices inclusive. The clusters are ordered according to
- ``(end, -start)`` (ascending) so that larger clusters encompassing
- smaller clusters come after those smaller ones. Since ``labels_`` does
- not reflect the hierarchy, usually
- ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also
- note that these indices are of the ``ordering_``, i.e.
- ``X[ordering_][start:end + 1]`` form a cluster.
- Only available when ``cluster_method='xi'``.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- DBSCAN : A similar clustering for a specified neighborhood radius (eps).
- Our implementation is optimized for runtime.
- References
- ----------
- .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
- and Jörg Sander. "OPTICS: ordering points to identify the clustering
- structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
- .. [2] Schubert, Erich, Michael Gertz.
- "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
- the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
- Examples
- --------
- >>> from sklearn.cluster import OPTICS
- >>> import numpy as np
- >>> X = np.array([[1, 2], [2, 5], [3, 6],
- ... [8, 7], [8, 8], [7, 3]])
- >>> clustering = OPTICS(min_samples=2).fit(X)
- >>> clustering.labels_
- array([0, 0, 0, 1, 1, 1])
- For a more detailed example see
- :ref:`sphx_glr_auto_examples_cluster_plot_optics.py`.
- """
- _parameter_constraints: dict = {
- "min_samples": [
- Interval(Integral, 2, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="both"),
- ],
- "max_eps": [Interval(Real, 0, None, closed="both")],
- "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
- "p": [Interval(Real, 1, None, closed="left")],
- "metric_params": [dict, None],
- "cluster_method": [StrOptions({"dbscan", "xi"})],
- "eps": [Interval(Real, 0, None, closed="both"), None],
- "xi": [Interval(Real, 0, 1, closed="both")],
- "predecessor_correction": ["boolean"],
- "min_cluster_size": [
- Interval(Integral, 2, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="right"),
- None,
- ],
- "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
- "leaf_size": [Interval(Integral, 1, None, closed="left")],
- "memory": [str, HasMethods("cache"), None],
- "n_jobs": [Integral, None],
- }
- def __init__(
- self,
- *,
- min_samples=5,
- max_eps=np.inf,
- metric="minkowski",
- p=2,
- metric_params=None,
- cluster_method="xi",
- eps=None,
- xi=0.05,
- predecessor_correction=True,
- min_cluster_size=None,
- algorithm="auto",
- leaf_size=30,
- memory=None,
- n_jobs=None,
- ):
- self.max_eps = max_eps
- self.min_samples = min_samples
- self.min_cluster_size = min_cluster_size
- self.algorithm = algorithm
- self.metric = metric
- self.metric_params = metric_params
- self.p = p
- self.leaf_size = leaf_size
- self.cluster_method = cluster_method
- self.eps = eps
- self.xi = xi
- self.predecessor_correction = predecessor_correction
- self.memory = memory
- self.n_jobs = n_jobs
- @_fit_context(
- # Optics.metric is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y=None):
- """Perform OPTICS clustering.
- Extracts an ordered list of points and reachability distances, and
- performs initial clustering using ``max_eps`` distance specified at
- OPTICS object instantiation.
- Parameters
- ----------
- X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
- (n_samples, n_samples) if metric='precomputed'
- A feature array, or array of distances between samples if
- metric='precomputed'. If a sparse matrix is provided, it will be
- converted into CSR format.
- y : Ignored
- Not used, present for API consistency by convention.
- Returns
- -------
- self : object
- Returns a fitted instance of self.
- """
- dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float
- if dtype == bool and X.dtype != bool:
- msg = (
- "Data will be converted to boolean for"
- f" metric {self.metric}, to avoid this warning,"
- " you may convert the data prior to calling fit."
- )
- warnings.warn(msg, DataConversionWarning)
- X = self._validate_data(X, dtype=dtype, accept_sparse="csr")
- if self.metric == "precomputed" and issparse(X):
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", SparseEfficiencyWarning)
- # Set each diagonal to an explicit value so each point is its
- # own neighbor
- X.setdiag(X.diagonal())
- memory = check_memory(self.memory)
- (
- self.ordering_,
- self.core_distances_,
- self.reachability_,
- self.predecessor_,
- ) = memory.cache(compute_optics_graph)(
- X=X,
- min_samples=self.min_samples,
- algorithm=self.algorithm,
- leaf_size=self.leaf_size,
- metric=self.metric,
- metric_params=self.metric_params,
- p=self.p,
- n_jobs=self.n_jobs,
- max_eps=self.max_eps,
- )
- # Extract clusters from the calculated orders and reachability
- if self.cluster_method == "xi":
- labels_, clusters_ = cluster_optics_xi(
- reachability=self.reachability_,
- predecessor=self.predecessor_,
- ordering=self.ordering_,
- min_samples=self.min_samples,
- min_cluster_size=self.min_cluster_size,
- xi=self.xi,
- predecessor_correction=self.predecessor_correction,
- )
- self.cluster_hierarchy_ = clusters_
- elif self.cluster_method == "dbscan":
- if self.eps is None:
- eps = self.max_eps
- else:
- eps = self.eps
- if eps > self.max_eps:
- raise ValueError(
- "Specify an epsilon smaller than %s. Got %s." % (self.max_eps, eps)
- )
- labels_ = cluster_optics_dbscan(
- reachability=self.reachability_,
- core_distances=self.core_distances_,
- ordering=self.ordering_,
- eps=eps,
- )
- self.labels_ = labels_
- return self
- def _validate_size(size, n_samples, param_name):
- if size > n_samples:
- raise ValueError(
- "%s must be no greater than the number of samples (%d). Got %d"
- % (param_name, n_samples, size)
- )
- # OPTICS helper functions
- def _compute_core_distances_(X, neighbors, min_samples, working_memory):
- """Compute the k-th nearest neighbor of each sample.
- Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
- but with more memory efficiency.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data.
- neighbors : NearestNeighbors instance
- The fitted nearest neighbors estimator.
- working_memory : int, default=None
- The sought maximum memory for temporary distance matrix chunks.
- When None (default), the value of
- ``sklearn.get_config()['working_memory']`` is used.
- Returns
- -------
- core_distances : ndarray of shape (n_samples,)
- Distance at which each sample becomes a core point.
- Points which will never be core have a distance of inf.
- """
- n_samples = X.shape[0]
- core_distances = np.empty(n_samples)
- core_distances.fill(np.nan)
- chunk_n_rows = get_chunk_n_rows(
- row_bytes=16 * min_samples, max_n_rows=n_samples, working_memory=working_memory
- )
- slices = gen_batches(n_samples, chunk_n_rows)
- for sl in slices:
- core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1]
- return core_distances
- @validate_params(
- {
- "X": [np.ndarray, "sparse matrix"],
- "min_samples": [
- Interval(Integral, 2, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="both"),
- ],
- "max_eps": [Interval(Real, 0, None, closed="both")],
- "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
- "p": [Interval(Real, 0, None, closed="right"), None],
- "metric_params": [dict, None],
- "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
- "leaf_size": [Interval(Integral, 1, None, closed="left")],
- "n_jobs": [Integral, None],
- },
- prefer_skip_nested_validation=False, # metric is not validated yet
- )
- def compute_optics_graph(
- X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs
- ):
- """Compute the OPTICS reachability graph.
- Read more in the :ref:`User Guide <optics>`.
- Parameters
- ----------
- X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
- (n_samples, n_samples) if metric='precomputed'
- A feature array, or array of distances between samples if
- metric='precomputed'.
- min_samples : int > 1 or float between 0 and 1
- The number of samples in a neighborhood for a point to be considered
- as a core point. Expressed as an absolute number or a fraction of the
- number of samples (rounded to be at least 2).
- max_eps : float, default=np.inf
- The maximum distance between two samples for one to be considered as
- in the neighborhood of the other. Default value of ``np.inf`` will
- identify clusters across all scales; reducing ``max_eps`` will result
- in shorter run times.
- metric : str or callable, default='minkowski'
- Metric to use for distance computation. Any metric from scikit-learn
- or scipy.spatial.distance can be used.
- If metric is a callable function, it is called on each
- pair of instances (rows) and the resulting value recorded. The callable
- should take two arrays as input and return one value indicating the
- distance between them. This works for Scipy's metrics, but is less
- efficient than passing the metric name as a string. If metric is
- "precomputed", X is assumed to be a distance matrix and must be square.
- Valid values for metric are:
- - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
- 'manhattan']
- - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
- 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
- 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
- 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
- 'yule']
- See the documentation for scipy.spatial.distance for details on these
- metrics.
- .. note::
- `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
- p : float, default=2
- Parameter for the Minkowski metric from
- :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
- equivalent to using manhattan_distance (l1), and euclidean_distance
- (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
- metric_params : dict, default=None
- Additional keyword arguments for the metric function.
- algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
- Algorithm used to compute the nearest neighbors:
- - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
- - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
- - 'brute' will use a brute-force search.
- - 'auto' will attempt to decide the most appropriate algorithm
- based on the values passed to `fit` method. (default)
- Note: fitting on sparse input will override the setting of
- this parameter, using brute force.
- leaf_size : int, default=30
- Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
- :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
- construction and query, as well as the memory required to store the
- tree. The optimal value depends on the nature of the problem.
- n_jobs : int, default=None
- The number of parallel jobs to run for neighbors search.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- Returns
- -------
- ordering_ : array of shape (n_samples,)
- The cluster ordered list of sample indices.
- core_distances_ : array of shape (n_samples,)
- Distance at which each sample becomes a core point, indexed by object
- order. Points which will never be core have a distance of inf. Use
- ``clust.core_distances_[clust.ordering_]`` to access in cluster order.
- reachability_ : array of shape (n_samples,)
- Reachability distances per sample, indexed by object order. Use
- ``clust.reachability_[clust.ordering_]`` to access in cluster order.
- predecessor_ : array of shape (n_samples,)
- Point that a sample was reached from, indexed by object order.
- Seed points have a predecessor of -1.
- References
- ----------
- .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
- and Jörg Sander. "OPTICS: ordering points to identify the clustering
- structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
- """
- n_samples = X.shape[0]
- _validate_size(min_samples, n_samples, "min_samples")
- if min_samples <= 1:
- min_samples = max(2, int(min_samples * n_samples))
- # Start all points as 'unprocessed' ##
- reachability_ = np.empty(n_samples)
- reachability_.fill(np.inf)
- predecessor_ = np.empty(n_samples, dtype=int)
- predecessor_.fill(-1)
- nbrs = NearestNeighbors(
- n_neighbors=min_samples,
- algorithm=algorithm,
- leaf_size=leaf_size,
- metric=metric,
- metric_params=metric_params,
- p=p,
- n_jobs=n_jobs,
- )
- nbrs.fit(X)
- # Here we first do a kNN query for each point, this differs from
- # the original OPTICS that only used epsilon range queries.
- # TODO: handle working_memory somehow?
- core_distances_ = _compute_core_distances_(
- X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
- )
- # OPTICS puts an upper limit on these, use inf for undefined.
- core_distances_[core_distances_ > max_eps] = np.inf
- np.around(
- core_distances_,
- decimals=np.finfo(core_distances_.dtype).precision,
- out=core_distances_,
- )
- # Main OPTICS loop. Not parallelizable. The order that entries are
- # written to the 'ordering_' list is important!
- # Note that this implementation is O(n^2) theoretically, but
- # supposedly with very low constant factors.
- processed = np.zeros(X.shape[0], dtype=bool)
- ordering = np.zeros(X.shape[0], dtype=int)
- for ordering_idx in range(X.shape[0]):
- # Choose next based on smallest reachability distance
- # (And prefer smaller ids on ties, possibly np.inf!)
- index = np.where(processed == 0)[0]
- point = index[np.argmin(reachability_[index])]
- processed[point] = True
- ordering[ordering_idx] = point
- if core_distances_[point] != np.inf:
- _set_reach_dist(
- core_distances_=core_distances_,
- reachability_=reachability_,
- predecessor_=predecessor_,
- point_index=point,
- processed=processed,
- X=X,
- nbrs=nbrs,
- metric=metric,
- metric_params=metric_params,
- p=p,
- max_eps=max_eps,
- )
- if np.all(np.isinf(reachability_)):
- warnings.warn(
- (
- "All reachability values are inf. Set a larger"
- " max_eps or all data will be considered outliers."
- ),
- UserWarning,
- )
- return ordering, core_distances_, reachability_, predecessor_
- def _set_reach_dist(
- core_distances_,
- reachability_,
- predecessor_,
- point_index,
- processed,
- X,
- nbrs,
- metric,
- metric_params,
- p,
- max_eps,
- ):
- P = X[point_index : point_index + 1]
- # Assume that radius_neighbors is faster without distances
- # and we don't need all distances, nevertheless, this means
- # we may be doing some work twice.
- indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
- # Getting indices of neighbors that have not been processed
- unproc = np.compress(~np.take(processed, indices), indices)
- # Neighbors of current point are already processed.
- if not unproc.size:
- return
- # Only compute distances to unprocessed neighbors:
- if metric == "precomputed":
- dists = X[point_index, unproc]
- if issparse(dists):
- dists.sort_indices()
- dists = dists.data
- else:
- _params = dict() if metric_params is None else metric_params.copy()
- if metric == "minkowski" and "p" not in _params:
- # the same logic as neighbors, p is ignored if explicitly set
- # in the dict params
- _params["p"] = p
- dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()
- rdists = np.maximum(dists, core_distances_[point_index])
- np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
- improved = np.where(rdists < np.take(reachability_, unproc))
- reachability_[unproc[improved]] = rdists[improved]
- predecessor_[unproc[improved]] = point_index
- @validate_params(
- {
- "reachability": [np.ndarray],
- "core_distances": [np.ndarray],
- "ordering": [np.ndarray],
- "eps": [Interval(Real, 0, None, closed="both")],
- },
- prefer_skip_nested_validation=True,
- )
- def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
- """Perform DBSCAN extraction for an arbitrary epsilon.
- Extracting the clusters runs in linear time. Note that this results in
- ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with
- similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.
- Parameters
- ----------
- reachability : ndarray of shape (n_samples,)
- Reachability distances calculated by OPTICS (``reachability_``).
- core_distances : ndarray of shape (n_samples,)
- Distances at which points become core (``core_distances_``).
- ordering : ndarray of shape (n_samples,)
- OPTICS ordered point indices (``ordering_``).
- eps : float
- DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results
- will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close
- to one another.
- Returns
- -------
- labels_ : array of shape (n_samples,)
- The estimated labels.
- """
- n_samples = len(core_distances)
- labels = np.zeros(n_samples, dtype=int)
- far_reach = reachability > eps
- near_core = core_distances <= eps
- labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1
- labels[far_reach & ~near_core] = -1
- return labels
- @validate_params(
- {
- "reachability": [np.ndarray],
- "predecessor": [np.ndarray],
- "ordering": [np.ndarray],
- "min_samples": [
- Interval(Integral, 2, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="both"),
- ],
- "min_cluster_size": [
- Interval(Integral, 2, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="both"),
- None,
- ],
- "xi": [Interval(Real, 0, 1, closed="both")],
- "predecessor_correction": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def cluster_optics_xi(
- *,
- reachability,
- predecessor,
- ordering,
- min_samples,
- min_cluster_size=None,
- xi=0.05,
- predecessor_correction=True,
- ):
- """Automatically extract clusters according to the Xi-steep method.
- Parameters
- ----------
- reachability : ndarray of shape (n_samples,)
- Reachability distances calculated by OPTICS (`reachability_`).
- predecessor : ndarray of shape (n_samples,)
- Predecessors calculated by OPTICS.
- ordering : ndarray of shape (n_samples,)
- OPTICS ordered point indices (`ordering_`).
- min_samples : int > 1 or float between 0 and 1
- The same as the min_samples given to OPTICS. Up and down steep regions
- can't have more then ``min_samples`` consecutive non-steep points.
- Expressed as an absolute number or a fraction of the number of samples
- (rounded to be at least 2).
- min_cluster_size : int > 1 or float between 0 and 1, default=None
- Minimum number of samples in an OPTICS cluster, expressed as an
- absolute number or a fraction of the number of samples (rounded to be
- at least 2). If ``None``, the value of ``min_samples`` is used instead.
- xi : float between 0 and 1, default=0.05
- Determines the minimum steepness on the reachability plot that
- constitutes a cluster boundary. For example, an upwards point in the
- reachability plot is defined by the ratio from one point to its
- successor being at most 1-xi.
- predecessor_correction : bool, default=True
- Correct clusters based on the calculated predecessors.
- Returns
- -------
- labels : ndarray of shape (n_samples,)
- The labels assigned to samples. Points which are not included
- in any cluster are labeled as -1.
- clusters : ndarray of shape (n_clusters, 2)
- The list of clusters in the form of ``[start, end]`` in each row, with
- all indices inclusive. The clusters are ordered according to ``(end,
- -start)`` (ascending) so that larger clusters encompassing smaller
- clusters come after such nested smaller clusters. Since ``labels`` does
- not reflect the hierarchy, usually ``len(clusters) >
- np.unique(labels)``.
- """
- n_samples = len(reachability)
- _validate_size(min_samples, n_samples, "min_samples")
- if min_samples <= 1:
- min_samples = max(2, int(min_samples * n_samples))
- if min_cluster_size is None:
- min_cluster_size = min_samples
- _validate_size(min_cluster_size, n_samples, "min_cluster_size")
- if min_cluster_size <= 1:
- min_cluster_size = max(2, int(min_cluster_size * n_samples))
- clusters = _xi_cluster(
- reachability[ordering],
- predecessor[ordering],
- ordering,
- xi,
- min_samples,
- min_cluster_size,
- predecessor_correction,
- )
- labels = _extract_xi_labels(ordering, clusters)
- return labels, clusters
- def _extend_region(steep_point, xward_point, start, min_samples):
- """Extend the area until it's maximal.
- It's the same function for both upward and downward reagions, depending on
- the given input parameters. Assuming:
- - steep_{upward/downward}: bool array indicating whether a point is a
- steep {upward/downward};
- - upward/downward: bool array indicating whether a point is
- upward/downward;
- To extend an upward reagion, ``steep_point=steep_upward`` and
- ``xward_point=downward`` are expected, and to extend a downward region,
- ``steep_point=steep_downward`` and ``xward_point=upward``.
- Parameters
- ----------
- steep_point : ndarray of shape (n_samples,), dtype=bool
- True if the point is steep downward (upward).
- xward_point : ndarray of shape (n_samples,), dtype=bool
- True if the point is an upward (respectively downward) point.
- start : int
- The start of the xward region.
- min_samples : int
- The same as the min_samples given to OPTICS. Up and down steep
- regions can't have more then ``min_samples`` consecutive non-steep
- points.
- Returns
- -------
- index : int
- The current index iterating over all the samples, i.e. where we are up
- to in our search.
- end : int
- The end of the region, which can be behind the index. The region
- includes the ``end`` index.
- """
- n_samples = len(steep_point)
- non_xward_points = 0
- index = start
- end = start
- # find a maximal area
- while index < n_samples:
- if steep_point[index]:
- non_xward_points = 0
- end = index
- elif not xward_point[index]:
- # it's not a steep point, but still goes up.
- non_xward_points += 1
- # region should include no more than min_samples consecutive
- # non steep xward points.
- if non_xward_points > min_samples:
- break
- else:
- return end
- index += 1
- return end
- def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):
- """Update steep down areas (SDAs) using the new maximum in between (mib)
- value, and the given complement of xi, i.e. ``1 - xi``.
- """
- if np.isinf(mib):
- return []
- res = [
- sda for sda in sdas if mib <= reachability_plot[sda["start"]] * xi_complement
- ]
- for sda in res:
- sda["mib"] = max(sda["mib"], mib)
- return res
- def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):
- """Correct for predecessors.
- Applies Algorithm 2 of [1]_.
- Input parameters are ordered by the computer OPTICS ordering.
- .. [1] Schubert, Erich, Michael Gertz.
- "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
- the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
- """
- while s < e:
- if reachability_plot[s] > reachability_plot[e]:
- return s, e
- p_e = ordering[predecessor_plot[e]]
- for i in range(s, e):
- if p_e == ordering[i]:
- return s, e
- e -= 1
- return None, None
- def _xi_cluster(
- reachability_plot,
- predecessor_plot,
- ordering,
- xi,
- min_samples,
- min_cluster_size,
- predecessor_correction,
- ):
- """Automatically extract clusters according to the Xi-steep method.
- This is rouphly an implementation of Figure 19 of the OPTICS paper.
- Parameters
- ----------
- reachability_plot : array-like of shape (n_samples,)
- The reachability plot, i.e. reachability ordered according to
- the calculated ordering, all computed by OPTICS.
- predecessor_plot : array-like of shape (n_samples,)
- Predecessors ordered according to the calculated ordering.
- xi : float, between 0 and 1
- Determines the minimum steepness on the reachability plot that
- constitutes a cluster boundary. For example, an upwards point in the
- reachability plot is defined by the ratio from one point to its
- successor being at most 1-xi.
- min_samples : int > 1
- The same as the min_samples given to OPTICS. Up and down steep regions
- can't have more then ``min_samples`` consecutive non-steep points.
- min_cluster_size : int > 1
- Minimum number of samples in an OPTICS cluster.
- predecessor_correction : bool
- Correct clusters based on the calculated predecessors.
- Returns
- -------
- clusters : ndarray of shape (n_clusters, 2)
- The list of clusters in the form of [start, end] in each row, with all
- indices inclusive. The clusters are ordered in a way that larger
- clusters encompassing smaller clusters come after those smaller
- clusters.
- """
- # Our implementation adds an inf to the end of reachability plot
- # this helps to find potential clusters at the end of the
- # reachability plot even if there's no upward region at the end of it.
- reachability_plot = np.hstack((reachability_plot, np.inf))
- xi_complement = 1 - xi
- sdas = [] # steep down areas, introduced in section 4.3.2 of the paper
- clusters = []
- index = 0
- mib = 0.0 # maximum in between, section 4.3.2
- # Our implementation corrects a mistake in the original
- # paper, i.e., in Definition 9 steep downward point,
- # r(p) * (1 - x1) <= r(p + 1) should be
- # r(p) * (1 - x1) >= r(p + 1)
- with np.errstate(invalid="ignore"):
- ratio = reachability_plot[:-1] / reachability_plot[1:]
- steep_upward = ratio <= xi_complement
- steep_downward = ratio >= 1 / xi_complement
- downward = ratio > 1
- upward = ratio < 1
- # the following loop is almost exactly as Figure 19 of the paper.
- # it jumps over the areas which are not either steep down or up areas
- for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):
- # just continue if steep_index has been a part of a discovered xward
- # area.
- if steep_index < index:
- continue
- mib = max(mib, np.max(reachability_plot[index : steep_index + 1]))
- # steep downward areas
- if steep_downward[steep_index]:
- sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)
- D_start = steep_index
- D_end = _extend_region(steep_downward, upward, D_start, min_samples)
- D = {"start": D_start, "end": D_end, "mib": 0.0}
- sdas.append(D)
- index = D_end + 1
- mib = reachability_plot[index]
- # steep upward areas
- else:
- sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)
- U_start = steep_index
- U_end = _extend_region(steep_upward, downward, U_start, min_samples)
- index = U_end + 1
- mib = reachability_plot[index]
- U_clusters = []
- for D in sdas:
- c_start = D["start"]
- c_end = U_end
- # line (**), sc2*
- if reachability_plot[c_end + 1] * xi_complement < D["mib"]:
- continue
- # Definition 11: criterion 4
- D_max = reachability_plot[D["start"]]
- if D_max * xi_complement >= reachability_plot[c_end + 1]:
- # Find the first index from the left side which is almost
- # at the same level as the end of the detected cluster.
- while (
- reachability_plot[c_start + 1] > reachability_plot[c_end + 1]
- and c_start < D["end"]
- ):
- c_start += 1
- elif reachability_plot[c_end + 1] * xi_complement >= D_max:
- # Find the first index from the right side which is almost
- # at the same level as the beginning of the detected
- # cluster.
- # Our implementation corrects a mistake in the original
- # paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
- # r(x) > r(sD).
- while reachability_plot[c_end - 1] > D_max and c_end > U_start:
- c_end -= 1
- # predecessor correction
- if predecessor_correction:
- c_start, c_end = _correct_predecessor(
- reachability_plot, predecessor_plot, ordering, c_start, c_end
- )
- if c_start is None:
- continue
- # Definition 11: criterion 3.a
- if c_end - c_start + 1 < min_cluster_size:
- continue
- # Definition 11: criterion 1
- if c_start > D["end"]:
- continue
- # Definition 11: criterion 2
- if c_end < U_start:
- continue
- U_clusters.append((c_start, c_end))
- # add smaller clusters first.
- U_clusters.reverse()
- clusters.extend(U_clusters)
- return np.array(clusters)
- def _extract_xi_labels(ordering, clusters):
- """Extracts the labels from the clusters returned by `_xi_cluster`.
- We rely on the fact that clusters are stored
- with the smaller clusters coming before the larger ones.
- Parameters
- ----------
- ordering : array-like of shape (n_samples,)
- The ordering of points calculated by OPTICS
- clusters : array-like of shape (n_clusters, 2)
- List of clusters i.e. (start, end) tuples,
- as returned by `_xi_cluster`.
- Returns
- -------
- labels : ndarray of shape (n_samples,)
- """
- labels = np.full(len(ordering), -1, dtype=int)
- label = 0
- for c in clusters:
- if not np.any(labels[c[0] : (c[1] + 1)] != -1):
- labels[c[0] : (c[1] + 1)] = label
- label += 1
- labels[ordering] = labels.copy()
- return labels
|