| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458 |
- """
- DBSCAN: Density-Based Spatial Clustering of Applications with Noise
- """
- # Author: Robert Layton <robertlayton@gmail.com>
- # Joel Nothman <joel.nothman@gmail.com>
- # Lars Buitinck
- #
- # License: BSD 3 clause
- import warnings
- from numbers import Integral, Real
- import numpy as np
- from scipy import sparse
- from ..base import BaseEstimator, ClusterMixin, _fit_context
- from ..metrics.pairwise import _VALID_METRICS
- from ..neighbors import NearestNeighbors
- from ..utils._param_validation import Interval, StrOptions
- from ..utils.validation import _check_sample_weight
- from ._dbscan_inner import dbscan_inner
- def dbscan(
- X,
- eps=0.5,
- *,
- min_samples=5,
- metric="minkowski",
- metric_params=None,
- algorithm="auto",
- leaf_size=30,
- p=2,
- sample_weight=None,
- n_jobs=None,
- ):
- """Perform DBSCAN clustering from vector array or distance matrix.
- Read more in the :ref:`User Guide <dbscan>`.
- Parameters
- ----------
- X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
- (n_samples, n_samples)
- A feature array, or array of distances between samples if
- ``metric='precomputed'``.
- eps : float, default=0.5
- The maximum distance between two samples for one to be considered
- as in the neighborhood of the other. This is not a maximum bound
- on the distances of points within a cluster. This is the most
- important DBSCAN parameter to choose appropriately for your data set
- and distance function.
- min_samples : int, default=5
- The number of samples (or total weight) in a neighborhood for a point
- to be considered as a core point. This includes the point itself.
- metric : str or callable, default='minkowski'
- The metric to use when calculating distance between instances in a
- feature array. If metric is a string or callable, it must be one of
- the options allowed by :func:`sklearn.metrics.pairwise_distances` for
- its metric parameter.
- If metric is "precomputed", X is assumed to be a distance matrix and
- must be square during fit.
- X may be a :term:`sparse graph <sparse graph>`,
- in which case only "nonzero" elements may be considered neighbors.
- metric_params : dict, default=None
- Additional keyword arguments for the metric function.
- .. versionadded:: 0.19
- algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
- The algorithm to be used by the NearestNeighbors module
- to compute pointwise distances and find nearest neighbors.
- See NearestNeighbors module documentation for details.
- leaf_size : int, default=30
- Leaf size passed to BallTree or cKDTree. This can affect the speed
- of the construction and query, as well as the memory required
- to store the tree. The optimal value depends
- on the nature of the problem.
- p : float, default=2
- The power of the Minkowski metric to be used to calculate distance
- between points.
- sample_weight : array-like of shape (n_samples,), default=None
- Weight of each sample, such that a sample with a weight of at least
- ``min_samples`` is by itself a core sample; a sample with negative
- weight may inhibit its eps-neighbor from being core.
- Note that weights are absolute, and default to 1.
- n_jobs : int, default=None
- The number of parallel jobs to run for neighbors search. ``None`` means
- 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
- using all processors. See :term:`Glossary <n_jobs>` for more details.
- If precomputed distance are used, parallel execution is not available
- and thus n_jobs will have no effect.
- Returns
- -------
- core_samples : ndarray of shape (n_core_samples,)
- Indices of core samples.
- labels : ndarray of shape (n_samples,)
- Cluster labels for each point. Noisy samples are given the label -1.
- See Also
- --------
- DBSCAN : An estimator interface for this clustering algorithm.
- OPTICS : A similar estimator interface clustering at multiple values of
- eps. Our implementation is optimized for memory usage.
- Notes
- -----
- For an example, see :ref:`examples/cluster/plot_dbscan.py
- <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
- This implementation bulk-computes all neighborhood queries, which increases
- the memory complexity to O(n.d) where d is the average number of neighbors,
- while original DBSCAN had memory complexity O(n). It may attract a higher
- memory complexity when querying these nearest neighborhoods, depending
- on the ``algorithm``.
- One way to avoid the query complexity is to pre-compute sparse
- neighborhoods in chunks using
- :func:`NearestNeighbors.radius_neighbors_graph
- <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
- ``mode='distance'``, then using ``metric='precomputed'`` here.
- Another way to reduce memory and computation time is to remove
- (near-)duplicate points and use ``sample_weight`` instead.
- :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
- memory usage.
- References
- ----------
- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
- Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
- <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
- In: Proceedings of the 2nd International Conference on Knowledge Discovery
- and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
- Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
- :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
- <10.1145/3068335>`
- ACM Transactions on Database Systems (TODS), 42(3), 19.
- """
- est = DBSCAN(
- eps=eps,
- min_samples=min_samples,
- metric=metric,
- metric_params=metric_params,
- algorithm=algorithm,
- leaf_size=leaf_size,
- p=p,
- n_jobs=n_jobs,
- )
- est.fit(X, sample_weight=sample_weight)
- return est.core_sample_indices_, est.labels_
- class DBSCAN(ClusterMixin, BaseEstimator):
- """Perform DBSCAN clustering from vector array or distance matrix.
- DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
- Finds core samples of high density and expands clusters from them.
- Good for data which contains clusters of similar density.
- The worst case memory complexity of DBSCAN is :math:`O({n}^2)`, which can
- occur when the `eps` param is large and `min_samples` is low.
- Read more in the :ref:`User Guide <dbscan>`.
- Parameters
- ----------
- eps : float, default=0.5
- The maximum distance between two samples for one to be considered
- as in the neighborhood of the other. This is not a maximum bound
- on the distances of points within a cluster. This is the most
- important DBSCAN parameter to choose appropriately for your data set
- and distance function.
- min_samples : int, default=5
- The number of samples (or total weight) in a neighborhood for a point to
- be considered as a core point. This includes the point itself. If
- `min_samples` is set to a higher value, DBSCAN will find denser clusters,
- whereas if it is set to a lower value, the found clusters will be more
- sparse.
- metric : str, or callable, default='euclidean'
- The metric to use when calculating distance between instances in a
- feature array. If metric is a string or callable, it must be one of
- the options allowed by :func:`sklearn.metrics.pairwise_distances` for
- its metric parameter.
- If metric is "precomputed", X is assumed to be a distance matrix and
- must be square. X may be a :term:`sparse graph`, in which
- case only "nonzero" elements may be considered neighbors for DBSCAN.
- .. versionadded:: 0.17
- metric *precomputed* to accept precomputed sparse matrix.
- metric_params : dict, default=None
- Additional keyword arguments for the metric function.
- .. versionadded:: 0.19
- algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
- The algorithm to be used by the NearestNeighbors module
- to compute pointwise distances and find nearest neighbors.
- See NearestNeighbors module documentation for details.
- leaf_size : int, default=30
- Leaf size passed to BallTree or cKDTree. This can affect the speed
- of the construction and query, as well as the memory required
- to store the tree. The optimal value depends
- on the nature of the problem.
- p : float, default=None
- The power of the Minkowski metric to be used to calculate distance
- between points. If None, then ``p=2`` (equivalent to the Euclidean
- distance).
- n_jobs : int, default=None
- The number of parallel jobs to run.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- Attributes
- ----------
- core_sample_indices_ : ndarray of shape (n_core_samples,)
- Indices of core samples.
- components_ : ndarray of shape (n_core_samples, n_features)
- Copy of each core sample found by training.
- labels_ : ndarray of shape (n_samples)
- Cluster labels for each point in the dataset given to fit().
- Noisy samples are given the label -1.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- OPTICS : A similar clustering at multiple values of eps. Our implementation
- is optimized for memory usage.
- Notes
- -----
- For an example, see :ref:`examples/cluster/plot_dbscan.py
- <sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
- This implementation bulk-computes all neighborhood queries, which increases
- the memory complexity to O(n.d) where d is the average number of neighbors,
- while original DBSCAN had memory complexity O(n). It may attract a higher
- memory complexity when querying these nearest neighborhoods, depending
- on the ``algorithm``.
- One way to avoid the query complexity is to pre-compute sparse
- neighborhoods in chunks using
- :func:`NearestNeighbors.radius_neighbors_graph
- <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
- ``mode='distance'``, then using ``metric='precomputed'`` here.
- Another way to reduce memory and computation time is to remove
- (near-)duplicate points and use ``sample_weight`` instead.
- :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
- usage.
- References
- ----------
- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
- Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
- <https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
- In: Proceedings of the 2nd International Conference on Knowledge Discovery
- and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
- Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
- :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
- <10.1145/3068335>`
- ACM Transactions on Database Systems (TODS), 42(3), 19.
- Examples
- --------
- >>> from sklearn.cluster import DBSCAN
- >>> import numpy as np
- >>> X = np.array([[1, 2], [2, 2], [2, 3],
- ... [8, 7], [8, 8], [25, 80]])
- >>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
- >>> clustering.labels_
- array([ 0, 0, 0, 1, 1, -1])
- >>> clustering
- DBSCAN(eps=3, min_samples=2)
- """
- _parameter_constraints: dict = {
- "eps": [Interval(Real, 0.0, None, closed="neither")],
- "min_samples": [Interval(Integral, 1, None, closed="left")],
- "metric": [
- StrOptions(set(_VALID_METRICS) | {"precomputed"}),
- callable,
- ],
- "metric_params": [dict, None],
- "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
- "leaf_size": [Interval(Integral, 1, None, closed="left")],
- "p": [Interval(Real, 0.0, None, closed="left"), None],
- "n_jobs": [Integral, None],
- }
- def __init__(
- self,
- eps=0.5,
- *,
- min_samples=5,
- metric="euclidean",
- metric_params=None,
- algorithm="auto",
- leaf_size=30,
- p=None,
- n_jobs=None,
- ):
- self.eps = eps
- self.min_samples = min_samples
- self.metric = metric
- self.metric_params = metric_params
- self.algorithm = algorithm
- self.leaf_size = leaf_size
- self.p = p
- self.n_jobs = n_jobs
- @_fit_context(
- # DBSCAN.metric is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y=None, sample_weight=None):
- """Perform DBSCAN clustering from features, or distance matrix.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
- (n_samples, n_samples)
- Training instances to cluster, or distances between instances if
- ``metric='precomputed'``. If a sparse matrix is provided, it will
- be converted into a sparse ``csr_matrix``.
- y : Ignored
- Not used, present here for API consistency by convention.
- sample_weight : array-like of shape (n_samples,), default=None
- Weight of each sample, such that a sample with a weight of at least
- ``min_samples`` is by itself a core sample; a sample with a
- negative weight may inhibit its eps-neighbor from being core.
- Note that weights are absolute, and default to 1.
- Returns
- -------
- self : object
- Returns a fitted instance of self.
- """
- X = self._validate_data(X, accept_sparse="csr")
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X)
- # Calculate neighborhood for all samples. This leaves the original
- # point in, which needs to be considered later (i.e. point i is in the
- # neighborhood of point i. While True, its useless information)
- if self.metric == "precomputed" and sparse.issparse(X):
- # set the diagonal to explicit values, as a point is its own
- # neighbor
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
- X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place
- neighbors_model = NearestNeighbors(
- radius=self.eps,
- algorithm=self.algorithm,
- leaf_size=self.leaf_size,
- metric=self.metric,
- metric_params=self.metric_params,
- p=self.p,
- n_jobs=self.n_jobs,
- )
- neighbors_model.fit(X)
- # This has worst case O(n^2) memory complexity
- neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
- if sample_weight is None:
- n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
- else:
- n_neighbors = np.array(
- [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
- )
- # Initially, all samples are noise.
- labels = np.full(X.shape[0], -1, dtype=np.intp)
- # A list of all core samples found.
- core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
- dbscan_inner(core_samples, neighborhoods, labels)
- self.core_sample_indices_ = np.where(core_samples)[0]
- self.labels_ = labels
- if len(self.core_sample_indices_):
- # fix for scipy sparse indexing issue
- self.components_ = X[self.core_sample_indices_].copy()
- else:
- # no core samples
- self.components_ = np.empty((0, X.shape[1]))
- return self
- def fit_predict(self, X, y=None, sample_weight=None):
- """Compute clusters from a data or distance matrix and predict labels.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
- (n_samples, n_samples)
- Training instances to cluster, or distances between instances if
- ``metric='precomputed'``. If a sparse matrix is provided, it will
- be converted into a sparse ``csr_matrix``.
- y : Ignored
- Not used, present here for API consistency by convention.
- sample_weight : array-like of shape (n_samples,), default=None
- Weight of each sample, such that a sample with a weight of at least
- ``min_samples`` is by itself a core sample; a sample with a
- negative weight may inhibit its eps-neighbor from being core.
- Note that weights are absolute, and default to 1.
- Returns
- -------
- labels : ndarray of shape (n_samples,)
- Cluster labels. Noisy samples are given the label -1.
- """
- self.fit(X, sample_weight=sample_weight)
- return self.labels_
- def _more_tags(self):
- return {"pairwise": self.metric == "precomputed"}
|