_optics.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113
  1. """Ordering Points To Identify the Clustering Structure (OPTICS)
  2. These routines execute the OPTICS algorithm, and implement various
  3. cluster extraction methods of the ordered list.
  4. Authors: Shane Grigsby <refuge@rocktalus.com>
  5. Adrin Jalali <adrinjalali@gmail.com>
  6. Erich Schubert <erich@debian.org>
  7. Hanmin Qin <qinhanmin2005@sina.com>
  8. License: BSD 3 clause
  9. """
  10. import warnings
  11. from numbers import Integral, Real
  12. import numpy as np
  13. from scipy.sparse import SparseEfficiencyWarning, issparse
  14. from ..base import BaseEstimator, ClusterMixin, _fit_context
  15. from ..exceptions import DataConversionWarning
  16. from ..metrics import pairwise_distances
  17. from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS
  18. from ..neighbors import NearestNeighbors
  19. from ..utils import gen_batches, get_chunk_n_rows
  20. from ..utils._param_validation import (
  21. HasMethods,
  22. Interval,
  23. RealNotInt,
  24. StrOptions,
  25. validate_params,
  26. )
  27. from ..utils.validation import check_memory
  28. class OPTICS(ClusterMixin, BaseEstimator):
  29. """Estimate clustering structure from vector array.
  30. OPTICS (Ordering Points To Identify the Clustering Structure), closely
  31. related to DBSCAN, finds core sample of high density and expands clusters
  32. from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
  33. neighborhood radius. Better suited for usage on large datasets than the
  34. current sklearn implementation of DBSCAN.
  35. Clusters are then extracted using a DBSCAN-like method
  36. (cluster_method = 'dbscan') or an automatic
  37. technique proposed in [1]_ (cluster_method = 'xi').
  38. This implementation deviates from the original OPTICS by first performing
  39. k-nearest-neighborhood searches on all points to identify core sizes, then
  40. computing only the distances to unprocessed points when constructing the
  41. cluster order. Note that we do not employ a heap to manage the expansion
  42. candidates, so the time complexity will be O(n^2).
  43. Read more in the :ref:`User Guide <optics>`.
  44. Parameters
  45. ----------
  46. min_samples : int > 1 or float between 0 and 1, default=5
  47. The number of samples in a neighborhood for a point to be considered as
  48. a core point. Also, up and down steep regions can't have more than
  49. ``min_samples`` consecutive non-steep points. Expressed as an absolute
  50. number or a fraction of the number of samples (rounded to be at least
  51. 2).
  52. max_eps : float, default=np.inf
  53. The maximum distance between two samples for one to be considered as
  54. in the neighborhood of the other. Default value of ``np.inf`` will
  55. identify clusters across all scales; reducing ``max_eps`` will result
  56. in shorter run times.
  57. metric : str or callable, default='minkowski'
  58. Metric to use for distance computation. Any metric from scikit-learn
  59. or scipy.spatial.distance can be used.
  60. If metric is a callable function, it is called on each
  61. pair of instances (rows) and the resulting value recorded. The callable
  62. should take two arrays as input and return one value indicating the
  63. distance between them. This works for Scipy's metrics, but is less
  64. efficient than passing the metric name as a string. If metric is
  65. "precomputed", `X` is assumed to be a distance matrix and must be
  66. square.
  67. Valid values for metric are:
  68. - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
  69. 'manhattan']
  70. - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
  71. 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
  72. 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
  73. 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
  74. 'yule']
  75. Sparse matrices are only supported by scikit-learn metrics.
  76. See the documentation for scipy.spatial.distance for details on these
  77. metrics.
  78. .. note::
  79. `'kulsinski'` is deprecated from SciPy 1.9 and will removed in SciPy 1.11.
  80. p : float, default=2
  81. Parameter for the Minkowski metric from
  82. :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
  83. equivalent to using manhattan_distance (l1), and euclidean_distance
  84. (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
  85. metric_params : dict, default=None
  86. Additional keyword arguments for the metric function.
  87. cluster_method : str, default='xi'
  88. The extraction method used to extract clusters using the calculated
  89. reachability and ordering. Possible values are "xi" and "dbscan".
  90. eps : float, default=None
  91. The maximum distance between two samples for one to be considered as
  92. in the neighborhood of the other. By default it assumes the same value
  93. as ``max_eps``.
  94. Used only when ``cluster_method='dbscan'``.
  95. xi : float between 0 and 1, default=0.05
  96. Determines the minimum steepness on the reachability plot that
  97. constitutes a cluster boundary. For example, an upwards point in the
  98. reachability plot is defined by the ratio from one point to its
  99. successor being at most 1-xi.
  100. Used only when ``cluster_method='xi'``.
  101. predecessor_correction : bool, default=True
  102. Correct clusters according to the predecessors calculated by OPTICS
  103. [2]_. This parameter has minimal effect on most datasets.
  104. Used only when ``cluster_method='xi'``.
  105. min_cluster_size : int > 1 or float between 0 and 1, default=None
  106. Minimum number of samples in an OPTICS cluster, expressed as an
  107. absolute number or a fraction of the number of samples (rounded to be
  108. at least 2). If ``None``, the value of ``min_samples`` is used instead.
  109. Used only when ``cluster_method='xi'``.
  110. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
  111. Algorithm used to compute the nearest neighbors:
  112. - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
  113. - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
  114. - 'brute' will use a brute-force search.
  115. - 'auto' (default) will attempt to decide the most appropriate
  116. algorithm based on the values passed to :meth:`fit` method.
  117. Note: fitting on sparse input will override the setting of
  118. this parameter, using brute force.
  119. leaf_size : int, default=30
  120. Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
  121. :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
  122. construction and query, as well as the memory required to store the
  123. tree. The optimal value depends on the nature of the problem.
  124. memory : str or object with the joblib.Memory interface, default=None
  125. Used to cache the output of the computation of the tree.
  126. By default, no caching is done. If a string is given, it is the
  127. path to the caching directory.
  128. n_jobs : int, default=None
  129. The number of parallel jobs to run for neighbors search.
  130. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  131. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  132. for more details.
  133. Attributes
  134. ----------
  135. labels_ : ndarray of shape (n_samples,)
  136. Cluster labels for each point in the dataset given to fit().
  137. Noisy samples and points which are not included in a leaf cluster
  138. of ``cluster_hierarchy_`` are labeled as -1.
  139. reachability_ : ndarray of shape (n_samples,)
  140. Reachability distances per sample, indexed by object order. Use
  141. ``clust.reachability_[clust.ordering_]`` to access in cluster order.
  142. ordering_ : ndarray of shape (n_samples,)
  143. The cluster ordered list of sample indices.
  144. core_distances_ : ndarray of shape (n_samples,)
  145. Distance at which each sample becomes a core point, indexed by object
  146. order. Points which will never be core have a distance of inf. Use
  147. ``clust.core_distances_[clust.ordering_]`` to access in cluster order.
  148. predecessor_ : ndarray of shape (n_samples,)
  149. Point that a sample was reached from, indexed by object order.
  150. Seed points have a predecessor of -1.
  151. cluster_hierarchy_ : ndarray of shape (n_clusters, 2)
  152. The list of clusters in the form of ``[start, end]`` in each row, with
  153. all indices inclusive. The clusters are ordered according to
  154. ``(end, -start)`` (ascending) so that larger clusters encompassing
  155. smaller clusters come after those smaller ones. Since ``labels_`` does
  156. not reflect the hierarchy, usually
  157. ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also
  158. note that these indices are of the ``ordering_``, i.e.
  159. ``X[ordering_][start:end + 1]`` form a cluster.
  160. Only available when ``cluster_method='xi'``.
  161. n_features_in_ : int
  162. Number of features seen during :term:`fit`.
  163. .. versionadded:: 0.24
  164. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  165. Names of features seen during :term:`fit`. Defined only when `X`
  166. has feature names that are all strings.
  167. .. versionadded:: 1.0
  168. See Also
  169. --------
  170. DBSCAN : A similar clustering for a specified neighborhood radius (eps).
  171. Our implementation is optimized for runtime.
  172. References
  173. ----------
  174. .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
  175. and Jörg Sander. "OPTICS: ordering points to identify the clustering
  176. structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
  177. .. [2] Schubert, Erich, Michael Gertz.
  178. "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
  179. the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
  180. Examples
  181. --------
  182. >>> from sklearn.cluster import OPTICS
  183. >>> import numpy as np
  184. >>> X = np.array([[1, 2], [2, 5], [3, 6],
  185. ... [8, 7], [8, 8], [7, 3]])
  186. >>> clustering = OPTICS(min_samples=2).fit(X)
  187. >>> clustering.labels_
  188. array([0, 0, 0, 1, 1, 1])
  189. For a more detailed example see
  190. :ref:`sphx_glr_auto_examples_cluster_plot_optics.py`.
  191. """
  192. _parameter_constraints: dict = {
  193. "min_samples": [
  194. Interval(Integral, 2, None, closed="left"),
  195. Interval(RealNotInt, 0, 1, closed="both"),
  196. ],
  197. "max_eps": [Interval(Real, 0, None, closed="both")],
  198. "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
  199. "p": [Interval(Real, 1, None, closed="left")],
  200. "metric_params": [dict, None],
  201. "cluster_method": [StrOptions({"dbscan", "xi"})],
  202. "eps": [Interval(Real, 0, None, closed="both"), None],
  203. "xi": [Interval(Real, 0, 1, closed="both")],
  204. "predecessor_correction": ["boolean"],
  205. "min_cluster_size": [
  206. Interval(Integral, 2, None, closed="left"),
  207. Interval(RealNotInt, 0, 1, closed="right"),
  208. None,
  209. ],
  210. "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
  211. "leaf_size": [Interval(Integral, 1, None, closed="left")],
  212. "memory": [str, HasMethods("cache"), None],
  213. "n_jobs": [Integral, None],
  214. }
  215. def __init__(
  216. self,
  217. *,
  218. min_samples=5,
  219. max_eps=np.inf,
  220. metric="minkowski",
  221. p=2,
  222. metric_params=None,
  223. cluster_method="xi",
  224. eps=None,
  225. xi=0.05,
  226. predecessor_correction=True,
  227. min_cluster_size=None,
  228. algorithm="auto",
  229. leaf_size=30,
  230. memory=None,
  231. n_jobs=None,
  232. ):
  233. self.max_eps = max_eps
  234. self.min_samples = min_samples
  235. self.min_cluster_size = min_cluster_size
  236. self.algorithm = algorithm
  237. self.metric = metric
  238. self.metric_params = metric_params
  239. self.p = p
  240. self.leaf_size = leaf_size
  241. self.cluster_method = cluster_method
  242. self.eps = eps
  243. self.xi = xi
  244. self.predecessor_correction = predecessor_correction
  245. self.memory = memory
  246. self.n_jobs = n_jobs
  247. @_fit_context(
  248. # Optics.metric is not validated yet
  249. prefer_skip_nested_validation=False
  250. )
  251. def fit(self, X, y=None):
  252. """Perform OPTICS clustering.
  253. Extracts an ordered list of points and reachability distances, and
  254. performs initial clustering using ``max_eps`` distance specified at
  255. OPTICS object instantiation.
  256. Parameters
  257. ----------
  258. X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
  259. (n_samples, n_samples) if metric='precomputed'
  260. A feature array, or array of distances between samples if
  261. metric='precomputed'. If a sparse matrix is provided, it will be
  262. converted into CSR format.
  263. y : Ignored
  264. Not used, present for API consistency by convention.
  265. Returns
  266. -------
  267. self : object
  268. Returns a fitted instance of self.
  269. """
  270. dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float
  271. if dtype == bool and X.dtype != bool:
  272. msg = (
  273. "Data will be converted to boolean for"
  274. f" metric {self.metric}, to avoid this warning,"
  275. " you may convert the data prior to calling fit."
  276. )
  277. warnings.warn(msg, DataConversionWarning)
  278. X = self._validate_data(X, dtype=dtype, accept_sparse="csr")
  279. if self.metric == "precomputed" and issparse(X):
  280. with warnings.catch_warnings():
  281. warnings.simplefilter("ignore", SparseEfficiencyWarning)
  282. # Set each diagonal to an explicit value so each point is its
  283. # own neighbor
  284. X.setdiag(X.diagonal())
  285. memory = check_memory(self.memory)
  286. (
  287. self.ordering_,
  288. self.core_distances_,
  289. self.reachability_,
  290. self.predecessor_,
  291. ) = memory.cache(compute_optics_graph)(
  292. X=X,
  293. min_samples=self.min_samples,
  294. algorithm=self.algorithm,
  295. leaf_size=self.leaf_size,
  296. metric=self.metric,
  297. metric_params=self.metric_params,
  298. p=self.p,
  299. n_jobs=self.n_jobs,
  300. max_eps=self.max_eps,
  301. )
  302. # Extract clusters from the calculated orders and reachability
  303. if self.cluster_method == "xi":
  304. labels_, clusters_ = cluster_optics_xi(
  305. reachability=self.reachability_,
  306. predecessor=self.predecessor_,
  307. ordering=self.ordering_,
  308. min_samples=self.min_samples,
  309. min_cluster_size=self.min_cluster_size,
  310. xi=self.xi,
  311. predecessor_correction=self.predecessor_correction,
  312. )
  313. self.cluster_hierarchy_ = clusters_
  314. elif self.cluster_method == "dbscan":
  315. if self.eps is None:
  316. eps = self.max_eps
  317. else:
  318. eps = self.eps
  319. if eps > self.max_eps:
  320. raise ValueError(
  321. "Specify an epsilon smaller than %s. Got %s." % (self.max_eps, eps)
  322. )
  323. labels_ = cluster_optics_dbscan(
  324. reachability=self.reachability_,
  325. core_distances=self.core_distances_,
  326. ordering=self.ordering_,
  327. eps=eps,
  328. )
  329. self.labels_ = labels_
  330. return self
  331. def _validate_size(size, n_samples, param_name):
  332. if size > n_samples:
  333. raise ValueError(
  334. "%s must be no greater than the number of samples (%d). Got %d"
  335. % (param_name, n_samples, size)
  336. )
  337. # OPTICS helper functions
  338. def _compute_core_distances_(X, neighbors, min_samples, working_memory):
  339. """Compute the k-th nearest neighbor of each sample.
  340. Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
  341. but with more memory efficiency.
  342. Parameters
  343. ----------
  344. X : array-like of shape (n_samples, n_features)
  345. The data.
  346. neighbors : NearestNeighbors instance
  347. The fitted nearest neighbors estimator.
  348. working_memory : int, default=None
  349. The sought maximum memory for temporary distance matrix chunks.
  350. When None (default), the value of
  351. ``sklearn.get_config()['working_memory']`` is used.
  352. Returns
  353. -------
  354. core_distances : ndarray of shape (n_samples,)
  355. Distance at which each sample becomes a core point.
  356. Points which will never be core have a distance of inf.
  357. """
  358. n_samples = X.shape[0]
  359. core_distances = np.empty(n_samples)
  360. core_distances.fill(np.nan)
  361. chunk_n_rows = get_chunk_n_rows(
  362. row_bytes=16 * min_samples, max_n_rows=n_samples, working_memory=working_memory
  363. )
  364. slices = gen_batches(n_samples, chunk_n_rows)
  365. for sl in slices:
  366. core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1]
  367. return core_distances
  368. @validate_params(
  369. {
  370. "X": [np.ndarray, "sparse matrix"],
  371. "min_samples": [
  372. Interval(Integral, 2, None, closed="left"),
  373. Interval(RealNotInt, 0, 1, closed="both"),
  374. ],
  375. "max_eps": [Interval(Real, 0, None, closed="both")],
  376. "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
  377. "p": [Interval(Real, 0, None, closed="right"), None],
  378. "metric_params": [dict, None],
  379. "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
  380. "leaf_size": [Interval(Integral, 1, None, closed="left")],
  381. "n_jobs": [Integral, None],
  382. },
  383. prefer_skip_nested_validation=False, # metric is not validated yet
  384. )
  385. def compute_optics_graph(
  386. X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs
  387. ):
  388. """Compute the OPTICS reachability graph.
  389. Read more in the :ref:`User Guide <optics>`.
  390. Parameters
  391. ----------
  392. X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \
  393. (n_samples, n_samples) if metric='precomputed'
  394. A feature array, or array of distances between samples if
  395. metric='precomputed'.
  396. min_samples : int > 1 or float between 0 and 1
  397. The number of samples in a neighborhood for a point to be considered
  398. as a core point. Expressed as an absolute number or a fraction of the
  399. number of samples (rounded to be at least 2).
  400. max_eps : float, default=np.inf
  401. The maximum distance between two samples for one to be considered as
  402. in the neighborhood of the other. Default value of ``np.inf`` will
  403. identify clusters across all scales; reducing ``max_eps`` will result
  404. in shorter run times.
  405. metric : str or callable, default='minkowski'
  406. Metric to use for distance computation. Any metric from scikit-learn
  407. or scipy.spatial.distance can be used.
  408. If metric is a callable function, it is called on each
  409. pair of instances (rows) and the resulting value recorded. The callable
  410. should take two arrays as input and return one value indicating the
  411. distance between them. This works for Scipy's metrics, but is less
  412. efficient than passing the metric name as a string. If metric is
  413. "precomputed", X is assumed to be a distance matrix and must be square.
  414. Valid values for metric are:
  415. - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
  416. 'manhattan']
  417. - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
  418. 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
  419. 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
  420. 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
  421. 'yule']
  422. See the documentation for scipy.spatial.distance for details on these
  423. metrics.
  424. .. note::
  425. `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
  426. p : float, default=2
  427. Parameter for the Minkowski metric from
  428. :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is
  429. equivalent to using manhattan_distance (l1), and euclidean_distance
  430. (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
  431. metric_params : dict, default=None
  432. Additional keyword arguments for the metric function.
  433. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
  434. Algorithm used to compute the nearest neighbors:
  435. - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`.
  436. - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`.
  437. - 'brute' will use a brute-force search.
  438. - 'auto' will attempt to decide the most appropriate algorithm
  439. based on the values passed to `fit` method. (default)
  440. Note: fitting on sparse input will override the setting of
  441. this parameter, using brute force.
  442. leaf_size : int, default=30
  443. Leaf size passed to :class:`~sklearn.neighbors.BallTree` or
  444. :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the
  445. construction and query, as well as the memory required to store the
  446. tree. The optimal value depends on the nature of the problem.
  447. n_jobs : int, default=None
  448. The number of parallel jobs to run for neighbors search.
  449. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  450. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  451. for more details.
  452. Returns
  453. -------
  454. ordering_ : array of shape (n_samples,)
  455. The cluster ordered list of sample indices.
  456. core_distances_ : array of shape (n_samples,)
  457. Distance at which each sample becomes a core point, indexed by object
  458. order. Points which will never be core have a distance of inf. Use
  459. ``clust.core_distances_[clust.ordering_]`` to access in cluster order.
  460. reachability_ : array of shape (n_samples,)
  461. Reachability distances per sample, indexed by object order. Use
  462. ``clust.reachability_[clust.ordering_]`` to access in cluster order.
  463. predecessor_ : array of shape (n_samples,)
  464. Point that a sample was reached from, indexed by object order.
  465. Seed points have a predecessor of -1.
  466. References
  467. ----------
  468. .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
  469. and Jörg Sander. "OPTICS: ordering points to identify the clustering
  470. structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
  471. """
  472. n_samples = X.shape[0]
  473. _validate_size(min_samples, n_samples, "min_samples")
  474. if min_samples <= 1:
  475. min_samples = max(2, int(min_samples * n_samples))
  476. # Start all points as 'unprocessed' ##
  477. reachability_ = np.empty(n_samples)
  478. reachability_.fill(np.inf)
  479. predecessor_ = np.empty(n_samples, dtype=int)
  480. predecessor_.fill(-1)
  481. nbrs = NearestNeighbors(
  482. n_neighbors=min_samples,
  483. algorithm=algorithm,
  484. leaf_size=leaf_size,
  485. metric=metric,
  486. metric_params=metric_params,
  487. p=p,
  488. n_jobs=n_jobs,
  489. )
  490. nbrs.fit(X)
  491. # Here we first do a kNN query for each point, this differs from
  492. # the original OPTICS that only used epsilon range queries.
  493. # TODO: handle working_memory somehow?
  494. core_distances_ = _compute_core_distances_(
  495. X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None
  496. )
  497. # OPTICS puts an upper limit on these, use inf for undefined.
  498. core_distances_[core_distances_ > max_eps] = np.inf
  499. np.around(
  500. core_distances_,
  501. decimals=np.finfo(core_distances_.dtype).precision,
  502. out=core_distances_,
  503. )
  504. # Main OPTICS loop. Not parallelizable. The order that entries are
  505. # written to the 'ordering_' list is important!
  506. # Note that this implementation is O(n^2) theoretically, but
  507. # supposedly with very low constant factors.
  508. processed = np.zeros(X.shape[0], dtype=bool)
  509. ordering = np.zeros(X.shape[0], dtype=int)
  510. for ordering_idx in range(X.shape[0]):
  511. # Choose next based on smallest reachability distance
  512. # (And prefer smaller ids on ties, possibly np.inf!)
  513. index = np.where(processed == 0)[0]
  514. point = index[np.argmin(reachability_[index])]
  515. processed[point] = True
  516. ordering[ordering_idx] = point
  517. if core_distances_[point] != np.inf:
  518. _set_reach_dist(
  519. core_distances_=core_distances_,
  520. reachability_=reachability_,
  521. predecessor_=predecessor_,
  522. point_index=point,
  523. processed=processed,
  524. X=X,
  525. nbrs=nbrs,
  526. metric=metric,
  527. metric_params=metric_params,
  528. p=p,
  529. max_eps=max_eps,
  530. )
  531. if np.all(np.isinf(reachability_)):
  532. warnings.warn(
  533. (
  534. "All reachability values are inf. Set a larger"
  535. " max_eps or all data will be considered outliers."
  536. ),
  537. UserWarning,
  538. )
  539. return ordering, core_distances_, reachability_, predecessor_
  540. def _set_reach_dist(
  541. core_distances_,
  542. reachability_,
  543. predecessor_,
  544. point_index,
  545. processed,
  546. X,
  547. nbrs,
  548. metric,
  549. metric_params,
  550. p,
  551. max_eps,
  552. ):
  553. P = X[point_index : point_index + 1]
  554. # Assume that radius_neighbors is faster without distances
  555. # and we don't need all distances, nevertheless, this means
  556. # we may be doing some work twice.
  557. indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]
  558. # Getting indices of neighbors that have not been processed
  559. unproc = np.compress(~np.take(processed, indices), indices)
  560. # Neighbors of current point are already processed.
  561. if not unproc.size:
  562. return
  563. # Only compute distances to unprocessed neighbors:
  564. if metric == "precomputed":
  565. dists = X[point_index, unproc]
  566. if issparse(dists):
  567. dists.sort_indices()
  568. dists = dists.data
  569. else:
  570. _params = dict() if metric_params is None else metric_params.copy()
  571. if metric == "minkowski" and "p" not in _params:
  572. # the same logic as neighbors, p is ignored if explicitly set
  573. # in the dict params
  574. _params["p"] = p
  575. dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel()
  576. rdists = np.maximum(dists, core_distances_[point_index])
  577. np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
  578. improved = np.where(rdists < np.take(reachability_, unproc))
  579. reachability_[unproc[improved]] = rdists[improved]
  580. predecessor_[unproc[improved]] = point_index
  581. @validate_params(
  582. {
  583. "reachability": [np.ndarray],
  584. "core_distances": [np.ndarray],
  585. "ordering": [np.ndarray],
  586. "eps": [Interval(Real, 0, None, closed="both")],
  587. },
  588. prefer_skip_nested_validation=True,
  589. )
  590. def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
  591. """Perform DBSCAN extraction for an arbitrary epsilon.
  592. Extracting the clusters runs in linear time. Note that this results in
  593. ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with
  594. similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.
  595. Parameters
  596. ----------
  597. reachability : ndarray of shape (n_samples,)
  598. Reachability distances calculated by OPTICS (``reachability_``).
  599. core_distances : ndarray of shape (n_samples,)
  600. Distances at which points become core (``core_distances_``).
  601. ordering : ndarray of shape (n_samples,)
  602. OPTICS ordered point indices (``ordering_``).
  603. eps : float
  604. DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results
  605. will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close
  606. to one another.
  607. Returns
  608. -------
  609. labels_ : array of shape (n_samples,)
  610. The estimated labels.
  611. """
  612. n_samples = len(core_distances)
  613. labels = np.zeros(n_samples, dtype=int)
  614. far_reach = reachability > eps
  615. near_core = core_distances <= eps
  616. labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1
  617. labels[far_reach & ~near_core] = -1
  618. return labels
  619. @validate_params(
  620. {
  621. "reachability": [np.ndarray],
  622. "predecessor": [np.ndarray],
  623. "ordering": [np.ndarray],
  624. "min_samples": [
  625. Interval(Integral, 2, None, closed="left"),
  626. Interval(RealNotInt, 0, 1, closed="both"),
  627. ],
  628. "min_cluster_size": [
  629. Interval(Integral, 2, None, closed="left"),
  630. Interval(RealNotInt, 0, 1, closed="both"),
  631. None,
  632. ],
  633. "xi": [Interval(Real, 0, 1, closed="both")],
  634. "predecessor_correction": ["boolean"],
  635. },
  636. prefer_skip_nested_validation=True,
  637. )
  638. def cluster_optics_xi(
  639. *,
  640. reachability,
  641. predecessor,
  642. ordering,
  643. min_samples,
  644. min_cluster_size=None,
  645. xi=0.05,
  646. predecessor_correction=True,
  647. ):
  648. """Automatically extract clusters according to the Xi-steep method.
  649. Parameters
  650. ----------
  651. reachability : ndarray of shape (n_samples,)
  652. Reachability distances calculated by OPTICS (`reachability_`).
  653. predecessor : ndarray of shape (n_samples,)
  654. Predecessors calculated by OPTICS.
  655. ordering : ndarray of shape (n_samples,)
  656. OPTICS ordered point indices (`ordering_`).
  657. min_samples : int > 1 or float between 0 and 1
  658. The same as the min_samples given to OPTICS. Up and down steep regions
  659. can't have more then ``min_samples`` consecutive non-steep points.
  660. Expressed as an absolute number or a fraction of the number of samples
  661. (rounded to be at least 2).
  662. min_cluster_size : int > 1 or float between 0 and 1, default=None
  663. Minimum number of samples in an OPTICS cluster, expressed as an
  664. absolute number or a fraction of the number of samples (rounded to be
  665. at least 2). If ``None``, the value of ``min_samples`` is used instead.
  666. xi : float between 0 and 1, default=0.05
  667. Determines the minimum steepness on the reachability plot that
  668. constitutes a cluster boundary. For example, an upwards point in the
  669. reachability plot is defined by the ratio from one point to its
  670. successor being at most 1-xi.
  671. predecessor_correction : bool, default=True
  672. Correct clusters based on the calculated predecessors.
  673. Returns
  674. -------
  675. labels : ndarray of shape (n_samples,)
  676. The labels assigned to samples. Points which are not included
  677. in any cluster are labeled as -1.
  678. clusters : ndarray of shape (n_clusters, 2)
  679. The list of clusters in the form of ``[start, end]`` in each row, with
  680. all indices inclusive. The clusters are ordered according to ``(end,
  681. -start)`` (ascending) so that larger clusters encompassing smaller
  682. clusters come after such nested smaller clusters. Since ``labels`` does
  683. not reflect the hierarchy, usually ``len(clusters) >
  684. np.unique(labels)``.
  685. """
  686. n_samples = len(reachability)
  687. _validate_size(min_samples, n_samples, "min_samples")
  688. if min_samples <= 1:
  689. min_samples = max(2, int(min_samples * n_samples))
  690. if min_cluster_size is None:
  691. min_cluster_size = min_samples
  692. _validate_size(min_cluster_size, n_samples, "min_cluster_size")
  693. if min_cluster_size <= 1:
  694. min_cluster_size = max(2, int(min_cluster_size * n_samples))
  695. clusters = _xi_cluster(
  696. reachability[ordering],
  697. predecessor[ordering],
  698. ordering,
  699. xi,
  700. min_samples,
  701. min_cluster_size,
  702. predecessor_correction,
  703. )
  704. labels = _extract_xi_labels(ordering, clusters)
  705. return labels, clusters
  706. def _extend_region(steep_point, xward_point, start, min_samples):
  707. """Extend the area until it's maximal.
  708. It's the same function for both upward and downward reagions, depending on
  709. the given input parameters. Assuming:
  710. - steep_{upward/downward}: bool array indicating whether a point is a
  711. steep {upward/downward};
  712. - upward/downward: bool array indicating whether a point is
  713. upward/downward;
  714. To extend an upward reagion, ``steep_point=steep_upward`` and
  715. ``xward_point=downward`` are expected, and to extend a downward region,
  716. ``steep_point=steep_downward`` and ``xward_point=upward``.
  717. Parameters
  718. ----------
  719. steep_point : ndarray of shape (n_samples,), dtype=bool
  720. True if the point is steep downward (upward).
  721. xward_point : ndarray of shape (n_samples,), dtype=bool
  722. True if the point is an upward (respectively downward) point.
  723. start : int
  724. The start of the xward region.
  725. min_samples : int
  726. The same as the min_samples given to OPTICS. Up and down steep
  727. regions can't have more then ``min_samples`` consecutive non-steep
  728. points.
  729. Returns
  730. -------
  731. index : int
  732. The current index iterating over all the samples, i.e. where we are up
  733. to in our search.
  734. end : int
  735. The end of the region, which can be behind the index. The region
  736. includes the ``end`` index.
  737. """
  738. n_samples = len(steep_point)
  739. non_xward_points = 0
  740. index = start
  741. end = start
  742. # find a maximal area
  743. while index < n_samples:
  744. if steep_point[index]:
  745. non_xward_points = 0
  746. end = index
  747. elif not xward_point[index]:
  748. # it's not a steep point, but still goes up.
  749. non_xward_points += 1
  750. # region should include no more than min_samples consecutive
  751. # non steep xward points.
  752. if non_xward_points > min_samples:
  753. break
  754. else:
  755. return end
  756. index += 1
  757. return end
  758. def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):
  759. """Update steep down areas (SDAs) using the new maximum in between (mib)
  760. value, and the given complement of xi, i.e. ``1 - xi``.
  761. """
  762. if np.isinf(mib):
  763. return []
  764. res = [
  765. sda for sda in sdas if mib <= reachability_plot[sda["start"]] * xi_complement
  766. ]
  767. for sda in res:
  768. sda["mib"] = max(sda["mib"], mib)
  769. return res
  770. def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):
  771. """Correct for predecessors.
  772. Applies Algorithm 2 of [1]_.
  773. Input parameters are ordered by the computer OPTICS ordering.
  774. .. [1] Schubert, Erich, Michael Gertz.
  775. "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
  776. the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
  777. """
  778. while s < e:
  779. if reachability_plot[s] > reachability_plot[e]:
  780. return s, e
  781. p_e = ordering[predecessor_plot[e]]
  782. for i in range(s, e):
  783. if p_e == ordering[i]:
  784. return s, e
  785. e -= 1
  786. return None, None
  787. def _xi_cluster(
  788. reachability_plot,
  789. predecessor_plot,
  790. ordering,
  791. xi,
  792. min_samples,
  793. min_cluster_size,
  794. predecessor_correction,
  795. ):
  796. """Automatically extract clusters according to the Xi-steep method.
  797. This is rouphly an implementation of Figure 19 of the OPTICS paper.
  798. Parameters
  799. ----------
  800. reachability_plot : array-like of shape (n_samples,)
  801. The reachability plot, i.e. reachability ordered according to
  802. the calculated ordering, all computed by OPTICS.
  803. predecessor_plot : array-like of shape (n_samples,)
  804. Predecessors ordered according to the calculated ordering.
  805. xi : float, between 0 and 1
  806. Determines the minimum steepness on the reachability plot that
  807. constitutes a cluster boundary. For example, an upwards point in the
  808. reachability plot is defined by the ratio from one point to its
  809. successor being at most 1-xi.
  810. min_samples : int > 1
  811. The same as the min_samples given to OPTICS. Up and down steep regions
  812. can't have more then ``min_samples`` consecutive non-steep points.
  813. min_cluster_size : int > 1
  814. Minimum number of samples in an OPTICS cluster.
  815. predecessor_correction : bool
  816. Correct clusters based on the calculated predecessors.
  817. Returns
  818. -------
  819. clusters : ndarray of shape (n_clusters, 2)
  820. The list of clusters in the form of [start, end] in each row, with all
  821. indices inclusive. The clusters are ordered in a way that larger
  822. clusters encompassing smaller clusters come after those smaller
  823. clusters.
  824. """
  825. # Our implementation adds an inf to the end of reachability plot
  826. # this helps to find potential clusters at the end of the
  827. # reachability plot even if there's no upward region at the end of it.
  828. reachability_plot = np.hstack((reachability_plot, np.inf))
  829. xi_complement = 1 - xi
  830. sdas = [] # steep down areas, introduced in section 4.3.2 of the paper
  831. clusters = []
  832. index = 0
  833. mib = 0.0 # maximum in between, section 4.3.2
  834. # Our implementation corrects a mistake in the original
  835. # paper, i.e., in Definition 9 steep downward point,
  836. # r(p) * (1 - x1) <= r(p + 1) should be
  837. # r(p) * (1 - x1) >= r(p + 1)
  838. with np.errstate(invalid="ignore"):
  839. ratio = reachability_plot[:-1] / reachability_plot[1:]
  840. steep_upward = ratio <= xi_complement
  841. steep_downward = ratio >= 1 / xi_complement
  842. downward = ratio > 1
  843. upward = ratio < 1
  844. # the following loop is almost exactly as Figure 19 of the paper.
  845. # it jumps over the areas which are not either steep down or up areas
  846. for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):
  847. # just continue if steep_index has been a part of a discovered xward
  848. # area.
  849. if steep_index < index:
  850. continue
  851. mib = max(mib, np.max(reachability_plot[index : steep_index + 1]))
  852. # steep downward areas
  853. if steep_downward[steep_index]:
  854. sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)
  855. D_start = steep_index
  856. D_end = _extend_region(steep_downward, upward, D_start, min_samples)
  857. D = {"start": D_start, "end": D_end, "mib": 0.0}
  858. sdas.append(D)
  859. index = D_end + 1
  860. mib = reachability_plot[index]
  861. # steep upward areas
  862. else:
  863. sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)
  864. U_start = steep_index
  865. U_end = _extend_region(steep_upward, downward, U_start, min_samples)
  866. index = U_end + 1
  867. mib = reachability_plot[index]
  868. U_clusters = []
  869. for D in sdas:
  870. c_start = D["start"]
  871. c_end = U_end
  872. # line (**), sc2*
  873. if reachability_plot[c_end + 1] * xi_complement < D["mib"]:
  874. continue
  875. # Definition 11: criterion 4
  876. D_max = reachability_plot[D["start"]]
  877. if D_max * xi_complement >= reachability_plot[c_end + 1]:
  878. # Find the first index from the left side which is almost
  879. # at the same level as the end of the detected cluster.
  880. while (
  881. reachability_plot[c_start + 1] > reachability_plot[c_end + 1]
  882. and c_start < D["end"]
  883. ):
  884. c_start += 1
  885. elif reachability_plot[c_end + 1] * xi_complement >= D_max:
  886. # Find the first index from the right side which is almost
  887. # at the same level as the beginning of the detected
  888. # cluster.
  889. # Our implementation corrects a mistake in the original
  890. # paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
  891. # r(x) > r(sD).
  892. while reachability_plot[c_end - 1] > D_max and c_end > U_start:
  893. c_end -= 1
  894. # predecessor correction
  895. if predecessor_correction:
  896. c_start, c_end = _correct_predecessor(
  897. reachability_plot, predecessor_plot, ordering, c_start, c_end
  898. )
  899. if c_start is None:
  900. continue
  901. # Definition 11: criterion 3.a
  902. if c_end - c_start + 1 < min_cluster_size:
  903. continue
  904. # Definition 11: criterion 1
  905. if c_start > D["end"]:
  906. continue
  907. # Definition 11: criterion 2
  908. if c_end < U_start:
  909. continue
  910. U_clusters.append((c_start, c_end))
  911. # add smaller clusters first.
  912. U_clusters.reverse()
  913. clusters.extend(U_clusters)
  914. return np.array(clusters)
  915. def _extract_xi_labels(ordering, clusters):
  916. """Extracts the labels from the clusters returned by `_xi_cluster`.
  917. We rely on the fact that clusters are stored
  918. with the smaller clusters coming before the larger ones.
  919. Parameters
  920. ----------
  921. ordering : array-like of shape (n_samples,)
  922. The ordering of points calculated by OPTICS
  923. clusters : array-like of shape (n_clusters, 2)
  924. List of clusters i.e. (start, end) tuples,
  925. as returned by `_xi_cluster`.
  926. Returns
  927. -------
  928. labels : ndarray of shape (n_samples,)
  929. """
  930. labels = np.full(len(ordering), -1, dtype=int)
  931. label = 0
  932. for c in clusters:
  933. if not np.any(labels[c[0] : (c[1] + 1)] != -1):
  934. labels[c[0] : (c[1] + 1)] = label
  935. label += 1
  936. labels[ordering] = labels.copy()
  937. return labels