| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358 |
- """Hierarchical Agglomerative Clustering
- These routines perform some hierarchical agglomerative clustering of some
- input data.
- Authors : Vincent Michel, Bertrand Thirion, Alexandre Gramfort,
- Gael Varoquaux
- License: BSD 3 clause
- """
- import warnings
- from heapq import heapify, heappop, heappush, heappushpop
- from numbers import Integral, Real
- import numpy as np
- from scipy import sparse
- from scipy.sparse.csgraph import connected_components
- from ..base import (
- BaseEstimator,
- ClassNamePrefixFeaturesOutMixin,
- ClusterMixin,
- _fit_context,
- )
- from ..metrics import DistanceMetric
- from ..metrics._dist_metrics import METRIC_MAPPING64
- from ..metrics.pairwise import _VALID_METRICS, paired_distances
- from ..utils import check_array
- from ..utils._fast_dict import IntFloatDict
- from ..utils._param_validation import (
- HasMethods,
- Hidden,
- Interval,
- StrOptions,
- validate_params,
- )
- from ..utils.graph import _fix_connected_components
- from ..utils.validation import check_memory
- # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
- from . import _hierarchical_fast as _hierarchical # type: ignore
- from ._feature_agglomeration import AgglomerationTransform
- ###############################################################################
- # For non fully-connected graphs
- def _fix_connectivity(X, connectivity, affinity):
- """
- Fixes the connectivity matrix.
- The different steps are:
- - copies it
- - makes it symmetric
- - converts it to LIL if necessary
- - completes it if necessary.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Feature matrix representing `n_samples` samples to be clustered.
- connectivity : sparse matrix, default=None
- Connectivity matrix. Defines for each sample the neighboring samples
- following a given structure of the data. The matrix is assumed to
- be symmetric and only the upper triangular half is used.
- Default is `None`, i.e, the Ward algorithm is unstructured.
- affinity : {"euclidean", "precomputed"}, default="euclidean"
- Which affinity to use. At the moment `precomputed` and
- ``euclidean`` are supported. `euclidean` uses the
- negative squared Euclidean distance between points.
- Returns
- -------
- connectivity : sparse matrix
- The fixed connectivity matrix.
- n_connected_components : int
- The number of connected components in the graph.
- """
- n_samples = X.shape[0]
- if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
- raise ValueError(
- "Wrong shape for connectivity matrix: %s when X is %s"
- % (connectivity.shape, X.shape)
- )
- # Make the connectivity matrix symmetric:
- connectivity = connectivity + connectivity.T
- # Convert connectivity matrix to LIL
- if not sparse.issparse(connectivity):
- connectivity = sparse.lil_matrix(connectivity)
- # `connectivity` is a sparse matrix at this point
- if connectivity.format != "lil":
- connectivity = connectivity.tolil()
- # Compute the number of nodes
- n_connected_components, labels = connected_components(connectivity)
- if n_connected_components > 1:
- warnings.warn(
- "the number of connected components of the "
- "connectivity matrix is %d > 1. Completing it to avoid "
- "stopping the tree early." % n_connected_components,
- stacklevel=2,
- )
- # XXX: Can we do without completing the matrix?
- connectivity = _fix_connected_components(
- X=X,
- graph=connectivity,
- n_connected_components=n_connected_components,
- component_labels=labels,
- metric=affinity,
- mode="connectivity",
- )
- return connectivity, n_connected_components
- def _single_linkage_tree(
- connectivity,
- n_samples,
- n_nodes,
- n_clusters,
- n_connected_components,
- return_distance,
- ):
- """
- Perform single linkage clustering on sparse data via the minimum
- spanning tree from scipy.sparse.csgraph, then using union-find to label.
- The parent array is then generated by walking through the tree.
- """
- from scipy.sparse.csgraph import minimum_spanning_tree
- # explicitly cast connectivity to ensure safety
- connectivity = connectivity.astype(np.float64, copy=False)
- # Ensure zero distances aren't ignored by setting them to "epsilon"
- epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps
- connectivity.data[connectivity.data == 0] = epsilon_value
- # Use scipy.sparse.csgraph to generate a minimum spanning tree
- mst = minimum_spanning_tree(connectivity.tocsr())
- # Convert the graph to scipy.cluster.hierarchy array format
- mst = mst.tocoo()
- # Undo the epsilon values
- mst.data[mst.data == epsilon_value] = 0
- mst_array = np.vstack([mst.row, mst.col, mst.data]).T
- # Sort edges of the min_spanning_tree by weight
- mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :]
- # Convert edge list into standard hierarchical clustering format
- single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
- children_ = single_linkage_tree[:, :2].astype(int)
- # Compute parents
- parent = np.arange(n_nodes, dtype=np.intp)
- for i, (left, right) in enumerate(children_, n_samples):
- if n_clusters is not None and i >= n_nodes:
- break
- if left < n_nodes:
- parent[left] = i
- if right < n_nodes:
- parent[right] = i
- if return_distance:
- distances = single_linkage_tree[:, 2]
- return children_, n_connected_components, n_samples, parent, distances
- return children_, n_connected_components, n_samples, parent
- ###############################################################################
- # Hierarchical tree building functions
- @validate_params(
- {
- "X": ["array-like"],
- "connectivity": ["array-like", "sparse matrix", None],
- "n_clusters": [Interval(Integral, 1, None, closed="left"), None],
- "return_distance": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
- """Ward clustering based on a Feature matrix.
- Recursively merges the pair of clusters that minimally increases
- within-cluster variance.
- The inertia matrix uses a Heapq-based representation.
- This is the structured version, that takes into account some topological
- structure between samples.
- Read more in the :ref:`User Guide <hierarchical_clustering>`.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Feature matrix representing `n_samples` samples to be clustered.
- connectivity : {array-like, sparse matrix}, default=None
- Connectivity matrix. Defines for each sample the neighboring samples
- following a given structure of the data. The matrix is assumed to
- be symmetric and only the upper triangular half is used.
- Default is None, i.e, the Ward algorithm is unstructured.
- n_clusters : int, default=None
- `n_clusters` should be less than `n_samples`. Stop early the
- construction of the tree at `n_clusters.` This is useful to decrease
- computation time if the number of clusters is not small compared to the
- number of samples. In this case, the complete tree is not computed, thus
- the 'children' output is of limited use, and the 'parents' output should
- rather be used. This option is valid only when specifying a connectivity
- matrix.
- return_distance : bool, default=False
- If `True`, return the distance between the clusters.
- Returns
- -------
- children : ndarray of shape (n_nodes-1, 2)
- The children of each non-leaf node. Values less than `n_samples`
- correspond to leaves of the tree which are the original samples.
- A node `i` greater than or equal to `n_samples` is a non-leaf
- node and has children `children_[i - n_samples]`. Alternatively
- at the i-th iteration, children[i][0] and children[i][1]
- are merged to form node `n_samples + i`.
- n_connected_components : int
- The number of connected components in the graph.
- n_leaves : int
- The number of leaves in the tree.
- parents : ndarray of shape (n_nodes,) or None
- The parent of each node. Only returned when a connectivity matrix
- is specified, elsewhere 'None' is returned.
- distances : ndarray of shape (n_nodes-1,)
- Only returned if `return_distance` is set to `True` (for compatibility).
- The distances between the centers of the nodes. `distances[i]`
- corresponds to a weighted Euclidean distance between
- the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
- leaves of the tree, then `distances[i]` is their unweighted Euclidean
- distance. Distances are updated in the following way
- (from scipy.hierarchy.linkage):
- The new entry :math:`d(u,v)` is computed as follows,
- .. math::
- d(u,v) = \\sqrt{\\frac{|v|+|s|}
- {T}d(v,s)^2
- + \\frac{|v|+|t|}
- {T}d(v,t)^2
- - \\frac{|v|}
- {T}d(s,t)^2}
- where :math:`u` is the newly joined cluster consisting of
- clusters :math:`s` and :math:`t`, :math:`v` is an unused
- cluster in the forest, :math:`T=|v|+|s|+|t|`, and
- :math:`|*|` is the cardinality of its argument. This is also
- known as the incremental algorithm.
- """
- X = np.asarray(X)
- if X.ndim == 1:
- X = np.reshape(X, (-1, 1))
- n_samples, n_features = X.shape
- if connectivity is None:
- from scipy.cluster import hierarchy # imports PIL
- if n_clusters is not None:
- warnings.warn(
- (
- "Partial build of the tree is implemented "
- "only for structured clustering (i.e. with "
- "explicit connectivity). The algorithm "
- "will build the full tree and only "
- "retain the lower branches required "
- "for the specified number of clusters"
- ),
- stacklevel=2,
- )
- X = np.require(X, requirements="W")
- out = hierarchy.ward(X)
- children_ = out[:, :2].astype(np.intp)
- if return_distance:
- distances = out[:, 2]
- return children_, 1, n_samples, None, distances
- else:
- return children_, 1, n_samples, None
- connectivity, n_connected_components = _fix_connectivity(
- X, connectivity, affinity="euclidean"
- )
- if n_clusters is None:
- n_nodes = 2 * n_samples - 1
- else:
- if n_clusters > n_samples:
- raise ValueError(
- "Cannot provide more clusters than samples. "
- "%i n_clusters was asked, and there are %i "
- "samples." % (n_clusters, n_samples)
- )
- n_nodes = 2 * n_samples - n_clusters
- # create inertia matrix
- coord_row = []
- coord_col = []
- A = []
- for ind, row in enumerate(connectivity.rows):
- A.append(row)
- # We keep only the upper triangular for the moments
- # Generator expressions are faster than arrays on the following
- row = [i for i in row if i < ind]
- coord_row.extend(
- len(row)
- * [
- ind,
- ]
- )
- coord_col.extend(row)
- coord_row = np.array(coord_row, dtype=np.intp, order="C")
- coord_col = np.array(coord_col, dtype=np.intp, order="C")
- # build moments as a list
- moments_1 = np.zeros(n_nodes, order="C")
- moments_1[:n_samples] = 1
- moments_2 = np.zeros((n_nodes, n_features), order="C")
- moments_2[:n_samples] = X
- inertia = np.empty(len(coord_row), dtype=np.float64, order="C")
- _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia)
- inertia = list(zip(inertia, coord_row, coord_col))
- heapify(inertia)
- # prepare the main fields
- parent = np.arange(n_nodes, dtype=np.intp)
- used_node = np.ones(n_nodes, dtype=bool)
- children = []
- if return_distance:
- distances = np.empty(n_nodes - n_samples)
- not_visited = np.empty(n_nodes, dtype=bool, order="C")
- # recursive merge loop
- for k in range(n_samples, n_nodes):
- # identify the merge
- while True:
- inert, i, j = heappop(inertia)
- if used_node[i] and used_node[j]:
- break
- parent[i], parent[j] = k, k
- children.append((i, j))
- used_node[i] = used_node[j] = False
- if return_distance: # store inertia value
- distances[k - n_samples] = inert
- # update the moments
- moments_1[k] = moments_1[i] + moments_1[j]
- moments_2[k] = moments_2[i] + moments_2[j]
- # update the structure matrix A and the inertia matrix
- coord_col = []
- not_visited.fill(1)
- not_visited[k] = 0
- _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
- _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
- # List comprehension is faster than a for loop
- [A[col].append(k) for col in coord_col]
- A.append(coord_col)
- coord_col = np.array(coord_col, dtype=np.intp, order="C")
- coord_row = np.empty(coord_col.shape, dtype=np.intp, order="C")
- coord_row.fill(k)
- n_additions = len(coord_row)
- ini = np.empty(n_additions, dtype=np.float64, order="C")
- _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini)
- # List comprehension is faster than a for loop
- [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)]
- # Separate leaves in children (empty lists up to now)
- n_leaves = n_samples
- # sort children to get consistent output with unstructured version
- children = [c[::-1] for c in children]
- children = np.array(children) # return numpy array for efficient caching
- if return_distance:
- # 2 is scaling factor to compare w/ unstructured version
- distances = np.sqrt(2.0 * distances)
- return children, n_connected_components, n_leaves, parent, distances
- else:
- return children, n_connected_components, n_leaves, parent
- # single average and complete linkage
- def linkage_tree(
- X,
- connectivity=None,
- n_clusters=None,
- linkage="complete",
- affinity="euclidean",
- return_distance=False,
- ):
- """Linkage agglomerative clustering based on a Feature matrix.
- The inertia matrix uses a Heapq-based representation.
- This is the structured version, that takes into account some topological
- structure between samples.
- Read more in the :ref:`User Guide <hierarchical_clustering>`.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Feature matrix representing `n_samples` samples to be clustered.
- connectivity : sparse matrix, default=None
- Connectivity matrix. Defines for each sample the neighboring samples
- following a given structure of the data. The matrix is assumed to
- be symmetric and only the upper triangular half is used.
- Default is `None`, i.e, the Ward algorithm is unstructured.
- n_clusters : int, default=None
- Stop early the construction of the tree at `n_clusters`. This is
- useful to decrease computation time if the number of clusters is
- not small compared to the number of samples. In this case, the
- complete tree is not computed, thus the 'children' output is of
- limited use, and the 'parents' output should rather be used.
- This option is valid only when specifying a connectivity matrix.
- linkage : {"average", "complete", "single"}, default="complete"
- Which linkage criteria to use. The linkage criterion determines which
- distance to use between sets of observation.
- - "average" uses the average of the distances of each observation of
- the two sets.
- - "complete" or maximum linkage uses the maximum distances between
- all observations of the two sets.
- - "single" uses the minimum of the distances between all
- observations of the two sets.
- affinity : str or callable, default='euclidean'
- Which metric to use. Can be 'euclidean', 'manhattan', or any
- distance known to paired distance (see metric.pairwise).
- return_distance : bool, default=False
- Whether or not to return the distances between the clusters.
- Returns
- -------
- children : ndarray of shape (n_nodes-1, 2)
- The children of each non-leaf node. Values less than `n_samples`
- correspond to leaves of the tree which are the original samples.
- A node `i` greater than or equal to `n_samples` is a non-leaf
- node and has children `children_[i - n_samples]`. Alternatively
- at the i-th iteration, children[i][0] and children[i][1]
- are merged to form node `n_samples + i`.
- n_connected_components : int
- The number of connected components in the graph.
- n_leaves : int
- The number of leaves in the tree.
- parents : ndarray of shape (n_nodes, ) or None
- The parent of each node. Only returned when a connectivity matrix
- is specified, elsewhere 'None' is returned.
- distances : ndarray of shape (n_nodes-1,)
- Returned when `return_distance` is set to `True`.
- distances[i] refers to the distance between children[i][0] and
- children[i][1] when they are merged.
- See Also
- --------
- ward_tree : Hierarchical clustering with ward linkage.
- """
- X = np.asarray(X)
- if X.ndim == 1:
- X = np.reshape(X, (-1, 1))
- n_samples, n_features = X.shape
- linkage_choices = {
- "complete": _hierarchical.max_merge,
- "average": _hierarchical.average_merge,
- "single": None,
- } # Single linkage is handled differently
- try:
- join_func = linkage_choices[linkage]
- except KeyError as e:
- raise ValueError(
- "Unknown linkage option, linkage should be one of %s, but %s was given"
- % (linkage_choices.keys(), linkage)
- ) from e
- if affinity == "cosine" and np.any(~np.any(X, axis=1)):
- raise ValueError("Cosine affinity cannot be used when X contains zero vectors")
- if connectivity is None:
- from scipy.cluster import hierarchy # imports PIL
- if n_clusters is not None:
- warnings.warn(
- (
- "Partial build of the tree is implemented "
- "only for structured clustering (i.e. with "
- "explicit connectivity). The algorithm "
- "will build the full tree and only "
- "retain the lower branches required "
- "for the specified number of clusters"
- ),
- stacklevel=2,
- )
- if affinity == "precomputed":
- # for the linkage function of hierarchy to work on precomputed
- # data, provide as first argument an ndarray of the shape returned
- # by sklearn.metrics.pairwise_distances.
- if X.shape[0] != X.shape[1]:
- raise ValueError(
- f"Distance matrix should be square, got matrix of shape {X.shape}"
- )
- i, j = np.triu_indices(X.shape[0], k=1)
- X = X[i, j]
- elif affinity == "l2":
- # Translate to something understood by scipy
- affinity = "euclidean"
- elif affinity in ("l1", "manhattan"):
- affinity = "cityblock"
- elif callable(affinity):
- X = affinity(X)
- i, j = np.triu_indices(X.shape[0], k=1)
- X = X[i, j]
- if (
- linkage == "single"
- and affinity != "precomputed"
- and not callable(affinity)
- and affinity in METRIC_MAPPING64
- ):
- # We need the fast cythonized metric from neighbors
- dist_metric = DistanceMetric.get_metric(affinity)
- # The Cython routines used require contiguous arrays
- X = np.ascontiguousarray(X, dtype=np.double)
- mst = _hierarchical.mst_linkage_core(X, dist_metric)
- # Sort edges of the min_spanning_tree by weight
- mst = mst[np.argsort(mst.T[2], kind="mergesort"), :]
- # Convert edge list into standard hierarchical clustering format
- out = _hierarchical.single_linkage_label(mst)
- else:
- out = hierarchy.linkage(X, method=linkage, metric=affinity)
- children_ = out[:, :2].astype(int, copy=False)
- if return_distance:
- distances = out[:, 2]
- return children_, 1, n_samples, None, distances
- return children_, 1, n_samples, None
- connectivity, n_connected_components = _fix_connectivity(
- X, connectivity, affinity=affinity
- )
- connectivity = connectivity.tocoo()
- # Put the diagonal to zero
- diag_mask = connectivity.row != connectivity.col
- connectivity.row = connectivity.row[diag_mask]
- connectivity.col = connectivity.col[diag_mask]
- connectivity.data = connectivity.data[diag_mask]
- del diag_mask
- if affinity == "precomputed":
- distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False)
- else:
- # FIXME We compute all the distances, while we could have only computed
- # the "interesting" distances
- distances = paired_distances(
- X[connectivity.row], X[connectivity.col], metric=affinity
- )
- connectivity.data = distances
- if n_clusters is None:
- n_nodes = 2 * n_samples - 1
- else:
- assert n_clusters <= n_samples
- n_nodes = 2 * n_samples - n_clusters
- if linkage == "single":
- return _single_linkage_tree(
- connectivity,
- n_samples,
- n_nodes,
- n_clusters,
- n_connected_components,
- return_distance,
- )
- if return_distance:
- distances = np.empty(n_nodes - n_samples)
- # create inertia heap and connection matrix
- A = np.empty(n_nodes, dtype=object)
- inertia = list()
- # LIL seems to the best format to access the rows quickly,
- # without the numpy overhead of slicing CSR indices and data.
- connectivity = connectivity.tolil()
- # We are storing the graph in a list of IntFloatDict
- for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)):
- A[ind] = IntFloatDict(
- np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)
- )
- # We keep only the upper triangular for the heap
- # Generator expressions are faster than arrays on the following
- inertia.extend(
- _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind
- )
- del connectivity
- heapify(inertia)
- # prepare the main fields
- parent = np.arange(n_nodes, dtype=np.intp)
- used_node = np.ones(n_nodes, dtype=np.intp)
- children = []
- # recursive merge loop
- for k in range(n_samples, n_nodes):
- # identify the merge
- while True:
- edge = heappop(inertia)
- if used_node[edge.a] and used_node[edge.b]:
- break
- i = edge.a
- j = edge.b
- if return_distance:
- # store distances
- distances[k - n_samples] = edge.weight
- parent[i] = parent[j] = k
- children.append((i, j))
- # Keep track of the number of elements per cluster
- n_i = used_node[i]
- n_j = used_node[j]
- used_node[k] = n_i + n_j
- used_node[i] = used_node[j] = False
- # update the structure matrix A and the inertia matrix
- # a clever 'min', or 'max' operation between A[i] and A[j]
- coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
- for col, d in coord_col:
- A[col].append(k, d)
- # Here we use the information from coord_col (containing the
- # distances) to update the heap
- heappush(inertia, _hierarchical.WeightedEdge(d, k, col))
- A[k] = coord_col
- # Clear A[i] and A[j] to save memory
- A[i] = A[j] = 0
- # Separate leaves in children (empty lists up to now)
- n_leaves = n_samples
- # # return numpy array for efficient caching
- children = np.array(children)[:, ::-1]
- if return_distance:
- return children, n_connected_components, n_leaves, parent, distances
- return children, n_connected_components, n_leaves, parent
- # Matching names to tree-building strategies
- def _complete_linkage(*args, **kwargs):
- kwargs["linkage"] = "complete"
- return linkage_tree(*args, **kwargs)
- def _average_linkage(*args, **kwargs):
- kwargs["linkage"] = "average"
- return linkage_tree(*args, **kwargs)
- def _single_linkage(*args, **kwargs):
- kwargs["linkage"] = "single"
- return linkage_tree(*args, **kwargs)
- _TREE_BUILDERS = dict(
- ward=ward_tree,
- complete=_complete_linkage,
- average=_average_linkage,
- single=_single_linkage,
- )
- ###############################################################################
- # Functions for cutting hierarchical clustering tree
- def _hc_cut(n_clusters, children, n_leaves):
- """Function cutting the ward tree for a given number of clusters.
- Parameters
- ----------
- n_clusters : int or ndarray
- The number of clusters to form.
- children : ndarray of shape (n_nodes-1, 2)
- The children of each non-leaf node. Values less than `n_samples`
- correspond to leaves of the tree which are the original samples.
- A node `i` greater than or equal to `n_samples` is a non-leaf
- node and has children `children_[i - n_samples]`. Alternatively
- at the i-th iteration, children[i][0] and children[i][1]
- are merged to form node `n_samples + i`.
- n_leaves : int
- Number of leaves of the tree.
- Returns
- -------
- labels : array [n_samples]
- Cluster labels for each point.
- """
- if n_clusters > n_leaves:
- raise ValueError(
- "Cannot extract more clusters than samples: "
- "%s clusters where given for a tree with %s leaves."
- % (n_clusters, n_leaves)
- )
- # In this function, we store nodes as a heap to avoid recomputing
- # the max of the nodes: the first element is always the smallest
- # We use negated indices as heaps work on smallest elements, and we
- # are interested in largest elements
- # children[-1] is the root of the tree
- nodes = [-(max(children[-1]) + 1)]
- for _ in range(n_clusters - 1):
- # As we have a heap, nodes[0] is the smallest element
- these_children = children[-nodes[0] - n_leaves]
- # Insert the 2 children and remove the largest node
- heappush(nodes, -these_children[0])
- heappushpop(nodes, -these_children[1])
- label = np.zeros(n_leaves, dtype=np.intp)
- for i, node in enumerate(nodes):
- label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i
- return label
- ###############################################################################
- class AgglomerativeClustering(ClusterMixin, BaseEstimator):
- """
- Agglomerative Clustering.
- Recursively merges pair of clusters of sample data; uses linkage distance.
- Read more in the :ref:`User Guide <hierarchical_clustering>`.
- Parameters
- ----------
- n_clusters : int or None, default=2
- The number of clusters to find. It must be ``None`` if
- ``distance_threshold`` is not ``None``.
- affinity : str or callable, default='euclidean'
- The metric to use when calculating distance between instances in a
- feature array. If metric is a string or callable, it must be one of
- the options allowed by :func:`sklearn.metrics.pairwise_distances` for
- its metric parameter.
- If linkage is "ward", only "euclidean" is accepted.
- If "precomputed", a distance matrix (instead of a similarity matrix)
- is needed as input for the fit method.
- .. deprecated:: 1.2
- `affinity` was deprecated in version 1.2 and will be renamed to
- `metric` in 1.4.
- metric : str or callable, default=None
- Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
- "manhattan", "cosine", or "precomputed". If set to `None` then
- "euclidean" is used. If linkage is "ward", only "euclidean" is
- accepted. If "precomputed", a distance matrix is needed as input for
- the fit method.
- .. versionadded:: 1.2
- memory : str or object with the joblib.Memory interface, default=None
- Used to cache the output of the computation of the tree.
- By default, no caching is done. If a string is given, it is the
- path to the caching directory.
- connectivity : array-like or callable, default=None
- Connectivity matrix. Defines for each sample the neighboring
- samples following a given structure of the data.
- This can be a connectivity matrix itself or a callable that transforms
- the data into a connectivity matrix, such as derived from
- `kneighbors_graph`. Default is ``None``, i.e, the
- hierarchical clustering algorithm is unstructured.
- compute_full_tree : 'auto' or bool, default='auto'
- Stop early the construction of the tree at ``n_clusters``. This is
- useful to decrease computation time if the number of clusters is not
- small compared to the number of samples. This option is useful only
- when specifying a connectivity matrix. Note also that when varying the
- number of clusters and using caching, it may be advantageous to compute
- the full tree. It must be ``True`` if ``distance_threshold`` is not
- ``None``. By default `compute_full_tree` is "auto", which is equivalent
- to `True` when `distance_threshold` is not `None` or that `n_clusters`
- is inferior to the maximum between 100 or `0.02 * n_samples`.
- Otherwise, "auto" is equivalent to `False`.
- linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
- Which linkage criterion to use. The linkage criterion determines which
- distance to use between sets of observation. The algorithm will merge
- the pairs of cluster that minimize this criterion.
- - 'ward' minimizes the variance of the clusters being merged.
- - 'average' uses the average of the distances of each observation of
- the two sets.
- - 'complete' or 'maximum' linkage uses the maximum distances between
- all observations of the two sets.
- - 'single' uses the minimum of the distances between all observations
- of the two sets.
- .. versionadded:: 0.20
- Added the 'single' option
- distance_threshold : float, default=None
- The linkage distance threshold at or above which clusters will not be
- merged. If not ``None``, ``n_clusters`` must be ``None`` and
- ``compute_full_tree`` must be ``True``.
- .. versionadded:: 0.21
- compute_distances : bool, default=False
- Computes distances between clusters even if `distance_threshold` is not
- used. This can be used to make dendrogram visualization, but introduces
- a computational and memory overhead.
- .. versionadded:: 0.24
- Attributes
- ----------
- n_clusters_ : int
- The number of clusters found by the algorithm. If
- ``distance_threshold=None``, it will be equal to the given
- ``n_clusters``.
- labels_ : ndarray of shape (n_samples)
- Cluster labels for each point.
- n_leaves_ : int
- Number of leaves in the hierarchical tree.
- n_connected_components_ : int
- The estimated number of connected components in the graph.
- .. versionadded:: 0.21
- ``n_connected_components_`` was added to replace ``n_components_``.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- children_ : array-like of shape (n_samples-1, 2)
- The children of each non-leaf node. Values less than `n_samples`
- correspond to leaves of the tree which are the original samples.
- A node `i` greater than or equal to `n_samples` is a non-leaf
- node and has children `children_[i - n_samples]`. Alternatively
- at the i-th iteration, children[i][0] and children[i][1]
- are merged to form node `n_samples + i`.
- distances_ : array-like of shape (n_nodes-1,)
- Distances between nodes in the corresponding place in `children_`.
- Only computed if `distance_threshold` is used or `compute_distances`
- is set to `True`.
- See Also
- --------
- FeatureAgglomeration : Agglomerative clustering but for features instead of
- samples.
- ward_tree : Hierarchical clustering with ward linkage.
- Examples
- --------
- >>> from sklearn.cluster import AgglomerativeClustering
- >>> import numpy as np
- >>> X = np.array([[1, 2], [1, 4], [1, 0],
- ... [4, 2], [4, 4], [4, 0]])
- >>> clustering = AgglomerativeClustering().fit(X)
- >>> clustering
- AgglomerativeClustering()
- >>> clustering.labels_
- array([1, 1, 1, 0, 0, 0])
- """
- _parameter_constraints: dict = {
- "n_clusters": [Interval(Integral, 1, None, closed="left"), None],
- "affinity": [
- Hidden(StrOptions({"deprecated"})),
- StrOptions(set(_VALID_METRICS) | {"precomputed"}),
- callable,
- ],
- "metric": [
- StrOptions(set(_VALID_METRICS) | {"precomputed"}),
- callable,
- None,
- ],
- "memory": [str, HasMethods("cache"), None],
- "connectivity": ["array-like", callable, None],
- "compute_full_tree": [StrOptions({"auto"}), "boolean"],
- "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
- "distance_threshold": [Interval(Real, 0, None, closed="left"), None],
- "compute_distances": ["boolean"],
- }
- def __init__(
- self,
- n_clusters=2,
- *,
- affinity="deprecated", # TODO(1.4): Remove
- metric=None, # TODO(1.4): Set to "euclidean"
- memory=None,
- connectivity=None,
- compute_full_tree="auto",
- linkage="ward",
- distance_threshold=None,
- compute_distances=False,
- ):
- self.n_clusters = n_clusters
- self.distance_threshold = distance_threshold
- self.memory = memory
- self.connectivity = connectivity
- self.compute_full_tree = compute_full_tree
- self.linkage = linkage
- self.affinity = affinity
- self.metric = metric
- self.compute_distances = compute_distances
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """Fit the hierarchical clustering from features, or distance matrix.
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features) or \
- (n_samples, n_samples)
- Training instances to cluster, or distances between instances if
- ``metric='precomputed'``.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- self : object
- Returns the fitted instance.
- """
- X = self._validate_data(X, ensure_min_samples=2)
- return self._fit(X)
- def _fit(self, X):
- """Fit without validation
- Parameters
- ----------
- X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
- Training instances to cluster, or distances between instances if
- ``affinity='precomputed'``.
- Returns
- -------
- self : object
- Returns the fitted instance.
- """
- memory = check_memory(self.memory)
- self._metric = self.metric
- # TODO(1.4): Remove
- if self.affinity != "deprecated":
- if self.metric is not None:
- raise ValueError(
- "Both `affinity` and `metric` attributes were set. Attribute"
- " `affinity` was deprecated in version 1.2 and will be removed in"
- " 1.4. To avoid this error, only set the `metric` attribute."
- )
- warnings.warn(
- (
- "Attribute `affinity` was deprecated in version 1.2 and will be"
- " removed in 1.4. Use `metric` instead"
- ),
- FutureWarning,
- )
- self._metric = self.affinity
- elif self.metric is None:
- self._metric = "euclidean"
- if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
- raise ValueError(
- "Exactly one of n_clusters and "
- "distance_threshold has to be set, and the other "
- "needs to be None."
- )
- if self.distance_threshold is not None and not self.compute_full_tree:
- raise ValueError(
- "compute_full_tree must be True if distance_threshold is set."
- )
- if self.linkage == "ward" and self._metric != "euclidean":
- raise ValueError(
- f"{self._metric} was provided as metric. Ward can only "
- "work with euclidean distances."
- )
- tree_builder = _TREE_BUILDERS[self.linkage]
- connectivity = self.connectivity
- if self.connectivity is not None:
- if callable(self.connectivity):
- connectivity = self.connectivity(X)
- connectivity = check_array(
- connectivity, accept_sparse=["csr", "coo", "lil"]
- )
- n_samples = len(X)
- compute_full_tree = self.compute_full_tree
- if self.connectivity is None:
- compute_full_tree = True
- if compute_full_tree == "auto":
- if self.distance_threshold is not None:
- compute_full_tree = True
- else:
- # Early stopping is likely to give a speed up only for
- # a large number of clusters. The actual threshold
- # implemented here is heuristic
- compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples)
- n_clusters = self.n_clusters
- if compute_full_tree:
- n_clusters = None
- # Construct the tree
- kwargs = {}
- if self.linkage != "ward":
- kwargs["linkage"] = self.linkage
- kwargs["affinity"] = self._metric
- distance_threshold = self.distance_threshold
- return_distance = (distance_threshold is not None) or self.compute_distances
- out = memory.cache(tree_builder)(
- X,
- connectivity=connectivity,
- n_clusters=n_clusters,
- return_distance=return_distance,
- **kwargs,
- )
- (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[
- :4
- ]
- if return_distance:
- self.distances_ = out[-1]
- if self.distance_threshold is not None: # distance_threshold is used
- self.n_clusters_ = (
- np.count_nonzero(self.distances_ >= distance_threshold) + 1
- )
- else: # n_clusters is used
- self.n_clusters_ = self.n_clusters
- # Cut the tree
- if compute_full_tree:
- self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_)
- else:
- labels = _hierarchical.hc_get_heads(parents, copy=False)
- # copy to avoid holding a reference on the original array
- labels = np.copy(labels[:n_samples])
- # Reassign cluster numbers
- self.labels_ = np.searchsorted(np.unique(labels), labels)
- return self
- def fit_predict(self, X, y=None):
- """Fit and return the result of each sample's clustering assignment.
- In addition to fitting, this method also return the result of the
- clustering assignment for each sample in the training set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or \
- (n_samples, n_samples)
- Training instances to cluster, or distances between instances if
- ``affinity='precomputed'``.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- labels : ndarray of shape (n_samples,)
- Cluster labels.
- """
- return super().fit_predict(X, y)
- class FeatureAgglomeration(
- ClassNamePrefixFeaturesOutMixin, AgglomerativeClustering, AgglomerationTransform
- ):
- """Agglomerate features.
- Recursively merges pair of clusters of features.
- Read more in the :ref:`User Guide <hierarchical_clustering>`.
- Parameters
- ----------
- n_clusters : int or None, default=2
- The number of clusters to find. It must be ``None`` if
- ``distance_threshold`` is not ``None``.
- affinity : str or callable, default='euclidean'
- The metric to use when calculating distance between instances in a
- feature array. If metric is a string or callable, it must be one of
- the options allowed by :func:`sklearn.metrics.pairwise_distances` for
- its metric parameter.
- If linkage is "ward", only "euclidean" is accepted.
- If "precomputed", a distance matrix (instead of a similarity matrix)
- is needed as input for the fit method.
- .. deprecated:: 1.2
- `affinity` was deprecated in version 1.2 and will be renamed to
- `metric` in 1.4.
- metric : str or callable, default=None
- Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
- "manhattan", "cosine", or "precomputed". If set to `None` then
- "euclidean" is used. If linkage is "ward", only "euclidean" is
- accepted. If "precomputed", a distance matrix is needed as input for
- the fit method.
- .. versionadded:: 1.2
- memory : str or object with the joblib.Memory interface, default=None
- Used to cache the output of the computation of the tree.
- By default, no caching is done. If a string is given, it is the
- path to the caching directory.
- connectivity : array-like or callable, default=None
- Connectivity matrix. Defines for each feature the neighboring
- features following a given structure of the data.
- This can be a connectivity matrix itself or a callable that transforms
- the data into a connectivity matrix, such as derived from
- `kneighbors_graph`. Default is `None`, i.e, the
- hierarchical clustering algorithm is unstructured.
- compute_full_tree : 'auto' or bool, default='auto'
- Stop early the construction of the tree at `n_clusters`. This is useful
- to decrease computation time if the number of clusters is not small
- compared to the number of features. This option is useful only when
- specifying a connectivity matrix. Note also that when varying the
- number of clusters and using caching, it may be advantageous to compute
- the full tree. It must be ``True`` if ``distance_threshold`` is not
- ``None``. By default `compute_full_tree` is "auto", which is equivalent
- to `True` when `distance_threshold` is not `None` or that `n_clusters`
- is inferior to the maximum between 100 or `0.02 * n_samples`.
- Otherwise, "auto" is equivalent to `False`.
- linkage : {"ward", "complete", "average", "single"}, default="ward"
- Which linkage criterion to use. The linkage criterion determines which
- distance to use between sets of features. The algorithm will merge
- the pairs of cluster that minimize this criterion.
- - "ward" minimizes the variance of the clusters being merged.
- - "complete" or maximum linkage uses the maximum distances between
- all features of the two sets.
- - "average" uses the average of the distances of each feature of
- the two sets.
- - "single" uses the minimum of the distances between all features
- of the two sets.
- pooling_func : callable, default=np.mean
- This combines the values of agglomerated features into a single
- value, and should accept an array of shape [M, N] and the keyword
- argument `axis=1`, and reduce it to an array of size [M].
- distance_threshold : float, default=None
- The linkage distance threshold at or above which clusters will not be
- merged. If not ``None``, ``n_clusters`` must be ``None`` and
- ``compute_full_tree`` must be ``True``.
- .. versionadded:: 0.21
- compute_distances : bool, default=False
- Computes distances between clusters even if `distance_threshold` is not
- used. This can be used to make dendrogram visualization, but introduces
- a computational and memory overhead.
- .. versionadded:: 0.24
- Attributes
- ----------
- n_clusters_ : int
- The number of clusters found by the algorithm. If
- ``distance_threshold=None``, it will be equal to the given
- ``n_clusters``.
- labels_ : array-like of (n_features,)
- Cluster labels for each feature.
- n_leaves_ : int
- Number of leaves in the hierarchical tree.
- n_connected_components_ : int
- The estimated number of connected components in the graph.
- .. versionadded:: 0.21
- ``n_connected_components_`` was added to replace ``n_components_``.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- children_ : array-like of shape (n_nodes-1, 2)
- The children of each non-leaf node. Values less than `n_features`
- correspond to leaves of the tree which are the original samples.
- A node `i` greater than or equal to `n_features` is a non-leaf
- node and has children `children_[i - n_features]`. Alternatively
- at the i-th iteration, children[i][0] and children[i][1]
- are merged to form node `n_features + i`.
- distances_ : array-like of shape (n_nodes-1,)
- Distances between nodes in the corresponding place in `children_`.
- Only computed if `distance_threshold` is used or `compute_distances`
- is set to `True`.
- See Also
- --------
- AgglomerativeClustering : Agglomerative clustering samples instead of
- features.
- ward_tree : Hierarchical clustering with ward linkage.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn import datasets, cluster
- >>> digits = datasets.load_digits()
- >>> images = digits.images
- >>> X = np.reshape(images, (len(images), -1))
- >>> agglo = cluster.FeatureAgglomeration(n_clusters=32)
- >>> agglo.fit(X)
- FeatureAgglomeration(n_clusters=32)
- >>> X_reduced = agglo.transform(X)
- >>> X_reduced.shape
- (1797, 32)
- """
- _parameter_constraints: dict = {
- "n_clusters": [Interval(Integral, 1, None, closed="left"), None],
- "affinity": [
- Hidden(StrOptions({"deprecated"})),
- StrOptions(set(_VALID_METRICS) | {"precomputed"}),
- callable,
- ],
- "metric": [
- StrOptions(set(_VALID_METRICS) | {"precomputed"}),
- callable,
- None,
- ],
- "memory": [str, HasMethods("cache"), None],
- "connectivity": ["array-like", callable, None],
- "compute_full_tree": [StrOptions({"auto"}), "boolean"],
- "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
- "pooling_func": [callable],
- "distance_threshold": [Interval(Real, 0, None, closed="left"), None],
- "compute_distances": ["boolean"],
- }
- def __init__(
- self,
- n_clusters=2,
- *,
- affinity="deprecated", # TODO(1.4): Remove
- metric=None, # TODO(1.4): Set to "euclidean"
- memory=None,
- connectivity=None,
- compute_full_tree="auto",
- linkage="ward",
- pooling_func=np.mean,
- distance_threshold=None,
- compute_distances=False,
- ):
- super().__init__(
- n_clusters=n_clusters,
- memory=memory,
- connectivity=connectivity,
- compute_full_tree=compute_full_tree,
- linkage=linkage,
- affinity=affinity,
- metric=metric,
- distance_threshold=distance_threshold,
- compute_distances=compute_distances,
- )
- self.pooling_func = pooling_func
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """Fit the hierarchical clustering on the data.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data.
- y : Ignored
- Not used, present here for API consistency by convention.
- Returns
- -------
- self : object
- Returns the transformer.
- """
- X = self._validate_data(X, ensure_min_features=2)
- super()._fit(X.T)
- self._n_features_out = self.n_clusters_
- return self
- @property
- def fit_predict(self):
- """Fit and return the result of each sample's clustering assignment."""
- raise AttributeError
|