graph.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. """
  2. Graph utilities and algorithms
  3. Graphs are represented with their adjacency matrices, preferably using
  4. sparse matrices.
  5. """
  6. # Authors: Aric Hagberg <hagberg@lanl.gov>
  7. # Gael Varoquaux <gael.varoquaux@normalesup.org>
  8. # Jake Vanderplas <vanderplas@astro.washington.edu>
  9. # License: BSD 3 clause
  10. import numpy as np
  11. from scipy import sparse
  12. from ..metrics.pairwise import pairwise_distances
  13. ###############################################################################
  14. # Path and connected component analysis.
  15. # Code adapted from networkx
  16. def single_source_shortest_path_length(graph, source, *, cutoff=None):
  17. """Return the length of the shortest path from source to all reachable nodes.
  18. Parameters
  19. ----------
  20. graph : {sparse matrix, ndarray} of shape (n_nodes, n_nodes)
  21. Adjacency matrix of the graph. Sparse matrix of format LIL is
  22. preferred.
  23. source : int
  24. Start node for path.
  25. cutoff : int, default=None
  26. Depth to stop the search - only paths of length <= cutoff are returned.
  27. Returns
  28. -------
  29. paths : dict
  30. Reachable end nodes mapped to length of path from source,
  31. i.e. `{end: path_length}`.
  32. Examples
  33. --------
  34. >>> from sklearn.utils.graph import single_source_shortest_path_length
  35. >>> import numpy as np
  36. >>> graph = np.array([[ 0, 1, 0, 0],
  37. ... [ 1, 0, 1, 0],
  38. ... [ 0, 1, 0, 0],
  39. ... [ 0, 0, 0, 0]])
  40. >>> single_source_shortest_path_length(graph, 0)
  41. {0: 0, 1: 1, 2: 2}
  42. >>> graph = np.ones((6, 6))
  43. >>> sorted(single_source_shortest_path_length(graph, 2).items())
  44. [(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]
  45. """
  46. if sparse.issparse(graph):
  47. graph = graph.tolil()
  48. else:
  49. graph = sparse.lil_matrix(graph)
  50. seen = {} # level (number of hops) when seen in BFS
  51. level = 0 # the current level
  52. next_level = [source] # dict of nodes to check at next level
  53. while next_level:
  54. this_level = next_level # advance to next level
  55. next_level = set() # and start a new list (fringe)
  56. for v in this_level:
  57. if v not in seen:
  58. seen[v] = level # set the level of vertex v
  59. next_level.update(graph.rows[v])
  60. if cutoff is not None and cutoff <= level:
  61. break
  62. level += 1
  63. return seen # return all path lengths as dictionary
  64. def _fix_connected_components(
  65. X,
  66. graph,
  67. n_connected_components,
  68. component_labels,
  69. mode="distance",
  70. metric="euclidean",
  71. **kwargs,
  72. ):
  73. """Add connections to sparse graph to connect unconnected components.
  74. For each pair of unconnected components, compute all pairwise distances
  75. from one component to the other, and add a connection on the closest pair
  76. of samples. This is a hacky way to get a graph with a single connected
  77. component, which is necessary for example to compute a shortest path
  78. between all pairs of samples in the graph.
  79. Parameters
  80. ----------
  81. X : array of shape (n_samples, n_features) or (n_samples, n_samples)
  82. Features to compute the pairwise distances. If `metric =
  83. "precomputed"`, X is the matrix of pairwise distances.
  84. graph : sparse matrix of shape (n_samples, n_samples)
  85. Graph of connection between samples.
  86. n_connected_components : int
  87. Number of connected components, as computed by
  88. `scipy.sparse.csgraph.connected_components`.
  89. component_labels : array of shape (n_samples)
  90. Labels of connected components, as computed by
  91. `scipy.sparse.csgraph.connected_components`.
  92. mode : {'connectivity', 'distance'}, default='distance'
  93. Type of graph matrix: 'connectivity' corresponds to the connectivity
  94. matrix with ones and zeros, and 'distance' corresponds to the distances
  95. between neighbors according to the given metric.
  96. metric : str
  97. Metric used in `sklearn.metrics.pairwise.pairwise_distances`.
  98. kwargs : kwargs
  99. Keyword arguments passed to
  100. `sklearn.metrics.pairwise.pairwise_distances`.
  101. Returns
  102. -------
  103. graph : sparse matrix of shape (n_samples, n_samples)
  104. Graph of connection between samples, with a single connected component.
  105. """
  106. if metric == "precomputed" and sparse.issparse(X):
  107. raise RuntimeError(
  108. "_fix_connected_components with metric='precomputed' requires the "
  109. "full distance matrix in X, and does not work with a sparse "
  110. "neighbors graph."
  111. )
  112. for i in range(n_connected_components):
  113. idx_i = np.flatnonzero(component_labels == i)
  114. Xi = X[idx_i]
  115. for j in range(i):
  116. idx_j = np.flatnonzero(component_labels == j)
  117. Xj = X[idx_j]
  118. if metric == "precomputed":
  119. D = X[np.ix_(idx_i, idx_j)]
  120. else:
  121. D = pairwise_distances(Xi, Xj, metric=metric, **kwargs)
  122. ii, jj = np.unravel_index(D.argmin(axis=None), D.shape)
  123. if mode == "connectivity":
  124. graph[idx_i[ii], idx_j[jj]] = 1
  125. graph[idx_j[jj], idx_i[ii]] = 1
  126. elif mode == "distance":
  127. graph[idx_i[ii], idx_j[jj]] = D[ii, jj]
  128. graph[idx_j[jj], idx_i[ii]] = D[ii, jj]
  129. else:
  130. raise ValueError(
  131. "Unknown mode=%r, should be one of ['connectivity', 'distance']."
  132. % mode
  133. )
  134. return graph