plot_roget.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. """
  2. =====
  3. Roget
  4. =====
  5. Build a directed graph of 1022 categories and 5075 cross-references as defined
  6. in the 1879 version of Roget's Thesaurus. This example is described in Section
  7. 1.2 of
  8. Donald E. Knuth, "The Stanford GraphBase: A Platform for Combinatorial
  9. Computing", ACM Press, New York, 1993.
  10. http://www-cs-faculty.stanford.edu/~knuth/sgb.html
  11. Note that one of the 5075 cross references is a self loop yet it is included in
  12. the graph built here because the standard networkx `DiGraph` class allows self
  13. loops. (cf. 400pungency:400 401 403 405).
  14. The data file can be found at:
  15. - https://github.com/networkx/networkx/blob/main/examples/graph/roget_dat.txt.gz
  16. """
  17. import gzip
  18. import re
  19. import sys
  20. import matplotlib.pyplot as plt
  21. import networkx as nx
  22. def roget_graph():
  23. """Return the thesaurus graph from the roget.dat example in
  24. the Stanford Graph Base.
  25. """
  26. # open file roget_dat.txt.gz
  27. fh = gzip.open("roget_dat.txt.gz", "r")
  28. G = nx.DiGraph()
  29. for line in fh.readlines():
  30. line = line.decode()
  31. if line.startswith("*"): # skip comments
  32. continue
  33. if line.startswith(" "): # this is a continuation line, append
  34. line = oldline + line
  35. if line.endswith("\\\n"): # continuation line, buffer, goto next
  36. oldline = line.strip("\\\n")
  37. continue
  38. (headname, tails) = line.split(":")
  39. # head
  40. numfind = re.compile(r"^\d+") # re to find the number of this word
  41. head = numfind.findall(headname)[0] # get the number
  42. G.add_node(head)
  43. for tail in tails.split():
  44. if head == tail:
  45. print("skipping self loop", head, tail, file=sys.stderr)
  46. G.add_edge(head, tail)
  47. return G
  48. G = roget_graph()
  49. print("Loaded roget_dat.txt containing 1022 categories.")
  50. print(G)
  51. UG = G.to_undirected()
  52. print(nx.number_connected_components(UG), "connected components")
  53. options = {
  54. "node_color": "black",
  55. "node_size": 1,
  56. "edge_color": "gray",
  57. "linewidths": 0,
  58. "width": 0.1,
  59. }
  60. nx.draw_circular(UG, **options)
  61. plt.show()