github.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. import requests
  2. import fsspec
  3. from ..spec import AbstractFileSystem
  4. from ..utils import infer_storage_options
  5. from .memory import MemoryFile
  6. # TODO: add GIST backend, would be very similar
  7. class GithubFileSystem(AbstractFileSystem):
  8. """Interface to files in github
  9. An instance of this class provides the files residing within a remote github
  10. repository. You may specify a point in the repos history, by SHA, branch
  11. or tag (default is current master).
  12. Given that code files tend to be small, and that github does not support
  13. retrieving partial content, we always fetch whole files.
  14. When using fsspec.open, allows URIs of the form:
  15. - "github://path/file", in which case you must specify org, repo and
  16. may specify sha in the extra args
  17. - 'github://org:repo@/precip/catalog.yml', where the org and repo are
  18. part of the URI
  19. - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
  20. ``sha`` can be the full or abbreviated hex of the commit you want to fetch
  21. from, or a branch or tag name (so long as it doesn't contain special characters
  22. like "/", "?", which would have to be HTTP-encoded).
  23. For authorised access, you must provide username and token, which can be made
  24. at https://github.com/settings/tokens
  25. """
  26. url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
  27. rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
  28. protocol = "github"
  29. timeout = (60, 60) # connect, read timeouts
  30. def __init__(
  31. self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
  32. ):
  33. super().__init__(**kwargs)
  34. self.org = org
  35. self.repo = repo
  36. if (username is None) ^ (token is None):
  37. raise ValueError("Auth required both username and token")
  38. self.username = username
  39. self.token = token
  40. if timeout is not None:
  41. self.timeout = timeout
  42. if sha is None:
  43. # look up default branch (not necessarily "master")
  44. u = "https://api.github.com/repos/{org}/{repo}"
  45. r = requests.get(
  46. u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
  47. )
  48. r.raise_for_status()
  49. sha = r.json()["default_branch"]
  50. self.root = sha
  51. self.ls("")
  52. @property
  53. def kw(self):
  54. if self.username:
  55. return {"auth": (self.username, self.token)}
  56. return {}
  57. @classmethod
  58. def repos(cls, org_or_user, is_org=True):
  59. """List repo names for given org or user
  60. This may become the top level of the FS
  61. Parameters
  62. ----------
  63. org_or_user: str
  64. Name of the github org or user to query
  65. is_org: bool (default True)
  66. Whether the name is an organisation (True) or user (False)
  67. Returns
  68. -------
  69. List of string
  70. """
  71. r = requests.get(
  72. f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
  73. timeout=cls.timeout,
  74. )
  75. r.raise_for_status()
  76. return [repo["name"] for repo in r.json()]
  77. @property
  78. def tags(self):
  79. """Names of tags in the repo"""
  80. r = requests.get(
  81. f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
  82. timeout=self.timeout,
  83. **self.kw,
  84. )
  85. r.raise_for_status()
  86. return [t["name"] for t in r.json()]
  87. @property
  88. def branches(self):
  89. """Names of branches in the repo"""
  90. r = requests.get(
  91. f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
  92. timeout=self.timeout,
  93. **self.kw,
  94. )
  95. r.raise_for_status()
  96. return [t["name"] for t in r.json()]
  97. @property
  98. def refs(self):
  99. """Named references, tags and branches"""
  100. return {"tags": self.tags, "branches": self.branches}
  101. def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
  102. """List files at given path
  103. Parameters
  104. ----------
  105. path: str
  106. Location to list, relative to repo root
  107. detail: bool
  108. If True, returns list of dicts, one per file; if False, returns
  109. list of full filenames only
  110. sha: str (optional)
  111. List at the given point in the repo history, branch or tag name or commit
  112. SHA
  113. _sha: str (optional)
  114. List this specific tree object (used internally to descend into trees)
  115. """
  116. path = self._strip_protocol(path)
  117. if path == "":
  118. _sha = sha or self.root
  119. if _sha is None:
  120. parts = path.rstrip("/").split("/")
  121. so_far = ""
  122. _sha = sha or self.root
  123. for part in parts:
  124. out = self.ls(so_far, True, sha=sha, _sha=_sha)
  125. so_far += "/" + part if so_far else part
  126. out = [o for o in out if o["name"] == so_far]
  127. if not out:
  128. raise FileNotFoundError(path)
  129. out = out[0]
  130. if out["type"] == "file":
  131. if detail:
  132. return [out]
  133. else:
  134. return path
  135. _sha = out["sha"]
  136. if path not in self.dircache or sha not in [self.root, None]:
  137. r = requests.get(
  138. self.url.format(org=self.org, repo=self.repo, sha=_sha),
  139. timeout=self.timeout,
  140. **self.kw,
  141. )
  142. if r.status_code == 404:
  143. raise FileNotFoundError(path)
  144. r.raise_for_status()
  145. types = {"blob": "file", "tree": "directory"}
  146. out = [
  147. {
  148. "name": path + "/" + f["path"] if path else f["path"],
  149. "mode": f["mode"],
  150. "type": types[f["type"]],
  151. "size": f.get("size", 0),
  152. "sha": f["sha"],
  153. }
  154. for f in r.json()["tree"]
  155. if f["type"] in types
  156. ]
  157. if sha in [self.root, None]:
  158. self.dircache[path] = out
  159. else:
  160. out = self.dircache[path]
  161. if detail:
  162. return out
  163. else:
  164. return sorted([f["name"] for f in out])
  165. def invalidate_cache(self, path=None):
  166. self.dircache.clear()
  167. @classmethod
  168. def _strip_protocol(cls, path):
  169. opts = infer_storage_options(path)
  170. if "username" not in opts:
  171. return super()._strip_protocol(path)
  172. return opts["path"].lstrip("/")
  173. @staticmethod
  174. def _get_kwargs_from_urls(path):
  175. opts = infer_storage_options(path)
  176. if "username" not in opts:
  177. return {}
  178. out = {"org": opts["username"], "repo": opts["password"]}
  179. if opts["host"]:
  180. out["sha"] = opts["host"]
  181. return out
  182. def _open(
  183. self,
  184. path,
  185. mode="rb",
  186. block_size=None,
  187. autocommit=True,
  188. cache_options=None,
  189. sha=None,
  190. **kwargs,
  191. ):
  192. if mode != "rb":
  193. raise NotImplementedError
  194. url = self.rurl.format(
  195. org=self.org, repo=self.repo, path=path, sha=sha or self.root
  196. )
  197. r = requests.get(url, timeout=self.timeout, **self.kw)
  198. if r.status_code == 404:
  199. raise FileNotFoundError(path)
  200. r.raise_for_status()
  201. return MemoryFile(None, None, r.content)
  202. def cat(self, path, recursive=False, on_error="raise", **kwargs):
  203. paths = self.expand_path(path, recursive=recursive)
  204. urls = [
  205. self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
  206. for u, sh in paths
  207. ]
  208. fs = fsspec.filesystem("http")
  209. data = fs.cat(urls, on_error="return")
  210. return {u: v for ((k, v), u) in zip(data.items(), urls)}