aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/fsspec/implementations/github.py')
-rw-r--r--.venv/lib/python3.12/site-packages/fsspec/implementations/github.py239
1 files changed, 239 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py b/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py
new file mode 100644
index 00000000..3650b8eb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py
@@ -0,0 +1,239 @@
+import requests
+
+import fsspec
+
+from ..spec import AbstractFileSystem
+from ..utils import infer_storage_options
+from .memory import MemoryFile
+
+# TODO: add GIST backend, would be very similar
+
+
+class GithubFileSystem(AbstractFileSystem):
+ """Interface to files in github
+
+ An instance of this class provides the files residing within a remote github
+ repository. You may specify a point in the repos history, by SHA, branch
+ or tag (default is current master).
+
+ Given that code files tend to be small, and that github does not support
+ retrieving partial content, we always fetch whole files.
+
+ When using fsspec.open, allows URIs of the form:
+
+ - "github://path/file", in which case you must specify org, repo and
+ may specify sha in the extra args
+ - 'github://org:repo@/precip/catalog.yml', where the org and repo are
+ part of the URI
+ - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
+
+ ``sha`` can be the full or abbreviated hex of the commit you want to fetch
+ from, or a branch or tag name (so long as it doesn't contain special characters
+ like "/", "?", which would have to be HTTP-encoded).
+
+ For authorised access, you must provide username and token, which can be made
+ at https://github.com/settings/tokens
+ """
+
+ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
+ rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
+ protocol = "github"
+ timeout = (60, 60) # connect, read timeouts
+
+ def __init__(
+ self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.org = org
+ self.repo = repo
+ if (username is None) ^ (token is None):
+ raise ValueError("Auth required both username and token")
+ self.username = username
+ self.token = token
+ if timeout is not None:
+ self.timeout = timeout
+ if sha is None:
+ # look up default branch (not necessarily "master")
+ u = "https://api.github.com/repos/{org}/{repo}"
+ r = requests.get(
+ u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
+ )
+ r.raise_for_status()
+ sha = r.json()["default_branch"]
+
+ self.root = sha
+ self.ls("")
+
+ @property
+ def kw(self):
+ if self.username:
+ return {"auth": (self.username, self.token)}
+ return {}
+
+ @classmethod
+ def repos(cls, org_or_user, is_org=True):
+ """List repo names for given org or user
+
+ This may become the top level of the FS
+
+ Parameters
+ ----------
+ org_or_user: str
+ Name of the github org or user to query
+ is_org: bool (default True)
+ Whether the name is an organisation (True) or user (False)
+
+ Returns
+ -------
+ List of string
+ """
+ r = requests.get(
+ f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
+ timeout=cls.timeout,
+ )
+ r.raise_for_status()
+ return [repo["name"] for repo in r.json()]
+
+ @property
+ def tags(self):
+ """Names of tags in the repo"""
+ r = requests.get(
+ f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
+ timeout=self.timeout,
+ **self.kw,
+ )
+ r.raise_for_status()
+ return [t["name"] for t in r.json()]
+
+ @property
+ def branches(self):
+ """Names of branches in the repo"""
+ r = requests.get(
+ f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
+ timeout=self.timeout,
+ **self.kw,
+ )
+ r.raise_for_status()
+ return [t["name"] for t in r.json()]
+
+ @property
+ def refs(self):
+ """Named references, tags and branches"""
+ return {"tags": self.tags, "branches": self.branches}
+
+ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
+ """List files at given path
+
+ Parameters
+ ----------
+ path: str
+ Location to list, relative to repo root
+ detail: bool
+ If True, returns list of dicts, one per file; if False, returns
+ list of full filenames only
+ sha: str (optional)
+ List at the given point in the repo history, branch or tag name or commit
+ SHA
+ _sha: str (optional)
+ List this specific tree object (used internally to descend into trees)
+ """
+ path = self._strip_protocol(path)
+ if path == "":
+ _sha = sha or self.root
+ if _sha is None:
+ parts = path.rstrip("/").split("/")
+ so_far = ""
+ _sha = sha or self.root
+ for part in parts:
+ out = self.ls(so_far, True, sha=sha, _sha=_sha)
+ so_far += "/" + part if so_far else part
+ out = [o for o in out if o["name"] == so_far]
+ if not out:
+ raise FileNotFoundError(path)
+ out = out[0]
+ if out["type"] == "file":
+ if detail:
+ return [out]
+ else:
+ return path
+ _sha = out["sha"]
+ if path not in self.dircache or sha not in [self.root, None]:
+ r = requests.get(
+ self.url.format(org=self.org, repo=self.repo, sha=_sha),
+ timeout=self.timeout,
+ **self.kw,
+ )
+ if r.status_code == 404:
+ raise FileNotFoundError(path)
+ r.raise_for_status()
+ types = {"blob": "file", "tree": "directory"}
+ out = [
+ {
+ "name": path + "/" + f["path"] if path else f["path"],
+ "mode": f["mode"],
+ "type": types[f["type"]],
+ "size": f.get("size", 0),
+ "sha": f["sha"],
+ }
+ for f in r.json()["tree"]
+ if f["type"] in types
+ ]
+ if sha in [self.root, None]:
+ self.dircache[path] = out
+ else:
+ out = self.dircache[path]
+ if detail:
+ return out
+ else:
+ return sorted([f["name"] for f in out])
+
+ def invalidate_cache(self, path=None):
+ self.dircache.clear()
+
+ @classmethod
+ def _strip_protocol(cls, path):
+ opts = infer_storage_options(path)
+ if "username" not in opts:
+ return super()._strip_protocol(path)
+ return opts["path"].lstrip("/")
+
+ @staticmethod
+ def _get_kwargs_from_urls(path):
+ opts = infer_storage_options(path)
+ if "username" not in opts:
+ return {}
+ out = {"org": opts["username"], "repo": opts["password"]}
+ if opts["host"]:
+ out["sha"] = opts["host"]
+ return out
+
+ def _open(
+ self,
+ path,
+ mode="rb",
+ block_size=None,
+ autocommit=True,
+ cache_options=None,
+ sha=None,
+ **kwargs,
+ ):
+ if mode != "rb":
+ raise NotImplementedError
+ url = self.rurl.format(
+ org=self.org, repo=self.repo, path=path, sha=sha or self.root
+ )
+ r = requests.get(url, timeout=self.timeout, **self.kw)
+ if r.status_code == 404:
+ raise FileNotFoundError(path)
+ r.raise_for_status()
+ return MemoryFile(None, None, r.content)
+
+ def cat(self, path, recursive=False, on_error="raise", **kwargs):
+ paths = self.expand_path(path, recursive=recursive)
+ urls = [
+ self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
+ for u, sh in paths
+ ]
+ fs = fsspec.filesystem("http")
+ data = fs.cat(urls, on_error="return")
+ return {u: v for ((k, v), u) in zip(data.items(), urls)}