aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py')
-rw-r--r--.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py b/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
new file mode 100644
index 00000000..412e5ba4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
@@ -0,0 +1,124 @@
+import logging
+import tarfile
+
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.compression import compr
+from fsspec.utils import infer_compression
+
+typemap = {b"0": "file", b"5": "directory"}
+
+logger = logging.getLogger("tar")
+
+
+class TarFileSystem(AbstractArchiveFileSystem):
+ """Compressed Tar archives as a file-system (read-only)
+
+ Supports the following formats:
+ tar.gz, tar.bz2, tar.xz
+ """
+
+ root_marker = ""
+ protocol = "tar"
+ cachable = False
+
+ def __init__(
+ self,
+ fo="",
+ index_store=None,
+ target_options=None,
+ target_protocol=None,
+ compression=None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ target_options = target_options or {}
+
+ if isinstance(fo, str):
+ self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
+ fo = self.of.open() # keep the reference
+
+ # Try to infer compression.
+ if compression is None:
+ name = None
+
+ # Try different ways to get hold of the filename. `fo` might either
+ # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
+ # `fsspec.AbstractFileSystem` instance.
+ try:
+ # Amended io.BufferedReader or similar.
+ # This uses a "protocol extension" where original filenames are
+ # propagated to archive-like filesystems in order to let them
+ # infer the right compression appropriately.
+ if hasattr(fo, "original"):
+ name = fo.original
+
+ # fsspec.LocalFileOpener
+ elif hasattr(fo, "path"):
+ name = fo.path
+
+ # io.BufferedReader
+ elif hasattr(fo, "name"):
+ name = fo.name
+
+ # fsspec.AbstractFileSystem
+ elif hasattr(fo, "info"):
+ name = fo.info()["name"]
+
+ except Exception as ex:
+ logger.warning(
+ f"Unable to determine file name, not inferring compression: {ex}"
+ )
+
+ if name is not None:
+ compression = infer_compression(name)
+ logger.info(f"Inferred compression {compression} from file name {name}")
+
+ if compression is not None:
+ # TODO: tarfile already implements compression with modes like "'r:gz'",
+ # but then would seek to offset in the file work?
+ fo = compr[compression](fo)
+
+ self._fo_ref = fo
+ self.fo = fo # the whole instance is a context
+ self.tar = tarfile.TarFile(fileobj=self.fo)
+ self.dir_cache = None
+
+ self.index_store = index_store
+ self.index = None
+ self._index()
+
+ def _index(self):
+ # TODO: load and set saved index, if exists
+ out = {}
+ for ti in self.tar:
+ info = ti.get_info()
+ info["type"] = typemap.get(info["type"], "file")
+ name = ti.get_info()["name"].rstrip("/")
+ out[name] = (info, ti.offset_data)
+
+ self.index = out
+ # TODO: save index to self.index_store here, if set
+
+ def _get_dirs(self):
+ if self.dir_cache is not None:
+ return
+
+ # This enables ls to get directories as children as well as files
+ self.dir_cache = {
+ dirname: {"name": dirname, "size": 0, "type": "directory"}
+ for dirname in self._all_dirnames(self.tar.getnames())
+ }
+ for member in self.tar.getmembers():
+ info = member.get_info()
+ info["name"] = info["name"].rstrip("/")
+ info["type"] = typemap.get(info["type"], "file")
+ self.dir_cache[info["name"]] = info
+
+ def _open(self, path, mode="rb", **kwargs):
+ if mode != "rb":
+ raise ValueError("Read-only filesystem implementation")
+ details, offset = self.index[path]
+ if details["type"] != "file":
+ raise ValueError("Can only handle regular files")
+ return self.tar.extractfile(path)