about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py')
-rw-r--r--.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py b/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
new file mode 100644
index 00000000..412e5ba4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
@@ -0,0 +1,124 @@
+import logging
+import tarfile
+
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.compression import compr
+from fsspec.utils import infer_compression
+
+typemap = {b"0": "file", b"5": "directory"}
+
+logger = logging.getLogger("tar")
+
+
+class TarFileSystem(AbstractArchiveFileSystem):
+    """Compressed Tar archives as a file-system (read-only)
+
+    Supports the following formats:
+    tar.gz, tar.bz2, tar.xz
+    """
+
+    root_marker = ""
+    protocol = "tar"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        index_store=None,
+        target_options=None,
+        target_protocol=None,
+        compression=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        target_options = target_options or {}
+
+        if isinstance(fo, str):
+            self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
+            fo = self.of.open()  # keep the reference
+
+        # Try to infer compression.
+        if compression is None:
+            name = None
+
+            # Try different ways to get hold of the filename. `fo` might either
+            # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
+            # `fsspec.AbstractFileSystem` instance.
+            try:
+                # Amended io.BufferedReader or similar.
+                # This uses a "protocol extension" where original filenames are
+                # propagated to archive-like filesystems in order to let them
+                # infer the right compression appropriately.
+                if hasattr(fo, "original"):
+                    name = fo.original
+
+                # fsspec.LocalFileOpener
+                elif hasattr(fo, "path"):
+                    name = fo.path
+
+                # io.BufferedReader
+                elif hasattr(fo, "name"):
+                    name = fo.name
+
+                # fsspec.AbstractFileSystem
+                elif hasattr(fo, "info"):
+                    name = fo.info()["name"]
+
+            except Exception as ex:
+                logger.warning(
+                    f"Unable to determine file name, not inferring compression: {ex}"
+                )
+
+            if name is not None:
+                compression = infer_compression(name)
+                logger.info(f"Inferred compression {compression} from file name {name}")
+
+        if compression is not None:
+            # TODO: tarfile already implements compression with modes like "'r:gz'",
+            #  but then would seek to offset in the file work?
+            fo = compr[compression](fo)
+
+        self._fo_ref = fo
+        self.fo = fo  # the whole instance is a context
+        self.tar = tarfile.TarFile(fileobj=self.fo)
+        self.dir_cache = None
+
+        self.index_store = index_store
+        self.index = None
+        self._index()
+
+    def _index(self):
+        # TODO: load and set saved index, if exists
+        out = {}
+        for ti in self.tar:
+            info = ti.get_info()
+            info["type"] = typemap.get(info["type"], "file")
+            name = ti.get_info()["name"].rstrip("/")
+            out[name] = (info, ti.offset_data)
+
+        self.index = out
+        # TODO: save index to self.index_store here, if set
+
+    def _get_dirs(self):
+        if self.dir_cache is not None:
+            return
+
+        # This enables ls to get directories as children as well as files
+        self.dir_cache = {
+            dirname: {"name": dirname, "size": 0, "type": "directory"}
+            for dirname in self._all_dirnames(self.tar.getnames())
+        }
+        for member in self.tar.getmembers():
+            info = member.get_info()
+            info["name"] = info["name"].rstrip("/")
+            info["type"] = typemap.get(info["type"], "file")
+            self.dir_cache[info["name"]] = info
+
+    def _open(self, path, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError("Read-only filesystem implementation")
+        details, offset = self.index[path]
+        if details["type"] != "file":
+            raise ValueError("Can only handle regular files")
+        return self.tar.extractfile(path)