diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py b/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py new file mode 100644 index 00000000..eb6f1453 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py @@ -0,0 +1,213 @@ +from contextlib import contextmanager +from ctypes import ( + CFUNCTYPE, + POINTER, + c_int, + c_longlong, + c_void_p, + cast, + create_string_buffer, +) + +import libarchive +import libarchive.ffi as ffi + +from fsspec import open_files +from fsspec.archive import AbstractArchiveFileSystem +from fsspec.implementations.memory import MemoryFile +from fsspec.utils import DEFAULT_BLOCK_SIZE + +# Libarchive requires seekable files or memory only for certain archive +# types. However, since we read the directory first to cache the contents +# and also allow random access to any file, the file-like object needs +# to be seekable no matter what. + +# Seek call-backs (not provided in the libarchive python wrapper) +SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int) +read_set_seek_callback = ffi.ffi( + "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int +) +new_api = hasattr(ffi, "NO_OPEN_CB") + + +@contextmanager +def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size): + """Read an archive from a seekable file-like object. + + The `file` object must support the standard `readinto` and 'seek' methods. + """ + buf = create_string_buffer(block_size) + buf_p = cast(buf, c_void_p) + + def read_func(archive_p, context, ptrptr): + # readinto the buffer, returns number of bytes read + length = file.readinto(buf) + # write the address of the buffer into the pointer + ptrptr = cast(ptrptr, POINTER(c_void_p)) + ptrptr[0] = buf_p + # tell libarchive how much data was written into the buffer + return length + + def seek_func(archive_p, context, offset, whence): + file.seek(offset, whence) + # tell libarchvie the current position + return file.tell() + + read_cb = ffi.READ_CALLBACK(read_func) + seek_cb = SEEK_CALLBACK(seek_func) + + if new_api: + open_cb = ffi.NO_OPEN_CB + close_cb = ffi.NO_CLOSE_CB + else: + open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB) + close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB) + + with libarchive.read.new_archive_read(format_name, filter_name) as archive_p: + read_set_seek_callback(archive_p, seek_cb) + ffi.read_open(archive_p, None, open_cb, read_cb, close_cb) + yield libarchive.read.ArchiveRead(archive_p) + + +class LibArchiveFileSystem(AbstractArchiveFileSystem): + """Compressed archives as a file-system (read-only) + + Supports the following formats: + tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar + Microsoft CAB, 7-Zip, WARC + + See the libarchive documentation for further restrictions. + https://www.libarchive.org/ + + Keeps file object open while instance lives. It only works in seekable + file-like objects. In case the filesystem does not support this kind of + file object, it is recommended to cache locally. + + This class is pickleable, but not necessarily thread-safe (depends on the + platform). See libarchive documentation for details. + """ + + root_marker = "" + protocol = "libarchive" + cachable = False + + def __init__( + self, + fo="", + mode="r", + target_protocol=None, + target_options=None, + block_size=DEFAULT_BLOCK_SIZE, + **kwargs, + ): + """ + Parameters + ---------- + fo: str or file-like + Contains ZIP, and must exist. If a str, will fetch file using + :meth:`~fsspec.open_files`, which must return one file exactly. + mode: str + Currently, only 'r' accepted + target_protocol: str (optional) + If ``fo`` is a string, this value can be used to override the + FS protocol inferred from a URL + target_options: dict (optional) + Kwargs passed when instantiating the target FS, if ``fo`` is + a string. + """ + super().__init__(self, **kwargs) + if mode != "r": + raise ValueError("Only read from archive files accepted") + if isinstance(fo, str): + files = open_files(fo, protocol=target_protocol, **(target_options or {})) + if len(files) != 1: + raise ValueError( + f'Path "{fo}" did not resolve to exactly one file: "{files}"' + ) + fo = files[0] + self.of = fo + self.fo = fo.__enter__() # the whole instance is a context + self.block_size = block_size + self.dir_cache = None + + @contextmanager + def _open_archive(self): + self.fo.seek(0) + with custom_reader(self.fo, block_size=self.block_size) as arc: + yield arc + + @classmethod + def _strip_protocol(cls, path): + # file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + def _get_dirs(self): + fields = { + "name": "pathname", + "size": "size", + "created": "ctime", + "mode": "mode", + "uid": "uid", + "gid": "gid", + "mtime": "mtime", + } + + if self.dir_cache is not None: + return + + self.dir_cache = {} + list_names = [] + with self._open_archive() as arc: + for entry in arc: + if not entry.isdir and not entry.isfile: + # Skip symbolic links, fifo entries, etc. + continue + self.dir_cache.update( + { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(set(entry.name)) + } + ) + f = {key: getattr(entry, fields[key]) for key in fields} + f["type"] = "directory" if entry.isdir else "file" + list_names.append(entry.name) + + self.dir_cache[f["name"]] = f + # libarchive does not seem to return an entry for the directories (at least + # not in all formats), so get the directories names from the files names + self.dir_cache.update( + { + dirname: {"name": dirname, "size": 0, "type": "directory"} + for dirname in self._all_dirnames(list_names) + } + ) + + def _open( + self, + path, + mode="rb", + block_size=None, + autocommit=True, + cache_options=None, + **kwargs, + ): + path = self._strip_protocol(path) + if mode != "rb": + raise NotImplementedError + + data = bytes() + with self._open_archive() as arc: + for entry in arc: + if entry.pathname != path: + continue + + if entry.size == 0: + # empty file, so there are no blocks + break + + for block in entry.get_blocks(entry.size): + data = block + break + else: + raise ValueError + return MemoryFile(fs=self, path=path, data=data) |