From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../site-packages/huggingface_hub/_local_folder.py | 432 +++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py') diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py b/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py new file mode 100644 index 00000000..264d51c5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py @@ -0,0 +1,432 @@ +# coding=utf-8 +# Copyright 2024-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utilities to handle the `../.cache/huggingface` folder in local directories. + +First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store +download metadata when downloading files from the hub to a local directory (without +using the cache). + +./.cache/huggingface folder structure: +[4.0K] data +├── [4.0K] .cache +│ └── [4.0K] huggingface +│ └── [4.0K] download +│ ├── [ 16] file.parquet.metadata +│ ├── [ 16] file.txt.metadata +│ └── [4.0K] folder +│ └── [ 16] file.parquet.metadata +│ +├── [6.5G] file.parquet +├── [1.5K] file.txt +└── [4.0K] folder + └── [ 16] file.parquet + + +Download metadata file structure: +``` +# file.txt.metadata +11c5a3d5811f50298f278a704980280950aedb10 +a16a55fda99d2f2e7b69cce5cf93ff4ad3049930 +1712656091.123 + +# file.parquet.metadata +11c5a3d5811f50298f278a704980280950aedb10 +7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421 +1712656091.123 +} +``` +""" + +import base64 +import hashlib +import logging +import os +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from .utils import WeakFileLock + + +logger = logging.getLogger(__name__) + + +@dataclass +class LocalDownloadFilePaths: + """ + Paths to the files related to a download process in a local dir. + + Returned by [`get_local_download_paths`]. + + Attributes: + file_path (`Path`): + Path where the file will be saved. + lock_path (`Path`): + Path to the lock file used to ensure atomicity when reading/writing metadata. + metadata_path (`Path`): + Path to the metadata file. + """ + + file_path: Path + lock_path: Path + metadata_path: Path + + def incomplete_path(self, etag: str) -> Path: + """Return the path where a file will be temporarily downloaded before being moved to `file_path`.""" + return self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete" + + +@dataclass(frozen=True) +class LocalUploadFilePaths: + """ + Paths to the files related to an upload process in a local dir. + + Returned by [`get_local_upload_paths`]. + + Attributes: + path_in_repo (`str`): + Path of the file in the repo. + file_path (`Path`): + Path where the file will be saved. + lock_path (`Path`): + Path to the lock file used to ensure atomicity when reading/writing metadata. + metadata_path (`Path`): + Path to the metadata file. + """ + + path_in_repo: str + file_path: Path + lock_path: Path + metadata_path: Path + + +@dataclass +class LocalDownloadFileMetadata: + """ + Metadata about a file in the local directory related to a download process. + + Attributes: + filename (`str`): + Path of the file in the repo. + commit_hash (`str`): + Commit hash of the file in the repo. + etag (`str`): + ETag of the file in the repo. Used to check if the file has changed. + For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash. + timestamp (`int`): + Unix timestamp of when the metadata was saved i.e. when the metadata was accurate. + """ + + filename: str + commit_hash: str + etag: str + timestamp: float + + +@dataclass +class LocalUploadFileMetadata: + """ + Metadata about a file in the local directory related to an upload process. + """ + + size: int + + # Default values correspond to "we don't know yet" + timestamp: Optional[float] = None + should_ignore: Optional[bool] = None + sha256: Optional[str] = None + upload_mode: Optional[str] = None + is_uploaded: bool = False + is_committed: bool = False + + def save(self, paths: LocalUploadFilePaths) -> None: + """Save the metadata to disk.""" + with WeakFileLock(paths.lock_path): + with paths.metadata_path.open("w") as f: + new_timestamp = time.time() + f.write(str(new_timestamp) + "\n") + + f.write(str(self.size)) # never None + f.write("\n") + + if self.should_ignore is not None: + f.write(str(int(self.should_ignore))) + f.write("\n") + + if self.sha256 is not None: + f.write(self.sha256) + f.write("\n") + + if self.upload_mode is not None: + f.write(self.upload_mode) + f.write("\n") + + f.write(str(int(self.is_uploaded)) + "\n") + f.write(str(int(self.is_committed)) + "\n") + + self.timestamp = new_timestamp + + +def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths: + """Compute paths to the files related to a download process. + + Folders containing the paths are all guaranteed to exist. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path). + """ + # filename is the path in the Hub repository (separated by '/') + # make sure to have a cross platform transcription + sanitized_filename = os.path.join(*filename.split("/")) + if os.name == "nt": + if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename: + raise ValueError( + f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository" + " owner to rename this file." + ) + file_path = local_dir / sanitized_filename + metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata" + lock_path = metadata_path.with_suffix(".lock") + + # Some Windows versions do not allow for paths longer than 255 characters. + # In this case, we must specify it as an extended path by using the "\\?\" prefix + if os.name == "nt": + if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255: + file_path = Path("\\\\?\\" + os.path.abspath(file_path)) + lock_path = Path("\\\\?\\" + os.path.abspath(lock_path)) + metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path)) + + file_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path) + + +def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths: + """Compute paths to the files related to an upload process. + + Folders containing the paths are all guaranteed to exist. + + Args: + local_dir (`Path`): + Path to the local directory that is uploaded. + filename (`str`): + Path of the file in the repo. + + Return: + [`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path). + """ + # filename is the path in the Hub repository (separated by '/') + # make sure to have a cross platform transcription + sanitized_filename = os.path.join(*filename.split("/")) + if os.name == "nt": + if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename: + raise ValueError( + f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository" + " owner to rename this file." + ) + file_path = local_dir / sanitized_filename + metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata" + lock_path = metadata_path.with_suffix(".lock") + + # Some Windows versions do not allow for paths longer than 255 characters. + # In this case, we must specify it as an extended path by using the "\\?\" prefix + if os.name == "nt": + if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255: + file_path = Path("\\\\?\\" + os.path.abspath(file_path)) + lock_path = Path("\\\\?\\" + os.path.abspath(lock_path)) + metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path)) + + file_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + return LocalUploadFilePaths( + path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path + ) + + +def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]: + """Read metadata about a file in the local directory related to a download process. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise. + """ + paths = get_local_download_paths(local_dir, filename) + with WeakFileLock(paths.lock_path): + if paths.metadata_path.exists(): + try: + with paths.metadata_path.open() as f: + commit_hash = f.readline().strip() + etag = f.readline().strip() + timestamp = float(f.readline().strip()) + metadata = LocalDownloadFileMetadata( + filename=filename, + commit_hash=commit_hash, + etag=etag, + timestamp=timestamp, + ) + except Exception as e: + # remove the metadata file if it is corrupted / not the right format + logger.warning( + f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue." + ) + try: + paths.metadata_path.unlink() + except Exception as e: + logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}") + + try: + # check if the file exists and hasn't been modified since the metadata was saved + stat = paths.file_path.stat() + if ( + stat.st_mtime - 1 <= metadata.timestamp + ): # allow 1s difference as stat.st_mtime might not be precise + return metadata + logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.") + except FileNotFoundError: + # file does not exist => metadata is outdated + return None + return None + + +def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata: + """Read metadata about a file in the local directory related to an upload process. + + TODO: factorize logic with `read_download_metadata`. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + filename (`str`): + Path of the file in the repo. + + Return: + `[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise. + """ + paths = get_local_upload_paths(local_dir, filename) + with WeakFileLock(paths.lock_path): + if paths.metadata_path.exists(): + try: + with paths.metadata_path.open() as f: + timestamp = float(f.readline().strip()) + + size = int(f.readline().strip()) # never None + + _should_ignore = f.readline().strip() + should_ignore = None if _should_ignore == "" else bool(int(_should_ignore)) + + _sha256 = f.readline().strip() + sha256 = None if _sha256 == "" else _sha256 + + _upload_mode = f.readline().strip() + upload_mode = None if _upload_mode == "" else _upload_mode + if upload_mode not in (None, "regular", "lfs"): + raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}") + + is_uploaded = bool(int(f.readline().strip())) + is_committed = bool(int(f.readline().strip())) + + metadata = LocalUploadFileMetadata( + timestamp=timestamp, + size=size, + should_ignore=should_ignore, + sha256=sha256, + upload_mode=upload_mode, + is_uploaded=is_uploaded, + is_committed=is_committed, + ) + except Exception as e: + # remove the metadata file if it is corrupted / not the right format + logger.warning( + f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue." + ) + try: + paths.metadata_path.unlink() + except Exception as e: + logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}") + + # TODO: can we do better? + if ( + metadata.timestamp is not None + and metadata.is_uploaded # file was uploaded + and not metadata.is_committed # but not committed + and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours + ): # => we consider it as garbage-collected by S3 + metadata.is_uploaded = False + + # check if the file exists and hasn't been modified since the metadata was saved + try: + if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp: + return metadata + logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.") + except FileNotFoundError: + # file does not exist => metadata is outdated + pass + + # empty metadata => we don't know anything expect its size + return LocalUploadFileMetadata(size=paths.file_path.stat().st_size) + + +def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None: + """Write metadata about a file in the local directory related to a download process. + + Args: + local_dir (`Path`): + Path to the local directory in which files are downloaded. + """ + paths = get_local_download_paths(local_dir, filename) + with WeakFileLock(paths.lock_path): + with paths.metadata_path.open("w") as f: + f.write(f"{commit_hash}\n{etag}\n{time.time()}\n") + + +def _huggingface_dir(local_dir: Path) -> Path: + """Return the path to the `.cache/huggingface` directory in a local directory.""" + # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times + path = local_dir / ".cache" / "huggingface" + path.mkdir(exist_ok=True, parents=True) + + # Create a .gitignore file in the .cache/huggingface directory if it doesn't exist + # Should be thread-safe enough like this. + gitignore = path / ".gitignore" + gitignore_lock = path / ".gitignore.lock" + if not gitignore.exists(): + try: + with WeakFileLock(gitignore_lock, timeout=0.1): + gitignore.write_text("*") + except IndexError: + pass + except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc. + pass + try: + gitignore_lock.unlink() + except OSError: + pass + return path + + +def _short_hash(filename: str) -> str: + return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode() -- cgit v1.2.3