aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py')
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py432
1 files changed, 432 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py b/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py
new file mode 100644
index 00000000..264d51c5
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/_local_folder.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright 2024-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains utilities to handle the `../.cache/huggingface` folder in local directories.
+
+First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
+download metadata when downloading files from the hub to a local directory (without
+using the cache).
+
+./.cache/huggingface folder structure:
+[4.0K] data
+├── [4.0K] .cache
+│ └── [4.0K] huggingface
+│ └── [4.0K] download
+│ ├── [ 16] file.parquet.metadata
+│ ├── [ 16] file.txt.metadata
+│ └── [4.0K] folder
+│ └── [ 16] file.parquet.metadata
+│
+├── [6.5G] file.parquet
+├── [1.5K] file.txt
+└── [4.0K] folder
+ └── [ 16] file.parquet
+
+
+Download metadata file structure:
+```
+# file.txt.metadata
+11c5a3d5811f50298f278a704980280950aedb10
+a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
+1712656091.123
+
+# file.parquet.metadata
+11c5a3d5811f50298f278a704980280950aedb10
+7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
+1712656091.123
+}
+```
+"""
+
+import base64
+import hashlib
+import logging
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+from .utils import WeakFileLock
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LocalDownloadFilePaths:
+ """
+ Paths to the files related to a download process in a local dir.
+
+ Returned by [`get_local_download_paths`].
+
+ Attributes:
+ file_path (`Path`):
+ Path where the file will be saved.
+ lock_path (`Path`):
+ Path to the lock file used to ensure atomicity when reading/writing metadata.
+ metadata_path (`Path`):
+ Path to the metadata file.
+ """
+
+ file_path: Path
+ lock_path: Path
+ metadata_path: Path
+
+ def incomplete_path(self, etag: str) -> Path:
+ """Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
+ return self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
+
+
+@dataclass(frozen=True)
+class LocalUploadFilePaths:
+ """
+ Paths to the files related to an upload process in a local dir.
+
+ Returned by [`get_local_upload_paths`].
+
+ Attributes:
+ path_in_repo (`str`):
+ Path of the file in the repo.
+ file_path (`Path`):
+ Path where the file will be saved.
+ lock_path (`Path`):
+ Path to the lock file used to ensure atomicity when reading/writing metadata.
+ metadata_path (`Path`):
+ Path to the metadata file.
+ """
+
+ path_in_repo: str
+ file_path: Path
+ lock_path: Path
+ metadata_path: Path
+
+
+@dataclass
+class LocalDownloadFileMetadata:
+ """
+ Metadata about a file in the local directory related to a download process.
+
+ Attributes:
+ filename (`str`):
+ Path of the file in the repo.
+ commit_hash (`str`):
+ Commit hash of the file in the repo.
+ etag (`str`):
+ ETag of the file in the repo. Used to check if the file has changed.
+ For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
+ timestamp (`int`):
+ Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
+ """
+
+ filename: str
+ commit_hash: str
+ etag: str
+ timestamp: float
+
+
+@dataclass
+class LocalUploadFileMetadata:
+ """
+ Metadata about a file in the local directory related to an upload process.
+ """
+
+ size: int
+
+ # Default values correspond to "we don't know yet"
+ timestamp: Optional[float] = None
+ should_ignore: Optional[bool] = None
+ sha256: Optional[str] = None
+ upload_mode: Optional[str] = None
+ is_uploaded: bool = False
+ is_committed: bool = False
+
+ def save(self, paths: LocalUploadFilePaths) -> None:
+ """Save the metadata to disk."""
+ with WeakFileLock(paths.lock_path):
+ with paths.metadata_path.open("w") as f:
+ new_timestamp = time.time()
+ f.write(str(new_timestamp) + "\n")
+
+ f.write(str(self.size)) # never None
+ f.write("\n")
+
+ if self.should_ignore is not None:
+ f.write(str(int(self.should_ignore)))
+ f.write("\n")
+
+ if self.sha256 is not None:
+ f.write(self.sha256)
+ f.write("\n")
+
+ if self.upload_mode is not None:
+ f.write(self.upload_mode)
+ f.write("\n")
+
+ f.write(str(int(self.is_uploaded)) + "\n")
+ f.write(str(int(self.is_committed)) + "\n")
+
+ self.timestamp = new_timestamp
+
+
+def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
+ """Compute paths to the files related to a download process.
+
+ Folders containing the paths are all guaranteed to exist.
+
+ Args:
+ local_dir (`Path`):
+ Path to the local directory in which files are downloaded.
+ filename (`str`):
+ Path of the file in the repo.
+
+ Return:
+ [`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
+ """
+ # filename is the path in the Hub repository (separated by '/')
+ # make sure to have a cross platform transcription
+ sanitized_filename = os.path.join(*filename.split("/"))
+ if os.name == "nt":
+ if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
+ raise ValueError(
+ f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
+ " owner to rename this file."
+ )
+ file_path = local_dir / sanitized_filename
+ metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
+ lock_path = metadata_path.with_suffix(".lock")
+
+ # Some Windows versions do not allow for paths longer than 255 characters.
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix
+ if os.name == "nt":
+ if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
+ file_path = Path("\\\\?\\" + os.path.abspath(file_path))
+ lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
+ metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
+
+ file_path.parent.mkdir(parents=True, exist_ok=True)
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
+ return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
+
+
+def get_local_upload_paths(local_dir: Path, filename: str) -> LocalUploadFilePaths:
+ """Compute paths to the files related to an upload process.
+
+ Folders containing the paths are all guaranteed to exist.
+
+ Args:
+ local_dir (`Path`):
+ Path to the local directory that is uploaded.
+ filename (`str`):
+ Path of the file in the repo.
+
+ Return:
+ [`LocalUploadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path).
+ """
+ # filename is the path in the Hub repository (separated by '/')
+ # make sure to have a cross platform transcription
+ sanitized_filename = os.path.join(*filename.split("/"))
+ if os.name == "nt":
+ if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
+ raise ValueError(
+ f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
+ " owner to rename this file."
+ )
+ file_path = local_dir / sanitized_filename
+ metadata_path = _huggingface_dir(local_dir) / "upload" / f"{sanitized_filename}.metadata"
+ lock_path = metadata_path.with_suffix(".lock")
+
+ # Some Windows versions do not allow for paths longer than 255 characters.
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix
+ if os.name == "nt":
+ if not str(local_dir).startswith("\\\\?\\") and len(os.path.abspath(lock_path)) > 255:
+ file_path = Path("\\\\?\\" + os.path.abspath(file_path))
+ lock_path = Path("\\\\?\\" + os.path.abspath(lock_path))
+ metadata_path = Path("\\\\?\\" + os.path.abspath(metadata_path))
+
+ file_path.parent.mkdir(parents=True, exist_ok=True)
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
+ return LocalUploadFilePaths(
+ path_in_repo=filename, file_path=file_path, lock_path=lock_path, metadata_path=metadata_path
+ )
+
+
+def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
+ """Read metadata about a file in the local directory related to a download process.
+
+ Args:
+ local_dir (`Path`):
+ Path to the local directory in which files are downloaded.
+ filename (`str`):
+ Path of the file in the repo.
+
+ Return:
+ `[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
+ """
+ paths = get_local_download_paths(local_dir, filename)
+ with WeakFileLock(paths.lock_path):
+ if paths.metadata_path.exists():
+ try:
+ with paths.metadata_path.open() as f:
+ commit_hash = f.readline().strip()
+ etag = f.readline().strip()
+ timestamp = float(f.readline().strip())
+ metadata = LocalDownloadFileMetadata(
+ filename=filename,
+ commit_hash=commit_hash,
+ etag=etag,
+ timestamp=timestamp,
+ )
+ except Exception as e:
+ # remove the metadata file if it is corrupted / not the right format
+ logger.warning(
+ f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
+ )
+ try:
+ paths.metadata_path.unlink()
+ except Exception as e:
+ logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
+
+ try:
+ # check if the file exists and hasn't been modified since the metadata was saved
+ stat = paths.file_path.stat()
+ if (
+ stat.st_mtime - 1 <= metadata.timestamp
+ ): # allow 1s difference as stat.st_mtime might not be precise
+ return metadata
+ logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
+ except FileNotFoundError:
+ # file does not exist => metadata is outdated
+ return None
+ return None
+
+
+def read_upload_metadata(local_dir: Path, filename: str) -> LocalUploadFileMetadata:
+ """Read metadata about a file in the local directory related to an upload process.
+
+ TODO: factorize logic with `read_download_metadata`.
+
+ Args:
+ local_dir (`Path`):
+ Path to the local directory in which files are downloaded.
+ filename (`str`):
+ Path of the file in the repo.
+
+ Return:
+ `[LocalUploadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
+ """
+ paths = get_local_upload_paths(local_dir, filename)
+ with WeakFileLock(paths.lock_path):
+ if paths.metadata_path.exists():
+ try:
+ with paths.metadata_path.open() as f:
+ timestamp = float(f.readline().strip())
+
+ size = int(f.readline().strip()) # never None
+
+ _should_ignore = f.readline().strip()
+ should_ignore = None if _should_ignore == "" else bool(int(_should_ignore))
+
+ _sha256 = f.readline().strip()
+ sha256 = None if _sha256 == "" else _sha256
+
+ _upload_mode = f.readline().strip()
+ upload_mode = None if _upload_mode == "" else _upload_mode
+ if upload_mode not in (None, "regular", "lfs"):
+ raise ValueError(f"Invalid upload mode in metadata {paths.path_in_repo}: {upload_mode}")
+
+ is_uploaded = bool(int(f.readline().strip()))
+ is_committed = bool(int(f.readline().strip()))
+
+ metadata = LocalUploadFileMetadata(
+ timestamp=timestamp,
+ size=size,
+ should_ignore=should_ignore,
+ sha256=sha256,
+ upload_mode=upload_mode,
+ is_uploaded=is_uploaded,
+ is_committed=is_committed,
+ )
+ except Exception as e:
+ # remove the metadata file if it is corrupted / not the right format
+ logger.warning(
+ f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
+ )
+ try:
+ paths.metadata_path.unlink()
+ except Exception as e:
+ logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
+
+ # TODO: can we do better?
+ if (
+ metadata.timestamp is not None
+ and metadata.is_uploaded # file was uploaded
+ and not metadata.is_committed # but not committed
+ and time.time() - metadata.timestamp > 20 * 3600 # and it's been more than 20 hours
+ ): # => we consider it as garbage-collected by S3
+ metadata.is_uploaded = False
+
+ # check if the file exists and hasn't been modified since the metadata was saved
+ try:
+ if metadata.timestamp is not None and paths.file_path.stat().st_mtime <= metadata.timestamp:
+ return metadata
+ logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
+ except FileNotFoundError:
+ # file does not exist => metadata is outdated
+ pass
+
+ # empty metadata => we don't know anything expect its size
+ return LocalUploadFileMetadata(size=paths.file_path.stat().st_size)
+
+
+def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
+ """Write metadata about a file in the local directory related to a download process.
+
+ Args:
+ local_dir (`Path`):
+ Path to the local directory in which files are downloaded.
+ """
+ paths = get_local_download_paths(local_dir, filename)
+ with WeakFileLock(paths.lock_path):
+ with paths.metadata_path.open("w") as f:
+ f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
+
+
+def _huggingface_dir(local_dir: Path) -> Path:
+ """Return the path to the `.cache/huggingface` directory in a local directory."""
+ # Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
+ path = local_dir / ".cache" / "huggingface"
+ path.mkdir(exist_ok=True, parents=True)
+
+ # Create a .gitignore file in the .cache/huggingface directory if it doesn't exist
+ # Should be thread-safe enough like this.
+ gitignore = path / ".gitignore"
+ gitignore_lock = path / ".gitignore.lock"
+ if not gitignore.exists():
+ try:
+ with WeakFileLock(gitignore_lock, timeout=0.1):
+ gitignore.write_text("*")
+ except IndexError:
+ pass
+ except OSError: # TimeoutError, FileNotFoundError, PermissionError, etc.
+ pass
+ try:
+ gitignore_lock.unlink()
+ except OSError:
+ pass
+ return path
+
+
+def _short_hash(filename: str) -> str:
+ return base64.urlsafe_b64encode(hashlib.sha1(filename.encode()).digest()).decode()