aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py')
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py896
1 files changed, 896 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py
new file mode 100644
index 00000000..21469c97
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py
@@ -0,0 +1,896 @@
+# coding=utf-8
+# Copyright 2022-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains utilities to manage the HF cache directory."""
+
+import os
+import shutil
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union
+
+from huggingface_hub.errors import CacheNotFound, CorruptedCacheException
+
+from ..commands._cli_utils import tabulate
+from ..constants import HF_HUB_CACHE
+from . import logging
+
+
+logger = logging.get_logger(__name__)
+
+REPO_TYPE_T = Literal["model", "dataset", "space"]
+
+# List of OS-created helper files that need to be ignored
+FILES_TO_IGNORE = [".DS_Store"]
+
+
+@dataclass(frozen=True)
+class CachedFileInfo:
+ """Frozen data structure holding information about a single cached file.
+
+ Args:
+ file_name (`str`):
+ Name of the file. Example: `config.json`.
+ file_path (`Path`):
+ Path of the file in the `snapshots` directory. The file path is a symlink
+ referring to a blob in the `blobs` folder.
+ blob_path (`Path`):
+ Path of the blob file. This is equivalent to `file_path.resolve()`.
+ size_on_disk (`int`):
+ Size of the blob file in bytes.
+ blob_last_accessed (`float`):
+ Timestamp of the last time the blob file has been accessed (from any
+ revision).
+ blob_last_modified (`float`):
+ Timestamp of the last time the blob file has been modified/created.
+
+ <Tip warning={true}>
+
+ `blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you
+ are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result)
+ for more details.
+
+ </Tip>
+ """
+
+ file_name: str
+ file_path: Path
+ blob_path: Path
+ size_on_disk: int
+
+ blob_last_accessed: float
+ blob_last_modified: float
+
+ @property
+ def blob_last_accessed_str(self) -> str:
+ """
+ (property) Timestamp of the last time the blob file has been accessed (from any
+ revision), returned as a human-readable string.
+
+ Example: "2 weeks ago".
+ """
+ return _format_timesince(self.blob_last_accessed)
+
+ @property
+ def blob_last_modified_str(self) -> str:
+ """
+ (property) Timestamp of the last time the blob file has been modified, returned
+ as a human-readable string.
+
+ Example: "2 weeks ago".
+ """
+ return _format_timesince(self.blob_last_modified)
+
+ @property
+ def size_on_disk_str(self) -> str:
+ """
+ (property) Size of the blob file as a human-readable string.
+
+ Example: "42.2K".
+ """
+ return _format_size(self.size_on_disk)
+
+
+@dataclass(frozen=True)
+class CachedRevisionInfo:
+ """Frozen data structure holding information about a revision.
+
+ A revision correspond to a folder in the `snapshots` folder and is populated with
+ the exact tree structure as the repo on the Hub but contains only symlinks. A
+ revision can be either referenced by 1 or more `refs` or be "detached" (no refs).
+
+ Args:
+ commit_hash (`str`):
+ Hash of the revision (unique).
+ Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`.
+ snapshot_path (`Path`):
+ Path to the revision directory in the `snapshots` folder. It contains the
+ exact tree structure as the repo on the Hub.
+ files: (`FrozenSet[CachedFileInfo]`):
+ Set of [`~CachedFileInfo`] describing all files contained in the snapshot.
+ refs (`FrozenSet[str]`):
+ Set of `refs` pointing to this revision. If the revision has no `refs`, it
+ is considered detached.
+ Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`.
+ size_on_disk (`int`):
+ Sum of the blob file sizes that are symlink-ed by the revision.
+ last_modified (`float`):
+ Timestamp of the last time the revision has been created/modified.
+
+ <Tip warning={true}>
+
+ `last_accessed` cannot be determined correctly on a single revision as blob files
+ are shared across revisions.
+
+ </Tip>
+
+ <Tip warning={true}>
+
+ `size_on_disk` is not necessarily the sum of all file sizes because of possible
+ duplicated files. Besides, only blobs are taken into account, not the (negligible)
+ size of folders and symlinks.
+
+ </Tip>
+ """
+
+ commit_hash: str
+ snapshot_path: Path
+ size_on_disk: int
+ files: FrozenSet[CachedFileInfo]
+ refs: FrozenSet[str]
+
+ last_modified: float
+
+ @property
+ def last_modified_str(self) -> str:
+ """
+ (property) Timestamp of the last time the revision has been modified, returned
+ as a human-readable string.
+
+ Example: "2 weeks ago".
+ """
+ return _format_timesince(self.last_modified)
+
+ @property
+ def size_on_disk_str(self) -> str:
+ """
+ (property) Sum of the blob file sizes as a human-readable string.
+
+ Example: "42.2K".
+ """
+ return _format_size(self.size_on_disk)
+
+ @property
+ def nb_files(self) -> int:
+ """
+ (property) Total number of files in the revision.
+ """
+ return len(self.files)
+
+
+@dataclass(frozen=True)
+class CachedRepoInfo:
+ """Frozen data structure holding information about a cached repository.
+
+ Args:
+ repo_id (`str`):
+ Repo id of the repo on the Hub. Example: `"google/fleurs"`.
+ repo_type (`Literal["dataset", "model", "space"]`):
+ Type of the cached repo.
+ repo_path (`Path`):
+ Local path to the cached repo.
+ size_on_disk (`int`):
+ Sum of the blob file sizes in the cached repo.
+ nb_files (`int`):
+ Total number of blob files in the cached repo.
+ revisions (`FrozenSet[CachedRevisionInfo]`):
+ Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo.
+ last_accessed (`float`):
+ Timestamp of the last time a blob file of the repo has been accessed.
+ last_modified (`float`):
+ Timestamp of the last time a blob file of the repo has been modified/created.
+
+ <Tip warning={true}>
+
+ `size_on_disk` is not necessarily the sum of all revisions sizes because of
+ duplicated files. Besides, only blobs are taken into account, not the (negligible)
+ size of folders and symlinks.
+
+ </Tip>
+
+ <Tip warning={true}>
+
+ `last_accessed` and `last_modified` reliability can depend on the OS you are using.
+ See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result)
+ for more details.
+
+ </Tip>
+ """
+
+ repo_id: str
+ repo_type: REPO_TYPE_T
+ repo_path: Path
+ size_on_disk: int
+ nb_files: int
+ revisions: FrozenSet[CachedRevisionInfo]
+
+ last_accessed: float
+ last_modified: float
+
+ @property
+ def last_accessed_str(self) -> str:
+ """
+ (property) Last time a blob file of the repo has been accessed, returned as a
+ human-readable string.
+
+ Example: "2 weeks ago".
+ """
+ return _format_timesince(self.last_accessed)
+
+ @property
+ def last_modified_str(self) -> str:
+ """
+ (property) Last time a blob file of the repo has been modified, returned as a
+ human-readable string.
+
+ Example: "2 weeks ago".
+ """
+ return _format_timesince(self.last_modified)
+
+ @property
+ def size_on_disk_str(self) -> str:
+ """
+ (property) Sum of the blob file sizes as a human-readable string.
+
+ Example: "42.2K".
+ """
+ return _format_size(self.size_on_disk)
+
+ @property
+ def refs(self) -> Dict[str, CachedRevisionInfo]:
+ """
+ (property) Mapping between `refs` and revision data structures.
+ """
+ return {ref: revision for revision in self.revisions for ref in revision.refs}
+
+
+@dataclass(frozen=True)
+class DeleteCacheStrategy:
+ """Frozen data structure holding the strategy to delete cached revisions.
+
+ This object is not meant to be instantiated programmatically but to be returned by
+ [`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example.
+
+ Args:
+ expected_freed_size (`float`):
+ Expected freed size once strategy is executed.
+ blobs (`FrozenSet[Path]`):
+ Set of blob file paths to be deleted.
+ refs (`FrozenSet[Path]`):
+ Set of reference file paths to be deleted.
+ repos (`FrozenSet[Path]`):
+ Set of entire repo paths to be deleted.
+ snapshots (`FrozenSet[Path]`):
+ Set of snapshots to be deleted (directory of symlinks).
+ """
+
+ expected_freed_size: int
+ blobs: FrozenSet[Path]
+ refs: FrozenSet[Path]
+ repos: FrozenSet[Path]
+ snapshots: FrozenSet[Path]
+
+ @property
+ def expected_freed_size_str(self) -> str:
+ """
+ (property) Expected size that will be freed as a human-readable string.
+
+ Example: "42.2K".
+ """
+ return _format_size(self.expected_freed_size)
+
+ def execute(self) -> None:
+ """Execute the defined strategy.
+
+ <Tip warning={true}>
+
+ If this method is interrupted, the cache might get corrupted. Deletion order is
+ implemented so that references and symlinks are deleted before the actual blob
+ files.
+
+ </Tip>
+
+ <Tip warning={true}>
+
+ This method is irreversible. If executed, cached files are erased and must be
+ downloaded again.
+
+ </Tip>
+ """
+ # Deletion order matters. Blobs are deleted in last so that the user can't end
+ # up in a state where a `ref`` refers to a missing snapshot or a snapshot
+ # symlink refers to a deleted blob.
+
+ # Delete entire repos
+ for path in self.repos:
+ _try_delete_path(path, path_type="repo")
+
+ # Delete snapshot directories
+ for path in self.snapshots:
+ _try_delete_path(path, path_type="snapshot")
+
+ # Delete refs files
+ for path in self.refs:
+ _try_delete_path(path, path_type="ref")
+
+ # Delete blob files
+ for path in self.blobs:
+ _try_delete_path(path, path_type="blob")
+
+ logger.info(f"Cache deletion done. Saved {self.expected_freed_size_str}.")
+
+
+@dataclass(frozen=True)
+class HFCacheInfo:
+ """Frozen data structure holding information about the entire cache-system.
+
+ This data structure is returned by [`scan_cache_dir`] and is immutable.
+
+ Args:
+ size_on_disk (`int`):
+ Sum of all valid repo sizes in the cache-system.
+ repos (`FrozenSet[CachedRepoInfo]`):
+ Set of [`~CachedRepoInfo`] describing all valid cached repos found on the
+ cache-system while scanning.
+ warnings (`List[CorruptedCacheException]`):
+ List of [`~CorruptedCacheException`] that occurred while scanning the cache.
+ Those exceptions are captured so that the scan can continue. Corrupted repos
+ are skipped from the scan.
+
+ <Tip warning={true}>
+
+ Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if
+ some cached repos are corrupted, their sizes are not taken into account.
+
+ </Tip>
+ """
+
+ size_on_disk: int
+ repos: FrozenSet[CachedRepoInfo]
+ warnings: List[CorruptedCacheException]
+
+ @property
+ def size_on_disk_str(self) -> str:
+ """
+ (property) Sum of all valid repo sizes in the cache-system as a human-readable
+ string.
+
+ Example: "42.2K".
+ """
+ return _format_size(self.size_on_disk)
+
+ def delete_revisions(self, *revisions: str) -> DeleteCacheStrategy:
+ """Prepare the strategy to delete one or more revisions cached locally.
+
+ Input revisions can be any revision hash. If a revision hash is not found in the
+ local cache, a warning is thrown but no error is raised. Revisions can be from
+ different cached repos since hashes are unique across repos,
+
+ Examples:
+ ```py
+ >>> from huggingface_hub import scan_cache_dir
+ >>> cache_info = scan_cache_dir()
+ >>> delete_strategy = cache_info.delete_revisions(
+ ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
+ ... )
+ >>> print(f"Will free {delete_strategy.expected_freed_size_str}.")
+ Will free 7.9K.
+ >>> delete_strategy.execute()
+ Cache deletion done. Saved 7.9K.
+ ```
+
+ ```py
+ >>> from huggingface_hub import scan_cache_dir
+ >>> scan_cache_dir().delete_revisions(
+ ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa",
+ ... "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
+ ... "6c0e6080953db56375760c0471a8c5f2929baf11",
+ ... ).execute()
+ Cache deletion done. Saved 8.6G.
+ ```
+
+ <Tip warning={true}>
+
+ `delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to
+ be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but
+ allows having a dry run before actually executing the deletion.
+
+ </Tip>
+ """
+ hashes_to_delete: Set[str] = set(revisions)
+
+ repos_with_revisions: Dict[CachedRepoInfo, Set[CachedRevisionInfo]] = defaultdict(set)
+
+ for repo in self.repos:
+ for revision in repo.revisions:
+ if revision.commit_hash in hashes_to_delete:
+ repos_with_revisions[repo].add(revision)
+ hashes_to_delete.remove(revision.commit_hash)
+
+ if len(hashes_to_delete) > 0:
+ logger.warning(f"Revision(s) not found - cannot delete them: {', '.join(hashes_to_delete)}")
+
+ delete_strategy_blobs: Set[Path] = set()
+ delete_strategy_refs: Set[Path] = set()
+ delete_strategy_repos: Set[Path] = set()
+ delete_strategy_snapshots: Set[Path] = set()
+ delete_strategy_expected_freed_size = 0
+
+ for affected_repo, revisions_to_delete in repos_with_revisions.items():
+ other_revisions = affected_repo.revisions - revisions_to_delete
+
+ # If no other revisions, it means all revisions are deleted
+ # -> delete the entire cached repo
+ if len(other_revisions) == 0:
+ delete_strategy_repos.add(affected_repo.repo_path)
+ delete_strategy_expected_freed_size += affected_repo.size_on_disk
+ continue
+
+ # Some revisions of the repo will be deleted but not all. We need to filter
+ # which blob files will not be linked anymore.
+ for revision_to_delete in revisions_to_delete:
+ # Snapshot dir
+ delete_strategy_snapshots.add(revision_to_delete.snapshot_path)
+
+ # Refs dir
+ for ref in revision_to_delete.refs:
+ delete_strategy_refs.add(affected_repo.repo_path / "refs" / ref)
+
+ # Blobs dir
+ for file in revision_to_delete.files:
+ if file.blob_path not in delete_strategy_blobs:
+ is_file_alone = True
+ for revision in other_revisions:
+ for rev_file in revision.files:
+ if file.blob_path == rev_file.blob_path:
+ is_file_alone = False
+ break
+ if not is_file_alone:
+ break
+
+ # Blob file not referenced by remaining revisions -> delete
+ if is_file_alone:
+ delete_strategy_blobs.add(file.blob_path)
+ delete_strategy_expected_freed_size += file.size_on_disk
+
+ # Return the strategy instead of executing it.
+ return DeleteCacheStrategy(
+ blobs=frozenset(delete_strategy_blobs),
+ refs=frozenset(delete_strategy_refs),
+ repos=frozenset(delete_strategy_repos),
+ snapshots=frozenset(delete_strategy_snapshots),
+ expected_freed_size=delete_strategy_expected_freed_size,
+ )
+
+ def export_as_table(self, *, verbosity: int = 0) -> str:
+ """Generate a table from the [`HFCacheInfo`] object.
+
+ Pass `verbosity=0` to get a table with a single row per repo, with columns
+ "repo_id", "repo_type", "size_on_disk", "nb_files", "last_accessed", "last_modified", "refs", "local_path".
+
+ Pass `verbosity=1` to get a table with a row per repo and revision (thus multiple rows can appear for a single repo), with columns
+ "repo_id", "repo_type", "revision", "size_on_disk", "nb_files", "last_modified", "refs", "local_path".
+
+ Example:
+ ```py
+ >>> from huggingface_hub.utils import scan_cache_dir
+
+ >>> hf_cache_info = scan_cache_dir()
+ HFCacheInfo(...)
+
+ >>> print(hf_cache_info.export_as_table())
+ REPO ID REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED LAST_MODIFIED REFS LOCAL PATH
+ --------------------------------------------------- --------- ------------ -------- ------------- ------------- ---- --------------------------------------------------------------------------------------------------
+ roberta-base model 2.7M 5 1 day ago 1 week ago main ~/.cache/huggingface/hub/models--roberta-base
+ suno/bark model 8.8K 1 1 week ago 1 week ago main ~/.cache/huggingface/hub/models--suno--bark
+ t5-base model 893.8M 4 4 days ago 7 months ago main ~/.cache/huggingface/hub/models--t5-base
+ t5-large model 3.0G 4 5 weeks ago 5 months ago main ~/.cache/huggingface/hub/models--t5-large
+
+ >>> print(hf_cache_info.export_as_table(verbosity=1))
+ REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES LAST_MODIFIED REFS LOCAL PATH
+ --------------------------------------------------- --------- ---------------------------------------- ------------ -------- ------------- ---- -----------------------------------------------------------------------------------------------------------------------------------------------------
+ roberta-base model e2da8e2f811d1448a5b465c236feacd80ffbac7b 2.7M 5 1 week ago main ~/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b
+ suno/bark model 70a8a7d34168586dc5d028fa9666aceade177992 8.8K 1 1 week ago main ~/.cache/huggingface/hub/models--suno--bark/snapshots/70a8a7d34168586dc5d028fa9666aceade177992
+ t5-base model a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1 893.8M 4 7 months ago main ~/.cache/huggingface/hub/models--t5-base/snapshots/a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1
+ t5-large model 150ebc2c4b72291e770f58e6057481c8d2ed331a 3.0G 4 5 months ago main ~/.cache/huggingface/hub/models--t5-large/snapshots/150ebc2c4b72291e770f58e6057481c8d2ed331a
+ ```
+
+ Args:
+ verbosity (`int`, *optional*):
+ The verbosity level. Defaults to 0.
+
+ Returns:
+ `str`: The table as a string.
+ """
+ if verbosity == 0:
+ return tabulate(
+ rows=[
+ [
+ repo.repo_id,
+ repo.repo_type,
+ "{:>12}".format(repo.size_on_disk_str),
+ repo.nb_files,
+ repo.last_accessed_str,
+ repo.last_modified_str,
+ ", ".join(sorted(repo.refs)),
+ str(repo.repo_path),
+ ]
+ for repo in sorted(self.repos, key=lambda repo: repo.repo_path)
+ ],
+ headers=[
+ "REPO ID",
+ "REPO TYPE",
+ "SIZE ON DISK",
+ "NB FILES",
+ "LAST_ACCESSED",
+ "LAST_MODIFIED",
+ "REFS",
+ "LOCAL PATH",
+ ],
+ )
+ else:
+ return tabulate(
+ rows=[
+ [
+ repo.repo_id,
+ repo.repo_type,
+ revision.commit_hash,
+ "{:>12}".format(revision.size_on_disk_str),
+ revision.nb_files,
+ revision.last_modified_str,
+ ", ".join(sorted(revision.refs)),
+ str(revision.snapshot_path),
+ ]
+ for repo in sorted(self.repos, key=lambda repo: repo.repo_path)
+ for revision in sorted(repo.revisions, key=lambda revision: revision.commit_hash)
+ ],
+ headers=[
+ "REPO ID",
+ "REPO TYPE",
+ "REVISION",
+ "SIZE ON DISK",
+ "NB FILES",
+ "LAST_MODIFIED",
+ "REFS",
+ "LOCAL PATH",
+ ],
+ )
+
+
+def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo:
+ """Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure.
+
+ Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache
+ will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`]
+ will be thrown internally but captured and returned in the [`~HFCacheInfo`]
+ structure. Only valid repos get a proper report.
+
+ ```py
+ >>> from huggingface_hub import scan_cache_dir
+
+ >>> hf_cache_info = scan_cache_dir()
+ HFCacheInfo(
+ size_on_disk=3398085269,
+ repos=frozenset({
+ CachedRepoInfo(
+ repo_id='t5-small',
+ repo_type='model',
+ repo_path=PosixPath(...),
+ size_on_disk=970726914,
+ nb_files=11,
+ revisions=frozenset({
+ CachedRevisionInfo(
+ commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5',
+ size_on_disk=970726339,
+ snapshot_path=PosixPath(...),
+ files=frozenset({
+ CachedFileInfo(
+ file_name='config.json',
+ size_on_disk=1197
+ file_path=PosixPath(...),
+ blob_path=PosixPath(...),
+ ),
+ CachedFileInfo(...),
+ ...
+ }),
+ ),
+ CachedRevisionInfo(...),
+ ...
+ }),
+ ),
+ CachedRepoInfo(...),
+ ...
+ }),
+ warnings=[
+ CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."),
+ CorruptedCacheException(...),
+ ...
+ ],
+ )
+ ```
+
+ You can also print a detailed report directly from the `huggingface-cli` using:
+ ```text
+ > huggingface-cli scan-cache
+ REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH
+ --------------------------- --------- ------------ -------- ------------------- -------------------------------------------------------------------------
+ glue dataset 116.3K 15 1.17.0, main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue
+ google/fleurs dataset 64.9M 6 main, refs/pr/1 /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs
+ Jean-Baptiste/camembert-ner model 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner
+ bert-base-cased model 1.9G 13 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased
+ t5-base model 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base
+ t5-small model 970.7M 11 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/models--t5-small
+
+ Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G.
+ Got 1 warning(s) while scanning. Use -vvv to print details.
+ ```
+
+ Args:
+ cache_dir (`str` or `Path`, `optional`):
+ Cache directory to cache. Defaults to the default HF cache directory.
+
+ <Tip warning={true}>
+
+ Raises:
+
+ `CacheNotFound`
+ If the cache directory does not exist.
+
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+ If the cache directory is a file, instead of a directory.
+
+ </Tip>
+
+ Returns: a [`~HFCacheInfo`] object.
+ """
+ if cache_dir is None:
+ cache_dir = HF_HUB_CACHE
+
+ cache_dir = Path(cache_dir).expanduser().resolve()
+ if not cache_dir.exists():
+ raise CacheNotFound(
+ f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.",
+ cache_dir=cache_dir,
+ )
+
+ if cache_dir.is_file():
+ raise ValueError(
+ f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable."
+ )
+
+ repos: Set[CachedRepoInfo] = set()
+ warnings: List[CorruptedCacheException] = []
+ for repo_path in cache_dir.iterdir():
+ if repo_path.name == ".locks": # skip './.locks/' folder
+ continue
+ try:
+ repos.add(_scan_cached_repo(repo_path))
+ except CorruptedCacheException as e:
+ warnings.append(e)
+
+ return HFCacheInfo(
+ repos=frozenset(repos),
+ size_on_disk=sum(repo.size_on_disk for repo in repos),
+ warnings=warnings,
+ )
+
+
+def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo:
+ """Scan a single cache repo and return information about it.
+
+ Any unexpected behavior will raise a [`~CorruptedCacheException`].
+ """
+ if not repo_path.is_dir():
+ raise CorruptedCacheException(f"Repo path is not a directory: {repo_path}")
+
+ if "--" not in repo_path.name:
+ raise CorruptedCacheException(f"Repo path is not a valid HuggingFace cache directory: {repo_path}")
+
+ repo_type, repo_id = repo_path.name.split("--", maxsplit=1)
+ repo_type = repo_type[:-1] # "models" -> "model"
+ repo_id = repo_id.replace("--", "/") # google/fleurs -> "google/fleurs"
+
+ if repo_type not in {"dataset", "model", "space"}:
+ raise CorruptedCacheException(
+ f"Repo type must be `dataset`, `model` or `space`, found `{repo_type}` ({repo_path})."
+ )
+
+ blob_stats: Dict[Path, os.stat_result] = {} # Key is blob_path, value is blob stats
+
+ snapshots_path = repo_path / "snapshots"
+ refs_path = repo_path / "refs"
+
+ if not snapshots_path.exists() or not snapshots_path.is_dir():
+ raise CorruptedCacheException(f"Snapshots dir doesn't exist in cached repo: {snapshots_path}")
+
+ # Scan over `refs` directory
+
+ # key is revision hash, value is set of refs
+ refs_by_hash: Dict[str, Set[str]] = defaultdict(set)
+ if refs_path.exists():
+ # Example of `refs` directory
+ # ── refs
+ # ├── main
+ # └── refs
+ # └── pr
+ # └── 1
+ if refs_path.is_file():
+ raise CorruptedCacheException(f"Refs directory cannot be a file: {refs_path}")
+
+ for ref_path in refs_path.glob("**/*"):
+ # glob("**/*") iterates over all files and directories -> skip directories
+ if ref_path.is_dir() or ref_path.name in FILES_TO_IGNORE:
+ continue
+
+ ref_name = str(ref_path.relative_to(refs_path))
+ with ref_path.open() as f:
+ commit_hash = f.read()
+
+ refs_by_hash[commit_hash].add(ref_name)
+
+ # Scan snapshots directory
+ cached_revisions: Set[CachedRevisionInfo] = set()
+ for revision_path in snapshots_path.iterdir():
+ # Ignore OS-created helper files
+ if revision_path.name in FILES_TO_IGNORE:
+ continue
+ if revision_path.is_file():
+ raise CorruptedCacheException(f"Snapshots folder corrupted. Found a file: {revision_path}")
+
+ cached_files = set()
+ for file_path in revision_path.glob("**/*"):
+ # glob("**/*") iterates over all files and directories -> skip directories
+ if file_path.is_dir():
+ continue
+
+ blob_path = Path(file_path).resolve()
+ if not blob_path.exists():
+ raise CorruptedCacheException(f"Blob missing (broken symlink): {blob_path}")
+
+ if blob_path not in blob_stats:
+ blob_stats[blob_path] = blob_path.stat()
+
+ cached_files.add(
+ CachedFileInfo(
+ file_name=file_path.name,
+ file_path=file_path,
+ size_on_disk=blob_stats[blob_path].st_size,
+ blob_path=blob_path,
+ blob_last_accessed=blob_stats[blob_path].st_atime,
+ blob_last_modified=blob_stats[blob_path].st_mtime,
+ )
+ )
+
+ # Last modified is either the last modified blob file or the revision folder
+ # itself if it is empty
+ if len(cached_files) > 0:
+ revision_last_modified = max(blob_stats[file.blob_path].st_mtime for file in cached_files)
+ else:
+ revision_last_modified = revision_path.stat().st_mtime
+
+ cached_revisions.add(
+ CachedRevisionInfo(
+ commit_hash=revision_path.name,
+ files=frozenset(cached_files),
+ refs=frozenset(refs_by_hash.pop(revision_path.name, set())),
+ size_on_disk=sum(
+ blob_stats[blob_path].st_size for blob_path in set(file.blob_path for file in cached_files)
+ ),
+ snapshot_path=revision_path,
+ last_modified=revision_last_modified,
+ )
+ )
+
+ # Check that all refs referred to an existing revision
+ if len(refs_by_hash) > 0:
+ raise CorruptedCacheException(
+ f"Reference(s) refer to missing commit hashes: {dict(refs_by_hash)} ({repo_path})."
+ )
+
+ # Last modified is either the last modified blob file or the repo folder itself if
+ # no blob files has been found. Same for last accessed.
+ if len(blob_stats) > 0:
+ repo_last_accessed = max(stat.st_atime for stat in blob_stats.values())
+ repo_last_modified = max(stat.st_mtime for stat in blob_stats.values())
+ else:
+ repo_stats = repo_path.stat()
+ repo_last_accessed = repo_stats.st_atime
+ repo_last_modified = repo_stats.st_mtime
+
+ # Build and return frozen structure
+ return CachedRepoInfo(
+ nb_files=len(blob_stats),
+ repo_id=repo_id,
+ repo_path=repo_path,
+ repo_type=repo_type, # type: ignore
+ revisions=frozenset(cached_revisions),
+ size_on_disk=sum(stat.st_size for stat in blob_stats.values()),
+ last_accessed=repo_last_accessed,
+ last_modified=repo_last_modified,
+ )
+
+
+def _format_size(num: int) -> str:
+ """Format size in bytes into a human-readable string.
+
+ Taken from https://stackoverflow.com/a/1094933
+ """
+ num_f = float(num)
+ for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+ if abs(num_f) < 1000.0:
+ return f"{num_f:3.1f}{unit}"
+ num_f /= 1000.0
+ return f"{num_f:.1f}Y"
+
+
+_TIMESINCE_CHUNKS = (
+ # Label, divider, max value
+ ("second", 1, 60),
+ ("minute", 60, 60),
+ ("hour", 60 * 60, 24),
+ ("day", 60 * 60 * 24, 6),
+ ("week", 60 * 60 * 24 * 7, 6),
+ ("month", 60 * 60 * 24 * 30, 11),
+ ("year", 60 * 60 * 24 * 365, None),
+)
+
+
+def _format_timesince(ts: float) -> str:
+ """Format timestamp in seconds into a human-readable string, relative to now.
+
+ Vaguely inspired by Django's `timesince` formatter.
+ """
+ delta = time.time() - ts
+ if delta < 20:
+ return "a few seconds ago"
+ for label, divider, max_value in _TIMESINCE_CHUNKS: # noqa: B007
+ value = round(delta / divider)
+ if max_value is not None and value <= max_value:
+ break
+ return f"{value} {label}{'s' if value > 1 else ''} ago"
+
+
+def _try_delete_path(path: Path, path_type: str) -> None:
+ """Try to delete a local file or folder.
+
+ If the path does not exists, error is logged as a warning and then ignored.
+
+ Args:
+ path (`Path`)
+ Path to delete. Can be a file or a folder.
+ path_type (`str`)
+ What path are we deleting ? Only for logging purposes. Example: "snapshot".
+ """
+ logger.info(f"Delete {path_type}: {path}")
+ try:
+ if path.is_file():
+ os.remove(path)
+ else:
+ shutil.rmtree(path)
+ except FileNotFoundError:
+ logger.warning(f"Couldn't delete {path_type}: file not found ({path})", exc_info=True)
+ except PermissionError:
+ logger.warning(f"Couldn't delete {path_type}: permission denied ({path})", exc_info=True)