diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py | 896 |
1 files changed, 896 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py new file mode 100644 index 00000000..21469c97 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_manager.py @@ -0,0 +1,896 @@ +# coding=utf-8 +# Copyright 2022-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utilities to manage the HF cache directory.""" + +import os +import shutil +import time +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union + +from huggingface_hub.errors import CacheNotFound, CorruptedCacheException + +from ..commands._cli_utils import tabulate +from ..constants import HF_HUB_CACHE +from . import logging + + +logger = logging.get_logger(__name__) + +REPO_TYPE_T = Literal["model", "dataset", "space"] + +# List of OS-created helper files that need to be ignored +FILES_TO_IGNORE = [".DS_Store"] + + +@dataclass(frozen=True) +class CachedFileInfo: + """Frozen data structure holding information about a single cached file. + + Args: + file_name (`str`): + Name of the file. Example: `config.json`. + file_path (`Path`): + Path of the file in the `snapshots` directory. The file path is a symlink + referring to a blob in the `blobs` folder. + blob_path (`Path`): + Path of the blob file. This is equivalent to `file_path.resolve()`. + size_on_disk (`int`): + Size of the blob file in bytes. + blob_last_accessed (`float`): + Timestamp of the last time the blob file has been accessed (from any + revision). + blob_last_modified (`float`): + Timestamp of the last time the blob file has been modified/created. + + <Tip warning={true}> + + `blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you + are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) + for more details. + + </Tip> + """ + + file_name: str + file_path: Path + blob_path: Path + size_on_disk: int + + blob_last_accessed: float + blob_last_modified: float + + @property + def blob_last_accessed_str(self) -> str: + """ + (property) Timestamp of the last time the blob file has been accessed (from any + revision), returned as a human-readable string. + + Example: "2 weeks ago". + """ + return _format_timesince(self.blob_last_accessed) + + @property + def blob_last_modified_str(self) -> str: + """ + (property) Timestamp of the last time the blob file has been modified, returned + as a human-readable string. + + Example: "2 weeks ago". + """ + return _format_timesince(self.blob_last_modified) + + @property + def size_on_disk_str(self) -> str: + """ + (property) Size of the blob file as a human-readable string. + + Example: "42.2K". + """ + return _format_size(self.size_on_disk) + + +@dataclass(frozen=True) +class CachedRevisionInfo: + """Frozen data structure holding information about a revision. + + A revision correspond to a folder in the `snapshots` folder and is populated with + the exact tree structure as the repo on the Hub but contains only symlinks. A + revision can be either referenced by 1 or more `refs` or be "detached" (no refs). + + Args: + commit_hash (`str`): + Hash of the revision (unique). + Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`. + snapshot_path (`Path`): + Path to the revision directory in the `snapshots` folder. It contains the + exact tree structure as the repo on the Hub. + files: (`FrozenSet[CachedFileInfo]`): + Set of [`~CachedFileInfo`] describing all files contained in the snapshot. + refs (`FrozenSet[str]`): + Set of `refs` pointing to this revision. If the revision has no `refs`, it + is considered detached. + Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`. + size_on_disk (`int`): + Sum of the blob file sizes that are symlink-ed by the revision. + last_modified (`float`): + Timestamp of the last time the revision has been created/modified. + + <Tip warning={true}> + + `last_accessed` cannot be determined correctly on a single revision as blob files + are shared across revisions. + + </Tip> + + <Tip warning={true}> + + `size_on_disk` is not necessarily the sum of all file sizes because of possible + duplicated files. Besides, only blobs are taken into account, not the (negligible) + size of folders and symlinks. + + </Tip> + """ + + commit_hash: str + snapshot_path: Path + size_on_disk: int + files: FrozenSet[CachedFileInfo] + refs: FrozenSet[str] + + last_modified: float + + @property + def last_modified_str(self) -> str: + """ + (property) Timestamp of the last time the revision has been modified, returned + as a human-readable string. + + Example: "2 weeks ago". + """ + return _format_timesince(self.last_modified) + + @property + def size_on_disk_str(self) -> str: + """ + (property) Sum of the blob file sizes as a human-readable string. + + Example: "42.2K". + """ + return _format_size(self.size_on_disk) + + @property + def nb_files(self) -> int: + """ + (property) Total number of files in the revision. + """ + return len(self.files) + + +@dataclass(frozen=True) +class CachedRepoInfo: + """Frozen data structure holding information about a cached repository. + + Args: + repo_id (`str`): + Repo id of the repo on the Hub. Example: `"google/fleurs"`. + repo_type (`Literal["dataset", "model", "space"]`): + Type of the cached repo. + repo_path (`Path`): + Local path to the cached repo. + size_on_disk (`int`): + Sum of the blob file sizes in the cached repo. + nb_files (`int`): + Total number of blob files in the cached repo. + revisions (`FrozenSet[CachedRevisionInfo]`): + Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo. + last_accessed (`float`): + Timestamp of the last time a blob file of the repo has been accessed. + last_modified (`float`): + Timestamp of the last time a blob file of the repo has been modified/created. + + <Tip warning={true}> + + `size_on_disk` is not necessarily the sum of all revisions sizes because of + duplicated files. Besides, only blobs are taken into account, not the (negligible) + size of folders and symlinks. + + </Tip> + + <Tip warning={true}> + + `last_accessed` and `last_modified` reliability can depend on the OS you are using. + See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) + for more details. + + </Tip> + """ + + repo_id: str + repo_type: REPO_TYPE_T + repo_path: Path + size_on_disk: int + nb_files: int + revisions: FrozenSet[CachedRevisionInfo] + + last_accessed: float + last_modified: float + + @property + def last_accessed_str(self) -> str: + """ + (property) Last time a blob file of the repo has been accessed, returned as a + human-readable string. + + Example: "2 weeks ago". + """ + return _format_timesince(self.last_accessed) + + @property + def last_modified_str(self) -> str: + """ + (property) Last time a blob file of the repo has been modified, returned as a + human-readable string. + + Example: "2 weeks ago". + """ + return _format_timesince(self.last_modified) + + @property + def size_on_disk_str(self) -> str: + """ + (property) Sum of the blob file sizes as a human-readable string. + + Example: "42.2K". + """ + return _format_size(self.size_on_disk) + + @property + def refs(self) -> Dict[str, CachedRevisionInfo]: + """ + (property) Mapping between `refs` and revision data structures. + """ + return {ref: revision for revision in self.revisions for ref in revision.refs} + + +@dataclass(frozen=True) +class DeleteCacheStrategy: + """Frozen data structure holding the strategy to delete cached revisions. + + This object is not meant to be instantiated programmatically but to be returned by + [`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example. + + Args: + expected_freed_size (`float`): + Expected freed size once strategy is executed. + blobs (`FrozenSet[Path]`): + Set of blob file paths to be deleted. + refs (`FrozenSet[Path]`): + Set of reference file paths to be deleted. + repos (`FrozenSet[Path]`): + Set of entire repo paths to be deleted. + snapshots (`FrozenSet[Path]`): + Set of snapshots to be deleted (directory of symlinks). + """ + + expected_freed_size: int + blobs: FrozenSet[Path] + refs: FrozenSet[Path] + repos: FrozenSet[Path] + snapshots: FrozenSet[Path] + + @property + def expected_freed_size_str(self) -> str: + """ + (property) Expected size that will be freed as a human-readable string. + + Example: "42.2K". + """ + return _format_size(self.expected_freed_size) + + def execute(self) -> None: + """Execute the defined strategy. + + <Tip warning={true}> + + If this method is interrupted, the cache might get corrupted. Deletion order is + implemented so that references and symlinks are deleted before the actual blob + files. + + </Tip> + + <Tip warning={true}> + + This method is irreversible. If executed, cached files are erased and must be + downloaded again. + + </Tip> + """ + # Deletion order matters. Blobs are deleted in last so that the user can't end + # up in a state where a `ref`` refers to a missing snapshot or a snapshot + # symlink refers to a deleted blob. + + # Delete entire repos + for path in self.repos: + _try_delete_path(path, path_type="repo") + + # Delete snapshot directories + for path in self.snapshots: + _try_delete_path(path, path_type="snapshot") + + # Delete refs files + for path in self.refs: + _try_delete_path(path, path_type="ref") + + # Delete blob files + for path in self.blobs: + _try_delete_path(path, path_type="blob") + + logger.info(f"Cache deletion done. Saved {self.expected_freed_size_str}.") + + +@dataclass(frozen=True) +class HFCacheInfo: + """Frozen data structure holding information about the entire cache-system. + + This data structure is returned by [`scan_cache_dir`] and is immutable. + + Args: + size_on_disk (`int`): + Sum of all valid repo sizes in the cache-system. + repos (`FrozenSet[CachedRepoInfo]`): + Set of [`~CachedRepoInfo`] describing all valid cached repos found on the + cache-system while scanning. + warnings (`List[CorruptedCacheException]`): + List of [`~CorruptedCacheException`] that occurred while scanning the cache. + Those exceptions are captured so that the scan can continue. Corrupted repos + are skipped from the scan. + + <Tip warning={true}> + + Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if + some cached repos are corrupted, their sizes are not taken into account. + + </Tip> + """ + + size_on_disk: int + repos: FrozenSet[CachedRepoInfo] + warnings: List[CorruptedCacheException] + + @property + def size_on_disk_str(self) -> str: + """ + (property) Sum of all valid repo sizes in the cache-system as a human-readable + string. + + Example: "42.2K". + """ + return _format_size(self.size_on_disk) + + def delete_revisions(self, *revisions: str) -> DeleteCacheStrategy: + """Prepare the strategy to delete one or more revisions cached locally. + + Input revisions can be any revision hash. If a revision hash is not found in the + local cache, a warning is thrown but no error is raised. Revisions can be from + different cached repos since hashes are unique across repos, + + Examples: + ```py + >>> from huggingface_hub import scan_cache_dir + >>> cache_info = scan_cache_dir() + >>> delete_strategy = cache_info.delete_revisions( + ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa" + ... ) + >>> print(f"Will free {delete_strategy.expected_freed_size_str}.") + Will free 7.9K. + >>> delete_strategy.execute() + Cache deletion done. Saved 7.9K. + ``` + + ```py + >>> from huggingface_hub import scan_cache_dir + >>> scan_cache_dir().delete_revisions( + ... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa", + ... "e2983b237dccf3ab4937c97fa717319a9ca1a96d", + ... "6c0e6080953db56375760c0471a8c5f2929baf11", + ... ).execute() + Cache deletion done. Saved 8.6G. + ``` + + <Tip warning={true}> + + `delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to + be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but + allows having a dry run before actually executing the deletion. + + </Tip> + """ + hashes_to_delete: Set[str] = set(revisions) + + repos_with_revisions: Dict[CachedRepoInfo, Set[CachedRevisionInfo]] = defaultdict(set) + + for repo in self.repos: + for revision in repo.revisions: + if revision.commit_hash in hashes_to_delete: + repos_with_revisions[repo].add(revision) + hashes_to_delete.remove(revision.commit_hash) + + if len(hashes_to_delete) > 0: + logger.warning(f"Revision(s) not found - cannot delete them: {', '.join(hashes_to_delete)}") + + delete_strategy_blobs: Set[Path] = set() + delete_strategy_refs: Set[Path] = set() + delete_strategy_repos: Set[Path] = set() + delete_strategy_snapshots: Set[Path] = set() + delete_strategy_expected_freed_size = 0 + + for affected_repo, revisions_to_delete in repos_with_revisions.items(): + other_revisions = affected_repo.revisions - revisions_to_delete + + # If no other revisions, it means all revisions are deleted + # -> delete the entire cached repo + if len(other_revisions) == 0: + delete_strategy_repos.add(affected_repo.repo_path) + delete_strategy_expected_freed_size += affected_repo.size_on_disk + continue + + # Some revisions of the repo will be deleted but not all. We need to filter + # which blob files will not be linked anymore. + for revision_to_delete in revisions_to_delete: + # Snapshot dir + delete_strategy_snapshots.add(revision_to_delete.snapshot_path) + + # Refs dir + for ref in revision_to_delete.refs: + delete_strategy_refs.add(affected_repo.repo_path / "refs" / ref) + + # Blobs dir + for file in revision_to_delete.files: + if file.blob_path not in delete_strategy_blobs: + is_file_alone = True + for revision in other_revisions: + for rev_file in revision.files: + if file.blob_path == rev_file.blob_path: + is_file_alone = False + break + if not is_file_alone: + break + + # Blob file not referenced by remaining revisions -> delete + if is_file_alone: + delete_strategy_blobs.add(file.blob_path) + delete_strategy_expected_freed_size += file.size_on_disk + + # Return the strategy instead of executing it. + return DeleteCacheStrategy( + blobs=frozenset(delete_strategy_blobs), + refs=frozenset(delete_strategy_refs), + repos=frozenset(delete_strategy_repos), + snapshots=frozenset(delete_strategy_snapshots), + expected_freed_size=delete_strategy_expected_freed_size, + ) + + def export_as_table(self, *, verbosity: int = 0) -> str: + """Generate a table from the [`HFCacheInfo`] object. + + Pass `verbosity=0` to get a table with a single row per repo, with columns + "repo_id", "repo_type", "size_on_disk", "nb_files", "last_accessed", "last_modified", "refs", "local_path". + + Pass `verbosity=1` to get a table with a row per repo and revision (thus multiple rows can appear for a single repo), with columns + "repo_id", "repo_type", "revision", "size_on_disk", "nb_files", "last_modified", "refs", "local_path". + + Example: + ```py + >>> from huggingface_hub.utils import scan_cache_dir + + >>> hf_cache_info = scan_cache_dir() + HFCacheInfo(...) + + >>> print(hf_cache_info.export_as_table()) + REPO ID REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED LAST_MODIFIED REFS LOCAL PATH + --------------------------------------------------- --------- ------------ -------- ------------- ------------- ---- -------------------------------------------------------------------------------------------------- + roberta-base model 2.7M 5 1 day ago 1 week ago main ~/.cache/huggingface/hub/models--roberta-base + suno/bark model 8.8K 1 1 week ago 1 week ago main ~/.cache/huggingface/hub/models--suno--bark + t5-base model 893.8M 4 4 days ago 7 months ago main ~/.cache/huggingface/hub/models--t5-base + t5-large model 3.0G 4 5 weeks ago 5 months ago main ~/.cache/huggingface/hub/models--t5-large + + >>> print(hf_cache_info.export_as_table(verbosity=1)) + REPO ID REPO TYPE REVISION SIZE ON DISK NB FILES LAST_MODIFIED REFS LOCAL PATH + --------------------------------------------------- --------- ---------------------------------------- ------------ -------- ------------- ---- ----------------------------------------------------------------------------------------------------------------------------------------------------- + roberta-base model e2da8e2f811d1448a5b465c236feacd80ffbac7b 2.7M 5 1 week ago main ~/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b + suno/bark model 70a8a7d34168586dc5d028fa9666aceade177992 8.8K 1 1 week ago main ~/.cache/huggingface/hub/models--suno--bark/snapshots/70a8a7d34168586dc5d028fa9666aceade177992 + t5-base model a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1 893.8M 4 7 months ago main ~/.cache/huggingface/hub/models--t5-base/snapshots/a9723ea7f1b39c1eae772870f3b547bf6ef7e6c1 + t5-large model 150ebc2c4b72291e770f58e6057481c8d2ed331a 3.0G 4 5 months ago main ~/.cache/huggingface/hub/models--t5-large/snapshots/150ebc2c4b72291e770f58e6057481c8d2ed331a + ``` + + Args: + verbosity (`int`, *optional*): + The verbosity level. Defaults to 0. + + Returns: + `str`: The table as a string. + """ + if verbosity == 0: + return tabulate( + rows=[ + [ + repo.repo_id, + repo.repo_type, + "{:>12}".format(repo.size_on_disk_str), + repo.nb_files, + repo.last_accessed_str, + repo.last_modified_str, + ", ".join(sorted(repo.refs)), + str(repo.repo_path), + ] + for repo in sorted(self.repos, key=lambda repo: repo.repo_path) + ], + headers=[ + "REPO ID", + "REPO TYPE", + "SIZE ON DISK", + "NB FILES", + "LAST_ACCESSED", + "LAST_MODIFIED", + "REFS", + "LOCAL PATH", + ], + ) + else: + return tabulate( + rows=[ + [ + repo.repo_id, + repo.repo_type, + revision.commit_hash, + "{:>12}".format(revision.size_on_disk_str), + revision.nb_files, + revision.last_modified_str, + ", ".join(sorted(revision.refs)), + str(revision.snapshot_path), + ] + for repo in sorted(self.repos, key=lambda repo: repo.repo_path) + for revision in sorted(repo.revisions, key=lambda revision: revision.commit_hash) + ], + headers=[ + "REPO ID", + "REPO TYPE", + "REVISION", + "SIZE ON DISK", + "NB FILES", + "LAST_MODIFIED", + "REFS", + "LOCAL PATH", + ], + ) + + +def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo: + """Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure. + + Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache + will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`] + will be thrown internally but captured and returned in the [`~HFCacheInfo`] + structure. Only valid repos get a proper report. + + ```py + >>> from huggingface_hub import scan_cache_dir + + >>> hf_cache_info = scan_cache_dir() + HFCacheInfo( + size_on_disk=3398085269, + repos=frozenset({ + CachedRepoInfo( + repo_id='t5-small', + repo_type='model', + repo_path=PosixPath(...), + size_on_disk=970726914, + nb_files=11, + revisions=frozenset({ + CachedRevisionInfo( + commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5', + size_on_disk=970726339, + snapshot_path=PosixPath(...), + files=frozenset({ + CachedFileInfo( + file_name='config.json', + size_on_disk=1197 + file_path=PosixPath(...), + blob_path=PosixPath(...), + ), + CachedFileInfo(...), + ... + }), + ), + CachedRevisionInfo(...), + ... + }), + ), + CachedRepoInfo(...), + ... + }), + warnings=[ + CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."), + CorruptedCacheException(...), + ... + ], + ) + ``` + + You can also print a detailed report directly from the `huggingface-cli` using: + ```text + > huggingface-cli scan-cache + REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH + --------------------------- --------- ------------ -------- ------------------- ------------------------------------------------------------------------- + glue dataset 116.3K 15 1.17.0, main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue + google/fleurs dataset 64.9M 6 main, refs/pr/1 /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs + Jean-Baptiste/camembert-ner model 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner + bert-base-cased model 1.9G 13 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased + t5-base model 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base + t5-small model 970.7M 11 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/models--t5-small + + Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G. + Got 1 warning(s) while scanning. Use -vvv to print details. + ``` + + Args: + cache_dir (`str` or `Path`, `optional`): + Cache directory to cache. Defaults to the default HF cache directory. + + <Tip warning={true}> + + Raises: + + `CacheNotFound` + If the cache directory does not exist. + + [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + If the cache directory is a file, instead of a directory. + + </Tip> + + Returns: a [`~HFCacheInfo`] object. + """ + if cache_dir is None: + cache_dir = HF_HUB_CACHE + + cache_dir = Path(cache_dir).expanduser().resolve() + if not cache_dir.exists(): + raise CacheNotFound( + f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.", + cache_dir=cache_dir, + ) + + if cache_dir.is_file(): + raise ValueError( + f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable." + ) + + repos: Set[CachedRepoInfo] = set() + warnings: List[CorruptedCacheException] = [] + for repo_path in cache_dir.iterdir(): + if repo_path.name == ".locks": # skip './.locks/' folder + continue + try: + repos.add(_scan_cached_repo(repo_path)) + except CorruptedCacheException as e: + warnings.append(e) + + return HFCacheInfo( + repos=frozenset(repos), + size_on_disk=sum(repo.size_on_disk for repo in repos), + warnings=warnings, + ) + + +def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: + """Scan a single cache repo and return information about it. + + Any unexpected behavior will raise a [`~CorruptedCacheException`]. + """ + if not repo_path.is_dir(): + raise CorruptedCacheException(f"Repo path is not a directory: {repo_path}") + + if "--" not in repo_path.name: + raise CorruptedCacheException(f"Repo path is not a valid HuggingFace cache directory: {repo_path}") + + repo_type, repo_id = repo_path.name.split("--", maxsplit=1) + repo_type = repo_type[:-1] # "models" -> "model" + repo_id = repo_id.replace("--", "/") # google/fleurs -> "google/fleurs" + + if repo_type not in {"dataset", "model", "space"}: + raise CorruptedCacheException( + f"Repo type must be `dataset`, `model` or `space`, found `{repo_type}` ({repo_path})." + ) + + blob_stats: Dict[Path, os.stat_result] = {} # Key is blob_path, value is blob stats + + snapshots_path = repo_path / "snapshots" + refs_path = repo_path / "refs" + + if not snapshots_path.exists() or not snapshots_path.is_dir(): + raise CorruptedCacheException(f"Snapshots dir doesn't exist in cached repo: {snapshots_path}") + + # Scan over `refs` directory + + # key is revision hash, value is set of refs + refs_by_hash: Dict[str, Set[str]] = defaultdict(set) + if refs_path.exists(): + # Example of `refs` directory + # ── refs + # ├── main + # └── refs + # └── pr + # └── 1 + if refs_path.is_file(): + raise CorruptedCacheException(f"Refs directory cannot be a file: {refs_path}") + + for ref_path in refs_path.glob("**/*"): + # glob("**/*") iterates over all files and directories -> skip directories + if ref_path.is_dir() or ref_path.name in FILES_TO_IGNORE: + continue + + ref_name = str(ref_path.relative_to(refs_path)) + with ref_path.open() as f: + commit_hash = f.read() + + refs_by_hash[commit_hash].add(ref_name) + + # Scan snapshots directory + cached_revisions: Set[CachedRevisionInfo] = set() + for revision_path in snapshots_path.iterdir(): + # Ignore OS-created helper files + if revision_path.name in FILES_TO_IGNORE: + continue + if revision_path.is_file(): + raise CorruptedCacheException(f"Snapshots folder corrupted. Found a file: {revision_path}") + + cached_files = set() + for file_path in revision_path.glob("**/*"): + # glob("**/*") iterates over all files and directories -> skip directories + if file_path.is_dir(): + continue + + blob_path = Path(file_path).resolve() + if not blob_path.exists(): + raise CorruptedCacheException(f"Blob missing (broken symlink): {blob_path}") + + if blob_path not in blob_stats: + blob_stats[blob_path] = blob_path.stat() + + cached_files.add( + CachedFileInfo( + file_name=file_path.name, + file_path=file_path, + size_on_disk=blob_stats[blob_path].st_size, + blob_path=blob_path, + blob_last_accessed=blob_stats[blob_path].st_atime, + blob_last_modified=blob_stats[blob_path].st_mtime, + ) + ) + + # Last modified is either the last modified blob file or the revision folder + # itself if it is empty + if len(cached_files) > 0: + revision_last_modified = max(blob_stats[file.blob_path].st_mtime for file in cached_files) + else: + revision_last_modified = revision_path.stat().st_mtime + + cached_revisions.add( + CachedRevisionInfo( + commit_hash=revision_path.name, + files=frozenset(cached_files), + refs=frozenset(refs_by_hash.pop(revision_path.name, set())), + size_on_disk=sum( + blob_stats[blob_path].st_size for blob_path in set(file.blob_path for file in cached_files) + ), + snapshot_path=revision_path, + last_modified=revision_last_modified, + ) + ) + + # Check that all refs referred to an existing revision + if len(refs_by_hash) > 0: + raise CorruptedCacheException( + f"Reference(s) refer to missing commit hashes: {dict(refs_by_hash)} ({repo_path})." + ) + + # Last modified is either the last modified blob file or the repo folder itself if + # no blob files has been found. Same for last accessed. + if len(blob_stats) > 0: + repo_last_accessed = max(stat.st_atime for stat in blob_stats.values()) + repo_last_modified = max(stat.st_mtime for stat in blob_stats.values()) + else: + repo_stats = repo_path.stat() + repo_last_accessed = repo_stats.st_atime + repo_last_modified = repo_stats.st_mtime + + # Build and return frozen structure + return CachedRepoInfo( + nb_files=len(blob_stats), + repo_id=repo_id, + repo_path=repo_path, + repo_type=repo_type, # type: ignore + revisions=frozenset(cached_revisions), + size_on_disk=sum(stat.st_size for stat in blob_stats.values()), + last_accessed=repo_last_accessed, + last_modified=repo_last_modified, + ) + + +def _format_size(num: int) -> str: + """Format size in bytes into a human-readable string. + + Taken from https://stackoverflow.com/a/1094933 + """ + num_f = float(num) + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + if abs(num_f) < 1000.0: + return f"{num_f:3.1f}{unit}" + num_f /= 1000.0 + return f"{num_f:.1f}Y" + + +_TIMESINCE_CHUNKS = ( + # Label, divider, max value + ("second", 1, 60), + ("minute", 60, 60), + ("hour", 60 * 60, 24), + ("day", 60 * 60 * 24, 6), + ("week", 60 * 60 * 24 * 7, 6), + ("month", 60 * 60 * 24 * 30, 11), + ("year", 60 * 60 * 24 * 365, None), +) + + +def _format_timesince(ts: float) -> str: + """Format timestamp in seconds into a human-readable string, relative to now. + + Vaguely inspired by Django's `timesince` formatter. + """ + delta = time.time() - ts + if delta < 20: + return "a few seconds ago" + for label, divider, max_value in _TIMESINCE_CHUNKS: # noqa: B007 + value = round(delta / divider) + if max_value is not None and value <= max_value: + break + return f"{value} {label}{'s' if value > 1 else ''} ago" + + +def _try_delete_path(path: Path, path_type: str) -> None: + """Try to delete a local file or folder. + + If the path does not exists, error is logged as a warning and then ignored. + + Args: + path (`Path`) + Path to delete. Can be a file or a folder. + path_type (`str`) + What path are we deleting ? Only for logging purposes. Example: "snapshot". + """ + logger.info(f"Delete {path_type}: {path}") + try: + if path.is_file(): + os.remove(path) + else: + shutil.rmtree(path) + except FileNotFoundError: + logger.warning(f"Couldn't delete {path_type}: file not found ({path})", exc_info=True) + except PermissionError: + logger.warning(f"Couldn't delete {path_type}: permission denied ({path})", exc_info=True) |