diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_assets.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_assets.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_assets.py | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_assets.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_assets.py new file mode 100644 index 00000000..e5d435df --- /dev/null +++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_cache_assets.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Union + +from ..constants import HF_ASSETS_CACHE + + +def cached_assets_path( + library_name: str, + namespace: str = "default", + subfolder: str = "default", + *, + assets_dir: Union[str, Path, None] = None, +): + """Return a folder path to cache arbitrary files. + + `huggingface_hub` provides a canonical folder path to store assets. This is the + recommended way to integrate cache in a downstream library as it will benefit from + the builtins tools to scan and delete the cache properly. + + The distinction is made between files cached from the Hub and assets. Files from the + Hub are cached in a git-aware manner and entirely managed by `huggingface_hub`. See + [related documentation](https://huggingface.co/docs/huggingface_hub/how-to-cache). + All other files that a downstream library caches are considered to be "assets" + (files downloaded from external sources, extracted from a .tar archive, preprocessed + for training,...). + + Once the folder path is generated, it is guaranteed to exist and to be a directory. + The path is based on 3 levels of depth: the library name, a namespace and a + subfolder. Those 3 levels grants flexibility while allowing `huggingface_hub` to + expect folders when scanning/deleting parts of the assets cache. Within a library, + it is expected that all namespaces share the same subset of subfolder names but this + is not a mandatory rule. The downstream library has then full control on which file + structure to adopt within its cache. Namespace and subfolder are optional (would + default to a `"default/"` subfolder) but library name is mandatory as we want every + downstream library to manage its own cache. + + Expected tree: + ```text + assets/ + └── datasets/ + │ ├── SQuAD/ + │ │ ├── downloaded/ + │ │ ├── extracted/ + │ │ └── processed/ + │ ├── Helsinki-NLP--tatoeba_mt/ + │ ├── downloaded/ + │ ├── extracted/ + │ └── processed/ + └── transformers/ + ├── default/ + │ ├── something/ + ├── bert-base-cased/ + │ ├── default/ + │ └── training/ + hub/ + └── models--julien-c--EsperBERTo-small/ + ├── blobs/ + │ ├── (...) + │ ├── (...) + ├── refs/ + │ └── (...) + └── [ 128] snapshots/ + ├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/ + │ ├── (...) + └── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/ + └── (...) + ``` + + + Args: + library_name (`str`): + Name of the library that will manage the cache folder. Example: `"dataset"`. + namespace (`str`, *optional*, defaults to "default"): + Namespace to which the data belongs. Example: `"SQuAD"`. + subfolder (`str`, *optional*, defaults to "default"): + Subfolder in which the data will be stored. Example: `extracted`. + assets_dir (`str`, `Path`, *optional*): + Path to the folder where assets are cached. This must not be the same folder + where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided. + Can also be set with `HF_ASSETS_CACHE` environment variable. + + Returns: + Path to the cache folder (`Path`). + + Example: + ```py + >>> from huggingface_hub import cached_assets_path + + >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download") + PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/download') + + >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="extracted") + PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/extracted') + + >>> cached_assets_path(library_name="datasets", namespace="Helsinki-NLP/tatoeba_mt") + PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/Helsinki-NLP--tatoeba_mt/default') + + >>> cached_assets_path(library_name="datasets", assets_dir="/tmp/tmp123456") + PosixPath('/tmp/tmp123456/datasets/default/default') + ``` + """ + # Resolve assets_dir + if assets_dir is None: + assets_dir = HF_ASSETS_CACHE + assets_dir = Path(assets_dir).expanduser().resolve() + + # Avoid names that could create path issues + for part in (" ", "/", "\\"): + library_name = library_name.replace(part, "--") + namespace = namespace.replace(part, "--") + subfolder = subfolder.replace(part, "--") + + # Path to subfolder is created + path = assets_dir / library_name / namespace / subfolder + try: + path.mkdir(exist_ok=True, parents=True) + except (FileExistsError, NotADirectoryError): + raise ValueError(f"Corrupted assets folder: cannot create directory because of an existing file ({path}).") + + # Return + return path |