diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py new file mode 100644 index 00000000..4f2c0ebc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2022-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utilities to handle paths in Huggingface Hub.""" + +from fnmatch import fnmatch +from pathlib import Path +from typing import Callable, Generator, Iterable, List, Optional, TypeVar, Union + + +T = TypeVar("T") + +# Always ignore `.git` and `.cache/huggingface` folders in commits +DEFAULT_IGNORE_PATTERNS = [ + ".git", + ".git/*", + "*/.git", + "**/.git/**", + ".cache/huggingface", + ".cache/huggingface/*", + "*/.cache/huggingface", + "**/.cache/huggingface/**", +] +# Forbidden to commit these folders +FORBIDDEN_FOLDERS = [".git", ".cache"] + + +def filter_repo_objects( + items: Iterable[T], + *, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + key: Optional[Callable[[T], str]] = None, +) -> Generator[T, None, None]: + """Filter repo objects based on an allowlist and a denylist. + + Input must be a list of paths (`str` or `Path`) or a list of arbitrary objects. + In the later case, `key` must be provided and specifies a function of one argument + that is used to extract a path from each element in iterable. + + Patterns are Unix shell-style wildcards which are NOT regular expressions. See + https://docs.python.org/3/library/fnmatch.html for more details. + + Args: + items (`Iterable`): + List of items to filter. + allow_patterns (`str` or `List[str]`, *optional*): + Patterns constituting the allowlist. If provided, item paths must match at + least one pattern from the allowlist. + ignore_patterns (`str` or `List[str]`, *optional*): + Patterns constituting the denylist. If provided, item paths must not match + any patterns from the denylist. + key (`Callable[[T], str]`, *optional*): + Single-argument function to extract a path from each item. If not provided, + the `items` must already be `str` or `Path`. + + Returns: + Filtered list of objects, as a generator. + + Raises: + :class:`ValueError`: + If `key` is not provided and items are not `str` or `Path`. + + Example usage with paths: + ```python + >>> # Filter only PDFs that are not hidden. + >>> list(filter_repo_objects( + ... ["aaa.PDF", "bbb.jpg", ".ccc.pdf", ".ddd.png"], + ... allow_patterns=["*.pdf"], + ... ignore_patterns=[".*"], + ... )) + ["aaa.pdf"] + ``` + + Example usage with objects: + ```python + >>> list(filter_repo_objects( + ... [ + ... CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf") + ... CommitOperationAdd(path_or_fileobj="/tmp/bbb.jpg", path_in_repo="bbb.jpg") + ... CommitOperationAdd(path_or_fileobj="/tmp/.ccc.pdf", path_in_repo=".ccc.pdf") + ... CommitOperationAdd(path_or_fileobj="/tmp/.ddd.png", path_in_repo=".ddd.png") + ... ], + ... allow_patterns=["*.pdf"], + ... ignore_patterns=[".*"], + ... key=lambda x: x.repo_in_path + ... )) + [CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")] + ``` + """ + if isinstance(allow_patterns, str): + allow_patterns = [allow_patterns] + + if isinstance(ignore_patterns, str): + ignore_patterns = [ignore_patterns] + + if allow_patterns is not None: + allow_patterns = [_add_wildcard_to_directories(p) for p in allow_patterns] + if ignore_patterns is not None: + ignore_patterns = [_add_wildcard_to_directories(p) for p in ignore_patterns] + + if key is None: + + def _identity(item: T) -> str: + if isinstance(item, str): + return item + if isinstance(item, Path): + return str(item) + raise ValueError(f"Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.") + + key = _identity # Items must be `str` or `Path`, otherwise raise ValueError + + for item in items: + path = key(item) + + # Skip if there's an allowlist and path doesn't match any + if allow_patterns is not None and not any(fnmatch(path, r) for r in allow_patterns): + continue + + # Skip if there's a denylist and path matches any + if ignore_patterns is not None and any(fnmatch(path, r) for r in ignore_patterns): + continue + + yield item + + +def _add_wildcard_to_directories(pattern: str) -> str: + if pattern[-1] == "/": + return pattern + "*" + return pattern |