about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py')
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py141
1 files changed, 141 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py
new file mode 100644
index 00000000..4f2c0ebc
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_paths.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2022-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains utilities to handle paths in Huggingface Hub."""
+
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import Callable, Generator, Iterable, List, Optional, TypeVar, Union
+
+
+T = TypeVar("T")
+
+# Always ignore `.git` and `.cache/huggingface` folders in commits
+DEFAULT_IGNORE_PATTERNS = [
+    ".git",
+    ".git/*",
+    "*/.git",
+    "**/.git/**",
+    ".cache/huggingface",
+    ".cache/huggingface/*",
+    "*/.cache/huggingface",
+    "**/.cache/huggingface/**",
+]
+# Forbidden to commit these folders
+FORBIDDEN_FOLDERS = [".git", ".cache"]
+
+
+def filter_repo_objects(
+    items: Iterable[T],
+    *,
+    allow_patterns: Optional[Union[List[str], str]] = None,
+    ignore_patterns: Optional[Union[List[str], str]] = None,
+    key: Optional[Callable[[T], str]] = None,
+) -> Generator[T, None, None]:
+    """Filter repo objects based on an allowlist and a denylist.
+
+    Input must be a list of paths (`str` or `Path`) or a list of arbitrary objects.
+    In the later case, `key` must be provided and specifies a function of one argument
+    that is used to extract a path from each element in iterable.
+
+    Patterns are Unix shell-style wildcards which are NOT regular expressions. See
+    https://docs.python.org/3/library/fnmatch.html for more details.
+
+    Args:
+        items (`Iterable`):
+            List of items to filter.
+        allow_patterns (`str` or `List[str]`, *optional*):
+            Patterns constituting the allowlist. If provided, item paths must match at
+            least one pattern from the allowlist.
+        ignore_patterns (`str` or `List[str]`, *optional*):
+            Patterns constituting the denylist. If provided, item paths must not match
+            any patterns from the denylist.
+        key (`Callable[[T], str]`, *optional*):
+            Single-argument function to extract a path from each item. If not provided,
+            the `items` must already be `str` or `Path`.
+
+    Returns:
+        Filtered list of objects, as a generator.
+
+    Raises:
+        :class:`ValueError`:
+            If `key` is not provided and items are not `str` or `Path`.
+
+    Example usage with paths:
+    ```python
+    >>> # Filter only PDFs that are not hidden.
+    >>> list(filter_repo_objects(
+    ...     ["aaa.PDF", "bbb.jpg", ".ccc.pdf", ".ddd.png"],
+    ...     allow_patterns=["*.pdf"],
+    ...     ignore_patterns=[".*"],
+    ... ))
+    ["aaa.pdf"]
+    ```
+
+    Example usage with objects:
+    ```python
+    >>> list(filter_repo_objects(
+    ... [
+    ...     CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")
+    ...     CommitOperationAdd(path_or_fileobj="/tmp/bbb.jpg", path_in_repo="bbb.jpg")
+    ...     CommitOperationAdd(path_or_fileobj="/tmp/.ccc.pdf", path_in_repo=".ccc.pdf")
+    ...     CommitOperationAdd(path_or_fileobj="/tmp/.ddd.png", path_in_repo=".ddd.png")
+    ... ],
+    ... allow_patterns=["*.pdf"],
+    ... ignore_patterns=[".*"],
+    ... key=lambda x: x.repo_in_path
+    ... ))
+    [CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")]
+    ```
+    """
+    if isinstance(allow_patterns, str):
+        allow_patterns = [allow_patterns]
+
+    if isinstance(ignore_patterns, str):
+        ignore_patterns = [ignore_patterns]
+
+    if allow_patterns is not None:
+        allow_patterns = [_add_wildcard_to_directories(p) for p in allow_patterns]
+    if ignore_patterns is not None:
+        ignore_patterns = [_add_wildcard_to_directories(p) for p in ignore_patterns]
+
+    if key is None:
+
+        def _identity(item: T) -> str:
+            if isinstance(item, str):
+                return item
+            if isinstance(item, Path):
+                return str(item)
+            raise ValueError(f"Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.")
+
+        key = _identity  # Items must be `str` or `Path`, otherwise raise ValueError
+
+    for item in items:
+        path = key(item)
+
+        # Skip if there's an allowlist and path doesn't match any
+        if allow_patterns is not None and not any(fnmatch(path, r) for r in allow_patterns):
+            continue
+
+        # Skip if there's a denylist and path matches any
+        if ignore_patterns is not None and any(fnmatch(path, r) for r in ignore_patterns):
+            continue
+
+        yield item
+
+
+def _add_wildcard_to_directories(pattern: str) -> str:
+    if pattern[-1] == "/":
+        return pattern + "*"
+    return pattern