diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py new file mode 100644 index 00000000..27833f28 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py @@ -0,0 +1,226 @@ +# coding=utf-8 +# Copyright 2022-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains utilities to validate argument values in `huggingface_hub`.""" + +import inspect +import re +import warnings +from functools import wraps +from itertools import chain +from typing import Any, Dict + +from huggingface_hub.errors import HFValidationError + +from ._typing import CallableT + + +REPO_ID_REGEX = re.compile( + r""" + ^ + (\b[\w\-.]+\b/)? # optional namespace (username or organization) + \b # starts with a word boundary + [\w\-.]{1,96} # repo_name: alphanumeric + . _ - + \b # ends with a word boundary + $ + """, + flags=re.VERBOSE, +) + + +def validate_hf_hub_args(fn: CallableT) -> CallableT: + """Validate values received as argument for any public method of `huggingface_hub`. + + The goal of this decorator is to harmonize validation of arguments reused + everywhere. By default, all defined validators are tested. + + Validators: + - [`~utils.validate_repo_id`]: `repo_id` must be `"repo_name"` + or `"namespace/repo_name"`. Namespace is a username or an organization. + - [`~utils.smoothly_deprecate_use_auth_token`]: Use `token` instead of + `use_auth_token` (only if `use_auth_token` is not expected by the decorated + function - in practice, always the case in `huggingface_hub`). + + Example: + ```py + >>> from huggingface_hub.utils import validate_hf_hub_args + + >>> @validate_hf_hub_args + ... def my_cool_method(repo_id: str): + ... print(repo_id) + + >>> my_cool_method(repo_id="valid_repo_id") + valid_repo_id + + >>> my_cool_method("other..repo..id") + huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'. + + >>> my_cool_method(repo_id="other..repo..id") + huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'. + + >>> @validate_hf_hub_args + ... def my_cool_auth_method(token: str): + ... print(token) + + >>> my_cool_auth_method(token="a token") + "a token" + + >>> my_cool_auth_method(use_auth_token="a use_auth_token") + "a use_auth_token" + + >>> my_cool_auth_method(token="a token", use_auth_token="a use_auth_token") + UserWarning: Both `token` and `use_auth_token` are passed (...) + "a token" + ``` + + Raises: + [`~utils.HFValidationError`]: + If an input is not valid. + """ + # TODO: add an argument to opt-out validation for specific argument? + signature = inspect.signature(fn) + + # Should the validator switch `use_auth_token` values to `token`? In practice, always + # True in `huggingface_hub`. Might not be the case in a downstream library. + check_use_auth_token = "use_auth_token" not in signature.parameters and "token" in signature.parameters + + @wraps(fn) + def _inner_fn(*args, **kwargs): + has_token = False + for arg_name, arg_value in chain( + zip(signature.parameters, args), # Args values + kwargs.items(), # Kwargs values + ): + if arg_name in ["repo_id", "from_id", "to_id"]: + validate_repo_id(arg_value) + + elif arg_name == "token" and arg_value is not None: + has_token = True + + if check_use_auth_token: + kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs) + + return fn(*args, **kwargs) + + return _inner_fn # type: ignore + + +def validate_repo_id(repo_id: str) -> None: + """Validate `repo_id` is valid. + + This is not meant to replace the proper validation made on the Hub but rather to + avoid local inconsistencies whenever possible (example: passing `repo_type` in the + `repo_id` is forbidden). + + Rules: + - Between 1 and 96 characters. + - Either "repo_name" or "namespace/repo_name" + - [a-zA-Z0-9] or "-", "_", "." + - "--" and ".." are forbidden + + Valid: `"foo"`, `"foo/bar"`, `"123"`, `"Foo-BAR_foo.bar123"` + + Not valid: `"datasets/foo/bar"`, `".repo_id"`, `"foo--bar"`, `"foo.git"` + + Example: + ```py + >>> from huggingface_hub.utils import validate_repo_id + >>> validate_repo_id(repo_id="valid_repo_id") + >>> validate_repo_id(repo_id="other..repo..id") + huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'. + ``` + + Discussed in https://github.com/huggingface/huggingface_hub/issues/1008. + In moon-landing (internal repository): + - https://github.com/huggingface/moon-landing/blob/main/server/lib/Names.ts#L27 + - https://github.com/huggingface/moon-landing/blob/main/server/views/components/NewRepoForm/NewRepoForm.svelte#L138 + """ + if not isinstance(repo_id, str): + # Typically, a Path is not a repo_id + raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'.") + + if repo_id.count("/") > 1: + raise HFValidationError( + "Repo id must be in the form 'repo_name' or 'namespace/repo_name':" + f" '{repo_id}'. Use `repo_type` argument if needed." + ) + + if not REPO_ID_REGEX.match(repo_id): + raise HFValidationError( + "Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are" + " forbidden, '-' and '.' cannot start or end the name, max length is 96:" + f" '{repo_id}'." + ) + + if "--" in repo_id or ".." in repo_id: + raise HFValidationError(f"Cannot have -- or .. in repo_id: '{repo_id}'.") + + if repo_id.endswith(".git"): + raise HFValidationError(f"Repo_id cannot end by '.git': '{repo_id}'.") + + +def smoothly_deprecate_use_auth_token(fn_name: str, has_token: bool, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """Smoothly deprecate `use_auth_token` in the `huggingface_hub` codebase. + + The long-term goal is to remove any mention of `use_auth_token` in the codebase in + favor of a unique and less verbose `token` argument. This will be done a few steps: + + 0. Step 0: methods that require a read-access to the Hub use the `use_auth_token` + argument (`str`, `bool` or `None`). Methods requiring write-access have a `token` + argument (`str`, `None`). This implicit rule exists to be able to not send the + token when not necessary (`use_auth_token=False`) even if logged in. + + 1. Step 1: we want to harmonize everything and use `token` everywhere (supporting + `token=False` for read-only methods). In order not to break existing code, if + `use_auth_token` is passed to a function, the `use_auth_token` value is passed + as `token` instead, without any warning. + a. Corner case: if both `use_auth_token` and `token` values are passed, a warning + is thrown and the `use_auth_token` value is ignored. + + 2. Step 2: Once it is release, we should push downstream libraries to switch from + `use_auth_token` to `token` as much as possible, but without throwing a warning + (e.g. manually create issues on the corresponding repos). + + 3. Step 3: After a transitional period (6 months e.g. until April 2023?), we update + `huggingface_hub` to throw a warning on `use_auth_token`. Hopefully, very few + users will be impacted as it would have already been fixed. + In addition, unit tests in `huggingface_hub` must be adapted to expect warnings + to be thrown (but still use `use_auth_token` as before). + + 4. Step 4: After a normal deprecation cycle (3 releases ?), remove this validator. + `use_auth_token` will definitely not be supported. + In addition, we update unit tests in `huggingface_hub` to use `token` everywhere. + + This has been discussed in: + - https://github.com/huggingface/huggingface_hub/issues/1094. + - https://github.com/huggingface/huggingface_hub/pull/928 + - (related) https://github.com/huggingface/huggingface_hub/pull/1064 + """ + new_kwargs = kwargs.copy() # do not mutate input ! + + use_auth_token = new_kwargs.pop("use_auth_token", None) # remove from kwargs + if use_auth_token is not None: + if has_token: + warnings.warn( + "Both `token` and `use_auth_token` are passed to" + f" `{fn_name}` with non-None values. `token` is now the" + " preferred argument to pass a User Access Token." + " `use_auth_token` value will be ignored." + ) + else: + # `token` argument is not passed and a non-None value is passed in + # `use_auth_token` => use `use_auth_token` value as `token` kwarg. + new_kwargs["token"] = use_auth_token + + return new_kwargs |