aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py')
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py1625
1 files changed, 1625 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py b/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py
new file mode 100644
index 00000000..e9f3d9fb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py
@@ -0,0 +1,1625 @@
+import contextlib
+import copy
+import errno
+import inspect
+import os
+import re
+import shutil
+import stat
+import time
+import uuid
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union
+from urllib.parse import quote, urlparse
+
+import requests
+
+from . import (
+ __version__, # noqa: F401 # for backward compatibility
+ constants,
+)
+from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
+from .constants import (
+ HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
+ HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
+)
+from .errors import (
+ EntryNotFoundError,
+ FileMetadataError,
+ GatedRepoError,
+ HfHubHTTPError,
+ LocalEntryNotFoundError,
+ RepositoryNotFoundError,
+ RevisionNotFoundError,
+)
+from .utils import (
+ OfflineModeIsEnabled,
+ SoftTemporaryDirectory,
+ WeakFileLock,
+ build_hf_headers,
+ get_fastai_version, # noqa: F401 # for backward compatibility
+ get_fastcore_version, # noqa: F401 # for backward compatibility
+ get_graphviz_version, # noqa: F401 # for backward compatibility
+ get_jinja_version, # noqa: F401 # for backward compatibility
+ get_pydot_version, # noqa: F401 # for backward compatibility
+ get_session,
+ get_tf_version, # noqa: F401 # for backward compatibility
+ get_torch_version, # noqa: F401 # for backward compatibility
+ hf_raise_for_status,
+ is_fastai_available, # noqa: F401 # for backward compatibility
+ is_fastcore_available, # noqa: F401 # for backward compatibility
+ is_graphviz_available, # noqa: F401 # for backward compatibility
+ is_jinja_available, # noqa: F401 # for backward compatibility
+ is_pydot_available, # noqa: F401 # for backward compatibility
+ is_tf_available, # noqa: F401 # for backward compatibility
+ is_torch_available, # noqa: F401 # for backward compatibility
+ logging,
+ reset_sessions,
+ tqdm,
+ validate_hf_hub_args,
+)
+from .utils._http import _adjust_range_header
+from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
+from .utils._typing import HTTP_METHOD_T
+from .utils.sha import sha_fileobj
+from .utils.tqdm import is_tqdm_disabled
+
+
+logger = logging.get_logger(__name__)
+
+# Return value when trying to load a file from cache but the file does not exist in the distant repo.
+_CACHED_NO_EXIST = object()
+_CACHED_NO_EXIST_T = Any
+
+# Regex to get filename from a "Content-Disposition" header for CDN-served files
+HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
+
+# Regex to check if the revision IS directly a commit_hash
+REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
+
+# Regex to check if the file etag IS a valid sha256
+REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
+
+_are_symlinks_supported_in_dir: Dict[str, bool] = {}
+
+
+def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
+ """Return whether the symlinks are supported on the machine.
+
+ Since symlinks support can change depending on the mounted disk, we need to check
+ on the precise cache folder. By default, the default HF cache directory is checked.
+
+ Args:
+ cache_dir (`str`, `Path`, *optional*):
+ Path to the folder where cached files are stored.
+
+ Returns: [bool] Whether symlinks are supported in the directory.
+ """
+ # Defaults to HF cache
+ if cache_dir is None:
+ cache_dir = constants.HF_HUB_CACHE
+ cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique
+
+ # Check symlink compatibility only once (per cache directory) at first time use
+ if cache_dir not in _are_symlinks_supported_in_dir:
+ _are_symlinks_supported_in_dir[cache_dir] = True
+
+ os.makedirs(cache_dir, exist_ok=True)
+ with SoftTemporaryDirectory(dir=cache_dir) as tmpdir:
+ src_path = Path(tmpdir) / "dummy_file_src"
+ src_path.touch()
+ dst_path = Path(tmpdir) / "dummy_file_dst"
+
+ # Relative source path as in `_create_symlink``
+ relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
+ try:
+ os.symlink(relative_src, dst_path)
+ except OSError:
+ # Likely running on Windows
+ _are_symlinks_supported_in_dir[cache_dir] = False
+
+ if not constants.HF_HUB_DISABLE_SYMLINKS_WARNING:
+ message = (
+ "`huggingface_hub` cache-system uses symlinks by default to"
+ " efficiently store duplicated files but your machine does not"
+ f" support them in {cache_dir}. Caching files will still work"
+ " but in a degraded version that might require more space on"
+ " your disk. This warning can be disabled by setting the"
+ " `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For"
+ " more details, see"
+ " https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations."
+ )
+ if os.name == "nt":
+ message += (
+ "\nTo support symlinks on Windows, you either need to"
+ " activate Developer Mode or to run Python as an"
+ " administrator. In order to activate developer mode,"
+ " see this article:"
+ " https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development"
+ )
+ warnings.warn(message)
+
+ return _are_symlinks_supported_in_dir[cache_dir]
+
+
+@dataclass(frozen=True)
+class HfFileMetadata:
+ """Data structure containing information about a file versioned on the Hub.
+
+ Returned by [`get_hf_file_metadata`] based on a URL.
+
+ Args:
+ commit_hash (`str`, *optional*):
+ The commit_hash related to the file.
+ etag (`str`, *optional*):
+ Etag of the file on the server.
+ location (`str`):
+ Location where to download the file. Can be a Hub url or not (CDN).
+ size (`size`):
+ Size of the file. In case of an LFS file, contains the size of the actual
+ LFS file, not the pointer.
+ """
+
+ commit_hash: Optional[str]
+ etag: Optional[str]
+ location: str
+ size: Optional[int]
+
+
+@validate_hf_hub_args
+def hf_hub_url(
+ repo_id: str,
+ filename: str,
+ *,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ endpoint: Optional[str] = None,
+) -> str:
+ """Construct the URL of a file from the given information.
+
+ The resolved address can either be a huggingface.co-hosted url, or a link to
+ Cloudfront (a Content Delivery Network, or CDN) for large files which are
+ more than a few MBs.
+
+ Args:
+ repo_id (`str`):
+ A namespace (user or an organization) name and a repo name separated
+ by a `/`.
+ filename (`str`):
+ The name of the file in the repo.
+ subfolder (`str`, *optional*):
+ An optional value corresponding to a folder inside the repo.
+ repo_type (`str`, *optional*):
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
+ `None` or `"model"` if downloading from a model. Default is `None`.
+ revision (`str`, *optional*):
+ An optional Git revision id which can be a branch name, a tag, or a
+ commit hash.
+
+ Example:
+
+ ```python
+ >>> from huggingface_hub import hf_hub_url
+
+ >>> hf_hub_url(
+ ... repo_id="julien-c/EsperBERTo-small", filename="pytorch_model.bin"
+ ... )
+ 'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
+ ```
+
+ <Tip>
+
+ Notes:
+
+ Cloudfront is replicated over the globe so downloads are way faster for
+ the end user (and it also lowers our bandwidth costs).
+
+ Cloudfront aggressively caches files by default (default TTL is 24
+ hours), however this is not an issue here because we implement a
+ git-based versioning system on huggingface.co, which means that we store
+ the files on S3/Cloudfront in a content-addressable way (i.e., the file
+ name is its hash). Using content-addressable filenames means cache can't
+ ever be stale.
+
+ In terms of client-side caching from this library, we base our caching
+ on the objects' entity tag (`ETag`), which is an identifier of a
+ specific version of a resource [1]_. An object's ETag is: its git-sha1
+ if stored in git, or its sha256 if stored in git-lfs.
+
+ </Tip>
+
+ References:
+
+ - [1] https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
+ """
+ if subfolder == "":
+ subfolder = None
+ if subfolder is not None:
+ filename = f"{subfolder}/{filename}"
+
+ if repo_type not in constants.REPO_TYPES:
+ raise ValueError("Invalid repo type")
+
+ if repo_type in constants.REPO_TYPES_URL_PREFIXES:
+ repo_id = constants.REPO_TYPES_URL_PREFIXES[repo_type] + repo_id
+
+ if revision is None:
+ revision = constants.DEFAULT_REVISION
+ url = HUGGINGFACE_CO_URL_TEMPLATE.format(
+ repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
+ )
+ # Update endpoint if provided
+ if endpoint is not None and url.startswith(constants.ENDPOINT):
+ url = endpoint + url[len(constants.ENDPOINT) :]
+ return url
+
+
+def _request_wrapper(
+ method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
+) -> requests.Response:
+ """Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
+ `allow_redirection=False`.
+
+ Args:
+ method (`str`):
+ HTTP method, such as 'GET' or 'HEAD'.
+ url (`str`):
+ The URL of the resource to fetch.
+ follow_relative_redirects (`bool`, *optional*, defaults to `False`)
+ If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
+ kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
+ following redirection to a CDN.
+ **params (`dict`, *optional*):
+ Params to pass to `requests.request`.
+ """
+ # Recursively follow relative redirects
+ if follow_relative_redirects:
+ response = _request_wrapper(
+ method=method,
+ url=url,
+ follow_relative_redirects=False,
+ **params,
+ )
+
+ # If redirection, we redirect only relative paths.
+ # This is useful in case of a renamed repository.
+ if 300 <= response.status_code <= 399:
+ parsed_target = urlparse(response.headers["Location"])
+ if parsed_target.netloc == "":
+ # This means it is a relative 'location' headers, as allowed by RFC 7231.
+ # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
+ # We want to follow this relative redirect !
+ #
+ # Highly inspired by `resolve_redirects` from requests library.
+ # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
+ next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
+ return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
+ return response
+
+ # Perform request and return if status_code is not in the retry list.
+ response = get_session().request(method=method, url=url, **params)
+ hf_raise_for_status(response)
+ return response
+
+
+def http_get(
+ url: str,
+ temp_file: BinaryIO,
+ *,
+ proxies: Optional[Dict] = None,
+ resume_size: int = 0,
+ headers: Optional[Dict[str, Any]] = None,
+ expected_size: Optional[int] = None,
+ displayed_filename: Optional[str] = None,
+ _nb_retries: int = 5,
+ _tqdm_bar: Optional[tqdm] = None,
+) -> None:
+ """
+ Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
+
+ If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
+ transient error (network outage?). We log a warning message and try to resume the download a few times before
+ giving up. The method gives up after 5 attempts if no new data has being received from the server.
+
+ Args:
+ url (`str`):
+ The URL of the file to download.
+ temp_file (`BinaryIO`):
+ The file-like object where to save the file.
+ proxies (`dict`, *optional*):
+ Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
+ resume_size (`int`, *optional*):
+ The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
+ positive number, the download will resume at the given position.
+ headers (`dict`, *optional*):
+ Dictionary of HTTP Headers to send with the request.
+ expected_size (`int`, *optional*):
+ The expected size of the file to download. If set, the download will raise an error if the size of the
+ received content is different from the expected one.
+ displayed_filename (`str`, *optional*):
+ The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
+ not set, the filename is guessed from the URL or the `Content-Disposition` header.
+ """
+ if expected_size is not None and resume_size == expected_size:
+ # If the file is already fully downloaded, we don't need to download it again.
+ return
+
+ hf_transfer = None
+ if constants.HF_HUB_ENABLE_HF_TRANSFER:
+ if resume_size != 0:
+ warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
+ elif proxies is not None:
+ warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
+ else:
+ try:
+ import hf_transfer # type: ignore[no-redef]
+ except ImportError:
+ raise ValueError(
+ "Fast download using 'hf_transfer' is enabled"
+ " (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
+ " available in your environment. Try `pip install hf_transfer`."
+ )
+
+ initial_headers = headers
+ headers = copy.deepcopy(headers) or {}
+ if resume_size > 0:
+ headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
+
+ r = _request_wrapper(
+ method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
+ )
+ hf_raise_for_status(r)
+ content_length = r.headers.get("Content-Length")
+
+ # NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file.
+ # If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
+ total = resume_size + int(content_length) if content_length is not None else None
+
+ if displayed_filename is None:
+ displayed_filename = url
+ content_disposition = r.headers.get("Content-Disposition")
+ if content_disposition is not None:
+ match = HEADER_FILENAME_PATTERN.search(content_disposition)
+ if match is not None:
+ # Means file is on CDN
+ displayed_filename = match.groupdict()["filename"]
+
+ # Truncate filename if too long to display
+ if len(displayed_filename) > 40:
+ displayed_filename = f"(…){displayed_filename[-40:]}"
+
+ consistency_error_message = (
+ f"Consistency check failed: file should be of size {expected_size} but has size"
+ f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
+ " Please retry with `force_download=True`."
+ )
+
+ # Stream file to buffer
+ progress_cm: tqdm = (
+ tqdm( # type: ignore[assignment]
+ unit="B",
+ unit_scale=True,
+ total=total,
+ initial=resume_size,
+ desc=displayed_filename,
+ disable=is_tqdm_disabled(logger.getEffectiveLevel()),
+ name="huggingface_hub.http_get",
+ )
+ if _tqdm_bar is None
+ else contextlib.nullcontext(_tqdm_bar)
+ # ^ `contextlib.nullcontext` mimics a context manager that does nothing
+ # Makes it easier to use the same code path for both cases but in the later
+ # case, the progress bar is not closed when exiting the context manager.
+ )
+
+ with progress_cm as progress:
+ if hf_transfer and total is not None and total > 5 * constants.DOWNLOAD_CHUNK_SIZE:
+ supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
+ if not supports_callback:
+ warnings.warn(
+ "You are using an outdated version of `hf_transfer`. "
+ "Consider upgrading to latest version to enable progress bars "
+ "using `pip install -U hf_transfer`."
+ )
+ try:
+ hf_transfer.download(
+ url=url,
+ filename=temp_file.name,
+ max_files=constants.HF_TRANSFER_CONCURRENCY,
+ chunk_size=constants.DOWNLOAD_CHUNK_SIZE,
+ headers=headers,
+ parallel_failures=3,
+ max_retries=5,
+ **({"callback": progress.update} if supports_callback else {}),
+ )
+ except Exception as e:
+ raise RuntimeError(
+ "An error occurred while downloading using `hf_transfer`. Consider"
+ " disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
+ ) from e
+ if not supports_callback:
+ progress.update(total)
+ if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
+ raise EnvironmentError(
+ consistency_error_message.format(
+ actual_size=os.path.getsize(temp_file.name),
+ )
+ )
+ return
+ new_resume_size = resume_size
+ try:
+ for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
+ if chunk: # filter out keep-alive new chunks
+ progress.update(len(chunk))
+ temp_file.write(chunk)
+ new_resume_size += len(chunk)
+ # Some data has been downloaded from the server so we reset the number of retries.
+ _nb_retries = 5
+ except (requests.ConnectionError, requests.ReadTimeout) as e:
+ # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
+ # a transient error (network outage?). We log a warning message and try to resume the download a few times
+ # before giving up. Tre retry mechanism is basic but should be enough in most cases.
+ if _nb_retries <= 0:
+ logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
+ raise
+ logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
+ time.sleep(1)
+ reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
+ return http_get(
+ url=url,
+ temp_file=temp_file,
+ proxies=proxies,
+ resume_size=new_resume_size,
+ headers=initial_headers,
+ expected_size=expected_size,
+ _nb_retries=_nb_retries - 1,
+ _tqdm_bar=_tqdm_bar,
+ )
+
+ if expected_size is not None and expected_size != temp_file.tell():
+ raise EnvironmentError(
+ consistency_error_message.format(
+ actual_size=temp_file.tell(),
+ )
+ )
+
+
+def _normalize_etag(etag: Optional[str]) -> Optional[str]:
+ """Normalize ETag HTTP header, so it can be used to create nice filepaths.
+
+ The HTTP spec allows two forms of ETag:
+ ETag: W/"<etag_value>"
+ ETag: "<etag_value>"
+
+ For now, we only expect the second form from the server, but we want to be future-proof so we support both. For
+ more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428.
+
+ Args:
+ etag (`str`, *optional*): HTTP header
+
+ Returns:
+ `str` or `None`: string that can be used as a nice directory name.
+ Returns `None` if input is None.
+ """
+ if etag is None:
+ return None
+ return etag.lstrip("W/").strip('"')
+
+
+def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None:
+ """Alias method used in `transformers` conversion script."""
+ return _create_symlink(src=src, dst=dst, new_blob=new_blob)
+
+
+def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
+ """Create a symbolic link named dst pointing to src.
+
+ By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages:
+ - If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will
+ not break.
+ - Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when
+ changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398,
+ https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228.
+ NOTE: The issue with absolute paths doesn't happen on admin mode.
+ When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created.
+ This happens when paths are not on the same volume. In that case, we use absolute paths.
+
+
+ The result layout looks something like
+ └── [ 128] snapshots
+ ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
+ │ ├── [ 52] README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
+ │ └── [ 76] pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+
+ If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
+ having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
+ (`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
+ cache, the file is duplicated on the disk.
+
+ In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
+ The warning message can be disabled with the `DISABLE_SYMLINKS_WARNING` environment variable.
+ """
+ try:
+ os.remove(dst)
+ except OSError:
+ pass
+
+ abs_src = os.path.abspath(os.path.expanduser(src))
+ abs_dst = os.path.abspath(os.path.expanduser(dst))
+ abs_dst_folder = os.path.dirname(abs_dst)
+
+ # Use relative_dst in priority
+ try:
+ relative_src = os.path.relpath(abs_src, abs_dst_folder)
+ except ValueError:
+ # Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a
+ # local_dir instead of within the cache directory.
+ # See https://docs.python.org/3/library/os.path.html#os.path.relpath
+ relative_src = None
+
+ try:
+ commonpath = os.path.commonpath([abs_src, abs_dst])
+ _support_symlinks = are_symlinks_supported(commonpath)
+ except ValueError:
+ # Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos.
+ # See https://docs.python.org/3/library/os.path.html#os.path.commonpath
+ _support_symlinks = os.name != "nt"
+ except PermissionError:
+ # Permission error means src and dst are not in the same volume (e.g. destination path has been provided
+ # by the user via `local_dir`. Let's test symlink support there)
+ _support_symlinks = are_symlinks_supported(abs_dst_folder)
+ except OSError as e:
+ # OS error (errno=30) means that the commonpath is readonly on Linux/MacOS.
+ if e.errno == errno.EROFS:
+ _support_symlinks = are_symlinks_supported(abs_dst_folder)
+ else:
+ raise
+
+ # Symlinks are supported => let's create a symlink.
+ if _support_symlinks:
+ src_rel_or_abs = relative_src or abs_src
+ logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}")
+ try:
+ os.symlink(src_rel_or_abs, abs_dst)
+ return
+ except FileExistsError:
+ if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
+ # `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
+ # been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
+ return
+ else:
+ # Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
+ # `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
+ raise
+ except PermissionError:
+ # Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink
+ # is supported on both volumes but not between them. Let's just make a hard copy in that case.
+ pass
+
+ # Symlinks are not supported => let's move or copy the file.
+ if new_blob:
+ logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
+ shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
+ else:
+ logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
+ shutil.copyfile(abs_src, abs_dst)
+
+
+def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
+ """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
+
+ Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
+ """
+ if revision != commit_hash:
+ ref_path = Path(storage_folder) / "refs" / revision
+ ref_path.parent.mkdir(parents=True, exist_ok=True)
+ if not ref_path.exists() or commit_hash != ref_path.read_text():
+ # Update ref only if has been updated. Could cause useless error in case
+ # repo is already cached and user doesn't have write access to cache folder.
+ # See https://github.com/huggingface/huggingface_hub/issues/1216.
+ ref_path.write_text(commit_hash)
+
+
+@validate_hf_hub_args
+def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
+ """Return a serialized version of a hf.co repo name and type, safe for disk storage
+ as a single non-nested folder.
+
+ Example: models--julien-c--EsperBERTo-small
+ """
+ # remove all `/` occurrences to correctly convert repo to directory name
+ parts = [f"{repo_type}s", *repo_id.split("/")]
+ return constants.REPO_ID_SEPARATOR.join(parts)
+
+
+def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
+ """Check disk usage and log a warning if there is not enough disk space to download the file.
+
+ Args:
+ expected_size (`int`):
+ The expected size of the file in bytes.
+ target_dir (`str`):
+ The directory where the file will be stored after downloading.
+ """
+
+ target_dir = Path(target_dir) # format as `Path`
+ for path in [target_dir] + list(target_dir.parents): # first check target_dir, then each parents one by one
+ try:
+ target_dir_free = shutil.disk_usage(path).free
+ if target_dir_free < expected_size:
+ warnings.warn(
+ "Not enough free disk space to download the file. "
+ f"The expected file size is: {expected_size / 1e6:.2f} MB. "
+ f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space."
+ )
+ return
+ except OSError: # raise on anything: file does not exist or space disk cannot be checked
+ pass
+
+
+@validate_hf_hub_args
+def hf_hub_download(
+ repo_id: str,
+ filename: str,
+ *,
+ subfolder: Optional[str] = None,
+ repo_type: Optional[str] = None,
+ revision: Optional[str] = None,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ cache_dir: Union[str, Path, None] = None,
+ local_dir: Union[str, Path, None] = None,
+ user_agent: Union[Dict, str, None] = None,
+ force_download: bool = False,
+ proxies: Optional[Dict] = None,
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
+ token: Union[bool, str, None] = None,
+ local_files_only: bool = False,
+ headers: Optional[Dict[str, str]] = None,
+ endpoint: Optional[str] = None,
+ resume_download: Optional[bool] = None,
+ force_filename: Optional[str] = None,
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+) -> str:
+ """Download a given file if it's not already present in the local cache.
+
+ The new cache file layout looks like this:
+ - The cache directory contains one subfolder per repo_id (namespaced by repo type)
+ - inside each repo folder:
+ - refs is a list of the latest known revision => commit_hash pairs
+ - blobs contains the actual file blobs (identified by their git-sha or sha256, depending on
+ whether they're LFS files or not)
+ - snapshots contains one subfolder per commit, each "commit" contains the subset of the files
+ that have been resolved at that particular commit. Each filename is a symlink to the blob
+ at that particular commit.
+
+ ```
+ [ 96] .
+ └── [ 160] models--julien-c--EsperBERTo-small
+ ├── [ 160] blobs
+ │ ├── [321M] 403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+ │ ├── [ 398] 7cb18dc9bafbfcf74629a4b760af1b160957a83e
+ │ └── [1.4K] d7edf6bd2a681fb0175f7735299831ee1b22b812
+ ├── [ 96] refs
+ │ └── [ 40] main
+ └── [ 128] snapshots
+ ├── [ 128] 2439f60ef33a0d46d85da5001d52aeda5b00ce9f
+ │ ├── [ 52] README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
+ │ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+ └── [ 128] bbc77c8132af1cc5cf678da3f1ddf2de43606d48
+ ├── [ 52] README.md -> ../../blobs/7cb18dc9bafbfcf74629a4b760af1b160957a83e
+ └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+ ```
+
+ If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
+ option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir`
+ to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
+ cache-system, it's optimized for regularly pulling the latest version of a repository.
+
+ Args:
+ repo_id (`str`):
+ A user or an organization name and a repo name separated by a `/`.
+ filename (`str`):
+ The name of the file in the repo.
+ subfolder (`str`, *optional*):
+ An optional value corresponding to a folder inside the model repo.
+ repo_type (`str`, *optional*):
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
+ `None` or `"model"` if downloading from a model. Default is `None`.
+ revision (`str`, *optional*):
+ An optional Git revision id which can be a branch name, a tag, or a
+ commit hash.
+ library_name (`str`, *optional*):
+ The name of the library to which the object corresponds.
+ library_version (`str`, *optional*):
+ The version of the library.
+ cache_dir (`str`, `Path`, *optional*):
+ Path to the folder where cached files are stored.
+ local_dir (`str` or `Path`, *optional*):
+ If provided, the downloaded file will be placed under this directory.
+ user_agent (`dict`, `str`, *optional*):
+ The user-agent info in the form of a dictionary or a string.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether the file should be downloaded even if it already exists in
+ the local cache.
+ proxies (`dict`, *optional*):
+ Dictionary mapping protocol to the URL of the proxy passed to
+ `requests.request`.
+ etag_timeout (`float`, *optional*, defaults to `10`):
+ When fetching ETag, how many seconds to wait for the server to send
+ data before giving up which is passed to `requests.request`.
+ token (`str`, `bool`, *optional*):
+ A token to be used for the download.
+ - If `True`, the token is read from the HuggingFace config
+ folder.
+ - If a string, it's used as the authentication token.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ If `True`, avoid downloading the file and return the path to the
+ local cached file if it exists.
+ headers (`dict`, *optional*):
+ Additional headers to be sent with the request.
+
+ Returns:
+ `str`: Local path of file or if networking is off, last version of file cached on disk.
+
+ Raises:
+ [`~utils.RepositoryNotFoundError`]
+ If the repository to download from cannot be found. This may be because it doesn't exist,
+ or because it is set to `private` and you do not have access.
+ [`~utils.RevisionNotFoundError`]
+ If the revision to download from cannot be found.
+ [`~utils.EntryNotFoundError`]
+ If the file to download cannot be found.
+ [`~utils.LocalEntryNotFoundError`]
+ If network is disabled or unavailable and file is not found in cache.
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+ If `token=True` but the token cannot be found.
+ [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
+ If ETag cannot be determined.
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+ If some parameter value is invalid.
+
+ """
+ if constants.HF_HUB_ETAG_TIMEOUT != constants.DEFAULT_ETAG_TIMEOUT:
+ # Respect environment variable above user value
+ etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
+
+ if force_filename is not None:
+ warnings.warn(
+ "The `force_filename` parameter is deprecated as a new caching system, "
+ "which keeps the filenames as they are on the Hub, is now in place.",
+ FutureWarning,
+ )
+ if resume_download is not None:
+ warnings.warn(
+ "`resume_download` is deprecated and will be removed in version 1.0.0. "
+ "Downloads always resume when possible. "
+ "If you want to force a new download, use `force_download=True`.",
+ FutureWarning,
+ )
+
+ if cache_dir is None:
+ cache_dir = constants.HF_HUB_CACHE
+ if revision is None:
+ revision = constants.DEFAULT_REVISION
+ if isinstance(cache_dir, Path):
+ cache_dir = str(cache_dir)
+ if isinstance(local_dir, Path):
+ local_dir = str(local_dir)
+
+ if subfolder == "":
+ subfolder = None
+ if subfolder is not None:
+ # This is used to create a URL, and not a local path, hence the forward slash.
+ filename = f"{subfolder}/{filename}"
+
+ if repo_type is None:
+ repo_type = "model"
+ if repo_type not in constants.REPO_TYPES:
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
+
+ hf_headers = build_hf_headers(
+ token=token,
+ library_name=library_name,
+ library_version=library_version,
+ user_agent=user_agent,
+ headers=headers,
+ )
+
+ if local_dir is not None:
+ if local_dir_use_symlinks != "auto":
+ warnings.warn(
+ "`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
+ "The process to download files to a local folder has been updated and do "
+ "not rely on symlinks anymore. You only need to pass a destination folder "
+ "as`local_dir`.\n"
+ "For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
+ )
+
+ return _hf_hub_download_to_local_dir(
+ # Destination
+ local_dir=local_dir,
+ # File info
+ repo_id=repo_id,
+ repo_type=repo_type,
+ filename=filename,
+ revision=revision,
+ # HTTP info
+ endpoint=endpoint,
+ etag_timeout=etag_timeout,
+ headers=hf_headers,
+ proxies=proxies,
+ token=token,
+ # Additional options
+ cache_dir=cache_dir,
+ force_download=force_download,
+ local_files_only=local_files_only,
+ )
+ else:
+ return _hf_hub_download_to_cache_dir(
+ # Destination
+ cache_dir=cache_dir,
+ # File info
+ repo_id=repo_id,
+ filename=filename,
+ repo_type=repo_type,
+ revision=revision,
+ # HTTP info
+ endpoint=endpoint,
+ etag_timeout=etag_timeout,
+ headers=hf_headers,
+ proxies=proxies,
+ token=token,
+ # Additional options
+ local_files_only=local_files_only,
+ force_download=force_download,
+ )
+
+
+def _hf_hub_download_to_cache_dir(
+ *,
+ # Destination
+ cache_dir: str,
+ # File info
+ repo_id: str,
+ filename: str,
+ repo_type: str,
+ revision: str,
+ # HTTP info
+ endpoint: Optional[str],
+ etag_timeout: float,
+ headers: Dict[str, str],
+ proxies: Optional[Dict],
+ token: Optional[Union[bool, str]],
+ # Additional options
+ local_files_only: bool,
+ force_download: bool,
+) -> str:
+ """Download a given file to a cache folder, if not already present.
+
+ Method should not be called directly. Please use `hf_hub_download` instead.
+ """
+ locks_dir = os.path.join(cache_dir, ".locks")
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
+
+ # cross platform transcription of filename, to be used as a local file path.
+ relative_filename = os.path.join(*filename.split("/"))
+ if os.name == "nt":
+ if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
+ raise ValueError(
+ f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
+ " owner to rename this file."
+ )
+
+ # if user provides a commit_hash and they already have the file on disk, shortcut everything.
+ if REGEX_COMMIT_HASH.match(revision):
+ pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
+ if os.path.exists(pointer_path) and not force_download:
+ return pointer_path
+
+ # Try to get metadata (etag, commit_hash, url, size) from the server.
+ # If we can't, a HEAD request error is returned.
+ (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
+ repo_id=repo_id,
+ filename=filename,
+ repo_type=repo_type,
+ revision=revision,
+ endpoint=endpoint,
+ proxies=proxies,
+ etag_timeout=etag_timeout,
+ headers=headers,
+ token=token,
+ local_files_only=local_files_only,
+ storage_folder=storage_folder,
+ relative_filename=relative_filename,
+ )
+
+ # etag can be None for several reasons:
+ # 1. we passed local_files_only.
+ # 2. we don't have a connection
+ # 3. Hub is down (HTTP 500, 503, 504)
+ # 4. repo is not found -for example private or gated- and invalid/missing token sent
+ # 5. Hub is blocked by a firewall or proxy is not set correctly.
+ # => Try to get the last downloaded one from the specified revision.
+ #
+ # If the specified revision is a commit hash, look inside "snapshots".
+ # If the specified revision is a branch or tag, look inside "refs".
+ if head_call_error is not None:
+ # Couldn't make a HEAD call => let's try to find a local file
+ if not force_download:
+ commit_hash = None
+ if REGEX_COMMIT_HASH.match(revision):
+ commit_hash = revision
+ else:
+ ref_path = os.path.join(storage_folder, "refs", revision)
+ if os.path.isfile(ref_path):
+ with open(ref_path) as f:
+ commit_hash = f.read()
+
+ # Return pointer file if exists
+ if commit_hash is not None:
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+ if os.path.exists(pointer_path) and not force_download:
+ return pointer_path
+
+ # Otherwise, raise appropriate error
+ _raise_on_head_call_error(head_call_error, force_download, local_files_only)
+
+ # From now on, etag, commit_hash, url and size are not None.
+ assert etag is not None, "etag must have been retrieved from server"
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
+ assert url_to_download is not None, "file location must have been retrieved from server"
+ assert expected_size is not None, "expected_size must have been retrieved from server"
+ blob_path = os.path.join(storage_folder, "blobs", etag)
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+
+ os.makedirs(os.path.dirname(blob_path), exist_ok=True)
+ os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
+
+ # if passed revision is not identical to commit_hash
+ # then revision has to be a branch name or tag name.
+ # In that case store a ref.
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
+
+ # If file already exists, return it (except if force_download=True)
+ if not force_download:
+ if os.path.exists(pointer_path):
+ return pointer_path
+
+ if os.path.exists(blob_path):
+ # we have the blob already, but not the pointer
+ _create_symlink(blob_path, pointer_path, new_blob=False)
+ return pointer_path
+
+ # Prevent parallel downloads of the same file with a lock.
+ # etag could be duplicated across repos,
+ lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
+
+ # Some Windows versions do not allow for paths longer than 255 characters.
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix.
+ if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
+ lock_path = "\\\\?\\" + os.path.abspath(lock_path)
+
+ if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
+ blob_path = "\\\\?\\" + os.path.abspath(blob_path)
+
+ Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
+ with WeakFileLock(lock_path):
+ _download_to_tmp_and_move(
+ incomplete_path=Path(blob_path + ".incomplete"),
+ destination_path=Path(blob_path),
+ url_to_download=url_to_download,
+ proxies=proxies,
+ headers=headers,
+ expected_size=expected_size,
+ filename=filename,
+ force_download=force_download,
+ )
+ if not os.path.exists(pointer_path):
+ _create_symlink(blob_path, pointer_path, new_blob=True)
+
+ return pointer_path
+
+
+def _hf_hub_download_to_local_dir(
+ *,
+ # Destination
+ local_dir: Union[str, Path],
+ # File info
+ repo_id: str,
+ repo_type: str,
+ filename: str,
+ revision: str,
+ # HTTP info
+ endpoint: Optional[str],
+ etag_timeout: float,
+ headers: Dict[str, str],
+ proxies: Optional[Dict],
+ token: Union[bool, str, None],
+ # Additional options
+ cache_dir: str,
+ force_download: bool,
+ local_files_only: bool,
+) -> str:
+ """Download a given file to a local folder, if not already present.
+
+ Method should not be called directly. Please use `hf_hub_download` instead.
+ """
+ # Some Windows versions do not allow for paths longer than 255 characters.
+ # In this case, we must specify it as an extended path by using the "\\?\" prefix.
+ if os.name == "nt" and len(os.path.abspath(local_dir)) > 255:
+ local_dir = "\\\\?\\" + os.path.abspath(local_dir)
+ local_dir = Path(local_dir)
+ paths = get_local_download_paths(local_dir=local_dir, filename=filename)
+ local_metadata = read_download_metadata(local_dir=local_dir, filename=filename)
+
+ # Local file exists + metadata exists + commit_hash matches => return file
+ if (
+ not force_download
+ and REGEX_COMMIT_HASH.match(revision)
+ and paths.file_path.is_file()
+ and local_metadata is not None
+ and local_metadata.commit_hash == revision
+ ):
+ return str(paths.file_path)
+
+ # Local file doesn't exist or commit_hash doesn't match => we need the etag
+ (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
+ repo_id=repo_id,
+ filename=filename,
+ repo_type=repo_type,
+ revision=revision,
+ endpoint=endpoint,
+ proxies=proxies,
+ etag_timeout=etag_timeout,
+ headers=headers,
+ token=token,
+ local_files_only=local_files_only,
+ )
+
+ if head_call_error is not None:
+ # No HEAD call but local file exists => default to local file
+ if not force_download and paths.file_path.is_file():
+ logger.warning(
+ f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
+ )
+ return str(paths.file_path)
+ # Otherwise => raise
+ _raise_on_head_call_error(head_call_error, force_download, local_files_only)
+
+ # From now on, etag, commit_hash, url and size are not None.
+ assert etag is not None, "etag must have been retrieved from server"
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
+ assert url_to_download is not None, "file location must have been retrieved from server"
+ assert expected_size is not None, "expected_size must have been retrieved from server"
+
+ # Local file exists => check if it's up-to-date
+ if not force_download and paths.file_path.is_file():
+ # etag matches => update metadata and return file
+ if local_metadata is not None and local_metadata.etag == etag:
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
+ return str(paths.file_path)
+
+ # metadata is outdated + etag is a sha256
+ # => means it's an LFS file (large)
+ # => let's compute local hash and compare
+ # => if match, update metadata and return file
+ if local_metadata is None and REGEX_SHA256.match(etag) is not None:
+ with open(paths.file_path, "rb") as f:
+ file_hash = sha_fileobj(f).hex()
+ if file_hash == etag:
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
+ return str(paths.file_path)
+
+ # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
+
+ # If we are lucky enough, the file is already in the cache => copy it
+ if not force_download:
+ cached_path = try_to_load_from_cache(
+ repo_id=repo_id,
+ filename=filename,
+ cache_dir=cache_dir,
+ revision=commit_hash,
+ repo_type=repo_type,
+ )
+ if isinstance(cached_path, str):
+ with WeakFileLock(paths.lock_path):
+ paths.file_path.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copyfile(cached_path, paths.file_path)
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
+ return str(paths.file_path)
+
+ # Otherwise, let's download the file!
+ with WeakFileLock(paths.lock_path):
+ paths.file_path.unlink(missing_ok=True) # delete outdated file first
+ _download_to_tmp_and_move(
+ incomplete_path=paths.incomplete_path(etag),
+ destination_path=paths.file_path,
+ url_to_download=url_to_download,
+ proxies=proxies,
+ headers=headers,
+ expected_size=expected_size,
+ filename=filename,
+ force_download=force_download,
+ )
+
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
+ return str(paths.file_path)
+
+
+@validate_hf_hub_args
+def try_to_load_from_cache(
+ repo_id: str,
+ filename: str,
+ cache_dir: Union[str, Path, None] = None,
+ revision: Optional[str] = None,
+ repo_type: Optional[str] = None,
+) -> Union[str, _CACHED_NO_EXIST_T, None]:
+ """
+ Explores the cache to return the latest cached file for a given revision if found.
+
+ This function will not raise any exception if the file in not cached.
+
+ Args:
+ cache_dir (`str` or `os.PathLike`):
+ The folder where the cached files lie.
+ repo_id (`str`):
+ The ID of the repo on huggingface.co.
+ filename (`str`):
+ The filename to look for inside `repo_id`.
+ revision (`str`, *optional*):
+ The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
+ provided either.
+ repo_type (`str`, *optional*):
+ The type of the repository. Will default to `"model"`.
+
+ Returns:
+ `Optional[str]` or `_CACHED_NO_EXIST`:
+ Will return `None` if the file was not cached. Otherwise:
+ - The exact path to the cached file if it's found in the cache
+ - A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
+ cached.
+
+ Example:
+
+ ```python
+ from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
+
+ filepath = try_to_load_from_cache()
+ if isinstance(filepath, str):
+ # file exists and is cached
+ ...
+ elif filepath is _CACHED_NO_EXIST:
+ # non-existence of file is cached
+ ...
+ else:
+ # file is not cached
+ ...
+ ```
+ """
+ if revision is None:
+ revision = "main"
+ if repo_type is None:
+ repo_type = "model"
+ if repo_type not in constants.REPO_TYPES:
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(constants.REPO_TYPES)}")
+ if cache_dir is None:
+ cache_dir = constants.HF_HUB_CACHE
+
+ object_id = repo_id.replace("/", "--")
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
+ if not os.path.isdir(repo_cache):
+ # No cache for this model
+ return None
+
+ refs_dir = os.path.join(repo_cache, "refs")
+ snapshots_dir = os.path.join(repo_cache, "snapshots")
+ no_exist_dir = os.path.join(repo_cache, ".no_exist")
+
+ # Resolve refs (for instance to convert main to the associated commit sha)
+ if os.path.isdir(refs_dir):
+ revision_file = os.path.join(refs_dir, revision)
+ if os.path.isfile(revision_file):
+ with open(revision_file) as f:
+ revision = f.read()
+
+ # Check if file is cached as "no_exist"
+ if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
+ return _CACHED_NO_EXIST
+
+ # Check if revision folder exists
+ if not os.path.exists(snapshots_dir):
+ return None
+ cached_shas = os.listdir(snapshots_dir)
+ if revision not in cached_shas:
+ # No cache for this revision and we won't try to return a random revision
+ return None
+
+ # Check if file exists in cache
+ cached_file = os.path.join(snapshots_dir, revision, filename)
+ return cached_file if os.path.isfile(cached_file) else None
+
+
+@validate_hf_hub_args
+def get_hf_file_metadata(
+ url: str,
+ token: Union[bool, str, None] = None,
+ proxies: Optional[Dict] = None,
+ timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
+ library_name: Optional[str] = None,
+ library_version: Optional[str] = None,
+ user_agent: Union[Dict, str, None] = None,
+ headers: Optional[Dict[str, str]] = None,
+) -> HfFileMetadata:
+ """Fetch metadata of a file versioned on the Hub for a given url.
+
+ Args:
+ url (`str`):
+ File url, for example returned by [`hf_hub_url`].
+ token (`str` or `bool`, *optional*):
+ A token to be used for the download.
+ - If `True`, the token is read from the HuggingFace config
+ folder.
+ - If `False` or `None`, no token is provided.
+ - If a string, it's used as the authentication token.
+ proxies (`dict`, *optional*):
+ Dictionary mapping protocol to the URL of the proxy passed to
+ `requests.request`.
+ timeout (`float`, *optional*, defaults to 10):
+ How many seconds to wait for the server to send metadata before giving up.
+ library_name (`str`, *optional*):
+ The name of the library to which the object corresponds.
+ library_version (`str`, *optional*):
+ The version of the library.
+ user_agent (`dict`, `str`, *optional*):
+ The user-agent info in the form of a dictionary or a string.
+ headers (`dict`, *optional*):
+ Additional headers to be sent with the request.
+
+ Returns:
+ A [`HfFileMetadata`] object containing metadata such as location, etag, size and
+ commit_hash.
+ """
+ hf_headers = build_hf_headers(
+ token=token,
+ library_name=library_name,
+ library_version=library_version,
+ user_agent=user_agent,
+ headers=headers,
+ )
+ hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
+
+ # Retrieve metadata
+ r = _request_wrapper(
+ method="HEAD",
+ url=url,
+ headers=hf_headers,
+ allow_redirects=False,
+ follow_relative_redirects=True,
+ proxies=proxies,
+ timeout=timeout,
+ )
+ hf_raise_for_status(r)
+
+ # Return
+ return HfFileMetadata(
+ commit_hash=r.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
+ # We favor a custom header indicating the etag of the linked resource, and
+ # we fallback to the regular etag header.
+ etag=_normalize_etag(r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")),
+ # Either from response headers (if redirected) or defaults to request url
+ # Do not use directly `url`, as `_request_wrapper` might have followed relative
+ # redirects.
+ location=r.headers.get("Location") or r.request.url, # type: ignore
+ size=_int_or_none(
+ r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or r.headers.get("Content-Length")
+ ),
+ )
+
+
+def _get_metadata_or_catch_error(
+ *,
+ repo_id: str,
+ filename: str,
+ repo_type: str,
+ revision: str,
+ endpoint: Optional[str],
+ proxies: Optional[Dict],
+ etag_timeout: Optional[float],
+ headers: Dict[str, str], # mutated inplace!
+ token: Union[bool, str, None],
+ local_files_only: bool,
+ relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
+ storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
+) -> Union[
+ # Either an exception is caught and returned
+ Tuple[None, None, None, None, Exception],
+ # Or the metadata is returned as
+ # `(url_to_download, etag, commit_hash, expected_size, None)`
+ Tuple[str, str, str, int, None],
+]:
+ """Get metadata for a file on the Hub, safely handling network issues.
+
+ Returns either the etag, commit_hash and expected size of the file, or the error
+ raised while fetching the metadata.
+
+ NOTE: This function mutates `headers` inplace! It removes the `authorization` header
+ if the file is a LFS blob and the domain of the url is different from the
+ domain of the location (typically an S3 bucket).
+ """
+ if local_files_only:
+ return (
+ None,
+ None,
+ None,
+ None,
+ OfflineModeIsEnabled(
+ f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
+ ),
+ )
+
+ url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
+ url_to_download: str = url
+ etag: Optional[str] = None
+ commit_hash: Optional[str] = None
+ expected_size: Optional[int] = None
+ head_error_call: Optional[Exception] = None
+
+ # Try to get metadata from the server.
+ # Do not raise yet if the file is not found or not accessible.
+ if not local_files_only:
+ try:
+ try:
+ metadata = get_hf_file_metadata(
+ url=url, proxies=proxies, timeout=etag_timeout, headers=headers, token=token
+ )
+ except EntryNotFoundError as http_error:
+ if storage_folder is not None and relative_filename is not None:
+ # Cache the non-existence of the file
+ commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
+ if commit_hash is not None:
+ no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
+ try:
+ no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
+ no_exist_file_path.touch()
+ except OSError as e:
+ logger.error(
+ f"Could not cache non-existence of file. Will ignore error and continue. Error: {e}"
+ )
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
+ raise
+
+ # Commit hash must exist
+ commit_hash = metadata.commit_hash
+ if commit_hash is None:
+ raise FileMetadataError(
+ "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
+ " prevents you from downloading resources from https://huggingface.co. Please check your firewall"
+ " and proxy settings and make sure your SSL certificates are updated."
+ )
+
+ # Etag must exist
+ # If we don't have any of those, raise an error.
+ etag = metadata.etag
+ if etag is None:
+ raise FileMetadataError(
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+ )
+
+ # Size must exist
+ expected_size = metadata.size
+ if expected_size is None:
+ raise FileMetadataError("Distant resource does not have a Content-Length.")
+
+ # In case of a redirect, save an extra redirect on the request.get call,
+ # and ensure we download the exact atomic version even if it changed
+ # between the HEAD and the GET (unlikely, but hey).
+ #
+ # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
+ # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
+ if url != metadata.location:
+ url_to_download = metadata.location
+ if urlparse(url).netloc != urlparse(metadata.location).netloc:
+ # Remove authorization header when downloading a LFS blob
+ headers.pop("authorization", None)
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+ # Actually raise for those subclasses of ConnectionError
+ raise
+ except (
+ requests.exceptions.ConnectionError,
+ requests.exceptions.Timeout,
+ OfflineModeIsEnabled,
+ ) as error:
+ # Otherwise, our Internet connection is down.
+ # etag is None
+ head_error_call = error
+ except (RevisionNotFoundError, EntryNotFoundError):
+ # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
+ raise
+ except requests.HTTPError as error:
+ # Multiple reasons for an http error:
+ # - Repository is private and invalid/missing token sent
+ # - Repository is gated and invalid/missing token sent
+ # - Hub is down (error 500 or 504)
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
+ # (if it's not the case, the error will be re-raised)
+ head_error_call = error
+ except FileMetadataError as error:
+ # Multiple reasons for a FileMetadataError:
+ # - Wrong network configuration (proxy, firewall, SSL certificates)
+ # - Inconsistency on the Hub
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
+ # (if it's not the case, the error will be re-raised)
+ head_error_call = error
+
+ if not (local_files_only or etag is not None or head_error_call is not None):
+ raise RuntimeError("etag is empty due to uncovered problems")
+
+ return (url_to_download, etag, commit_hash, expected_size, head_error_call) # type: ignore [return-value]
+
+
+def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
+ """Raise an appropriate error when the HEAD call failed and we cannot locate a local file."""
+ # No head call => we cannot force download.
+ if force_download:
+ if local_files_only:
+ raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
+ elif isinstance(head_call_error, OfflineModeIsEnabled):
+ raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error
+ else:
+ raise ValueError("Force download failed due to the above error.") from head_call_error
+
+ # No head call + couldn't find an appropriate file on disk => raise an error.
+ if local_files_only:
+ raise LocalEntryNotFoundError(
+ "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
+ " hf.co look-ups and downloads online, set 'local_files_only' to False."
+ )
+ elif isinstance(head_call_error, (RepositoryNotFoundError, GatedRepoError)) or (
+ isinstance(head_call_error, HfHubHTTPError) and head_call_error.response.status_code == 401
+ ):
+ # Repo not found or gated => let's raise the actual error
+ # Unauthorized => likely a token issue => let's raise the actual error
+ raise head_call_error
+ else:
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
+ raise LocalEntryNotFoundError(
+ "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
+ " in the local cache. Please check your connection and try again or make sure your Internet connection"
+ " is on."
+ ) from head_call_error
+
+
+def _download_to_tmp_and_move(
+ incomplete_path: Path,
+ destination_path: Path,
+ url_to_download: str,
+ proxies: Optional[Dict],
+ headers: Dict[str, str],
+ expected_size: Optional[int],
+ filename: str,
+ force_download: bool,
+) -> None:
+ """Download content from a URL to a destination path.
+
+ Internal logic:
+ - return early if file is already downloaded
+ - resume download if possible (from incomplete file)
+ - do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True`
+ - check disk space before downloading
+ - download content to a temporary file
+ - set correct permissions on temporary file
+ - move the temporary file to the destination path
+
+ Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy.
+ """
+ if destination_path.exists() and not force_download:
+ # Do nothing if already exists (except if force_download=True)
+ return
+
+ if incomplete_path.exists() and (force_download or (constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies)):
+ # By default, we will try to resume the download if possible.
+ # However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should
+ # not resume the download => delete the incomplete file.
+ message = f"Removing incomplete file '{incomplete_path}'"
+ if force_download:
+ message += " (force_download=True)"
+ elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
+ message += " (hf_transfer=True)"
+ logger.info(message)
+ incomplete_path.unlink(missing_ok=True)
+
+ with incomplete_path.open("ab") as f:
+ resume_size = f.tell()
+ message = f"Downloading '{filename}' to '{incomplete_path}'"
+ if resume_size > 0 and expected_size is not None:
+ message += f" (resume from {resume_size}/{expected_size})"
+ logger.info(message)
+
+ if expected_size is not None: # might be None if HTTP header not set correctly
+ # Check disk space in both tmp and destination path
+ _check_disk_space(expected_size, incomplete_path.parent)
+ _check_disk_space(expected_size, destination_path.parent)
+
+ http_get(
+ url_to_download,
+ f,
+ proxies=proxies,
+ resume_size=resume_size,
+ headers=headers,
+ expected_size=expected_size,
+ )
+
+ logger.info(f"Download complete. Moving file to {destination_path}")
+ _chmod_and_move(incomplete_path, destination_path)
+
+
+def _int_or_none(value: Optional[str]) -> Optional[int]:
+ try:
+ return int(value) # type: ignore
+ except (TypeError, ValueError):
+ return None
+
+
+def _chmod_and_move(src: Path, dst: Path) -> None:
+ """Set correct permission before moving a blob from tmp directory to cache dir.
+
+ Do not take into account the `umask` from the process as there is no convenient way
+ to get it that is thread-safe.
+
+ See:
+ - About umask: https://docs.python.org/3/library/os.html#os.umask
+ - Thread-safety: https://stackoverflow.com/a/70343066
+ - About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591
+ - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141
+ - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
+ """
+ # Get umask by creating a temporary file in the cached repo folder.
+ tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}"
+ try:
+ tmp_file.touch()
+ cache_dir_mode = Path(tmp_file).stat().st_mode
+ os.chmod(str(src), stat.S_IMODE(cache_dir_mode))
+ except OSError as e:
+ logger.warning(
+ f"Could not set the permissions on the file '{src}'. Error: {e}.\nContinuing without setting permissions."
+ )
+ finally:
+ try:
+ tmp_file.unlink()
+ except OSError:
+ # fails if `tmp_file.touch()` failed => do nothing
+ # See https://github.com/huggingface/huggingface_hub/issues/2359
+ pass
+
+ shutil.move(str(src), str(dst), copy_function=_copy_no_matter_what)
+
+
+def _copy_no_matter_what(src: str, dst: str) -> None:
+ """Copy file from src to dst.
+
+ If `shutil.copy2` fails, fallback to `shutil.copyfile`.
+ """
+ try:
+ # Copy file with metadata and permission
+ # Can fail e.g. if dst is an S3 mount
+ shutil.copy2(src, dst)
+ except OSError:
+ # Copy only file content
+ shutil.copyfile(src, dst)
+
+
+def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
+ # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
+ snapshot_path = os.path.join(storage_folder, "snapshots")
+ pointer_path = os.path.join(snapshot_path, revision, relative_filename)
+ if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents:
+ raise ValueError(
+ "Invalid pointer path: cannot create pointer path in snapshot folder if"
+ f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and"
+ f" `relative_filename='{relative_filename}'`."
+ )
+ return pointer_path