diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py new file mode 100644 index 00000000..001c3fe8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py @@ -0,0 +1,64 @@ +"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes.""" + +from typing import BinaryIO, Optional + +from .insecure_hashlib import sha1, sha256 + + +def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: + """ + Computes the sha256 hash of the given file object, by chunks of size `chunk_size`. + + Args: + fileobj (file-like object): + The File object to compute sha256 for, typically obtained with `open(path, "rb")` + chunk_size (`int`, *optional*): + The number of bytes to read from `fileobj` at once, defaults to 1MB. + + Returns: + `bytes`: `fileobj`'s sha256 hash as bytes + """ + chunk_size = chunk_size if chunk_size is not None else 1024 * 1024 + + sha = sha256() + while True: + chunk = fileobj.read(chunk_size) + sha.update(chunk) + if not chunk: + break + return sha.digest() + + +def git_hash(data: bytes) -> str: + """ + Computes the git-sha1 hash of the given bytes, using the same algorithm as git. + + This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object + for more details. + + Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the + pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of + the LFS file content when we want to compare LFS files. + + Args: + data (`bytes`): + The data to compute the git-hash for. + + Returns: + `str`: the git-hash of `data` as an hexadecimal string. + + Example: + ```python + >>> from huggingface_hub.utils.sha import git_hash + >>> git_hash(b"Hello, World!") + 'b45ef6fec89518d314f546fd6c3025367b721684' + ``` + """ + # Taken from https://gist.github.com/msabramo/763200 + # Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum). + sha = sha1() + sha.update(b"blob ") + sha.update(str(len(data)).encode()) + sha.update(b"\0") + sha.update(data) + return sha.hexdigest() |