aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py')
-rw-r--r--.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py64
1 files changed, 64 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py
new file mode 100644
index 00000000..001c3fe8
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/huggingface_hub/utils/sha.py
@@ -0,0 +1,64 @@
+"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes."""
+
+from typing import BinaryIO, Optional
+
+from .insecure_hashlib import sha1, sha256
+
+
+def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
+ """
+ Computes the sha256 hash of the given file object, by chunks of size `chunk_size`.
+
+ Args:
+ fileobj (file-like object):
+ The File object to compute sha256 for, typically obtained with `open(path, "rb")`
+ chunk_size (`int`, *optional*):
+ The number of bytes to read from `fileobj` at once, defaults to 1MB.
+
+ Returns:
+ `bytes`: `fileobj`'s sha256 hash as bytes
+ """
+ chunk_size = chunk_size if chunk_size is not None else 1024 * 1024
+
+ sha = sha256()
+ while True:
+ chunk = fileobj.read(chunk_size)
+ sha.update(chunk)
+ if not chunk:
+ break
+ return sha.digest()
+
+
+def git_hash(data: bytes) -> str:
+ """
+ Computes the git-sha1 hash of the given bytes, using the same algorithm as git.
+
+ This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
+ for more details.
+
+ Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
+ pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
+ the LFS file content when we want to compare LFS files.
+
+ Args:
+ data (`bytes`):
+ The data to compute the git-hash for.
+
+ Returns:
+ `str`: the git-hash of `data` as an hexadecimal string.
+
+ Example:
+ ```python
+ >>> from huggingface_hub.utils.sha import git_hash
+ >>> git_hash(b"Hello, World!")
+ 'b45ef6fec89518d314f546fd6c3025367b721684'
+ ```
+ """
+ # Taken from https://gist.github.com/msabramo/763200
+ # Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
+ sha = sha1()
+ sha.update(b"blob ")
+ sha.update(str(len(data)).encode())
+ sha.update(b"\0")
+ sha.update(data)
+ return sha.hexdigest()