aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py180
1 files changed, 180 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
new file mode 100644
index 00000000..fdcdc333
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py
@@ -0,0 +1,180 @@
+import codecs
+from typing import Dict, List, Tuple, Union
+
+from .._codecs import _pdfdoc_encoding
+from .._utils import StreamType, b_, logger_warning, read_non_whitespace
+from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError
+from ._base import ByteStringObject, TextStringObject
+
+
+def hex_to_rgb(value: str) -> Tuple[float, float, float]:
+ return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore
+
+
+def read_hex_string_from_stream(
+ stream: StreamType,
+ forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
+) -> Union["TextStringObject", "ByteStringObject"]:
+ stream.read(1)
+ txt = ""
+ x = b""
+ while True:
+ tok = read_non_whitespace(stream)
+ if not tok:
+ raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
+ if tok == b">":
+ break
+ x += tok
+ if len(x) == 2:
+ txt += chr(int(x, base=16))
+ x = b""
+ if len(x) == 1:
+ x += b"0"
+ if len(x) == 2:
+ txt += chr(int(x, base=16))
+ return create_string_object(b_(txt), forced_encoding)
+
+
+def read_string_from_stream(
+ stream: StreamType,
+ forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
+) -> Union["TextStringObject", "ByteStringObject"]:
+ tok = stream.read(1)
+ parens = 1
+ txt = []
+ while True:
+ tok = stream.read(1)
+ if not tok:
+ raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
+ if tok == b"(":
+ parens += 1
+ elif tok == b")":
+ parens -= 1
+ if parens == 0:
+ break
+ elif tok == b"\\":
+ tok = stream.read(1)
+ escape_dict = {
+ b"n": b"\n",
+ b"r": b"\r",
+ b"t": b"\t",
+ b"b": b"\b",
+ b"f": b"\f",
+ b"c": rb"\c",
+ b"(": b"(",
+ b")": b")",
+ b"/": b"/",
+ b"\\": b"\\",
+ b" ": b" ",
+ b"%": b"%",
+ b"<": b"<",
+ b">": b">",
+ b"[": b"[",
+ b"]": b"]",
+ b"#": b"#",
+ b"_": b"_",
+ b"&": b"&",
+ b"$": b"$",
+ }
+ try:
+ tok = escape_dict[tok]
+ except KeyError:
+ if b"0" <= tok <= b"7":
+ # "The number ddd may consist of one, two, or three
+ # octal digits; high-order overflow shall be ignored.
+ # Three octal digits shall be used, with leading zeros
+ # as needed, if the next character of the string is also
+ # a digit." (PDF reference 7.3.4.2, p 16)
+ for _ in range(2):
+ ntok = stream.read(1)
+ if b"0" <= ntok <= b"7":
+ tok += ntok
+ else:
+ stream.seek(-1, 1) # ntok has to be analyzed
+ break
+ tok = b_(chr(int(tok, base=8)))
+ elif tok in b"\n\r":
+ # This case is hit when a backslash followed by a line
+ # break occurs. If it's a multi-char EOL, consume the
+ # second character:
+ tok = stream.read(1)
+ if tok not in b"\n\r":
+ stream.seek(-1, 1)
+ # Then don't add anything to the actual string, since this
+ # line break was escaped:
+ tok = b""
+ else:
+ msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}"
+ logger_warning(msg, __name__)
+ txt.append(tok)
+ return create_string_object(b"".join(txt), forced_encoding)
+
+
+def create_string_object(
+ string: Union[str, bytes],
+ forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
+) -> Union[TextStringObject, ByteStringObject]:
+ """
+ Create a ByteStringObject or a TextStringObject from a string to represent the string.
+
+ Args:
+ string: The data being used
+ forced_encoding: Typically None, or an encoding string
+
+ Returns:
+ A ByteStringObject
+
+ Raises:
+ TypeError: If string is not of type str or bytes.
+ """
+ if isinstance(string, str):
+ return TextStringObject(string)
+ elif isinstance(string, bytes):
+ if isinstance(forced_encoding, (list, dict)):
+ out = ""
+ for x in string:
+ try:
+ out += forced_encoding[x]
+ except Exception:
+ out += bytes((x,)).decode("charmap")
+ return TextStringObject(out)
+ elif isinstance(forced_encoding, str):
+ if forced_encoding == "bytes":
+ return ByteStringObject(string)
+ return TextStringObject(string.decode(forced_encoding))
+ else:
+ try:
+ if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
+ retval = TextStringObject(string.decode("utf-16"))
+ retval.autodetect_utf16 = True
+ retval.utf16_bom = string[:2]
+ return retval
+ else:
+ # This is probably a big performance hit here, but we need
+ # to convert string objects into the text/unicode-aware
+ # version if possible... and the only way to check if that's
+ # possible is to try.
+ # Some strings are strings, some are just byte arrays.
+ retval = TextStringObject(decode_pdfdocencoding(string))
+ retval.autodetect_pdfdocencoding = True
+ return retval
+ except UnicodeDecodeError:
+ return ByteStringObject(string)
+ else:
+ raise TypeError("create_string_object should have str or unicode arg")
+
+
+def decode_pdfdocencoding(byte_array: bytes) -> str:
+ retval = ""
+ for b in byte_array:
+ c = _pdfdoc_encoding[b]
+ if c == "\u0000":
+ raise UnicodeDecodeError(
+ "pdfdocencoding",
+ bytearray(b),
+ -1,
+ -1,
+ "does not exist in translation table",
+ )
+ retval += c
+ return retval