diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/PyPDF2/generic')
8 files changed, 3050 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/__init__.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/__init__.py new file mode 100644 index 00000000..5f0b16dd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/__init__.py @@ -0,0 +1,144 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +"""Implementation of generic PDF objects (dictionary, number, string, ...).""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +from typing import Dict, List, Union + +from .._utils import StreamType, deprecate_with_replacement +from ..constants import OutlineFontFlag +from ._annotations import AnnotationBuilder +from ._base import ( + BooleanObject, + ByteStringObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + TextStringObject, + encode_pdfdocencoding, +) +from ._data_structures import ( + ArrayObject, + ContentStream, + DecodedStreamObject, + Destination, + DictionaryObject, + EncodedStreamObject, + Field, + StreamObject, + TreeObject, + read_object, +) +from ._fit import Fit +from ._outline import Bookmark, OutlineItem +from ._rectangle import RectangleObject +from ._utils import ( + create_string_object, + decode_pdfdocencoding, + hex_to_rgb, + read_hex_string_from_stream, + read_string_from_stream, +) + + +def readHexStringFromStream( + stream: StreamType, +) -> Union["TextStringObject", "ByteStringObject"]: # pragma: no cover + deprecate_with_replacement( + "readHexStringFromStream", "read_hex_string_from_stream", "4.0.0" + ) + return read_hex_string_from_stream(stream) + + +def readStringFromStream( + stream: StreamType, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union["TextStringObject", "ByteStringObject"]: # pragma: no cover + deprecate_with_replacement( + "readStringFromStream", "read_string_from_stream", "4.0.0" + ) + return read_string_from_stream(stream, forced_encoding) + + +def createStringObject( + string: Union[str, bytes], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union[TextStringObject, ByteStringObject]: # pragma: no cover + deprecate_with_replacement("createStringObject", "create_string_object", "4.0.0") + return create_string_object(string, forced_encoding) + + +PAGE_FIT = Fit.fit() + + +__all__ = [ + # Base types + "BooleanObject", + "FloatObject", + "NumberObject", + "NameObject", + "IndirectObject", + "NullObject", + "PdfObject", + "TextStringObject", + "ByteStringObject", + # Annotations + "AnnotationBuilder", + # Fit + "Fit", + "PAGE_FIT", + # Data structures + "ArrayObject", + "DictionaryObject", + "TreeObject", + "StreamObject", + "DecodedStreamObject", + "EncodedStreamObject", + "ContentStream", + "RectangleObject", + "Field", + "Destination", + # --- More specific stuff + # Outline + "OutlineItem", + "OutlineFontFlag", + "Bookmark", + # Data structures core functions + "read_object", + # Utility functions + "create_string_object", + "encode_pdfdocencoding", + "decode_pdfdocencoding", + "hex_to_rgb", + "read_hex_string_from_stream", + "read_string_from_stream", +] diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_annotations.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_annotations.py new file mode 100644 index 00000000..bb46dd90 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_annotations.py @@ -0,0 +1,275 @@ +from typing import Optional, Tuple, Union + +from ._base import ( + BooleanObject, + FloatObject, + NameObject, + NumberObject, + TextStringObject, +) +from ._data_structures import ArrayObject, DictionaryObject +from ._fit import DEFAULT_FIT, Fit +from ._rectangle import RectangleObject +from ._utils import hex_to_rgb + + +class AnnotationBuilder: + """ + The AnnotationBuilder creates dictionaries representing PDF annotations. + + Those dictionaries can be modified before they are added to a PdfWriter + instance via `writer.add_annotation`. + + See `adding PDF annotations <../user/adding-pdf-annotations.html>`_ for + it's usage combined with PdfWriter. + """ + + from ..types import FitType, ZoomArgType + + @staticmethod + def text( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + text: str, + open: bool = False, + flags: int = 0, + ) -> DictionaryObject: + """ + Add text annotation. + + :param Tuple[int, int, int, int] rect: + or array of four integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]`` + :param bool open: + :param int flags: + """ + # TABLE 8.23 Additional entries specific to a text annotation + text_obj = DictionaryObject( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Text"), + NameObject("/Rect"): RectangleObject(rect), + NameObject("/Contents"): TextStringObject(text), + NameObject("/Open"): BooleanObject(open), + NameObject("/Flags"): NumberObject(flags), + } + ) + return text_obj + + @staticmethod + def free_text( + text: str, + rect: Union[RectangleObject, Tuple[float, float, float, float]], + font: str = "Helvetica", + bold: bool = False, + italic: bool = False, + font_size: str = "14pt", + font_color: str = "000000", + border_color: str = "000000", + background_color: str = "ffffff", + ) -> DictionaryObject: + """ + Add text in a rectangle to a page. + + :param str text: Text to be added + :param RectangleObject rect: or array of four integers + specifying the clickable rectangular area ``[xLL, yLL, xUR, yUR]`` + :param str font: Name of the Font, e.g. 'Helvetica' + :param bool bold: Print the text in bold + :param bool italic: Print the text in italic + :param str font_size: How big the text will be, e.g. '14pt' + :param str font_color: Hex-string for the color + :param str border_color: Hex-string for the border color + :param str background_color: Hex-string for the background of the annotation + """ + font_str = "font: " + if bold is True: + font_str = font_str + "bold " + if italic is True: + font_str = font_str + "italic " + font_str = font_str + font + " " + font_size + font_str = font_str + ";text-align:left;color:#" + font_color + + bg_color_str = "" + for st in hex_to_rgb(border_color): + bg_color_str = bg_color_str + str(st) + " " + bg_color_str = bg_color_str + "rg" + + free_text = DictionaryObject() + free_text.update( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/FreeText"), + NameObject("/Rect"): RectangleObject(rect), + NameObject("/Contents"): TextStringObject(text), + # font size color + NameObject("/DS"): TextStringObject(font_str), + # border color + NameObject("/DA"): TextStringObject(bg_color_str), + # background color + NameObject("/C"): ArrayObject( + [FloatObject(n) for n in hex_to_rgb(background_color)] + ), + } + ) + return free_text + + @staticmethod + def line( + p1: Tuple[float, float], + p2: Tuple[float, float], + rect: Union[RectangleObject, Tuple[float, float, float, float]], + text: str = "", + title_bar: str = "", + ) -> DictionaryObject: + """ + Draw a line on the PDF. + + :param Tuple[float, float] p1: First point + :param Tuple[float, float] p2: Second point + :param RectangleObject rect: or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]`` + :param str text: Text to be displayed as the line annotation + :param str title_bar: Text to be displayed in the title bar of the + annotation; by convention this is the name of the author + """ + line_obj = DictionaryObject( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Line"), + NameObject("/Rect"): RectangleObject(rect), + NameObject("/T"): TextStringObject(title_bar), + NameObject("/L"): ArrayObject( + [ + FloatObject(p1[0]), + FloatObject(p1[1]), + FloatObject(p2[0]), + FloatObject(p2[1]), + ] + ), + NameObject("/LE"): ArrayObject( + [ + NameObject(None), + NameObject(None), + ] + ), + NameObject("/IC"): ArrayObject( + [ + FloatObject(0.5), + FloatObject(0.5), + FloatObject(0.5), + ] + ), + NameObject("/Contents"): TextStringObject(text), + } + ) + return line_obj + + @staticmethod + def rectangle( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + interiour_color: Optional[str] = None, + ) -> DictionaryObject: + """ + Draw a rectangle on the PDF. + + :param RectangleObject rect: or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]`` + """ + square_obj = DictionaryObject( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Square"), + NameObject("/Rect"): RectangleObject(rect), + } + ) + + if interiour_color: + square_obj[NameObject("/IC")] = ArrayObject( + [FloatObject(n) for n in hex_to_rgb(interiour_color)] + ) + + return square_obj + + @staticmethod + def link( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + border: Optional[ArrayObject] = None, + url: Optional[str] = None, + target_page_index: Optional[int] = None, + fit: Fit = DEFAULT_FIT, + ) -> DictionaryObject: + """ + Add a link to the document. + + The link can either be an external link or an internal link. + + An external link requires the URL parameter. + An internal link requires the target_page_index, fit, and fit args. + + + :param RectangleObject rect: or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]`` + :param border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + - horizontal corner radius, + - vertical corner radius, and + - border width + - Optionally: Dash + :param str url: Link to a website (if you want to make an external link) + :param int target_page_index: index of the page to which the link should go + (if you want to make an internal link) + :param Fit fit: Page fit or 'zoom' option. + """ + from ..types import BorderArrayType + + is_external = url is not None + is_internal = target_page_index is not None + if not is_external and not is_internal: + raise ValueError( + "Either 'url' or 'target_page_index' have to be provided. Both were None." + ) + if is_external and is_internal: + raise ValueError( + f"Either 'url' or 'target_page_index' have to be provided. url={url}, target_page_index={target_page_index}" + ) + + border_arr: BorderArrayType + if border is not None: + border_arr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dash_pattern = ArrayObject([NameObject(n) for n in border[3]]) + border_arr.append(dash_pattern) + else: + border_arr = [NumberObject(0)] * 3 + + link_obj = DictionaryObject( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Link"), + NameObject("/Rect"): RectangleObject(rect), + NameObject("/Border"): ArrayObject(border_arr), + } + ) + if is_external: + link_obj[NameObject("/A")] = DictionaryObject( + { + NameObject("/S"): NameObject("/URI"), + NameObject("/Type"): NameObject("/Action"), + NameObject("/URI"): TextStringObject(url), + } + ) + if is_internal: + # This needs to be updated later! + dest_deferred = DictionaryObject( + { + "target_page_index": NumberObject(target_page_index), + "fit": NameObject(fit.fit_type), + "fit_args": fit.fit_args, + } + ) + link_obj[NameObject("/Dest")] = dest_deferred + return link_obj diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_base.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_base.py new file mode 100644 index 00000000..00b9c17b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_base.py @@ -0,0 +1,648 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import codecs +import decimal +import hashlib +import re +from binascii import unhexlify +from typing import Any, Callable, List, Optional, Tuple, Union, cast + +from .._codecs import _pdfdoc_encoding_rev +from .._protocols import PdfObjectProtocol, PdfWriterProtocol +from .._utils import ( + StreamType, + b_, + deprecation_with_replacement, + hex_str, + hexencode, + logger_warning, + read_non_whitespace, + read_until_regex, + str_, +) +from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + + +class PdfObject(PdfObjectProtocol): + # function for calculating a hash value + hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 + indirect_reference: Optional["IndirectObject"] + + def hash_value_data(self) -> bytes: + return ("%s" % self).encode() + + def hash_value(self) -> bytes: + return ( + "%s:%s" + % ( + self.__class__.__name__, + self.hash_func(self.hash_value_data()).hexdigest(), + ) + ).encode() + + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "PdfObject": + """ + clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) + force_duplicate: in standard if the object has been already cloned and reference, + the copy is returned; when force_duplicate == True, a new copy is always performed + ignore_fields : list/tuple of Fields names (for dictionaries that will be ignored during cloning (apply also to childs duplication) + in standard, clone function call _reference_clone (see _reference) + """ + raise Exception("clone PdfObject") + + def _reference_clone( + self, clone: Any, pdf_dest: PdfWriterProtocol + ) -> PdfObjectProtocol: + """ + reference the object within the _objects of pdf_dest only if + indirect_reference attribute exists (which means the objects + was already identified in xref/xobjstm) + if object has been already referenced do nothing + """ + try: + if clone.indirect_reference.pdf == pdf_dest: + return clone + except Exception: + pass + if hasattr(self, "indirect_reference"): + ind = self.indirect_reference + i = len(pdf_dest._objects) + 1 + if ind is not None: + if id(ind.pdf) not in pdf_dest._id_translated: + pdf_dest._id_translated[id(ind.pdf)] = {} + if ind.idnum in pdf_dest._id_translated[id(ind.pdf)]: + obj = pdf_dest.get_object( + pdf_dest._id_translated[id(ind.pdf)][ind.idnum] + ) + assert obj is not None + return obj + pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i + pdf_dest._objects.append(clone) + clone.indirect_reference = IndirectObject(i, 0, pdf_dest) + return clone + + def get_object(self) -> Optional["PdfObject"]: + """Resolve indirect references.""" + return self + + def getObject(self) -> Optional["PdfObject"]: # pragma: no cover + deprecation_with_replacement("getObject", "get_object", "3.0.0") + return self.get_object() + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + raise NotImplementedError + + +class NullObject(PdfObject): + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "NullObject": + """clone object into pdf_dest""" + return cast("NullObject", self._reference_clone(NullObject(), pdf_dest)) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(b"null") + + @staticmethod + def read_from_stream(stream: StreamType) -> "NullObject": + nulltxt = stream.read(4) + if nulltxt != b"null": + raise PdfReadError("Could not read Null object") + return NullObject() + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + def __repr__(self) -> str: + return "NullObject" + + @staticmethod + def readFromStream(stream: StreamType) -> "NullObject": # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return NullObject.read_from_stream(stream) + + +class BooleanObject(PdfObject): + def __init__(self, value: Any) -> None: + self.value = value + + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "BooleanObject": + """clone object into pdf_dest""" + return cast( + "BooleanObject", self._reference_clone(BooleanObject(self.value), pdf_dest) + ) + + def __eq__(self, __o: object) -> bool: + if isinstance(__o, BooleanObject): + return self.value == __o.value + elif isinstance(__o, bool): + return self.value == __o + else: + return False + + def __repr__(self) -> str: + return "True" if self.value else "False" + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + if self.value: + stream.write(b"true") + else: + stream.write(b"false") + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + @staticmethod + def read_from_stream(stream: StreamType) -> "BooleanObject": + word = stream.read(4) + if word == b"true": + return BooleanObject(True) + elif word == b"fals": + stream.read(1) + return BooleanObject(False) + else: + raise PdfReadError("Could not read Boolean object") + + @staticmethod + def readFromStream(stream: StreamType) -> "BooleanObject": # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return BooleanObject.read_from_stream(stream) + + +class IndirectObject(PdfObject): + def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "IndirectObject": + """clone object into pdf_dest""" + if self.pdf == pdf_dest and not force_duplicate: + # Already duplicated and no extra duplication required + return self + if id(self.pdf) not in pdf_dest._id_translated: + pdf_dest._id_translated[id(self.pdf)] = {} + + if not force_duplicate and self.idnum in pdf_dest._id_translated[id(self.pdf)]: + dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) + else: + obj = self.get_object() + assert obj is not None + dup = obj.clone(pdf_dest, force_duplicate, ignore_fields) + assert dup is not None + assert dup.indirect_reference is not None + return dup.indirect_reference + + @property + def indirect_reference(self) -> "IndirectObject": # type: ignore[override] + return self + + def get_object(self) -> Optional["PdfObject"]: + obj = self.pdf.get_object(self) + if obj is None: + return None + return obj.get_object() + + def __repr__(self) -> str: + return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" + + def __eq__(self, other: Any) -> bool: + return ( + other is not None + and isinstance(other, IndirectObject) + and self.idnum == other.idnum + and self.generation == other.generation + and self.pdf is other.pdf + ) + + def __ne__(self, other: Any) -> bool: + return not self.__eq__(other) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(b_(f"{self.idnum} {self.generation} R")) + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + @staticmethod + def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader + idnum = b"" + while True: + tok = stream.read(1) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok.isspace(): + break + idnum += tok + generation = b"" + while True: + tok = stream.read(1) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok.isspace(): + if not generation: + continue + break + generation += tok + r = read_non_whitespace(stream) + if r != b"R": + raise PdfReadError( + f"Error reading indirect object reference at byte {hex_str(stream.tell())}" + ) + return IndirectObject(int(idnum), int(generation), pdf) + + @staticmethod + def readFromStream( + stream: StreamType, pdf: Any # PdfReader + ) -> "IndirectObject": # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return IndirectObject.read_from_stream(stream, pdf) + + +class FloatObject(decimal.Decimal, PdfObject): + def __new__( + cls, value: Union[str, Any] = "0", context: Optional[Any] = None + ) -> "FloatObject": + try: + return decimal.Decimal.__new__(cls, str_(value), context) + except Exception: + # If this isn't a valid decimal (happens in malformed PDFs) + # fallback to 0 + logger_warning(f"FloatObject ({value}) invalid; use 0.0 instead", __name__) + return decimal.Decimal.__new__(cls, "0.0") + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "FloatObject": + """clone object into pdf_dest""" + return cast("FloatObject", self._reference_clone(FloatObject(self), pdf_dest)) + + def __repr__(self) -> str: + if self == self.to_integral(): + # If this is an integer, format it with no decimal place. + return str(self.quantize(decimal.Decimal(1))) + else: + # Otherwise, format it with a decimal place, taking care to + # remove any extraneous trailing zeros. + return f"{self:f}".rstrip("0") + + def as_numeric(self) -> float: + return float(repr(self).encode("utf8")) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(repr(self).encode("utf8")) + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + +class NumberObject(int, PdfObject): + NumberPattern = re.compile(b"[^+-.0-9]") + + def __new__(cls, value: Any) -> "NumberObject": + try: + return int.__new__(cls, int(value)) + except ValueError: + logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) + return int.__new__(cls, 0) + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "NumberObject": + """clone object into pdf_dest""" + return cast("NumberObject", self._reference_clone(NumberObject(self), pdf_dest)) + + def as_numeric(self) -> int: + return int(repr(self).encode("utf8")) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(repr(self).encode("utf8")) + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + @staticmethod + def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: + num = read_until_regex(stream, NumberObject.NumberPattern) + if num.find(b".") != -1: + return FloatObject(num) + return NumberObject(num) + + @staticmethod + def readFromStream( + stream: StreamType, + ) -> Union["NumberObject", "FloatObject"]: # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return NumberObject.read_from_stream(stream) + + +class ByteStringObject(bytes, PdfObject): + """ + Represents a string object where the text encoding could not be determined. + This occurs quite often, as the PDF spec doesn't provide an alternate way to + represent strings -- for example, the encryption data stored in files (like + /O) is clearly not text, but is still stored in a "String" object. + """ + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "ByteStringObject": + """clone object into pdf_dest""" + return cast( + "ByteStringObject", + self._reference_clone(ByteStringObject(bytes(self)), pdf_dest), + ) + + @property + def original_bytes(self) -> bytes: + """For compatibility with TextStringObject.original_bytes.""" + return self + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + bytearr = self + if encryption_key: + from .._security import RC4_encrypt + + bytearr = RC4_encrypt(encryption_key, bytearr) # type: ignore + stream.write(b"<") + stream.write(hexencode(bytearr)) + stream.write(b">") + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + +class TextStringObject(str, PdfObject): + """ + Represents a string object that has been decoded into a real unicode string. + If read from a PDF document, this string appeared to match the + PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to + occur. + """ + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "TextStringObject": + """clone object into pdf_dest""" + obj = TextStringObject(self) + obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding + obj.autodetect_utf16 = self.autodetect_utf16 + return cast("TextStringObject", self._reference_clone(obj, pdf_dest)) + + autodetect_pdfdocencoding = False + autodetect_utf16 = False + + @property + def original_bytes(self) -> bytes: + """ + It is occasionally possible that a text string object gets created where + a byte string object was expected due to the autodetection mechanism -- + if that occurs, this "original_bytes" property can be used to + back-calculate what the original encoded bytes were. + """ + return self.get_original_bytes() + + def get_original_bytes(self) -> bytes: + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if encryption_key: + from .._security import RC4_encrypt + + bytearr = RC4_encrypt(encryption_key, bytearr) + obj = ByteStringObject(bytearr) + obj.write_to_stream(stream, None) + else: + stream.write(b"(") + for c in bytearr: + if not chr(c).isalnum() and c != b" ": + # This: + # stream.write(b_(rf"\{c:0>3o}")) + # gives + # https://github.com/davidhalter/parso/issues/207 + stream.write(b_("\\%03o" % c)) + else: + stream.write(b_(chr(c))) + stream.write(b")") + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + +class NameObject(str, PdfObject): + delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") + surfix = b"/" + renumber_table = { + "#": b"#23", + "(": b"#28", + ")": b"#29", + "/": b"#2F", + **{chr(i): f"#{i:02X}".encode() for i in range(33)}, + } + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "NameObject": + """clone object into pdf_dest""" + return cast("NameObject", self._reference_clone(NameObject(self), pdf_dest)) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(self.renumber()) # b_(renumber(self))) + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + def renumber(self) -> bytes: + out = self[0].encode("utf-8") + if out != b"/": + logger_warning(f"Incorrect first char in NameObject:({self})", __name__) + for c in self[1:]: + if c > "~": + for x in c.encode("utf-8"): + out += f"#{x:02X}".encode() + else: + try: + out += self.renumber_table[c] + except KeyError: + out += c.encode("utf-8") + return out + + @staticmethod + def unnumber(sin: bytes) -> bytes: + i = sin.find(b"#", 0) + while i >= 0: + try: + sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :] + i = sin.find(b"#", i + 1) + except ValueError: + # if the 2 characters after # can not be converted to hexa + # we change nothing and carry on + i = i + 1 + return sin + + @staticmethod + def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader + name = stream.read(1) + if name != NameObject.surfix: + raise PdfReadError("name read error") + name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True) + try: + # Name objects should represent irregular characters + # with a '#' followed by the symbol's hex number + name = NameObject.unnumber(name) + for enc in ("utf-8", "gbk"): + try: + ret = name.decode(enc) + return NameObject(ret) + except Exception: + pass + raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") + except (UnicodeEncodeError, UnicodeDecodeError) as e: + if not pdf.strict: + logger_warning( + f"Illegal character in Name Object ({repr(name)})", __name__ + ) + return NameObject(name.decode("charmap")) + else: + raise PdfReadError( + f"Illegal character in Name Object ({repr(name)})" + ) from e + + @staticmethod + def readFromStream( + stream: StreamType, pdf: Any # PdfReader + ) -> "NameObject": # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return NameObject.read_from_stream(stream, pdf) + + +def encode_pdfdocencoding(unicode_string: str) -> bytes: + retval = b"" + for c in unicode_string: + try: + retval += b_(chr(_pdfdoc_encoding_rev[c])) + except KeyError: + raise UnicodeEncodeError( + "pdfdocencoding", c, -1, -1, "does not exist in translation table" + ) + return retval diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_data_structures.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_data_structures.py new file mode 100644 index 00000000..19f5be9f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_data_structures.py @@ -0,0 +1,1382 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import logging +import re +from io import BytesIO +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast + +from .._protocols import PdfWriterProtocol +from .._utils import ( + WHITESPACES, + StreamType, + b_, + deprecate_with_replacement, + deprecation_with_replacement, + hex_str, + logger_warning, + read_non_whitespace, + read_until_regex, + skip_over_comment, +) +from ..constants import ( + CheckboxRadioButtonAttributes, + FieldDictionaryAttributes, +) +from ..constants import FilterTypes as FT +from ..constants import OutlineFontFlag +from ..constants import StreamAttributes as SA +from ..constants import TypArguments as TA +from ..constants import TypFitArguments as TF +from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError +from ._base import ( + BooleanObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + TextStringObject, +) +from ._fit import Fit +from ._utils import read_hex_string_from_stream, read_string_from_stream + +logger = logging.getLogger(__name__) +NumberSigns = b"+-" +IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") + + +class ArrayObject(list, PdfObject): + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "ArrayObject": + """clone object into pdf_dest""" + try: + if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore + return self + except Exception: + pass + arr = cast("ArrayObject", self._reference_clone(ArrayObject(), pdf_dest)) + for data in self: + if isinstance(data, StreamObject): + # if not hasattr(data, "indirect_reference"): + # data.indirect_reference = None + dup = data._reference_clone( + data.clone(pdf_dest, force_duplicate, ignore_fields), pdf_dest + ) + arr.append(dup.indirect_reference) + elif hasattr(data, "clone"): + arr.append(data.clone(pdf_dest, force_duplicate, ignore_fields)) + else: + arr.append(data) + return cast("ArrayObject", arr) + + def items(self) -> Iterable[Any]: + """ + Emulate DictionaryObject.items for a list + (index, object) + """ + return enumerate(self) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(b"[") + for data in self: + stream.write(b" ") + data.write_to_stream(stream, encryption_key) + stream.write(b" ]") + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + @staticmethod + def read_from_stream( + stream: StreamType, + pdf: Any, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, + ) -> "ArrayObject": # PdfReader + arr = ArrayObject() + tmp = stream.read(1) + if tmp != b"[": + raise PdfReadError("Could not read array") + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peekahead = stream.read(1) + if peekahead == b"]": + break + stream.seek(-1, 1) + # read and append obj + arr.append(read_object(stream, pdf, forced_encoding)) + return arr + + @staticmethod + def readFromStream( + stream: StreamType, pdf: Any # PdfReader + ) -> "ArrayObject": # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return ArrayObject.read_from_stream(stream, pdf) + + +class DictionaryObject(dict, PdfObject): + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "DictionaryObject": + """clone object into pdf_dest""" + try: + if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore + return self + except Exception: + pass + + d__ = cast( + "DictionaryObject", self._reference_clone(self.__class__(), pdf_dest) + ) + if ignore_fields is None: + ignore_fields = [] + if len(d__.keys()) == 0: + d__._clone(self, pdf_dest, force_duplicate, ignore_fields) + return d__ + + def _clone( + self, + src: "DictionaryObject", + pdf_dest: PdfWriterProtocol, + force_duplicate: bool, + ignore_fields: Union[Tuple[str, ...], List[str]], + ) -> None: + """update the object from src""" + # First check if this is a chain list, we need to loop to prevent recur + if ( + ("/Next" not in ignore_fields and "/Next" in src) + or ("/Prev" not in ignore_fields and "/Prev" in src) + ) or ( + ("/N" not in ignore_fields and "/N" in src) + or ("/V" not in ignore_fields and "/V" in src) + ): + ignore_fields = list(ignore_fields) + for lst in (("/Next", "/Prev"), ("/N", "/V")): + for k in lst: + objs = [] + if ( + k in src + and k not in self + and isinstance(src.raw_get(k), IndirectObject) + ): + cur_obj: Optional["DictionaryObject"] = cast( + "DictionaryObject", src[k] + ) + prev_obj: Optional["DictionaryObject"] = self + while cur_obj is not None: + clon = cast( + "DictionaryObject", + cur_obj._reference_clone(cur_obj.__class__(), pdf_dest), + ) + objs.append((cur_obj, clon)) + assert prev_obj is not None + prev_obj[NameObject(k)] = clon.indirect_reference + prev_obj = clon + try: + if cur_obj == src: + cur_obj = None + else: + cur_obj = cast("DictionaryObject", cur_obj[k]) + except Exception: + cur_obj = None + for (s, c) in objs: + c._clone(s, pdf_dest, force_duplicate, ignore_fields + [k]) + + for k, v in src.items(): + if k not in ignore_fields: + if isinstance(v, StreamObject): + if not hasattr(v, "indirect_reference"): + v.indirect_reference = None + vv = v.clone(pdf_dest, force_duplicate, ignore_fields) + assert vv.indirect_reference is not None + self[k.clone(pdf_dest)] = vv.indirect_reference # type: ignore[attr-defined] + else: + if k not in self: + self[NameObject(k)] = ( + v.clone(pdf_dest, force_duplicate, ignore_fields) + if hasattr(v, "clone") + else v + ) + + def raw_get(self, key: Any) -> Any: + return dict.__getitem__(self, key) + + def __setitem__(self, key: Any, value: Any) -> Any: + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key: Any, value: Optional[Any] = None) -> Any: + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) # type: ignore + + def __getitem__(self, key: Any) -> PdfObject: + return dict.__getitem__(self, key).get_object() + + @property + def xmp_metadata(self) -> Optional[PdfObject]: + """ + Retrieve XMP (Extensible Metadata Platform) data relevant to the + this object, if available. + + Stability: Added in v1.12, will exist for all future v1.x releases. + @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + that can be used to access XMP metadata from the document. Can also + return None if no metadata was found on the document root. + """ + from ..xmp import XmpInformation + + metadata = self.get("/Metadata", None) + if metadata is None: + return None + metadata = metadata.get_object() + + if not isinstance(metadata, XmpInformation): + metadata = XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + def getXmpMetadata( + self, + ) -> Optional[PdfObject]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :meth:`xmp_metadata` instead. + """ + deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") + return self.xmp_metadata + + @property + def xmpMetadata(self) -> Optional[PdfObject]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :meth:`xmp_metadata` instead. + """ + deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") + return self.xmp_metadata + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(b"<<\n") + for key, value in list(self.items()): + key.write_to_stream(stream, encryption_key) + stream.write(b" ") + value.write_to_stream(stream, encryption_key) + stream.write(b"\n") + stream.write(b">>") + + def writeToStream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: # pragma: no cover + deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") + self.write_to_stream(stream, encryption_key) + + @staticmethod + def read_from_stream( + stream: StreamType, + pdf: Any, # PdfReader + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, + ) -> "DictionaryObject": + def get_next_obj_pos( + p: int, p1: int, rem_gens: List[int], pdf: Any + ) -> int: # PdfReader + l = pdf.xref[rem_gens[0]] + for o in l: + if p1 > l[o] and p < l[o]: + p1 = l[o] + if len(rem_gens) == 1: + return p1 + else: + return get_next_obj_pos(p, p1, rem_gens[1:], pdf) + + def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader + # we are just pointing at beginning of the stream + eon = get_next_obj_pos(stream.tell(), 2**32, list(pdf.xref), pdf) - 1 + curr = stream.tell() + rw = stream.read(eon - stream.tell()) + p = rw.find(b"endstream") + if p < 0: + raise PdfReadError( + f"Unable to find 'endstream' marker for obj starting at {curr}." + ) + stream.seek(curr + p + 9) + return rw[: p - 1] + + tmp = stream.read(2) + if tmp != b"<<": + raise PdfReadError( + f"Dictionary read error at byte {hex_str(stream.tell())}: " + "stream must begin with '<<'" + ) + data: Dict[Any, Any] = {} + while True: + tok = read_non_whitespace(stream) + if tok == b"\x00": + continue + elif tok == b"%": + stream.seek(-1, 1) + skip_over_comment(stream) + continue + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + + if tok == b">": + stream.read(1) + break + stream.seek(-1, 1) + try: + key = read_object(stream, pdf) + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + value = read_object(stream, pdf, forced_encoding) + except Exception as exc: + if pdf is not None and pdf.strict: + raise PdfReadError(exc.__repr__()) + logger_warning(exc.__repr__(), __name__) + retval = DictionaryObject() + retval.update(data) + return retval # return partial data + + if not data.get(key): + data[key] = value + else: + # multiple definitions of key not permitted + msg = ( + f"Multiple definitions in dictionary at byte " + f"{hex_str(stream.tell())} for key {key}" + ) + if pdf is not None and pdf.strict: + raise PdfReadError(msg) + logger_warning(msg, __name__) + + pos = stream.tell() + s = read_non_whitespace(stream) + if s == b"s" and stream.read(5) == b"tream": + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == b" ": + eol = stream.read(1) + if eol not in (b"\n", b"\r"): + raise PdfStreamError("Stream data must be followed by a newline") + if eol == b"\r": + # read \n after + if stream.read(1) != b"\n": + stream.seek(-1, 1) + # this is a stream object, not a dictionary + if SA.LENGTH not in data: + raise PdfStreamError("Stream length not defined") + length = data[SA.LENGTH] + if isinstance(length, IndirectObject): + t = stream.tell() + length = pdf.get_object(length) + stream.seek(t, 0) + pstart = stream.tell() + data["__streamdata__"] = stream.read(length) + e = read_non_whitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != b"endstream": + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == b"endstream": + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + elif not pdf.strict: + stream.seek(pstart, 0) + data["__streamdata__"] = read_unsized_from_steam(stream, pdf) + pos = stream.tell() + else: + stream.seek(pos, 0) + raise PdfReadError( + "Unable to find 'endstream' marker after stream at byte " + f"{hex_str(stream.tell())} (nd='{ndstream!r}', end='{end!r}')." + ) + else: + stream.seek(pos, 0) + if "__streamdata__" in data: + return StreamObject.initialize_from_dictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + + @staticmethod + def readFromStream( + stream: StreamType, pdf: Any # PdfReader + ) -> "DictionaryObject": # pragma: no cover + deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") + return DictionaryObject.read_from_stream(stream, pdf) + + +class TreeObject(DictionaryObject): + def __init__(self) -> None: + DictionaryObject.__init__(self) + + def hasChildren(self) -> bool: # pragma: no cover + deprecate_with_replacement("hasChildren", "has_children", "4.0.0") + return self.has_children() + + def has_children(self) -> bool: + return "/First" in self + + def __iter__(self) -> Any: + return self.children() + + def children(self) -> Iterable[Any]: + if not self.has_children(): + return + + child_ref = self[NameObject("/First")] + child = child_ref.get_object() + while True: + yield child + if child == self[NameObject("/Last")]: + return + child_ref = child.get(NameObject("/Next")) # type: ignore + if child_ref is None: + return + child = child_ref.get_object() + + def addChild(self, child: Any, pdf: Any) -> None: # pragma: no cover + deprecation_with_replacement("addChild", "add_child", "3.0.0") + self.add_child(child, pdf) + + def add_child(self, child: Any, pdf: PdfWriterProtocol) -> None: + self.insert_child(child, None, pdf) + + def insert_child(self, child: Any, before: Any, pdf: PdfWriterProtocol) -> None: + def inc_parent_counter( + parent: Union[None, IndirectObject, TreeObject], n: int + ) -> None: + if parent is None: + return + parent = cast("TreeObject", parent.get_object()) + if "/Count" in parent: + parent[NameObject("/Count")] = NumberObject( + cast(int, parent[NameObject("/Count")]) + n + ) + inc_parent_counter(parent.get("/Parent", None), n) + + child_obj = child.get_object() + child = child.indirect_reference # get_reference(child_obj) + # assert isinstance(child, IndirectObject) + + prev: Optional[DictionaryObject] + if "/First" not in self: # no child yet + self[NameObject("/First")] = child + self[NameObject("/Count")] = NumberObject(0) + self[NameObject("/Last")] = child + child_obj[NameObject("/Parent")] = self.indirect_reference + inc_parent_counter(self, child_obj.get("/Count", 1)) + if "/Next" in child_obj: + del child_obj["/Next"] + if "/Prev" in child_obj: + del child_obj["/Prev"] + return + else: + prev = cast("DictionaryObject", self["/Last"]) + + while prev.indirect_reference != before: + if "/Next" in prev: + prev = cast("TreeObject", prev["/Next"]) + else: # append at the end + prev[NameObject("/Next")] = cast("TreeObject", child) + child_obj[NameObject("/Prev")] = prev.indirect_reference + child_obj[NameObject("/Parent")] = self.indirect_reference + if "/Next" in child_obj: + del child_obj["/Next"] + self[NameObject("/Last")] = child + inc_parent_counter(self, child_obj.get("/Count", 1)) + return + try: # insert as first or in the middle + assert isinstance(prev["/Prev"], DictionaryObject) + prev["/Prev"][NameObject("/Next")] = child + child_obj[NameObject("/Prev")] = prev["/Prev"] + except Exception: # it means we are inserting in first position + del child_obj["/Next"] + child_obj[NameObject("/Next")] = prev + prev[NameObject("/Prev")] = child + child_obj[NameObject("/Parent")] = self.indirect_reference + inc_parent_counter(self, child_obj.get("/Count", 1)) + + def removeChild(self, child: Any) -> None: # pragma: no cover + deprecation_with_replacement("removeChild", "remove_child", "3.0.0") + self.remove_child(child) + + def _remove_node_from_tree( + self, prev: Any, prev_ref: Any, cur: Any, last: Any + ) -> None: + """Adjust the pointers of the linked list and tree node count.""" + next_ref = cur.get(NameObject("/Next"), None) + if prev is None: + if next_ref: + # Removing first tree node + next_obj = next_ref.get_object() + del next_obj[NameObject("/Prev")] + self[NameObject("/First")] = next_ref + self[NameObject("/Count")] = NumberObject( + self[NameObject("/Count")] - 1 # type: ignore + ) + + else: + # Removing only tree node + assert self[NameObject("/Count")] == 1 + del self[NameObject("/Count")] + del self[NameObject("/First")] + if NameObject("/Last") in self: + del self[NameObject("/Last")] + else: + if next_ref: + # Removing middle tree node + next_obj = next_ref.get_object() + next_obj[NameObject("/Prev")] = prev_ref + prev[NameObject("/Next")] = next_ref + else: + # Removing last tree node + assert cur == last + del prev[NameObject("/Next")] + self[NameObject("/Last")] = prev_ref + self[NameObject("/Count")] = NumberObject(self[NameObject("/Count")] - 1) # type: ignore + + def remove_child(self, child: Any) -> None: + child_obj = child.get_object() + child = child_obj.indirect_reference + + if NameObject("/Parent") not in child_obj: + raise ValueError("Removed child does not appear to be a tree item") + elif child_obj[NameObject("/Parent")] != self: + raise ValueError("Removed child is not a member of this tree") + + found = False + prev_ref = None + prev = None + cur_ref: Optional[Any] = self[NameObject("/First")] + cur: Optional[Dict[str, Any]] = cur_ref.get_object() # type: ignore + last_ref = self[NameObject("/Last")] + last = last_ref.get_object() + while cur is not None: + if cur == child_obj: + self._remove_node_from_tree(prev, prev_ref, cur, last) + found = True + break + + # Go to the next node + prev_ref = cur_ref + prev = cur + if NameObject("/Next") in cur: + cur_ref = cur[NameObject("/Next")] + cur = cur_ref.get_object() + else: + cur_ref = None + cur = None + + if not found: + raise ValueError("Removal couldn't find item in tree") + + _reset_node_tree_relationship(child_obj) + + def remove_from_tree(self) -> None: + """ + remove the object from the tree it is in + """ + if NameObject("/Parent") not in self: + raise ValueError("Removed child does not appear to be a tree item") + else: + cast("TreeObject", self["/Parent"]).remove_child(self) + + def emptyTree(self) -> None: # pragma: no cover + deprecate_with_replacement("emptyTree", "empty_tree", "4.0.0") + self.empty_tree() + + def empty_tree(self) -> None: + for child in self: + child_obj = child.get_object() + _reset_node_tree_relationship(child_obj) + + if NameObject("/Count") in self: + del self[NameObject("/Count")] + if NameObject("/First") in self: + del self[NameObject("/First")] + if NameObject("/Last") in self: + del self[NameObject("/Last")] + + +def _reset_node_tree_relationship(child_obj: Any) -> None: + """ + Call this after a node has been removed from a tree. + + This resets the nodes attributes in respect to that tree. + """ + del child_obj[NameObject("/Parent")] + if NameObject("/Next") in child_obj: + del child_obj[NameObject("/Next")] + if NameObject("/Prev") in child_obj: + del child_obj[NameObject("/Prev")] + + +class StreamObject(DictionaryObject): + def __init__(self) -> None: + self.__data: Optional[str] = None + self.decoded_self: Optional["DecodedStreamObject"] = None + + def _clone( + self, + src: DictionaryObject, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool, + ignore_fields: Union[Tuple[str, ...], List[str]], + ) -> None: + """update the object from src""" + self._data = cast("StreamObject", src)._data + try: + decoded_self = cast("StreamObject", src).decoded_self + if decoded_self is None: + self.decoded_self = None + else: + self.decoded_self = decoded_self.clone(pdf_dest, True, ignore_fields) # type: ignore[assignment] + except Exception: + pass + super()._clone(src, pdf_dest, force_duplicate, ignore_fields) + return + + def hash_value_data(self) -> bytes: + data = super().hash_value_data() + data += b_(self._data) + return data + + @property + def decodedSelf(self) -> Optional["DecodedStreamObject"]: # pragma: no cover + deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0") + return self.decoded_self + + @decodedSelf.setter + def decodedSelf(self, value: "DecodedStreamObject") -> None: # pragma: no cover + deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0") + self.decoded_self = value + + @property + def _data(self) -> Any: + return self.__data + + @_data.setter + def _data(self, value: Any) -> None: + self.__data = value + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) + DictionaryObject.write_to_stream(self, stream, encryption_key) + del self[SA.LENGTH] + stream.write(b"\nstream\n") + data = self._data + if encryption_key: + from .._security import RC4_encrypt + + data = RC4_encrypt(encryption_key, data) + stream.write(data) + stream.write(b"\nendstream") + + @staticmethod + def initializeFromDictionary( + data: Dict[str, Any] + ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: # pragma: no cover + return StreamObject.initialize_from_dictionary(data) + + @staticmethod + def initialize_from_dictionary( + data: Dict[str, Any] + ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: + retval: Union["EncodedStreamObject", "DecodedStreamObject"] + if SA.FILTER in data: + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data[SA.LENGTH] + retval.update(data) + return retval + + def flateEncode(self) -> "EncodedStreamObject": # pragma: no cover + deprecation_with_replacement("flateEncode", "flate_encode", "3.0.0") + return self.flate_encode() + + def flate_encode(self) -> "EncodedStreamObject": + from ..filters import FlateDecode + + if SA.FILTER in self: + f = self[SA.FILTER] + if isinstance(f, ArrayObject): + f.insert(0, NameObject(FT.FLATE_DECODE)) + else: + newf = ArrayObject() + newf.append(NameObject("/FlateDecode")) + newf.append(f) + f = newf + else: + f = NameObject("/FlateDecode") + retval = EncodedStreamObject() + retval[NameObject(SA.FILTER)] = f + retval._data = FlateDecode.encode(self._data) + return retval + + +class DecodedStreamObject(StreamObject): + def get_data(self) -> Any: + return self._data + + def set_data(self, data: Any) -> Any: + self._data = data + + def getData(self) -> Any: # pragma: no cover + deprecation_with_replacement("getData", "get_data", "3.0.0") + return self._data + + def setData(self, data: Any) -> None: # pragma: no cover + deprecation_with_replacement("setData", "set_data", "3.0.0") + self.set_data(data) + + +class EncodedStreamObject(StreamObject): + def __init__(self) -> None: + self.decoded_self: Optional["DecodedStreamObject"] = None + + @property + def decodedSelf(self) -> Optional["DecodedStreamObject"]: # pragma: no cover + deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0") + return self.decoded_self + + @decodedSelf.setter + def decodedSelf(self, value: DecodedStreamObject) -> None: # pragma: no cover + deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0") + self.decoded_self = value + + def get_data(self) -> Union[None, str, bytes]: + from ..filters import decode_stream_data + + if self.decoded_self is not None: + # cached version of decoded object + return self.decoded_self.get_data() + else: + # create decoded object + decoded = DecodedStreamObject() + + decoded._data = decode_stream_data(self) + for key, value in list(self.items()): + if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): + decoded[key] = value + self.decoded_self = decoded + return decoded._data + + def getData(self) -> Union[None, str, bytes]: # pragma: no cover + deprecation_with_replacement("getData", "get_data", "3.0.0") + return self.get_data() + + def set_data(self, data: Any) -> None: # pragma: no cover + raise PdfReadError("Creating EncodedStreamObject is not currently supported") + + def setData(self, data: Any) -> None: # pragma: no cover + deprecation_with_replacement("setData", "set_data", "3.0.0") + return self.set_data(data) + + +class ContentStream(DecodedStreamObject): + def __init__( + self, + stream: Any, + pdf: Any, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, + ) -> None: + self.pdf = pdf + + # The inner list has two elements: + # [0] : List + # [1] : str + self.operations: List[Tuple[Any, Any]] = [] + + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + if stream is not None: + stream = stream.get_object() + if isinstance(stream, ArrayObject): + data = b"" + for s in stream: + data += b_(s.get_object().get_data()) + if len(data) == 0 or data[-1] != b"\n": + data += b"\n" + stream_bytes = BytesIO(data) + else: + stream_data = stream.get_data() + assert stream_data is not None + stream_data_bytes = b_(stream_data) + stream_bytes = BytesIO(stream_data_bytes) + self.forced_encoding = forced_encoding + self.__parse_content_stream(stream_bytes) + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Union[Tuple[str, ...], List[str], None] = (), + ) -> "ContentStream": + """clone object into pdf_dest""" + try: + if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore + return self + except Exception: + pass + + d__ = cast( + "ContentStream", self._reference_clone(self.__class__(None, None), pdf_dest) + ) + if ignore_fields is None: + ignore_fields = [] + d__._clone(self, pdf_dest, force_duplicate, ignore_fields) + return d__ + + def _clone( + self, + src: DictionaryObject, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool, + ignore_fields: Union[Tuple[str, ...], List[str]], + ) -> None: + """update the object from src""" + self.pdf = pdf_dest + self.operations = list(cast("ContentStream", src).operations) + self.forced_encoding = cast("ContentStream", src).forced_encoding + # no need to call DictionaryObjection or any + # super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields) + return + + def __parse_content_stream(self, stream: StreamType) -> None: + stream.seek(0, 0) + operands: List[Union[int, str, PdfObject]] = [] + while True: + peek = read_non_whitespace(stream) + if peek == b"" or peek == 0: + break + stream.seek(-1, 1) + if peek.isalpha() or peek in (b"'", b'"'): + operator = read_until_regex(stream, NameObject.delimiter_pattern, True) + if operator == b"BI": + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._read_inline_image(stream) + self.operations.append((ii, b"INLINE IMAGE")) + else: + self.operations.append((operands, operator)) + operands = [] + elif peek == b"%": + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, read_object will handle + # encountering a comment -- but read_object assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in (b"\r", b"\n"): + peek = stream.read(1) + else: + operands.append(read_object(stream, None, self.forced_encoding)) + + def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + if tok == b"I": + # "ID" - begin of image data + break + key = read_object(stream, self.pdf) + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + value = read_object(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == b"ID" + data = BytesIO() + # Read the inline image, while checking for EI (End Image) operator. + while True: + # Read 8 kB at a time and check if the chunk contains the E operator. + buf = stream.read(8192) + # We have reached the end of the stream, but haven't found the EI operator. + if not buf: + raise PdfReadError("Unexpected end of stream") + loc = buf.find(b"E") + + if loc == -1: + data.write(buf) + else: + # Write out everything before the E. + data.write(buf[0:loc]) + + # Seek back in the stream to read the E next. + stream.seek(loc - len(buf), 1) + tok = stream.read(1) + # Check for End Image + tok2 = stream.read(1) + if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES: + # Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required. + tok3 = stream.read(1) + info = tok + tok2 + # We need to find at least one whitespace after. + has_q_whitespace = False + while tok3 in WHITESPACES: + has_q_whitespace = True + info += tok3 + tok3 = stream.read(1) + if has_q_whitespace: + stream.seek(-1, 1) + break + else: + stream.seek(-1, 1) + data.write(info) + else: + stream.seek(-1, 1) + data.write(tok) + return {"settings": settings, "data": data.getvalue()} + + @property + def _data(self) -> bytes: + newdata = BytesIO() + for operands, operator in self.operations: + if operator == b"INLINE IMAGE": + newdata.write(b"BI") + dicttext = BytesIO() + operands["settings"].write_to_stream(dicttext, None) + newdata.write(dicttext.getvalue()[2:-2]) + newdata.write(b"ID ") + newdata.write(operands["data"]) + newdata.write(b"EI") + else: + for op in operands: + op.write_to_stream(newdata, None) + newdata.write(b" ") + newdata.write(b_(operator)) + newdata.write(b"\n") + return newdata.getvalue() + + @_data.setter + def _data(self, value: Union[str, bytes]) -> None: + self.__parse_content_stream(BytesIO(b_(value))) + + +def read_object( + stream: StreamType, + pdf: Any, # PdfReader + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union[PdfObject, int, str, ContentStream]: + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + if tok == b"/": + return NameObject.read_from_stream(stream, pdf) + elif tok == b"<": + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + + if peek == b"<<": + return DictionaryObject.read_from_stream(stream, pdf, forced_encoding) + else: + return read_hex_string_from_stream(stream, forced_encoding) + elif tok == b"[": + return ArrayObject.read_from_stream(stream, pdf, forced_encoding) + elif tok == b"t" or tok == b"f": + return BooleanObject.read_from_stream(stream) + elif tok == b"(": + return read_string_from_stream(stream, forced_encoding) + elif tok == b"e" and stream.read(6) == b"endobj": + stream.seek(-6, 1) + return NullObject() + elif tok == b"n": + return NullObject.read_from_stream(stream) + elif tok == b"%": + # comment + while tok not in (b"\r", b"\n"): + tok = stream.read(1) + # Prevents an infinite loop by raising an error if the stream is at + # the EOF + if len(tok) <= 0: + raise PdfStreamError("File ended unexpectedly.") + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + return read_object(stream, pdf, forced_encoding) + elif tok in b"0123456789+-.": + # number object OR indirect reference + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) is not None: + return IndirectObject.read_from_stream(stream, pdf) + else: + return NumberObject.read_from_stream(stream) + else: + stream.seek(-20, 1) + raise PdfReadError( + f"Invalid Elementary Object starting with {tok!r} @{stream.tell()}: {stream.read(80).__repr__()}" + ) + + +class Field(TreeObject): + """ + A class representing a field dictionary. + + This class is accessed through + :meth:`get_fields()<PyPDF2.PdfReader.get_fields>` + """ + + def __init__(self, data: Dict[str, Any]) -> None: + DictionaryObject.__init__(self) + field_attributes = ( + FieldDictionaryAttributes.attributes() + + CheckboxRadioButtonAttributes.attributes() + ) + for attr in field_attributes: + try: + self[NameObject(attr)] = data[attr] + except KeyError: + pass + + # TABLE 8.69 Entries common to all field dictionaries + @property + def field_type(self) -> Optional[NameObject]: + """Read-only property accessing the type of this field.""" + return self.get(FieldDictionaryAttributes.FT) + + @property + def fieldType(self) -> Optional[NameObject]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :py:attr:`field_type` instead. + """ + deprecation_with_replacement("fieldType", "field_type", "3.0.0") + return self.field_type + + @property + def parent(self) -> Optional[DictionaryObject]: + """Read-only property accessing the parent of this field.""" + return self.get(FieldDictionaryAttributes.Parent) + + @property + def kids(self) -> Optional["ArrayObject"]: + """Read-only property accessing the kids of this field.""" + return self.get(FieldDictionaryAttributes.Kids) + + @property + def name(self) -> Optional[str]: + """Read-only property accessing the name of this field.""" + return self.get(FieldDictionaryAttributes.T) + + @property + def alternate_name(self) -> Optional[str]: + """Read-only property accessing the alternate name of this field.""" + return self.get(FieldDictionaryAttributes.TU) + + @property + def altName(self) -> Optional[str]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :py:attr:`alternate_name` instead. + """ + deprecation_with_replacement("altName", "alternate_name", "3.0.0") + return self.alternate_name + + @property + def mapping_name(self) -> Optional[str]: + """ + Read-only property accessing the mapping name of this field. This + name is used by PyPDF2 as a key in the dictionary returned by + :meth:`get_fields()<PyPDF2.PdfReader.get_fields>` + """ + return self.get(FieldDictionaryAttributes.TM) + + @property + def mappingName(self) -> Optional[str]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :py:attr:`mapping_name` instead. + """ + deprecation_with_replacement("mappingName", "mapping_name", "3.0.0") + return self.mapping_name + + @property + def flags(self) -> Optional[int]: + """ + Read-only property accessing the field flags, specifying various + characteristics of the field (see Table 8.70 of the PDF 1.7 reference). + """ + return self.get(FieldDictionaryAttributes.Ff) + + @property + def value(self) -> Optional[Any]: + """ + Read-only property accessing the value of this field. Format + varies based on field type. + """ + return self.get(FieldDictionaryAttributes.V) + + @property + def default_value(self) -> Optional[Any]: + """Read-only property accessing the default value of this field.""" + return self.get(FieldDictionaryAttributes.DV) + + @property + def defaultValue(self) -> Optional[Any]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :py:attr:`default_value` instead. + """ + deprecation_with_replacement("defaultValue", "default_value", "3.0.0") + return self.default_value + + @property + def additional_actions(self) -> Optional[DictionaryObject]: + """ + Read-only property accessing the additional actions dictionary. + This dictionary defines the field's behavior in response to trigger events. + See Section 8.5.2 of the PDF 1.7 reference. + """ + return self.get(FieldDictionaryAttributes.AA) + + @property + def additionalActions(self) -> Optional[DictionaryObject]: # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :py:attr:`additional_actions` instead. + """ + deprecation_with_replacement("additionalActions", "additional_actions", "3.0.0") + return self.additional_actions + + +class Destination(TreeObject): + """ + A class representing a destination within a PDF file. + See section 8.2.1 of the PDF 1.6 reference. + + :param str title: Title of this destination. + :param IndirectObject page: Reference to the page of this destination. Should + be an instance of :class:`IndirectObject<PyPDF2.generic.IndirectObject>`. + :param Fit fit: How the destination is displayed. + :raises PdfReadError: If destination type is invalid. + + + """ + + node: Optional[ + DictionaryObject + ] = None # node provide access to the original Object + childs: List[Any] = [] # used in PdfWriter + + def __init__( + self, + title: str, + page: Union[NumberObject, IndirectObject, NullObject, DictionaryObject], + fit: Fit, + ) -> None: + typ = fit.fit_type + args = fit.fit_args + + DictionaryObject.__init__(self) + self[NameObject("/Title")] = TextStringObject(title) + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ + + # from table 8.2 of the PDF 1.7 reference. + if typ == "/XYZ": + ( + self[NameObject(TA.LEFT)], + self[NameObject(TA.TOP)], + self[NameObject("/Zoom")], + ) = args + elif typ == TF.FIT_R: + ( + self[NameObject(TA.LEFT)], + self[NameObject(TA.BOTTOM)], + self[NameObject(TA.RIGHT)], + self[NameObject(TA.TOP)], + ) = args + elif typ in [TF.FIT_H, TF.FIT_BH]: + try: # Prefered to be more robust not only to null parameters + (self[NameObject(TA.TOP)],) = args + except Exception: + (self[NameObject(TA.TOP)],) = (NullObject(),) + elif typ in [TF.FIT_V, TF.FIT_BV]: + try: # Prefered to be more robust not only to null parameters + (self[NameObject(TA.LEFT)],) = args + except Exception: + (self[NameObject(TA.LEFT)],) = (NullObject(),) + elif typ in [TF.FIT, TF.FIT_B]: + pass + else: + raise PdfReadError(f"Unknown Destination Type: {typ!r}") + + @property + def dest_array(self) -> "ArrayObject": + return ArrayObject( + [self.raw_get("/Page"), self["/Type"]] + + [ + self[x] + for x in ["/Left", "/Bottom", "/Right", "/Top", "/Zoom"] + if x in self + ] + ) + + def getDestArray(self) -> "ArrayObject": # pragma: no cover + """ + .. deprecated:: 1.28.3 + + Use :py:attr:`dest_array` instead. + """ + deprecation_with_replacement("getDestArray", "dest_array", "3.0.0") + return self.dest_array + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(b"<<\n") + key = NameObject("/D") + key.write_to_stream(stream, encryption_key) + stream.write(b" ") + value = self.dest_array + value.write_to_stream(stream, encryption_key) + + key = NameObject("/S") + key.write_to_stream(stream, encryption_key) + stream.write(b" ") + value_s = NameObject("/GoTo") + value_s.write_to_stream(stream, encryption_key) + + stream.write(b"\n") + stream.write(b">>") + + @property + def title(self) -> Optional[str]: + """Read-only property accessing the destination title.""" + return self.get("/Title") + + @property + def page(self) -> Optional[int]: + """Read-only property accessing the destination page number.""" + return self.get("/Page") + + @property + def typ(self) -> Optional[str]: + """Read-only property accessing the destination type.""" + return self.get("/Type") + + @property + def zoom(self) -> Optional[int]: + """Read-only property accessing the zoom factor.""" + return self.get("/Zoom", None) + + @property + def left(self) -> Optional[FloatObject]: + """Read-only property accessing the left horizontal coordinate.""" + return self.get("/Left", None) + + @property + def right(self) -> Optional[FloatObject]: + """Read-only property accessing the right horizontal coordinate.""" + return self.get("/Right", None) + + @property + def top(self) -> Optional[FloatObject]: + """Read-only property accessing the top vertical coordinate.""" + return self.get("/Top", None) + + @property + def bottom(self) -> Optional[FloatObject]: + """Read-only property accessing the bottom vertical coordinate.""" + return self.get("/Bottom", None) + + @property + def color(self) -> Optional["ArrayObject"]: + """Read-only property accessing the color in (R, G, B) with values 0.0-1.0""" + return self.get( + "/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)]) + ) + + @property + def font_format(self) -> Optional[OutlineFontFlag]: + """Read-only property accessing the font type. 1=italic, 2=bold, 3=both""" + return self.get("/F", 0) + + @property + def outline_count(self) -> Optional[int]: + """ + Read-only property accessing the outline count. + positive = expanded + negative = collapsed + absolute value = number of visible descendents at all levels + """ + return self.get("/Count", None) diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_fit.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_fit.py new file mode 100644 index 00000000..b0e7aaa9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_fit.py @@ -0,0 +1,129 @@ +from typing import Any, Optional, Tuple, Union + + +class Fit: + def __init__( + self, fit_type: str, fit_args: Tuple[Union[None, float, Any], ...] = tuple() + ): + from ._base import FloatObject, NameObject, NullObject + + self.fit_type = NameObject(fit_type) + self.fit_args = [ + NullObject() if a is None or isinstance(a, NullObject) else FloatObject(a) + for a in fit_args + ] + + @classmethod + def xyz( + cls, + left: Optional[float] = None, + top: Optional[float] = None, + zoom: Optional[float] = None, + ) -> "Fit": + """ + Display the page designated by page, with the coordinates ( left , top ) + positioned at the upper-left corner of the window and the contents + of the page magnified by the factor zoom. + + A null value for any of the parameters left, top, or zoom specifies + that the current value of that parameter is to be retained unchanged. + + A zoom value of 0 has the same meaning as a null value. + """ + return Fit(fit_type="/XYZ", fit_args=(left, top, zoom)) + + @classmethod + def fit(cls) -> "Fit": + """ + Display the page designated by page, with its contents magnified just + enough to fit the entire page within the window both horizontally and + vertically. If the required horizontal and vertical magnification + factors are different, use the smaller of the two, centering the page + within the window in the other dimension. + """ + return Fit(fit_type="/Fit") + + @classmethod + def fit_horizontally(cls, top: Optional[float] = None) -> "Fit": + """ + Display the page designated by page , with the vertical coordinate top + positioned at the top edge of the window and the contents of the page + magnified just enough to fit the entire width of the page within the + window. + + A null value for `top` specifies that the current value of that + parameter is to be retained unchanged. + """ + return Fit(fit_type="/FitH", fit_args=(top,)) + + @classmethod + def fit_vertically(cls, left: Optional[float] = None) -> "Fit": + return Fit(fit_type="/FitV", fit_args=(left,)) + + @classmethod + def fit_rectangle( + cls, + left: Optional[float] = None, + bottom: Optional[float] = None, + right: Optional[float] = None, + top: Optional[float] = None, + ) -> "Fit": + """ + Display the page designated by page , with its contents magnified + just enough to fit the rectangle specified by the coordinates + left , bottom , right , and top entirely within the window + both horizontally and vertically. + + If the required horizontal and vertical magnification factors are + different, use the smaller of the two, centering the rectangle within + the window in the other dimension. + + A null value for any of the parameters may result in unpredictable + behavior. + """ + return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top)) + + @classmethod + def fit_box(cls) -> "Fit": + """ + Display the page designated by page , with its contents magnified + just enough to fit its bounding box entirely within the window both + horizontally and vertically. If the required horizontal and vertical + magnification factors are different, use the smaller of the two, + centering the bounding box within the window in the other dimension. + """ + return Fit(fit_type="/FitB") + + @classmethod + def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit": + """ + Display the page designated by page , with the vertical coordinate + top positioned at the top edge of the window and the contents of the + page magnified just enough to fit the entire width of its bounding box + within the window. + + A null value for top specifies that the current value of that parameter + is to be retained unchanged. + """ + return Fit(fit_type="/FitBH", fit_args=(top,)) + + @classmethod + def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit": + """ + Display the page designated by page , with the horizontal coordinate + left positioned at the left edge of the window and the contents of + the page magnified just enough to fit the entire height of its + bounding box within the window. + + A null value for left specifies that the current value of that + parameter is to be retained unchanged. + """ + return Fit(fit_type="/FitBV", fit_args=(left,)) + + def __str__(self) -> str: + if not self.fit_args: + return f"Fit({self.fit_type})" + return f"Fit({self.fit_type}, {self.fit_args})" + + +DEFAULT_FIT = Fit.fit() diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_outline.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_outline.py new file mode 100644 index 00000000..c2e72c0a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_outline.py @@ -0,0 +1,35 @@ +from typing import Any, Union + +from .._utils import StreamType, deprecation_with_replacement +from ._base import NameObject +from ._data_structures import Destination + + +class OutlineItem(Destination): + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + stream.write(b"<<\n") + for key in [ + NameObject(x) + for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"] + if x in self + ]: + key.write_to_stream(stream, encryption_key) + stream.write(b" ") + value = self.raw_get(key) + value.write_to_stream(stream, encryption_key) + stream.write(b"\n") + key = NameObject("/Dest") + key.write_to_stream(stream, encryption_key) + stream.write(b" ") + value = self.dest_array + value.write_to_stream(stream, encryption_key) + stream.write(b"\n") + stream.write(b">>") + + +class Bookmark(OutlineItem): # pragma: no cover + def __init__(self, *args: Any, **kwargs: Any) -> None: + deprecation_with_replacement("Bookmark", "OutlineItem", "3.0.0") + super().__init__(*args, **kwargs) diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_rectangle.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_rectangle.py new file mode 100644 index 00000000..3f41bfd5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_rectangle.py @@ -0,0 +1,265 @@ +import decimal +from typing import Any, List, Tuple, Union + +from .._utils import deprecation_no_replacement, deprecation_with_replacement +from ._base import FloatObject, NumberObject +from ._data_structures import ArrayObject + + +class RectangleObject(ArrayObject): + """ + This class is used to represent *page boxes* in PyPDF2. These boxes include: + * :attr:`artbox <PyPDF2._page.PageObject.artbox>` + * :attr:`bleedbox <PyPDF2._page.PageObject.bleedbox>` + * :attr:`cropbox <PyPDF2._page.PageObject.cropbox>` + * :attr:`mediabox <PyPDF2._page.PageObject.mediabox>` + * :attr:`trimbox <PyPDF2._page.PageObject.trimbox>` + """ + + def __init__( + self, arr: Union["RectangleObject", Tuple[float, float, float, float]] + ) -> None: + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore + + def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]: + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) + return value + + def scale(self, sx: float, sy: float) -> "RectangleObject": + return RectangleObject( + ( + float(self.left) * sx, + float(self.bottom) * sy, + float(self.right) * sx, + float(self.top) * sy, + ) + ) + + def ensureIsNumber( + self, value: Any + ) -> Union[FloatObject, NumberObject]: # pragma: no cover + deprecation_no_replacement("ensureIsNumber", "3.0.0") + return self._ensure_is_number(value) + + def __repr__(self) -> str: + return f"RectangleObject({repr(list(self))})" + + @property + def left(self) -> FloatObject: + return self[0] + + @left.setter + def left(self, f: float) -> None: + self[0] = FloatObject(f) + + @property + def bottom(self) -> FloatObject: + return self[1] + + @bottom.setter + def bottom(self, f: float) -> None: + self[1] = FloatObject(f) + + @property + def right(self) -> FloatObject: + return self[2] + + @right.setter + def right(self, f: float) -> None: + self[2] = FloatObject(f) + + @property + def top(self) -> FloatObject: + return self[3] + + @top.setter + def top(self, f: float) -> None: + self[3] = FloatObject(f) + + def getLowerLeft_x(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getLowerLeft_x", "left", "3.0.0") + return self.left + + def getLowerLeft_y(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getLowerLeft_y", "bottom", "3.0.0") + return self.bottom + + def getUpperRight_x(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getUpperRight_x", "right", "3.0.0") + return self.right + + def getUpperRight_y(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getUpperRight_y", "top", "3.0.0") + return self.top + + def getUpperLeft_x(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getUpperLeft_x", "left", "3.0.0") + return self.left + + def getUpperLeft_y(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getUpperLeft_y", "top", "3.0.0") + return self.top + + def getLowerRight_x(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getLowerRight_x", "right", "3.0.0") + return self.right + + def getLowerRight_y(self) -> FloatObject: # pragma: no cover + deprecation_with_replacement("getLowerRight_y", "bottom", "3.0.0") + return self.bottom + + @property + def lower_left(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + """ + Property to read and modify the lower left coordinate of this box + in (x,y) form. + """ + return self.left, self.bottom + + @lower_left.setter + def lower_left(self, value: List[Any]) -> None: + self[0], self[1] = (self._ensure_is_number(x) for x in value) + + @property + def lower_right(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + """ + Property to read and modify the lower right coordinate of this box + in (x,y) form. + """ + return self.right, self.bottom + + @lower_right.setter + def lower_right(self, value: List[Any]) -> None: + self[2], self[1] = (self._ensure_is_number(x) for x in value) + + @property + def upper_left(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + """ + Property to read and modify the upper left coordinate of this box + in (x,y) form. + """ + return self.left, self.top + + @upper_left.setter + def upper_left(self, value: List[Any]) -> None: + self[0], self[3] = (self._ensure_is_number(x) for x in value) + + @property + def upper_right(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + """ + Property to read and modify the upper right coordinate of this box + in (x,y) form. + """ + return self.right, self.top + + @upper_right.setter + def upper_right(self, value: List[Any]) -> None: + self[2], self[3] = (self._ensure_is_number(x) for x in value) + + def getLowerLeft( + self, + ) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("getLowerLeft", "lower_left", "3.0.0") + return self.lower_left + + def getLowerRight( + self, + ) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("getLowerRight", "lower_right", "3.0.0") + return self.lower_right + + def getUpperLeft( + self, + ) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("getUpperLeft", "upper_left", "3.0.0") + return self.upper_left + + def getUpperRight( + self, + ) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("getUpperRight", "upper_right", "3.0.0") + return self.upper_right + + def setLowerLeft(self, value: Tuple[float, float]) -> None: # pragma: no cover + deprecation_with_replacement("setLowerLeft", "lower_left", "3.0.0") + self.lower_left = value # type: ignore + + def setLowerRight(self, value: Tuple[float, float]) -> None: # pragma: no cover + deprecation_with_replacement("setLowerRight", "lower_right", "3.0.0") + self[2], self[1] = (self._ensure_is_number(x) for x in value) + + def setUpperLeft(self, value: Tuple[float, float]) -> None: # pragma: no cover + deprecation_with_replacement("setUpperLeft", "upper_left", "3.0.0") + self[0], self[3] = (self._ensure_is_number(x) for x in value) + + def setUpperRight(self, value: Tuple[float, float]) -> None: # pragma: no cover + deprecation_with_replacement("setUpperRight", "upper_right", "3.0.0") + self[2], self[3] = (self._ensure_is_number(x) for x in value) + + @property + def width(self) -> decimal.Decimal: + return self.right - self.left + + def getWidth(self) -> decimal.Decimal: # pragma: no cover + deprecation_with_replacement("getWidth", "width", "3.0.0") + return self.width + + @property + def height(self) -> decimal.Decimal: + return self.top - self.bottom + + def getHeight(self) -> decimal.Decimal: # pragma: no cover + deprecation_with_replacement("getHeight", "height", "3.0.0") + return self.height + + @property + def lowerLeft(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("lowerLeft", "lower_left", "3.0.0") + return self.lower_left + + @lowerLeft.setter + def lowerLeft( + self, value: Tuple[decimal.Decimal, decimal.Decimal] + ) -> None: # pragma: no cover + deprecation_with_replacement("lowerLeft", "lower_left", "3.0.0") + self.lower_left = value + + @property + def lowerRight(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("lowerRight", "lower_right", "3.0.0") + return self.lower_right + + @lowerRight.setter + def lowerRight( + self, value: Tuple[decimal.Decimal, decimal.Decimal] + ) -> None: # pragma: no cover + deprecation_with_replacement("lowerRight", "lower_right", "3.0.0") + self.lower_right = value + + @property + def upperLeft(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("upperLeft", "upper_left", "3.0.0") + return self.upper_left + + @upperLeft.setter + def upperLeft( + self, value: Tuple[decimal.Decimal, decimal.Decimal] + ) -> None: # pragma: no cover + deprecation_with_replacement("upperLeft", "upper_left", "3.0.0") + self.upper_left = value + + @property + def upperRight(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # pragma: no cover + deprecation_with_replacement("upperRight", "upper_right", "3.0.0") + return self.upper_right + + @upperRight.setter + def upperRight( + self, value: Tuple[decimal.Decimal, decimal.Decimal] + ) -> None: # pragma: no cover + deprecation_with_replacement("upperRight", "upper_right", "3.0.0") + self.upper_right = value diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/generic/_utils.py b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_utils.py new file mode 100644 index 00000000..2f8debdc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/generic/_utils.py @@ -0,0 +1,172 @@ +import codecs +from typing import Dict, List, Tuple, Union + +from .._codecs import _pdfdoc_encoding +from .._utils import StreamType, b_, logger_warning, read_non_whitespace +from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError +from ._base import ByteStringObject, TextStringObject + + +def hex_to_rgb(value: str) -> Tuple[float, float, float]: + return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore + + +def read_hex_string_from_stream( + stream: StreamType, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union["TextStringObject", "ByteStringObject"]: + stream.read(1) + txt = "" + x = b"" + while True: + tok = read_non_whitespace(stream) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok == b">": + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = b"" + if len(x) == 1: + x += b"0" + if len(x) == 2: + txt += chr(int(x, base=16)) + return create_string_object(b_(txt), forced_encoding) + + +def read_string_from_stream( + stream: StreamType, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union["TextStringObject", "ByteStringObject"]: + tok = stream.read(1) + parens = 1 + txt = [] + while True: + tok = stream.read(1) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok == b"(": + parens += 1 + elif tok == b")": + parens -= 1 + if parens == 0: + break + elif tok == b"\\": + tok = stream.read(1) + escape_dict = { + b"n": b"\n", + b"r": b"\r", + b"t": b"\t", + b"b": b"\b", + b"f": b"\f", + b"c": rb"\c", + b"(": b"(", + b")": b")", + b"/": b"/", + b"\\": b"\\", + b" ": b" ", + b"%": b"%", + b"<": b"<", + b">": b">", + b"[": b"[", + b"]": b"]", + b"#": b"#", + b"_": b"_", + b"&": b"&", + b"$": b"$", + } + try: + tok = escape_dict[tok] + except KeyError: + if b"0" <= tok and tok <= b"7": + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for _ in range(2): + ntok = stream.read(1) + if b"0" <= ntok and ntok <= b"7": + tok += ntok + else: + stream.seek(-1, 1) # ntok has to be analysed + break + tok = b_(chr(int(tok, base=8))) + elif tok in b"\n\r": + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if tok not in b"\n\r": + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b"" + else: + msg = rf"Unexpected escaped string: {tok.decode('utf8')}" + logger_warning(msg, __name__) + txt.append(tok) + return create_string_object(b"".join(txt), forced_encoding) + + +def create_string_object( + string: Union[str, bytes], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union[TextStringObject, ByteStringObject]: + """ + Create a ByteStringObject or a TextStringObject from a string to represent the string. + + :param Union[str, bytes] string: A string + + :raises TypeError: If string is not of type str or bytes. + """ + if isinstance(string, str): + return TextStringObject(string) + elif isinstance(string, bytes): + if isinstance(forced_encoding, (list, dict)): + out = "" + for x in string: + try: + out += forced_encoding[x] + except Exception: + out += bytes((x,)).decode("charmap") + return TextStringObject(out) + elif isinstance(forced_encoding, str): + if forced_encoding == "bytes": + return ByteStringObject(string) + return TextStringObject(string.decode(forced_encoding)) + else: + try: + if string.startswith(codecs.BOM_UTF16_BE): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + return retval + else: + # This is probably a big performance hit here, but we need to + # convert string objects into the text/unicode-aware version if + # possible... and the only way to check if that's possible is + # to try. Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("create_string_object should have str or unicode arg") + + +def decode_pdfdocencoding(byte_array: bytes) -> str: + retval = "" + for b in byte_array: + c = _pdfdoc_encoding[b] + if c == "\u0000": + raise UnicodeDecodeError( + "pdfdocencoding", + bytearray(b), + -1, + -1, + "does not exist in translation table", + ) + retval += c + return retval |