diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/generic')
9 files changed, 3713 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/generic/__init__.py new file mode 100644 index 00000000..48045e0a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/__init__.py @@ -0,0 +1,464 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +"""Implementation of generic PDF objects (dictionary, number, string, ...).""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +from typing import Dict, List, Optional, Tuple, Union + +from .._utils import StreamType, deprecate_with_replacement +from ..constants import OutlineFontFlag +from ._base import ( + BooleanObject, + ByteStringObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + TextStringObject, + encode_pdfdocencoding, +) +from ._data_structures import ( + ArrayObject, + ContentStream, + DecodedStreamObject, + Destination, + DictionaryObject, + EncodedStreamObject, + Field, + StreamObject, + TreeObject, + read_object, +) +from ._fit import Fit +from ._outline import OutlineItem +from ._rectangle import RectangleObject +from ._utils import ( + create_string_object, + decode_pdfdocencoding, + hex_to_rgb, + read_hex_string_from_stream, + read_string_from_stream, +) +from ._viewerpref import ViewerPreferences + + +def readHexStringFromStream( + stream: StreamType, +) -> Union["TextStringObject", "ByteStringObject"]: # deprecated + """Deprecated, use read_hex_string_from_stream.""" + deprecate_with_replacement( + "readHexStringFromStream", "read_hex_string_from_stream", "4.0.0" + ) + return read_hex_string_from_stream(stream) + + +def readStringFromStream( + stream: StreamType, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union["TextStringObject", "ByteStringObject"]: # deprecated + """Deprecated, use read_string_from_stream.""" + deprecate_with_replacement( + "readStringFromStream", "read_string_from_stream", "4.0.0" + ) + return read_string_from_stream(stream, forced_encoding) + + +def createStringObject( + string: Union[str, bytes], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union[TextStringObject, ByteStringObject]: # deprecated + """Deprecated, use create_string_object.""" + deprecate_with_replacement("createStringObject", "create_string_object", "4.0.0") + return create_string_object(string, forced_encoding) + + +PAGE_FIT = Fit.fit() + + +class AnnotationBuilder: + """ + The AnnotationBuilder is deprecated. + + Instead, use the annotation classes in pypdf.annotations. + + See `adding PDF annotations <../user/adding-pdf-annotations.html>`_ for + its usage combined with PdfWriter. + """ + + from ..generic._rectangle import RectangleObject + + @staticmethod + def text( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + text: str, + open: bool = False, + flags: int = 0, + ) -> DictionaryObject: + """ + Add text annotation. + + Args: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + text: The text that is added to the document + open: + flags: + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.text", "pypdf.annotations.Text", "4.0.0" + ) + from ..annotations import Text + + return Text(rect=rect, text=text, open=open, flags=flags) + + @staticmethod + def free_text( + text: str, + rect: Union[RectangleObject, Tuple[float, float, float, float]], + font: str = "Helvetica", + bold: bool = False, + italic: bool = False, + font_size: str = "14pt", + font_color: str = "000000", + border_color: Optional[str] = "000000", + background_color: Optional[str] = "ffffff", + ) -> DictionaryObject: + """ + Add text in a rectangle to a page. + + Args: + text: Text to be added + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + font: Name of the Font, e.g. 'Helvetica' + bold: Print the text in bold + italic: Print the text in italic + font_size: How big the text will be, e.g. '14pt' + font_color: Hex-string for the color, e.g. cdcdcd + border_color: Hex-string for the border color, e.g. cdcdcd. + Use ``None`` for no border. + background_color: Hex-string for the background of the annotation, + e.g. cdcdcd. Use ``None`` for transparent background. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.free_text", "pypdf.annotations.FreeText", "4.0.0" + ) + from ..annotations import FreeText + + return FreeText( + text=text, + rect=rect, + font=font, + bold=bold, + italic=italic, + font_size=font_size, + font_color=font_color, + background_color=background_color, + border_color=border_color, + ) + + @staticmethod + def popup( + *, + rect: Union[RectangleObject, Tuple[float, float, float, float]], + flags: int = 0, + parent: Optional[DictionaryObject] = None, + open: bool = False, + ) -> DictionaryObject: + """ + Add a popup to the document. + + Args: + rect: + Specifies the clickable rectangular area as `[xLL, yLL, xUR, yUR]` + flags: + 1 - invisible, 2 - hidden, 3 - print, 4 - no zoom, + 5 - no rotate, 6 - no view, 7 - read only, 8 - locked, + 9 - toggle no view, 10 - locked contents + open: + Whether the popup should be shown directly (default is False). + parent: + The contents of the popup. Create this via the AnnotationBuilder. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.popup", "pypdf.annotations.Popup", "4.0.0" + ) + from ..annotations import Popup + + popup = Popup(rect=rect, open=open, parent=parent) + popup.flags = flags # type: ignore + + return popup + + @staticmethod + def line( + p1: Tuple[float, float], + p2: Tuple[float, float], + rect: Union[RectangleObject, Tuple[float, float, float, float]], + text: str = "", + title_bar: Optional[str] = None, + ) -> DictionaryObject: + """ + Draw a line on the PDF. + + Args: + p1: First point + p2: Second point + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + text: Text to be displayed as the line annotation + title_bar: Text to be displayed in the title bar of the + annotation; by convention this is the name of the author + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.line", "pypdf.annotations.Line", "4.0.0" + ) + from ..annotations import Line + + return Line(p1=p1, p2=p2, rect=rect, text=text, title_bar=title_bar) + + @staticmethod + def polyline( + vertices: List[Tuple[float, float]], + ) -> DictionaryObject: + """ + Draw a polyline on the PDF. + + Args: + vertices: Array specifying the vertices (x, y) coordinates of the poly-line. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.polyline", "pypdf.annotations.PolyLine", "4.0.0" + ) + from ..annotations import PolyLine + + return PolyLine(vertices=vertices) + + @staticmethod + def rectangle( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + interiour_color: Optional[str] = None, + ) -> DictionaryObject: + """ + Draw a rectangle on the PDF. + + This method uses the /Square annotation type of the PDF format. + + Args: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + interiour_color: None or hex-string for the color, e.g. cdcdcd + If None is used, the interiour is transparent. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.rectangle", "pypdf.annotations.Rectangle", "4.0.0" + ) + from ..annotations import Rectangle + + return Rectangle(rect=rect, interiour_color=interiour_color) + + @staticmethod + def highlight( + *, + rect: Union[RectangleObject, Tuple[float, float, float, float]], + quad_points: ArrayObject, + highlight_color: str = "ff0000", + printing: bool = False, + ) -> DictionaryObject: + """ + Add a highlight annotation to the document. + + Args: + rect: Array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the highlighted area + quad_points: An ArrayObject of 8 FloatObjects. Must match a word or + a group of words, otherwise no highlight will be shown. + highlight_color: The color used for the highlight. + printing: Whether to print out the highlight annotation when the page + is printed. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.highlight", "pypdf.annotations.Highlight", "4.0.0" + ) + from ..annotations import Highlight + + return Highlight( + rect=rect, quad_points=quad_points, highlight_color=highlight_color, printing=printing + ) + + @staticmethod + def ellipse( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + interiour_color: Optional[str] = None, + ) -> DictionaryObject: + """ + Draw an ellipse on the PDF. + + This method uses the /Circle annotation type of the PDF format. + + Args: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` specifying + the bounding box of the ellipse + interiour_color: None or hex-string for the color, e.g. cdcdcd + If None is used, the interiour is transparent. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.ellipse", "pypdf.annotations.Ellipse", "4.0.0" + ) + from ..annotations import Ellipse + + return Ellipse(rect=rect, interiour_color=interiour_color) + + @staticmethod + def polygon(vertices: List[Tuple[float, float]]) -> DictionaryObject: + deprecate_with_replacement( + "AnnotationBuilder.polygon", "pypdf.annotations.Polygon", "4.0.0" + ) + from ..annotations import Polygon + + return Polygon(vertices=vertices) + + from ._fit import DEFAULT_FIT + + @staticmethod + def link( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + border: Optional[ArrayObject] = None, + url: Optional[str] = None, + target_page_index: Optional[int] = None, + fit: Fit = DEFAULT_FIT, + ) -> DictionaryObject: + """ + Add a link to the document. + + The link can either be an external link or an internal link. + + An external link requires the URL parameter. + An internal link requires the target_page_index, fit, and fit args. + + Args: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + - horizontal corner radius, + - vertical corner radius, and + - border width + - Optionally: Dash + url: Link to a website (if you want to make an external link) + target_page_index: index of the page to which the link should go + (if you want to make an internal link) + fit: Page fit or 'zoom' option. + + Returns: + A dictionary object representing the annotation. + """ + deprecate_with_replacement( + "AnnotationBuilder.link", "pypdf.annotations.Link", "4.0.0" + ) + from ..annotations import Link + + return Link( + rect=rect, + border=border, + url=url, + target_page_index=target_page_index, + fit=fit, + ) + + +__all__ = [ + # Base types + "BooleanObject", + "FloatObject", + "NumberObject", + "NameObject", + "IndirectObject", + "NullObject", + "PdfObject", + "TextStringObject", + "ByteStringObject", + # Annotations + "AnnotationBuilder", + # Fit + "Fit", + "PAGE_FIT", + # Data structures + "ArrayObject", + "DictionaryObject", + "TreeObject", + "StreamObject", + "DecodedStreamObject", + "EncodedStreamObject", + "ContentStream", + "RectangleObject", + "Field", + "Destination", + "ViewerPreferences", + # --- More specific stuff + # Outline + "OutlineItem", + "OutlineFontFlag", + # Data structures core functions + "read_object", + # Utility functions + "create_string_object", + "encode_pdfdocencoding", + "decode_pdfdocencoding", + "hex_to_rgb", + "read_hex_string_from_stream", + "read_string_from_stream", +] diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_base.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_base.py new file mode 100644 index 00000000..2d606b41 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_base.py @@ -0,0 +1,721 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +import binascii +import codecs +import hashlib +import re +from binascii import unhexlify +from math import log10 +from typing import Any, Callable, ClassVar, Dict, Optional, Sequence, Union, cast + +from .._codecs import _pdfdoc_encoding_rev +from .._protocols import PdfObjectProtocol, PdfWriterProtocol +from .._utils import ( + StreamType, + b_, + deprecate_no_replacement, + logger_warning, + read_non_whitespace, + read_until_regex, + str_, +) +from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + + +class PdfObject(PdfObjectProtocol): + # function for calculating a hash value + hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1 + indirect_reference: Optional["IndirectObject"] + + def hash_value_data(self) -> bytes: + return ("%s" % self).encode() + + def hash_value(self) -> bytes: + return ( + "%s:%s" + % ( + self.__class__.__name__, + self.hash_func(self.hash_value_data()).hexdigest(), + ) + ).encode() + + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "PdfObject": + """ + Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter). + + By default, this method will call ``_reference_clone`` (see ``_reference``). + + + Args: + pdf_dest: Target to clone to. + force_duplicate: By default, if the object has already been cloned and referenced, + the copy will be returned; when ``True``, a new copy will be created. + (Default value = ``False``) + ignore_fields: List/tuple of field names (for dictionaries) that will be ignored + during cloning (applies to children duplication as well). If fields are to be + considered for a limited number of levels, you have to add it as integer, for + example ``[1,"/B","/TOTO"]`` means that ``"/B"`` will be ignored at the first + level only but ``"/TOTO"`` on all levels. + + Returns: + The cloned PdfObject + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not implement .clone so far" + ) + + def _reference_clone( + self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False + ) -> PdfObjectProtocol: + """ + Reference the object within the _objects of pdf_dest only if + indirect_reference attribute exists (which means the objects was + already identified in xref/xobjstm) if object has been already + referenced do nothing. + + Args: + clone: + pdf_dest: + + Returns: + The clone + """ + try: + if not force_duplicate and clone.indirect_reference.pdf == pdf_dest: + return clone + except Exception: + pass + # if hasattr(clone, "indirect_reference"): + try: + ind = self.indirect_reference + except AttributeError: + return clone + i = len(pdf_dest._objects) + 1 + if ind is not None: + if id(ind.pdf) not in pdf_dest._id_translated: + pdf_dest._id_translated[id(ind.pdf)] = {} + pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore + if ( + not force_duplicate + and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] + ): + obj = pdf_dest.get_object( + pdf_dest._id_translated[id(ind.pdf)][ind.idnum] + ) + assert obj is not None + return obj + pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i + pdf_dest._objects.append(clone) + clone.indirect_reference = IndirectObject(i, 0, pdf_dest) + return clone + + def get_object(self) -> Optional["PdfObject"]: + """Resolve indirect references.""" + return self + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + raise NotImplementedError + + +class NullObject(PdfObject): + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "NullObject": + """Clone object into pdf_dest.""" + return cast( + "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) + ) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(b"null") + + @staticmethod + def read_from_stream(stream: StreamType) -> "NullObject": + nulltxt = stream.read(4) + if nulltxt != b"null": + raise PdfReadError("Could not read Null object") + return NullObject() + + def __repr__(self) -> str: + return "NullObject" + + +class BooleanObject(PdfObject): + def __init__(self, value: Any) -> None: + self.value = value + + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "BooleanObject": + """Clone object into pdf_dest.""" + return cast( + "BooleanObject", + self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), + ) + + def __eq__(self, __o: object) -> bool: + if isinstance(__o, BooleanObject): + return self.value == __o.value + elif isinstance(__o, bool): + return self.value == __o + else: + return False + + def __repr__(self) -> str: + return "True" if self.value else "False" + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + if self.value: + stream.write(b"true") + else: + stream.write(b"false") + + @staticmethod + def read_from_stream(stream: StreamType) -> "BooleanObject": + word = stream.read(4) + if word == b"true": + return BooleanObject(True) + elif word == b"fals": + stream.read(1) + return BooleanObject(False) + else: + raise PdfReadError("Could not read Boolean object") + + +class IndirectObject(PdfObject): + def __init__(self, idnum: int, generation: int, pdf: Any) -> None: # PdfReader + self.idnum = idnum + self.generation = generation + self.pdf = pdf + + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "IndirectObject": + """Clone object into pdf_dest.""" + if self.pdf == pdf_dest and not force_duplicate: + # Already duplicated and no extra duplication required + return self + if id(self.pdf) not in pdf_dest._id_translated: + pdf_dest._id_translated[id(self.pdf)] = {} + + if self.idnum in pdf_dest._id_translated[id(self.pdf)]: + dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) + if force_duplicate: + assert dup is not None + assert dup.indirect_reference is not None + idref = dup.indirect_reference + return IndirectObject(idref.idnum, idref.generation, idref.pdf) + else: + obj = self.get_object() + # case observed : a pointed object can not be found + if obj is None: + # this normally + obj = NullObject() + assert isinstance(self, (IndirectObject,)) + obj.indirect_reference = self + dup = pdf_dest._add_object( + obj.clone(pdf_dest, force_duplicate, ignore_fields) + ) + # asserts added to prevent errors in mypy + assert dup is not None + assert dup.indirect_reference is not None + return dup.indirect_reference + + @property + def indirect_reference(self) -> "IndirectObject": # type: ignore[override] + return self + + def get_object(self) -> Optional["PdfObject"]: + return self.pdf.get_object(self) + + def __deepcopy__(self, memo: Any) -> "IndirectObject": + return IndirectObject(self.idnum, self.generation, self.pdf) + + def _get_object_with_check(self) -> Optional["PdfObject"]: + o = self.get_object() + # the check is done here to not slow down get_object() + if isinstance(o, IndirectObject): + raise PdfStreamError( + f"{self.__repr__()} references an IndirectObject {o.__repr__()}" + ) + return o + + def __getattr__(self, name: str) -> Any: + # Attribute not found in object: look in pointed object + try: + return getattr(self._get_object_with_check(), name) + except AttributeError: + raise AttributeError( + f"No attribute {name} found in IndirectObject or pointed object" + ) + + def __getitem__(self, key: Any) -> Any: + # items should be extracted from pointed Object + return self._get_object_with_check()[key] # type: ignore + + def __str__(self) -> str: + # in this case we are looking for the pointed data + return self.get_object().__str__() + + def __repr__(self) -> str: + return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" + + def __eq__(self, other: object) -> bool: + return ( + other is not None + and isinstance(other, IndirectObject) + and self.idnum == other.idnum + and self.generation == other.generation + and self.pdf is other.pdf + ) + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(f"{self.idnum} {self.generation} R".encode()) + + @staticmethod + def read_from_stream(stream: StreamType, pdf: Any) -> "IndirectObject": # PdfReader + idnum = b"" + while True: + tok = stream.read(1) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok.isspace(): + break + idnum += tok + generation = b"" + while True: + tok = stream.read(1) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok.isspace(): + if not generation: + continue + break + generation += tok + r = read_non_whitespace(stream) + if r != b"R": + raise PdfReadError( + f"Error reading indirect object reference at byte {hex(stream.tell())}" + ) + return IndirectObject(int(idnum), int(generation), pdf) + + +FLOAT_WRITE_PRECISION = 8 # shall be min 5 digits max, allow user adj + + +class FloatObject(float, PdfObject): + def __new__( + cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None + ) -> "FloatObject": + try: + value = float(str_(value)) + return float.__new__(cls, value) + except Exception as e: + # If this isn't a valid decimal (happens in malformed PDFs) + # fallback to 0 + logger_warning( + f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ + ) + return float.__new__(cls, 0.0) + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "FloatObject": + """Clone object into pdf_dest.""" + return cast( + "FloatObject", + self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), + ) + + def myrepr(self) -> str: + if self == 0: + return "0.0" + nb = FLOAT_WRITE_PRECISION - int(log10(abs(self))) + s = f"{self:.{max(1,nb)}f}".rstrip("0").rstrip(".") + return s + + def __repr__(self) -> str: + return self.myrepr() # repr(float(self)) + + def as_numeric(self) -> float: + return float(self) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(self.myrepr().encode("utf8")) + + +class NumberObject(int, PdfObject): + NumberPattern = re.compile(b"[^+-.0-9]") + + def __new__(cls, value: Any) -> "NumberObject": + try: + return int.__new__(cls, int(value)) + except ValueError: + logger_warning(f"NumberObject({value}) invalid; use 0 instead", __name__) + return int.__new__(cls, 0) + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "NumberObject": + """Clone object into pdf_dest.""" + return cast( + "NumberObject", + self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), + ) + + def as_numeric(self) -> int: + return int(repr(self).encode("utf8")) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(repr(self).encode("utf8")) + + @staticmethod + def read_from_stream(stream: StreamType) -> Union["NumberObject", "FloatObject"]: + num = read_until_regex(stream, NumberObject.NumberPattern) + if num.find(b".") != -1: + return FloatObject(num) + return NumberObject(num) + + +class ByteStringObject(bytes, PdfObject): + """ + Represents a string object where the text encoding could not be determined. + + This occurs quite often, as the PDF spec doesn't provide an alternate way to + represent strings -- for example, the encryption data stored in files (like + /O) is clearly not text, but is still stored in a "String" object. + """ + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "ByteStringObject": + """Clone object into pdf_dest.""" + return cast( + "ByteStringObject", + self._reference_clone( + ByteStringObject(bytes(self)), pdf_dest, force_duplicate + ), + ) + + @property + def original_bytes(self) -> bytes: + """For compatibility with TextStringObject.original_bytes.""" + return self + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(b"<") + stream.write(binascii.hexlify(self)) + stream.write(b">") + + +class TextStringObject(str, PdfObject): # noqa: SLOT000 + """ + A string object that has been decoded into a real unicode string. + + If read from a PDF document, this string appeared to match the + PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding + to occur. + """ + + autodetect_pdfdocencoding: bool + autodetect_utf16: bool + utf16_bom: bytes + + def __new__(cls, value: Any) -> "TextStringObject": + if isinstance(value, bytes): + value = value.decode("charmap") + o = str.__new__(cls, value) + o.autodetect_utf16 = False + o.autodetect_pdfdocencoding = False + o.utf16_bom = b"" + if value.startswith(("\xfe\xff", "\xff\xfe")): + o.autodetect_utf16 = True + o.utf16_bom = value[:2].encode("charmap") + else: + try: + encode_pdfdocencoding(o) + o.autodetect_pdfdocencoding = True + except UnicodeEncodeError: + o.autodetect_utf16 = True + return o + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "TextStringObject": + """Clone object into pdf_dest.""" + obj = TextStringObject(self) + obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding + obj.autodetect_utf16 = self.autodetect_utf16 + obj.utf16_bom = self.utf16_bom + return cast( + "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) + ) + + @property + def original_bytes(self) -> bytes: + """ + It is occasionally possible that a text string object gets created where + a byte string object was expected due to the autodetection mechanism -- + if that occurs, this "original_bytes" property can be used to + back-calculate what the original encoded bytes were. + """ + return self.get_original_bytes() + + def get_original_bytes(self) -> bytes: + # We're a text string object, but the library is trying to get our raw + # bytes. This can happen if we auto-detected this string as text, but + # we were wrong. It's pretty common. Return the original bytes that + # would have been used to create this object, based upon the autodetect + # method. + if self.autodetect_utf16: + if self.utf16_bom == codecs.BOM_UTF16_LE: + return codecs.BOM_UTF16_LE + self.encode("utf-16le") + elif self.utf16_bom == codecs.BOM_UTF16_BE: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + else: + return self.encode("utf-16be") + elif self.autodetect_pdfdocencoding: + return encode_pdfdocencoding(self) + else: + raise Exception("no information about original bytes") # pragma: no cover + + def get_encoded_bytes(self) -> bytes: + # Try to write the string out as a PDFDocEncoding encoded string. It's + # nicer to look at in the PDF file. Sadly, we take a performance hit + # here for trying... + try: + if self.autodetect_utf16: + raise UnicodeEncodeError("", "forced", -1, -1, "") + bytearr = encode_pdfdocencoding(self) + except UnicodeEncodeError: + if self.utf16_bom == codecs.BOM_UTF16_LE: + bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") + elif self.utf16_bom == codecs.BOM_UTF16_BE: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + else: + bytearr = self.encode("utf-16be") + return bytearr + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + bytearr = self.get_encoded_bytes() + stream.write(b"(") + for c in bytearr: + if not chr(c).isalnum() and c != b" ": + # This: + # stream.write(rf"\{c:0>3o}".encode()) + # gives + # https://github.com/davidhalter/parso/issues/207 + stream.write(("\\%03o" % c).encode()) + else: + stream.write(b_(chr(c))) + stream.write(b")") + + +class NameObject(str, PdfObject): # noqa: SLOT000 + delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") + surfix = b"/" + renumber_table: ClassVar[Dict[str, bytes]] = { + "#": b"#23", + "(": b"#28", + ")": b"#29", + "/": b"#2F", + "%": b"#25", + **{chr(i): f"#{i:02X}".encode() for i in range(33)}, + } + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "NameObject": + """Clone object into pdf_dest.""" + return cast( + "NameObject", + self._reference_clone(NameObject(self), pdf_dest, force_duplicate), + ) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(self.renumber()) + + def renumber(self) -> bytes: + out = self[0].encode("utf-8") + if out != b"/": + deprecate_no_replacement( + f"Incorrect first char in NameObject, should start with '/': ({self})", + "6.0.0", + ) + for c in self[1:]: + if c > "~": + for x in c.encode("utf-8"): + out += f"#{x:02X}".encode() + else: + try: + out += self.renumber_table[c] + except KeyError: + out += c.encode("utf-8") + return out + + @staticmethod + def unnumber(sin: bytes) -> bytes: + i = sin.find(b"#", 0) + while i >= 0: + try: + sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :] + i = sin.find(b"#", i + 1) + except ValueError: + # if the 2 characters after # can not be converted to hex + # we change nothing and carry on + i = i + 1 + return sin + + CHARSETS = ("utf-8", "gbk", "latin1") + + @staticmethod + def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader + name = stream.read(1) + if name != NameObject.surfix: + raise PdfReadError("name read error") + name += read_until_regex(stream, NameObject.delimiter_pattern) + try: + # Name objects should represent irregular characters + # with a '#' followed by the symbol's hex number + name = NameObject.unnumber(name) + for enc in NameObject.CHARSETS: + try: + ret = name.decode(enc) + return NameObject(ret) + except Exception: + pass + raise UnicodeDecodeError("", name, 0, 0, "Code Not Found") + except (UnicodeEncodeError, UnicodeDecodeError) as e: + if not pdf.strict: + logger_warning( + f"Illegal character in NameObject ({name!r}), " + "you may need to adjust NameObject.CHARSETS", + __name__, + ) + return NameObject(name.decode("charmap")) + else: + raise PdfReadError( + f"Illegal character in NameObject ({name!r}). " + "You may need to adjust NameObject.CHARSETS.", + ) from e + + +def encode_pdfdocencoding(unicode_string: str) -> bytes: + retval = bytearray() + for c in unicode_string: + try: + retval += b_(chr(_pdfdoc_encoding_rev[c])) + except KeyError: + raise UnicodeEncodeError( + "pdfdocencoding", c, -1, -1, "does not exist in translation table" + ) + return bytes(retval) diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py new file mode 100644 index 00000000..87d68867 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_data_structures.py @@ -0,0 +1,1616 @@ +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import logging +import re +import sys +from io import BytesIO +from math import ceil +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Union, + cast, +) + +from .._protocols import PdfReaderProtocol, PdfWriterProtocol, XmpInformationProtocol +from .._utils import ( + WHITESPACES, + StreamType, + b_, + deprecate_no_replacement, + deprecate_with_replacement, + logger_warning, + read_non_whitespace, + read_until_regex, + skip_over_comment, +) +from ..constants import ( + CheckboxRadioButtonAttributes, + FieldDictionaryAttributes, + OutlineFontFlag, +) +from ..constants import FilterTypes as FT +from ..constants import StreamAttributes as SA +from ..constants import TypArguments as TA +from ..constants import TypFitArguments as TF +from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError +from ._base import ( + BooleanObject, + ByteStringObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + TextStringObject, +) +from ._fit import Fit +from ._image_inline import ( + extract_inline_A85, + extract_inline_AHx, + extract_inline_DCT, + extract_inline_default, + extract_inline_RL, +) +from ._utils import read_hex_string_from_stream, read_string_from_stream + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +logger = logging.getLogger(__name__) +NumberSigns = b"+-" +IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") + + +class ArrayObject(List[Any], PdfObject): + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "ArrayObject": + """Clone object into pdf_dest.""" + try: + if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore + return self + except Exception: + pass + arr = cast( + "ArrayObject", + self._reference_clone(ArrayObject(), pdf_dest, force_duplicate), + ) + for data in self: + if isinstance(data, StreamObject): + dup = data._reference_clone( + data.clone(pdf_dest, force_duplicate, ignore_fields), + pdf_dest, + force_duplicate, + ) + arr.append(dup.indirect_reference) + elif hasattr(data, "clone"): + arr.append(data.clone(pdf_dest, force_duplicate, ignore_fields)) + else: + arr.append(data) + return arr + + def items(self) -> Iterable[Any]: + """Emulate DictionaryObject.items for a list (index, object).""" + return enumerate(self) + + def _to_lst(self, lst: Any) -> List[Any]: + # Convert to list, internal + if isinstance(lst, (list, tuple, set)): + pass + elif isinstance(lst, PdfObject): + lst = [lst] + elif isinstance(lst, str): + if lst[0] == "/": + lst = [NameObject(lst)] + else: + lst = [TextStringObject(lst)] + elif isinstance(lst, bytes): + lst = [ByteStringObject(lst)] + else: # for numbers,... + lst = [lst] + return lst + + def __add__(self, lst: Any) -> "ArrayObject": + """ + Allow extension by adding list or add one element only + + Args: + lst: any list, tuples are extended the list. + other types(numbers,...) will be appended. + if str is passed it will be converted into TextStringObject + or NameObject (if starting with "/") + if bytes is passed it will be converted into ByteStringObject + + Returns: + ArrayObject with all elements + """ + temp = ArrayObject(self) + temp.extend(self._to_lst(lst)) + return temp + + def __iadd__(self, lst: Any) -> Self: + """ + Allow extension by adding list or add one element only + + Args: + lst: any list, tuples are extended the list. + other types(numbers,...) will be appended. + if str is passed it will be converted into TextStringObject + or NameObject (if starting with "/") + if bytes is passed it will be converted into ByteStringObject + """ + self.extend(self._to_lst(lst)) + return self + + def __isub__(self, lst: Any) -> Self: + """Allow to remove items""" + for x in self._to_lst(lst): + try: + x = self.index(x) + del self[x] + except ValueError: + pass + return self + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(b"[") + for data in self: + stream.write(b" ") + data.write_to_stream(stream) + stream.write(b" ]") + + @staticmethod + def read_from_stream( + stream: StreamType, + pdf: Optional[PdfReaderProtocol], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, + ) -> "ArrayObject": + arr = ArrayObject() + tmp = stream.read(1) + if tmp != b"[": + raise PdfReadError("Could not read array") + while True: + # skip leading whitespace + tok = stream.read(1) + while tok.isspace(): + tok = stream.read(1) + stream.seek(-1, 1) + # check for array ending + peek_ahead = stream.read(1) + if peek_ahead == b"]": + break + stream.seek(-1, 1) + # read and append obj + arr.append(read_object(stream, pdf, forced_encoding)) + return arr + + +class DictionaryObject(Dict[Any, Any], PdfObject): + def clone( + self, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "DictionaryObject": + """Clone object into pdf_dest.""" + try: + if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore + return self + except Exception: + pass + + visited: Set[Tuple[int, int]] = set() # (idnum, generation) + d__ = cast( + "DictionaryObject", + self._reference_clone(self.__class__(), pdf_dest, force_duplicate), + ) + if ignore_fields is None: + ignore_fields = [] + if len(d__.keys()) == 0: + d__._clone(self, pdf_dest, force_duplicate, ignore_fields, visited) + return d__ + + def _clone( + self, + src: "DictionaryObject", + pdf_dest: PdfWriterProtocol, + force_duplicate: bool, + ignore_fields: Optional[Sequence[Union[str, int]]], + visited: Set[Tuple[int, int]], # (idnum, generation) + ) -> None: + """ + Update the object from src. + + Args: + src: "DictionaryObject": + pdf_dest: + force_duplicate: + ignore_fields: + """ + # first we remove for the ignore_fields + # that are for a limited number of levels + x = 0 + assert ignore_fields is not None + ignore_fields = list(ignore_fields) + while x < len(ignore_fields): + if isinstance(ignore_fields[x], int): + if cast(int, ignore_fields[x]) <= 0: + del ignore_fields[x] + del ignore_fields[x] + continue + else: + ignore_fields[x] -= 1 # type:ignore + x += 1 + # First check if this is a chain list, we need to loop to prevent recur + if any( + field not in ignore_fields + and field in src + and isinstance(src.raw_get(field), IndirectObject) + and isinstance(src[field], DictionaryObject) + and ( + src.get("/Type", None) is None + or cast(DictionaryObject, src[field]).get("/Type", None) is None + or src.get("/Type", None) + == cast(DictionaryObject, src[field]).get("/Type", None) + ) + for field in ["/Next", "/Prev", "/N", "/V"] + ): + ignore_fields = list(ignore_fields) + for lst in (("/Next", "/Prev"), ("/N", "/V")): + for k in lst: + objs = [] + if ( + k in src + and k not in self + and isinstance(src.raw_get(k), IndirectObject) + and isinstance(src[k], DictionaryObject) + # IF need to go further the idea is to check + # that the types are the same: + and ( + src.get("/Type", None) is None + or cast(DictionaryObject, src[k]).get("/Type", None) is None + or src.get("/Type", None) + == cast(DictionaryObject, src[k]).get("/Type", None) + ) + ): + cur_obj: Optional[DictionaryObject] = cast( + "DictionaryObject", src[k] + ) + prev_obj: Optional[DictionaryObject] = self + while cur_obj is not None: + clon = cast( + "DictionaryObject", + cur_obj._reference_clone( + cur_obj.__class__(), pdf_dest, force_duplicate + ), + ) + # check to see if we've previously processed our item + if clon.indirect_reference is not None: + idnum = clon.indirect_reference.idnum + generation = clon.indirect_reference.generation + if (idnum, generation) in visited: + cur_obj = None + break + visited.add((idnum, generation)) + objs.append((cur_obj, clon)) + assert prev_obj is not None + prev_obj[NameObject(k)] = clon.indirect_reference + prev_obj = clon + try: + if cur_obj == src: + cur_obj = None + else: + cur_obj = cast("DictionaryObject", cur_obj[k]) + except Exception: + cur_obj = None + for s, c in objs: + c._clone( + s, pdf_dest, force_duplicate, ignore_fields, visited + ) + + for k, v in src.items(): + if k not in ignore_fields: + if isinstance(v, StreamObject): + if not hasattr(v, "indirect_reference"): + v.indirect_reference = None + vv = v.clone(pdf_dest, force_duplicate, ignore_fields) + assert vv.indirect_reference is not None + self[k.clone(pdf_dest)] = vv.indirect_reference # type: ignore[attr-defined] + elif k not in self: + self[NameObject(k)] = ( + v.clone(pdf_dest, force_duplicate, ignore_fields) + if hasattr(v, "clone") + else v + ) + + def raw_get(self, key: Any) -> Any: + return dict.__getitem__(self, key) + + def get_inherited(self, key: str, default: Any = None) -> Any: + """ + Returns the value of a key or from the parent if not found. + If not found returns default. + + Args: + key: string identifying the field to return + + default: default value to return + + Returns: + Current key or inherited one, otherwise default value. + """ + if key in self: + return self[key] + try: + if "/Parent" not in self: + return default + raise KeyError("not present") + except KeyError: + return cast("DictionaryObject", self["/Parent"].get_object()).get_inherited( + key, default + ) + + def __setitem__(self, key: Any, value: Any) -> Any: + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.__setitem__(self, key, value) + + def setdefault(self, key: Any, value: Optional[Any] = None) -> Any: + if not isinstance(key, PdfObject): + raise ValueError("key must be PdfObject") + if not isinstance(value, PdfObject): + raise ValueError("value must be PdfObject") + return dict.setdefault(self, key, value) # type: ignore + + def __getitem__(self, key: Any) -> PdfObject: + return dict.__getitem__(self, key).get_object() + + @property + def xmp_metadata(self) -> Optional[XmpInformationProtocol]: + """ + Retrieve XMP (Extensible Metadata Platform) data relevant to the this + object, if available. + + See Table 347 — Additional entries in a metadata stream dictionary. + + Returns: + Returns a :class:`~pypdf.xmp.XmpInformation` instance + that can be used to access XMP metadata from the document. Can also + return None if no metadata was found on the document root. + """ + from ..xmp import XmpInformation + + metadata = self.get("/Metadata", None) + if metadata is None: + return None + metadata = metadata.get_object() + + if not isinstance(metadata, XmpInformation): + metadata = XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(b"<<\n") + for key, value in list(self.items()): + if len(key) > 2 and key[1] == "%" and key[-1] == "%": + continue + key.write_to_stream(stream, encryption_key) + stream.write(b" ") + value.write_to_stream(stream) + stream.write(b"\n") + stream.write(b">>") + + @staticmethod + def read_from_stream( + stream: StreamType, + pdf: Optional[PdfReaderProtocol], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, + ) -> "DictionaryObject": + def get_next_obj_pos( + p: int, p1: int, rem_gens: List[int], pdf: PdfReaderProtocol + ) -> int: + out = p1 + for gen in rem_gens: + loc = pdf.xref[gen] + try: + out = min(out, min([x for x in loc.values() if p < x <= p1])) + except ValueError: + pass + return out + + def read_unsized_from_stream( + stream: StreamType, pdf: PdfReaderProtocol + ) -> bytes: + # we are just pointing at beginning of the stream + eon = get_next_obj_pos(stream.tell(), 2**32, list(pdf.xref), pdf) - 1 + curr = stream.tell() + rw = stream.read(eon - stream.tell()) + p = rw.find(b"endstream") + if p < 0: + raise PdfReadError( + f"Unable to find 'endstream' marker for obj starting at {curr}." + ) + stream.seek(curr + p + 9) + return rw[: p - 1] + + tmp = stream.read(2) + if tmp != b"<<": + raise PdfReadError( + f"Dictionary read error at byte {hex(stream.tell())}: " + "stream must begin with '<<'" + ) + data: Dict[Any, Any] = {} + while True: + tok = read_non_whitespace(stream) + if tok == b"\x00": + continue + elif tok == b"%": + stream.seek(-1, 1) + skip_over_comment(stream) + continue + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + + if tok == b">": + stream.read(1) + break + stream.seek(-1, 1) + try: + key = read_object(stream, pdf) + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + value = read_object(stream, pdf, forced_encoding) + except Exception as exc: + if pdf is not None and pdf.strict: + raise PdfReadError(exc.__repr__()) + logger_warning(exc.__repr__(), __name__) + retval = DictionaryObject() + retval.update(data) + return retval # return partial data + + if not data.get(key): + data[key] = value + else: + # multiple definitions of key not permitted + msg = ( + f"Multiple definitions in dictionary at byte " + f"{hex(stream.tell())} for key {key}" + ) + if pdf is not None and pdf.strict: + raise PdfReadError(msg) + logger_warning(msg, __name__) + + pos = stream.tell() + s = read_non_whitespace(stream) + if s == b"s" and stream.read(5) == b"tream": + eol = stream.read(1) + # odd PDF file output has spaces after 'stream' keyword but before EOL. + # patch provided by Danial Sandler + while eol == b" ": + eol = stream.read(1) + if eol not in (b"\n", b"\r"): + raise PdfStreamError("Stream data must be followed by a newline") + if eol == b"\r" and stream.read(1) != b"\n": + stream.seek(-1, 1) + # this is a stream object, not a dictionary + if SA.LENGTH not in data: + if pdf is not None and pdf.strict: + raise PdfStreamError("Stream length not defined") + else: + logger_warning( + f"Stream length not defined @pos={stream.tell()}", __name__ + ) + data[NameObject(SA.LENGTH)] = NumberObject(-1) + length = data[SA.LENGTH] + if isinstance(length, IndirectObject): + t = stream.tell() + assert pdf is not None # hint for mypy + length = pdf.get_object(length) + stream.seek(t, 0) + if length is None: # if the PDF is damaged + length = -1 + pstart = stream.tell() + if length > 0: + data["__streamdata__"] = stream.read(length) + else: + data["__streamdata__"] = read_until_regex( + stream, re.compile(b"endstream") + ) + e = read_non_whitespace(stream) + ndstream = stream.read(8) + if (e + ndstream) != b"endstream": + # (sigh) - the odd PDF file has a length that is too long, so + # we need to read backwards to find the "endstream" ending. + # ReportLab (unknown version) generates files with this bug, + # and Python users into PDF files tend to be our audience. + # we need to do this to correct the streamdata and chop off + # an extra character. + pos = stream.tell() + stream.seek(-10, 1) + end = stream.read(9) + if end == b"endstream": + # we found it by looking back one character further. + data["__streamdata__"] = data["__streamdata__"][:-1] + elif pdf is not None and not pdf.strict: + stream.seek(pstart, 0) + data["__streamdata__"] = read_unsized_from_stream(stream, pdf) + pos = stream.tell() + else: + stream.seek(pos, 0) + raise PdfReadError( + "Unable to find 'endstream' marker after stream at byte " + f"{hex(stream.tell())} (nd='{ndstream!r}', end='{end!r}')." + ) + else: + stream.seek(pos, 0) + if "__streamdata__" in data: + return StreamObject.initialize_from_dictionary(data) + else: + retval = DictionaryObject() + retval.update(data) + return retval + + +class TreeObject(DictionaryObject): + def __init__(self, dct: Optional[DictionaryObject] = None) -> None: + DictionaryObject.__init__(self) + if dct: + self.update(dct) + + def hasChildren(self) -> bool: # deprecated + deprecate_with_replacement("hasChildren", "has_children", "4.0.0") + return self.has_children() + + def has_children(self) -> bool: + return "/First" in self + + def __iter__(self) -> Any: + return self.children() + + def children(self) -> Iterable[Any]: + if not self.has_children(): + return + + child_ref = self[NameObject("/First")] + child = child_ref.get_object() + while True: + yield child + if child == self[NameObject("/Last")]: + return + child_ref = child.get(NameObject("/Next")) # type: ignore + if child_ref is None: + return + child = child_ref.get_object() + + def add_child(self, child: Any, pdf: PdfWriterProtocol) -> None: + self.insert_child(child, None, pdf) + + def inc_parent_counter_default( + self, parent: Union[None, IndirectObject, "TreeObject"], n: int + ) -> None: + if parent is None: + return + parent = cast("TreeObject", parent.get_object()) + if "/Count" in parent: + parent[NameObject("/Count")] = NumberObject( + max(0, cast(int, parent[NameObject("/Count")]) + n) + ) + self.inc_parent_counter_default(parent.get("/Parent", None), n) + + def inc_parent_counter_outline( + self, parent: Union[None, IndirectObject, "TreeObject"], n: int + ) -> None: + if parent is None: + return + parent = cast("TreeObject", parent.get_object()) + # BooleanObject requires comparison with == not is + opn = parent.get("/%is_open%", True) == True # noqa + c = cast(int, parent.get("/Count", 0)) + if c < 0: + c = abs(c) + parent[NameObject("/Count")] = NumberObject((c + n) * (1 if opn else -1)) + if not opn: + return + self.inc_parent_counter_outline(parent.get("/Parent", None), n) + + def insert_child( + self, + child: Any, + before: Any, + pdf: PdfWriterProtocol, + inc_parent_counter: Optional[Callable[..., Any]] = None, + ) -> IndirectObject: + if inc_parent_counter is None: + inc_parent_counter = self.inc_parent_counter_default + child_obj = child.get_object() + child = child.indirect_reference # get_reference(child_obj) + + prev: Optional[DictionaryObject] + if "/First" not in self: # no child yet + self[NameObject("/First")] = child + self[NameObject("/Count")] = NumberObject(0) + self[NameObject("/Last")] = child + child_obj[NameObject("/Parent")] = self.indirect_reference + inc_parent_counter(self, child_obj.get("/Count", 1)) + if "/Next" in child_obj: + del child_obj["/Next"] + if "/Prev" in child_obj: + del child_obj["/Prev"] + return child + else: + prev = cast("DictionaryObject", self["/Last"]) + + while prev.indirect_reference != before: + if "/Next" in prev: + prev = cast("TreeObject", prev["/Next"]) + else: # append at the end + prev[NameObject("/Next")] = cast("TreeObject", child) + child_obj[NameObject("/Prev")] = prev.indirect_reference + child_obj[NameObject("/Parent")] = self.indirect_reference + if "/Next" in child_obj: + del child_obj["/Next"] + self[NameObject("/Last")] = child + inc_parent_counter(self, child_obj.get("/Count", 1)) + return child + try: # insert as first or in the middle + assert isinstance(prev["/Prev"], DictionaryObject) + prev["/Prev"][NameObject("/Next")] = child + child_obj[NameObject("/Prev")] = prev["/Prev"] + except Exception: # it means we are inserting in first position + del child_obj["/Next"] + child_obj[NameObject("/Next")] = prev + prev[NameObject("/Prev")] = child + child_obj[NameObject("/Parent")] = self.indirect_reference + inc_parent_counter(self, child_obj.get("/Count", 1)) + return child + + def _remove_node_from_tree( + self, prev: Any, prev_ref: Any, cur: Any, last: Any + ) -> None: + """ + Adjust the pointers of the linked list and tree node count. + + Args: + prev: + prev_ref: + cur: + last: + """ + next_ref = cur.get(NameObject("/Next"), None) + if prev is None: + if next_ref: + # Removing first tree node + next_obj = next_ref.get_object() + del next_obj[NameObject("/Prev")] + self[NameObject("/First")] = next_ref + self[NameObject("/Count")] = NumberObject( + self[NameObject("/Count")] - 1 # type: ignore + ) + + else: + # Removing only tree node + self[NameObject("/Count")] = NumberObject(0) + del self[NameObject("/First")] + if NameObject("/Last") in self: + del self[NameObject("/Last")] + else: + if next_ref: + # Removing middle tree node + next_obj = next_ref.get_object() + next_obj[NameObject("/Prev")] = prev_ref + prev[NameObject("/Next")] = next_ref + else: + # Removing last tree node + assert cur == last + del prev[NameObject("/Next")] + self[NameObject("/Last")] = prev_ref + self[NameObject("/Count")] = NumberObject(self[NameObject("/Count")] - 1) # type: ignore + + def remove_child(self, child: Any) -> None: + child_obj = child.get_object() + child = child_obj.indirect_reference + + if NameObject("/Parent") not in child_obj: + raise ValueError("Removed child does not appear to be a tree item") + elif child_obj[NameObject("/Parent")] != self: + raise ValueError("Removed child is not a member of this tree") + + found = False + prev_ref = None + prev = None + cur_ref: Optional[Any] = self[NameObject("/First")] + cur: Optional[Dict[str, Any]] = cur_ref.get_object() # type: ignore + last_ref = self[NameObject("/Last")] + last = last_ref.get_object() + while cur is not None: + if cur == child_obj: + self._remove_node_from_tree(prev, prev_ref, cur, last) + found = True + break + + # Go to the next node + prev_ref = cur_ref + prev = cur + if NameObject("/Next") in cur: + cur_ref = cur[NameObject("/Next")] + cur = cur_ref.get_object() + else: + cur_ref = None + cur = None + + if not found: + raise ValueError("Removal couldn't find item in tree") + + _reset_node_tree_relationship(child_obj) + + def remove_from_tree(self) -> None: + """Remove the object from the tree it is in.""" + if NameObject("/Parent") not in self: + raise ValueError("Removed child does not appear to be a tree item") + else: + cast("TreeObject", self["/Parent"]).remove_child(self) + + def emptyTree(self) -> None: # deprecated + deprecate_with_replacement("emptyTree", "empty_tree", "4.0.0") + self.empty_tree() + + def empty_tree(self) -> None: + for child in self: + child_obj = child.get_object() + _reset_node_tree_relationship(child_obj) + + if NameObject("/Count") in self: + del self[NameObject("/Count")] + if NameObject("/First") in self: + del self[NameObject("/First")] + if NameObject("/Last") in self: + del self[NameObject("/Last")] + + +def _reset_node_tree_relationship(child_obj: Any) -> None: + """ + Call this after a node has been removed from a tree. + + This resets the nodes attributes in respect to that tree. + + Args: + child_obj: + """ + del child_obj[NameObject("/Parent")] + if NameObject("/Next") in child_obj: + del child_obj[NameObject("/Next")] + if NameObject("/Prev") in child_obj: + del child_obj[NameObject("/Prev")] + + +class StreamObject(DictionaryObject): + def __init__(self) -> None: + self._data: Union[bytes, str] = b"" + self.decoded_self: Optional[DecodedStreamObject] = None + + def _clone( + self, + src: DictionaryObject, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool, + ignore_fields: Optional[Sequence[Union[str, int]]], + visited: Set[Tuple[int, int]], + ) -> None: + """ + Update the object from src. + + Args: + src: + pdf_dest: + force_duplicate: + ignore_fields: + """ + self._data = cast("StreamObject", src)._data + try: + decoded_self = cast("StreamObject", src).decoded_self + if decoded_self is None: + self.decoded_self = None + else: + self.decoded_self = cast( + "DecodedStreamObject", + decoded_self.clone(pdf_dest, force_duplicate, ignore_fields), + ) + except Exception: + pass + super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) + + def get_data(self) -> Union[bytes, str]: + return self._data + + def set_data(self, data: bytes) -> None: + self._data = data + + def hash_value_data(self) -> bytes: + data = super().hash_value_data() + data += b_(self._data) + return data + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) + DictionaryObject.write_to_stream(self, stream) + del self[SA.LENGTH] + stream.write(b"\nstream\n") + stream.write(self._data) + stream.write(b"\nendstream") + + @staticmethod + def initializeFromDictionary( + data: Dict[str, Any] + ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: + deprecate_with_replacement( + "initializeFromDictionary", "initialize_from_dictionary", "5.0.0" + ) # pragma: no cover + return StreamObject.initialize_from_dictionary(data) # pragma: no cover + + @staticmethod + def initialize_from_dictionary( + data: Dict[str, Any] + ) -> Union["EncodedStreamObject", "DecodedStreamObject"]: + retval: Union[EncodedStreamObject, DecodedStreamObject] + if SA.FILTER in data: + retval = EncodedStreamObject() + else: + retval = DecodedStreamObject() + retval._data = data["__streamdata__"] + del data["__streamdata__"] + del data[SA.LENGTH] + retval.update(data) + return retval + + def flate_encode(self, level: int = -1) -> "EncodedStreamObject": + from ..filters import FlateDecode + + if SA.FILTER in self: + f = self[SA.FILTER] + if isinstance(f, ArrayObject): + f = ArrayObject([NameObject(FT.FLATE_DECODE), *f]) + try: + params = ArrayObject( + [NullObject(), *self.get(SA.DECODE_PARMS, ArrayObject())] + ) + except TypeError: + # case of error where the * operator is not working (not an array + params = ArrayObject( + [NullObject(), self.get(SA.DECODE_PARMS, ArrayObject())] + ) + else: + f = ArrayObject([NameObject(FT.FLATE_DECODE), f]) + params = ArrayObject( + [NullObject(), self.get(SA.DECODE_PARMS, NullObject())] + ) + else: + f = NameObject(FT.FLATE_DECODE) + params = None + retval = EncodedStreamObject() + retval.update(self) + retval[NameObject(SA.FILTER)] = f + if params is not None: + retval[NameObject(SA.DECODE_PARMS)] = params + retval._data = FlateDecode.encode(b_(self._data), level) + return retval + + def decode_as_image(self) -> Any: + """ + Try to decode the stream object as an image + + Returns: + a PIL image if proper decoding has been found + Raises: + Exception: (any)during decoding to to invalid object or + errors during decoding will be reported + It is recommended to catch exceptions to prevent + stops in your program. + """ + from ..filters import _xobj_to_image + + if self.get("/Subtype", "") != "/Image": + try: + msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover + except AttributeError: + msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover + logger_warning(msg, __name__) + extension, byte_stream, img = _xobj_to_image(self) + if extension is None: + return None # pragma: no cover + return img + + +class DecodedStreamObject(StreamObject): + pass + + +class EncodedStreamObject(StreamObject): + def __init__(self) -> None: + self.decoded_self: Optional[DecodedStreamObject] = None + + # This overrides the parent method: + def get_data(self) -> Union[bytes, str]: + from ..filters import decode_stream_data + + if self.decoded_self is not None: + # cached version of decoded object + return self.decoded_self.get_data() + else: + # create decoded object + decoded = DecodedStreamObject() + + decoded.set_data(b_(decode_stream_data(self))) + for key, value in list(self.items()): + if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): + decoded[key] = value + self.decoded_self = decoded + return decoded.get_data() + + # This overrides the parent method: + def set_data(self, data: bytes) -> None: # deprecated + from ..filters import FlateDecode + + if self.get(SA.FILTER, "") == FT.FLATE_DECODE: + if not isinstance(data, bytes): + raise TypeError("data must be bytes") + assert self.decoded_self is not None + self.decoded_self.set_data(data) + super().set_data(FlateDecode.encode(data)) + else: + raise PdfReadError( + "Streams encoded with different filter from only FlateDecode is not supported" + ) + + +class ContentStream(DecodedStreamObject): + """ + In order to be fast, this data structure can contain either: + + * raw data in ._data + * parsed stream operations in ._operations. + + At any time, ContentStream object can either have both of those fields defined, + or one field defined and the other set to None. + + These fields are "rebuilt" lazily, when accessed: + + * when .get_data() is called, if ._data is None, it is rebuilt from ._operations. + * when .operations is called, if ._operations is None, it is rebuilt from ._data. + + Conversely, these fields can be invalidated: + + * when .set_data() is called, ._operations is set to None. + * when .operations is set, ._data is set to None. + """ + + def __init__( + self, + stream: Any, + pdf: Any, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, + ) -> None: + self.pdf = pdf + + # The inner list has two elements: + # Element 0: List + # Element 1: str + self._operations: List[Tuple[Any, Any]] = [] + + # stream may be a StreamObject or an ArrayObject containing + # multiple StreamObjects to be cat'd together. + if stream is None: + super().set_data(b"") + else: + stream = stream.get_object() + if isinstance(stream, ArrayObject): + data = b"" + for s in stream: + data += b_(s.get_object().get_data()) + if len(data) == 0 or data[-1] != b"\n": + data += b"\n" + super().set_data(bytes(data)) + else: + stream_data = stream.get_data() + assert stream_data is not None + super().set_data(b_(stream_data)) + self.forced_encoding = forced_encoding + + def clone( + self, + pdf_dest: Any, + force_duplicate: bool = False, + ignore_fields: Optional[Sequence[Union[str, int]]] = (), + ) -> "ContentStream": + """ + Clone object into pdf_dest. + + Args: + pdf_dest: + force_duplicate: + ignore_fields: + + Returns: + The cloned ContentStream + """ + try: + if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore + return self + except Exception: + pass + + visited: Set[Tuple[int, int]] = set() + d__ = cast( + "ContentStream", + self._reference_clone( + self.__class__(None, None), pdf_dest, force_duplicate + ), + ) + if ignore_fields is None: + ignore_fields = [] + d__._clone(self, pdf_dest, force_duplicate, ignore_fields, visited) + return d__ + + def _clone( + self, + src: DictionaryObject, + pdf_dest: PdfWriterProtocol, + force_duplicate: bool, + ignore_fields: Optional[Sequence[Union[str, int]]], + visited: Set[Tuple[int, int]], + ) -> None: + """ + Update the object from src. + + Args: + src: + pdf_dest: + force_duplicate: + ignore_fields: + """ + src_cs = cast("ContentStream", src) + super().set_data(b_(src_cs._data)) + self.pdf = pdf_dest + self._operations = list(src_cs._operations) + self.forced_encoding = src_cs.forced_encoding + # no need to call DictionaryObjection or anything + # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) + + def _parse_content_stream(self, stream: StreamType) -> None: + # 7.8.2 Content Streams + stream.seek(0, 0) + operands: List[Union[int, str, PdfObject]] = [] + while True: + peek = read_non_whitespace(stream) + if peek == b"" or peek == 0: + break + stream.seek(-1, 1) + if peek.isalpha() or peek in (b"'", b'"'): + operator = read_until_regex(stream, NameObject.delimiter_pattern) + if operator == b"BI": + # begin inline image - a completely different parsing + # mechanism is required, of course... thanks buddy... + assert operands == [] + ii = self._read_inline_image(stream) + self._operations.append((ii, b"INLINE IMAGE")) + else: + self._operations.append((operands, operator)) + operands = [] + elif peek == b"%": + # If we encounter a comment in the content stream, we have to + # handle it here. Typically, read_object will handle + # encountering a comment -- but read_object assumes that + # following the comment must be the object we're trying to + # read. In this case, it could be an operator instead. + while peek not in (b"\r", b"\n", b""): + peek = stream.read(1) + else: + operands.append(read_object(stream, None, self.forced_encoding)) + + def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: + # begin reading just after the "BI" - begin image + # first read the dictionary of settings. + settings = DictionaryObject() + while True: + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + if tok == b"I": + # "ID" - begin of image data + break + key = read_object(stream, self.pdf) + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + value = read_object(stream, self.pdf) + settings[key] = value + # left at beginning of ID + tmp = stream.read(3) + assert tmp[:2] == b"ID" + filtr = settings.get("/F", settings.get("/Filter", "not set")) + savpos = stream.tell() + if isinstance(filtr, list): + filtr = filtr[0] # used forencoding + if "AHx" in filtr or "ASCIIHexDecode" in filtr: + data = extract_inline_AHx(stream) + elif "A85" in filtr or "ASCII85Decode" in filtr: + data = extract_inline_A85(stream) + elif "RL" in filtr or "RunLengthDecode" in filtr: + data = extract_inline_RL(stream) + elif "DCT" in filtr or "DCTDecode" in filtr: + data = extract_inline_DCT(stream) + elif filtr == "not set": + cs = settings.get("/CS", "") + if "RGB" in cs: + lcs = 3 + elif "CMYK" in cs: + lcs = 4 + else: + bits = settings.get( + "/BPC", + 8 if cs in {"/I", "/G", "/Indexed", "/DeviceGray"} else -1, + ) + if bits > 0: + lcs = bits / 8.0 + else: + data = extract_inline_default(stream) + lcs = -1 + if lcs > 0: + data = stream.read( + ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"]) + ) + ei = read_non_whitespace(stream) + stream.seek(-1, 1) + else: + data = extract_inline_default(stream) + + ei = stream.read(3) + stream.seek(-1, 1) + if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES: + stream.seek(savpos, 0) + data = extract_inline_default(stream) + return {"settings": settings, "data": data} + + # This overrides the parent method: + def get_data(self) -> bytes: + if not self._data: + new_data = BytesIO() + for operands, operator in self._operations: + if operator == b"INLINE IMAGE": + new_data.write(b"BI") + dict_text = BytesIO() + operands["settings"].write_to_stream(dict_text) + new_data.write(dict_text.getvalue()[2:-2]) + new_data.write(b"ID ") + new_data.write(operands["data"]) + new_data.write(b"EI") + else: + for op in operands: + op.write_to_stream(new_data) + new_data.write(b" ") + new_data.write(b_(operator)) + new_data.write(b"\n") + self._data = new_data.getvalue() + return b_(self._data) + + # This overrides the parent method: + def set_data(self, data: bytes) -> None: + super().set_data(data) + self._operations = [] + + @property + def operations(self) -> List[Tuple[Any, Any]]: + if not self._operations and self._data: + self._parse_content_stream(BytesIO(b_(self._data))) + self._data = b"" + return self._operations + + @operations.setter + def operations(self, operations: List[Tuple[Any, Any]]) -> None: + self._operations = operations + self._data = b"" + + def isolate_graphics_state(self) -> None: + if self._operations: + self._operations.insert(0, ([], "q")) + self._operations.append(([], "Q")) + elif self._data: + self._data = b"q\n" + b_(self._data) + b"\nQ\n" + + # This overrides the parent method: + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if not self._data and self._operations: + self.get_data() # this ensures ._data is rebuilt + super().write_to_stream(stream, encryption_key) + + +def read_object( + stream: StreamType, + pdf: Optional[PdfReaderProtocol], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union[PdfObject, int, str, ContentStream]: + tok = stream.read(1) + stream.seek(-1, 1) # reset to start + if tok == b"/": + return NameObject.read_from_stream(stream, pdf) + elif tok == b"<": + # hexadecimal string OR dictionary + peek = stream.read(2) + stream.seek(-2, 1) # reset to start + if peek == b"<<": + return DictionaryObject.read_from_stream(stream, pdf, forced_encoding) + else: + return read_hex_string_from_stream(stream, forced_encoding) + elif tok == b"[": + return ArrayObject.read_from_stream(stream, pdf, forced_encoding) + elif tok == b"t" or tok == b"f": + return BooleanObject.read_from_stream(stream) + elif tok == b"(": + return read_string_from_stream(stream, forced_encoding) + elif tok == b"e" and stream.read(6) == b"endobj": + stream.seek(-6, 1) + return NullObject() + elif tok == b"n": + return NullObject.read_from_stream(stream) + elif tok == b"%": + # comment + while tok not in (b"\r", b"\n"): + tok = stream.read(1) + # Prevents an infinite loop by raising an error if the stream is at + # the EOF + if len(tok) <= 0: + raise PdfStreamError("File ended unexpectedly.") + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + return read_object(stream, pdf, forced_encoding) + elif tok in b"0123456789+-.": + # number object OR indirect reference + peek = stream.read(20) + stream.seek(-len(peek), 1) # reset to start + if IndirectPattern.match(peek) is not None: + assert pdf is not None # hint for mypy + return IndirectObject.read_from_stream(stream, pdf) + else: + return NumberObject.read_from_stream(stream) + else: + stream.seek(-20, 1) + raise PdfReadError( + f"Invalid Elementary Object starting with {tok!r} @{stream.tell()}: {stream.read(80).__repr__()}" + ) + + +class Field(TreeObject): + """ + A class representing a field dictionary. + + This class is accessed through + :meth:`get_fields()<pypdf.PdfReader.get_fields>` + """ + + def __init__(self, data: DictionaryObject) -> None: + DictionaryObject.__init__(self) + field_attributes = ( + FieldDictionaryAttributes.attributes() + + CheckboxRadioButtonAttributes.attributes() + ) + self.indirect_reference = data.indirect_reference + for attr in field_attributes: + try: + self[NameObject(attr)] = data[attr] + except KeyError: + pass + if isinstance(self.get("/V"), EncodedStreamObject): + d = cast(EncodedStreamObject, self[NameObject("/V")]).get_data() + if isinstance(d, bytes): + d_str = d.decode() + elif d is None: + d_str = "" + else: + raise Exception("Should never happen") + self[NameObject("/V")] = TextStringObject(d_str) + + # TABLE 8.69 Entries common to all field dictionaries + @property + def field_type(self) -> Optional[NameObject]: + """Read-only property accessing the type of this field.""" + return self.get(FieldDictionaryAttributes.FT) + + @property + def parent(self) -> Optional[DictionaryObject]: + """Read-only property accessing the parent of this field.""" + return self.get(FieldDictionaryAttributes.Parent) + + @property + def kids(self) -> Optional["ArrayObject"]: + """Read-only property accessing the kids of this field.""" + return self.get(FieldDictionaryAttributes.Kids) + + @property + def name(self) -> Optional[str]: + """Read-only property accessing the name of this field.""" + return self.get(FieldDictionaryAttributes.T) + + @property + def alternate_name(self) -> Optional[str]: + """Read-only property accessing the alternate name of this field.""" + return self.get(FieldDictionaryAttributes.TU) + + @property + def mapping_name(self) -> Optional[str]: + """ + Read-only property accessing the mapping name of this field. + + This name is used by pypdf as a key in the dictionary returned by + :meth:`get_fields()<pypdf.PdfReader.get_fields>` + """ + return self.get(FieldDictionaryAttributes.TM) + + @property + def flags(self) -> Optional[int]: + """ + Read-only property accessing the field flags, specifying various + characteristics of the field (see Table 8.70 of the PDF 1.7 reference). + """ + return self.get(FieldDictionaryAttributes.Ff) + + @property + def value(self) -> Optional[Any]: + """ + Read-only property accessing the value of this field. + + Format varies based on field type. + """ + return self.get(FieldDictionaryAttributes.V) + + @property + def default_value(self) -> Optional[Any]: + """Read-only property accessing the default value of this field.""" + return self.get(FieldDictionaryAttributes.DV) + + @property + def additional_actions(self) -> Optional[DictionaryObject]: + """ + Read-only property accessing the additional actions dictionary. + + This dictionary defines the field's behavior in response to trigger + events. See Section 8.5.2 of the PDF 1.7 reference. + """ + return self.get(FieldDictionaryAttributes.AA) + + +class Destination(TreeObject): + """ + A class representing a destination within a PDF file. + + See section 12.3.2 of the PDF 2.0 reference. + + Args: + title: Title of this destination. + page: Reference to the page of this destination. Should + be an instance of :class:`IndirectObject<pypdf.generic.IndirectObject>`. + fit: How the destination is displayed. + + Raises: + PdfReadError: If destination type is invalid. + """ + + node: Optional[ + DictionaryObject + ] = None # node provide access to the original Object + + def __init__( + self, + title: str, + page: Union[NumberObject, IndirectObject, NullObject, DictionaryObject], + fit: Fit, + ) -> None: + self._filtered_children: List[Any] = [] # used in PdfWriter + + typ = fit.fit_type + args = fit.fit_args + + DictionaryObject.__init__(self) + self[NameObject("/Title")] = TextStringObject(title) + self[NameObject("/Page")] = page + self[NameObject("/Type")] = typ + + # from table 8.2 of the PDF 1.7 reference. + if typ == "/XYZ": + if len(args) < 1: # left is missing : should never occur + args.append(NumberObject(0.0)) + if len(args) < 2: # top is missing + args.append(NumberObject(0.0)) + if len(args) < 3: # zoom is missing + args.append(NumberObject(0.0)) + ( + self[NameObject(TA.LEFT)], + self[NameObject(TA.TOP)], + self[NameObject("/Zoom")], + ) = args + elif len(args) == 0: + pass + elif typ == TF.FIT_R: + ( + self[NameObject(TA.LEFT)], + self[NameObject(TA.BOTTOM)], + self[NameObject(TA.RIGHT)], + self[NameObject(TA.TOP)], + ) = args + elif typ in [TF.FIT_H, TF.FIT_BH]: + try: # Preferred to be more robust not only to null parameters + (self[NameObject(TA.TOP)],) = args + except Exception: + (self[NameObject(TA.TOP)],) = (NullObject(),) + elif typ in [TF.FIT_V, TF.FIT_BV]: + try: # Preferred to be more robust not only to null parameters + (self[NameObject(TA.LEFT)],) = args + except Exception: + (self[NameObject(TA.LEFT)],) = (NullObject(),) + elif typ in [TF.FIT, TF.FIT_B]: + pass + else: + raise PdfReadError(f"Unknown Destination Type: {typ!r}") + + @property + def dest_array(self) -> "ArrayObject": + return ArrayObject( + [self.raw_get("/Page"), self["/Type"]] + + [ + self[x] + for x in ["/Left", "/Bottom", "/Right", "/Top", "/Zoom"] + if x in self + ] + ) + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(b"<<\n") + key = NameObject("/D") + key.write_to_stream(stream) + stream.write(b" ") + value = self.dest_array + value.write_to_stream(stream) + + key = NameObject("/S") + key.write_to_stream(stream) + stream.write(b" ") + value_s = NameObject("/GoTo") + value_s.write_to_stream(stream) + + stream.write(b"\n") + stream.write(b">>") + + @property + def title(self) -> Optional[str]: + """Read-only property accessing the destination title.""" + return self.get("/Title") + + @property + def page(self) -> Optional[int]: + """Read-only property accessing the destination page number.""" + return self.get("/Page") + + @property + def typ(self) -> Optional[str]: + """Read-only property accessing the destination type.""" + return self.get("/Type") + + @property + def zoom(self) -> Optional[int]: + """Read-only property accessing the zoom factor.""" + return self.get("/Zoom", None) + + @property + def left(self) -> Optional[FloatObject]: + """Read-only property accessing the left horizontal coordinate.""" + return self.get("/Left", None) + + @property + def right(self) -> Optional[FloatObject]: + """Read-only property accessing the right horizontal coordinate.""" + return self.get("/Right", None) + + @property + def top(self) -> Optional[FloatObject]: + """Read-only property accessing the top vertical coordinate.""" + return self.get("/Top", None) + + @property + def bottom(self) -> Optional[FloatObject]: + """Read-only property accessing the bottom vertical coordinate.""" + return self.get("/Bottom", None) + + @property + def color(self) -> Optional["ArrayObject"]: + """Read-only property accessing the color in (R, G, B) with values 0.0-1.0.""" + return self.get( + "/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)]) + ) + + @property + def font_format(self) -> Optional[OutlineFontFlag]: + """ + Read-only property accessing the font type. + + 1=italic, 2=bold, 3=both + """ + return self.get("/F", 0) + + @property + def outline_count(self) -> Optional[int]: + """ + Read-only property accessing the outline count. + + positive = expanded + negative = collapsed + absolute value = number of visible descendants at all levels + """ + return self.get("/Count", None) diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_fit.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_fit.py new file mode 100644 index 00000000..4132f4b7 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_fit.py @@ -0,0 +1,168 @@ +from typing import Any, Optional, Tuple, Union + + +class Fit: + def __init__( + self, fit_type: str, fit_args: Tuple[Union[None, float, Any], ...] = () + ): + from ._base import FloatObject, NameObject, NullObject + + self.fit_type = NameObject(fit_type) + self.fit_args = [ + NullObject() if a is None or isinstance(a, NullObject) else FloatObject(a) + for a in fit_args + ] + + @classmethod + def xyz( + cls, + left: Optional[float] = None, + top: Optional[float] = None, + zoom: Optional[float] = None, + ) -> "Fit": + """ + Display the page designated by page, with the coordinates (left, top) + positioned at the upper-left corner of the window and the contents + of the page magnified by the factor zoom. + + A null value for any of the parameters left, top, or zoom specifies + that the current value of that parameter is to be retained unchanged. + + A zoom value of 0 has the same meaning as a null value. + + Args: + left: + top: + zoom: + + Returns: + The created fit object. + """ + return Fit(fit_type="/XYZ", fit_args=(left, top, zoom)) + + @classmethod + def fit(cls) -> "Fit": + """ + Display the page designated by page, with its contents magnified just + enough to fit the entire page within the window both horizontally and + vertically. + + If the required horizontal and vertical magnification factors are + different, use the smaller of the two, centering the page within the + window in the other dimension. + """ + return Fit(fit_type="/Fit") + + @classmethod + def fit_horizontally(cls, top: Optional[float] = None) -> "Fit": + """ + Display the page designated by page, with the vertical coordinate top + positioned at the top edge of the window and the contents of the page + magnified just enough to fit the entire width of the page within the + window. + + A null value for ``top`` specifies that the current value of that + parameter is to be retained unchanged. + + Args: + top: + + Returns: + The created fit object. + """ + return Fit(fit_type="/FitH", fit_args=(top,)) + + @classmethod + def fit_vertically(cls, left: Optional[float] = None) -> "Fit": + return Fit(fit_type="/FitV", fit_args=(left,)) + + @classmethod + def fit_rectangle( + cls, + left: Optional[float] = None, + bottom: Optional[float] = None, + right: Optional[float] = None, + top: Optional[float] = None, + ) -> "Fit": + """ + Display the page designated by page, with its contents magnified + just enough to fit the rectangle specified by the coordinates + left, bottom, right, and top entirely within the window + both horizontally and vertically. + + If the required horizontal and vertical magnification factors are + different, use the smaller of the two, centering the rectangle within + the window in the other dimension. + + A null value for any of the parameters may result in unpredictable + behavior. + + Args: + left: + bottom: + right: + top: + + Returns: + The created fit object. + """ + return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top)) + + @classmethod + def fit_box(cls) -> "Fit": + """ + Display the page designated by page, with its contents magnified just + enough to fit its bounding box entirely within the window both + horizontally and vertically. + + If the required horizontal and vertical magnification factors are + different, use the smaller of the two, centering the bounding box + within the window in the other dimension. + """ + return Fit(fit_type="/FitB") + + @classmethod + def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit": + """ + Display the page designated by page, with the vertical coordinate top + positioned at the top edge of the window and the contents of the page + magnified just enough to fit the entire width of its bounding box + within the window. + + A null value for top specifies that the current value of that parameter + is to be retained unchanged. + + Args: + top: + + Returns: + The created fit object. + """ + return Fit(fit_type="/FitBH", fit_args=(top,)) + + @classmethod + def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit": + """ + Display the page designated by page, with the horizontal coordinate + left positioned at the left edge of the window and the contents of the + page magnified just enough to fit the entire height of its bounding box + within the window. + + A null value for left specifies that the current value of that + parameter is to be retained unchanged. + + Args: + left: + + Returns: + The created fit object. + """ + return Fit(fit_type="/FitBV", fit_args=(left,)) + + def __str__(self) -> str: + if not self.fit_args: + return f"Fit({self.fit_type})" + return f"Fit({self.fit_type}, {self.fit_args})" + + +DEFAULT_FIT = Fit.fit() diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py new file mode 100644 index 00000000..41826ac3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_image_inline.py @@ -0,0 +1,235 @@ +# Copyright (c) 2024, pypdf contributors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import logging +from io import BytesIO + +from .._utils import ( + WHITESPACES, + StreamType, + read_non_whitespace, +) +from ..errors import PdfReadError + +logger = logging.getLogger(__name__) + +BUFFER_SIZE = 8192 + + +def extract_inline_AHx(stream: StreamType) -> bytes: + """ + Extract HexEncoded Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read data until delimiter > and EI as backup + # ignoring backup. + while True: + data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_tok = data_buffered.find(b">") + if pos_tok >= 0: # found > + data_out += data_buffered[: (pos_tok + 1)] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) + break + pos_ei = data_buffered.find(b"EI") + if pos_ei >= 0: # found EI + stream.seek(-len(data_buffered) + pos_ei - 1, 1) + c = stream.read(1) + while c in WHITESPACES: + stream.seek(-2, 1) + c = stream.read(1) + pos_ei -= 1 + data_out += data_buffered[:pos_ei] + break + elif len(data_buffered) == 2: + data_out += data_buffered + raise PdfReadError("Unexpected end of stream") + else: # > nor EI found + data_out += data_buffered[:-2] + stream.seek(-2, 1) + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_A85(stream: StreamType) -> bytes: + """ + Extract A85 Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read data up to delimiter ~> + # see §3.3.2 from PDF ref 1.7 + while True: + data_buffered = read_non_whitespace(stream) + stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_tok = data_buffered.find(b"~>") + if pos_tok >= 0: # found! + data_out += data_buffered[: pos_tok + 2] + stream.seek(-len(data_buffered) + pos_tok + 2, 1) + break + elif len(data_buffered) == 2: # end of buffer + data_out += data_buffered + raise PdfReadError("Unexpected end of stream") + data_out += data_buffered[ + :-2 + ] # back by one char in case of in the middle of ~> + stream.seek(-2, 1) + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_RL(stream: StreamType) -> bytes: + """ + Extract RL Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read data up to delimiter ~> + # see §3.3.4 from PDF ref 1.7 + while True: + data_buffered = stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_tok = data_buffered.find(b"\x80") + if pos_tok >= 0: # found + data_out += data_buffered[: pos_tok + 1] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) + break + data_out += data_buffered + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_DCT(stream: StreamType) -> bytes: + """ + Extract DCT (JPEG) Stream from Inline Image. + the stream will be moved onto the EI + """ + data_out: bytes = b"" + # Read Blocks of data (ID/Size/data) up to ID=FF/D9 + # see https://www.digicamsoft.com/itu/itu-t81-36.html + notfirst = False + while True: + c = stream.read(1) + if notfirst or (c == b"\xff"): + data_out += c + if c != b"\xff": + continue + else: + notfirst = True + c = stream.read(1) + data_out += c + if c == b"\xff": + stream.seek(-1, 1) # pragma: no cover + elif c == b"\x00": # stuffing + pass + elif c == b"\xd9": # end + break + elif c in ( + b"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc9\xca\xcb\xcc\xcd\xce\xcf" + b"\xda\xdb\xdc\xdd\xde\xdf" + b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe" + ): + c = stream.read(2) + data_out += c + sz = c[0] * 256 + c[1] + data_out += stream.read(sz - 2) + # else: pass + + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + if ei_tok[0:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + raise PdfReadError("EI stream not found") + return data_out + + +def extract_inline_default(stream: StreamType) -> bytes: + """ + Legacy method + used by default + """ + stream_out = BytesIO() + # Read the inline image, while checking for EI (End Image) operator. + while True: + data_buffered = stream.read(BUFFER_SIZE) + if not data_buffered: + raise PdfReadError("Unexpected end of stream") + pos_ei = data_buffered.find( + b"E" + ) # we can not look straight for "EI" because it may not have been loaded in the buffer + + if pos_ei == -1: + stream_out.write(data_buffered) + else: + # Write out everything including E (the one from EI to be removed). + stream_out.write(data_buffered[0 : pos_ei + 1]) + sav_pos_ei = stream_out.tell() - 1 + # Seek back in the stream to read the E next. + stream.seek(pos_ei + 1 - len(data_buffered), 1) + saved_pos = stream.tell() + # Check for End Image + tok2 = stream.read(1) # I of "EI" + if tok2 != b"I": + stream.seek(saved_pos, 0) + continue + tok3 = stream.read(1) # possible space after "EI" + if tok3 not in WHITESPACES: + stream.seek(saved_pos, 0) + continue + while tok3 in WHITESPACES: + tok3 = stream.read(1) + if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in { + b"Q", + b"E", + }: # for Q ou EMC + stream.seek(saved_pos, 0) + continue + # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients + # remove E(I) wrongly inserted earlier + stream_out.truncate(sav_pos_ei) + break + + return stream_out.getvalue() diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_outline.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_outline.py new file mode 100644 index 00000000..4d6a47da --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_outline.py @@ -0,0 +1,33 @@ +from typing import Union + +from .._utils import StreamType, deprecate_no_replacement +from ._base import NameObject +from ._data_structures import Destination + + +class OutlineItem(Destination): + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None + ) -> None: + if encryption_key is not None: # deprecated + deprecate_no_replacement( + "the encryption_key parameter of write_to_stream", "5.0.0" + ) + stream.write(b"<<\n") + for key in [ + NameObject(x) + for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"] + if x in self + ]: + key.write_to_stream(stream) + stream.write(b" ") + value = self.raw_get(key) + value.write_to_stream(stream) + stream.write(b"\n") + key = NameObject("/Dest") + key.write_to_stream(stream) + stream.write(b" ") + value = self.dest_array + value.write_to_stream(stream) + stream.write(b"\n") + stream.write(b">>") diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py new file mode 100644 index 00000000..690b5217 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_rectangle.py @@ -0,0 +1,132 @@ +from typing import Any, Tuple, Union + +from ._base import FloatObject, NumberObject +from ._data_structures import ArrayObject + + +class RectangleObject(ArrayObject): + """ + This class is used to represent *page boxes* in pypdf. + + These boxes include: + + * :attr:`artbox <pypdf._page.PageObject.artbox>` + * :attr:`bleedbox <pypdf._page.PageObject.bleedbox>` + * :attr:`cropbox <pypdf._page.PageObject.cropbox>` + * :attr:`mediabox <pypdf._page.PageObject.mediabox>` + * :attr:`trimbox <pypdf._page.PageObject.trimbox>` + """ + + def __init__( + self, arr: Union["RectangleObject", Tuple[float, float, float, float]] + ) -> None: + # must have four points + assert len(arr) == 4 + # automatically convert arr[x] into NumberObject(arr[x]) if necessary + ArrayObject.__init__(self, [self._ensure_is_number(x) for x in arr]) # type: ignore + + def _ensure_is_number(self, value: Any) -> Union[FloatObject, NumberObject]: + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) + return value + + def scale(self, sx: float, sy: float) -> "RectangleObject": + return RectangleObject( + ( + float(self.left) * sx, + float(self.bottom) * sy, + float(self.right) * sx, + float(self.top) * sy, + ) + ) + + def __repr__(self) -> str: + return f"RectangleObject({list(self)!r})" + + @property + def left(self) -> FloatObject: + return self[0] + + @left.setter + def left(self, f: float) -> None: + self[0] = FloatObject(f) + + @property + def bottom(self) -> FloatObject: + return self[1] + + @bottom.setter + def bottom(self, f: float) -> None: + self[1] = FloatObject(f) + + @property + def right(self) -> FloatObject: + return self[2] + + @right.setter + def right(self, f: float) -> None: + self[2] = FloatObject(f) + + @property + def top(self) -> FloatObject: + return self[3] + + @top.setter + def top(self, f: float) -> None: + self[3] = FloatObject(f) + + @property + def lower_left(self) -> Tuple[float, float]: + """ + Property to read and modify the lower left coordinate of this box + in (x,y) form. + """ + return self.left, self.bottom + + @lower_left.setter + def lower_left(self, value: Tuple[float, float]) -> None: + self[0], self[1] = (self._ensure_is_number(x) for x in value) + + @property + def lower_right(self) -> Tuple[float, float]: + """ + Property to read and modify the lower right coordinate of this box + in (x,y) form. + """ + return self.right, self.bottom + + @lower_right.setter + def lower_right(self, value: Tuple[float, float]) -> None: + self[2], self[1] = (self._ensure_is_number(x) for x in value) + + @property + def upper_left(self) -> Tuple[float, float]: + """ + Property to read and modify the upper left coordinate of this box + in (x,y) form. + """ + return self.left, self.top + + @upper_left.setter + def upper_left(self, value: Tuple[float, float]) -> None: + self[0], self[3] = (self._ensure_is_number(x) for x in value) + + @property + def upper_right(self) -> Tuple[float, float]: + """ + Property to read and modify the upper right coordinate of this box + in (x,y) form. + """ + return self.right, self.top + + @upper_right.setter + def upper_right(self, value: Tuple[float, float]) -> None: + self[2], self[3] = (self._ensure_is_number(x) for x in value) + + @property + def width(self) -> float: + return self.right - self.left + + @property + def height(self) -> float: + return self.top - self.bottom diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py new file mode 100644 index 00000000..fdcdc333 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_utils.py @@ -0,0 +1,180 @@ +import codecs +from typing import Dict, List, Tuple, Union + +from .._codecs import _pdfdoc_encoding +from .._utils import StreamType, b_, logger_warning, read_non_whitespace +from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError +from ._base import ByteStringObject, TextStringObject + + +def hex_to_rgb(value: str) -> Tuple[float, float, float]: + return tuple(int(value.lstrip("#")[i : i + 2], 16) / 255.0 for i in (0, 2, 4)) # type: ignore + + +def read_hex_string_from_stream( + stream: StreamType, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union["TextStringObject", "ByteStringObject"]: + stream.read(1) + txt = "" + x = b"" + while True: + tok = read_non_whitespace(stream) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok == b">": + break + x += tok + if len(x) == 2: + txt += chr(int(x, base=16)) + x = b"" + if len(x) == 1: + x += b"0" + if len(x) == 2: + txt += chr(int(x, base=16)) + return create_string_object(b_(txt), forced_encoding) + + +def read_string_from_stream( + stream: StreamType, + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union["TextStringObject", "ByteStringObject"]: + tok = stream.read(1) + parens = 1 + txt = [] + while True: + tok = stream.read(1) + if not tok: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + if tok == b"(": + parens += 1 + elif tok == b")": + parens -= 1 + if parens == 0: + break + elif tok == b"\\": + tok = stream.read(1) + escape_dict = { + b"n": b"\n", + b"r": b"\r", + b"t": b"\t", + b"b": b"\b", + b"f": b"\f", + b"c": rb"\c", + b"(": b"(", + b")": b")", + b"/": b"/", + b"\\": b"\\", + b" ": b" ", + b"%": b"%", + b"<": b"<", + b">": b">", + b"[": b"[", + b"]": b"]", + b"#": b"#", + b"_": b"_", + b"&": b"&", + b"$": b"$", + } + try: + tok = escape_dict[tok] + except KeyError: + if b"0" <= tok <= b"7": + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for _ in range(2): + ntok = stream.read(1) + if b"0" <= ntok <= b"7": + tok += ntok + else: + stream.seek(-1, 1) # ntok has to be analyzed + break + tok = b_(chr(int(tok, base=8))) + elif tok in b"\n\r": + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if tok not in b"\n\r": + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b"" + else: + msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}" + logger_warning(msg, __name__) + txt.append(tok) + return create_string_object(b"".join(txt), forced_encoding) + + +def create_string_object( + string: Union[str, bytes], + forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, +) -> Union[TextStringObject, ByteStringObject]: + """ + Create a ByteStringObject or a TextStringObject from a string to represent the string. + + Args: + string: The data being used + forced_encoding: Typically None, or an encoding string + + Returns: + A ByteStringObject + + Raises: + TypeError: If string is not of type str or bytes. + """ + if isinstance(string, str): + return TextStringObject(string) + elif isinstance(string, bytes): + if isinstance(forced_encoding, (list, dict)): + out = "" + for x in string: + try: + out += forced_encoding[x] + except Exception: + out += bytes((x,)).decode("charmap") + return TextStringObject(out) + elif isinstance(forced_encoding, str): + if forced_encoding == "bytes": + return ByteStringObject(string) + return TextStringObject(string.decode(forced_encoding)) + else: + try: + if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): + retval = TextStringObject(string.decode("utf-16")) + retval.autodetect_utf16 = True + retval.utf16_bom = string[:2] + return retval + else: + # This is probably a big performance hit here, but we need + # to convert string objects into the text/unicode-aware + # version if possible... and the only way to check if that's + # possible is to try. + # Some strings are strings, some are just byte arrays. + retval = TextStringObject(decode_pdfdocencoding(string)) + retval.autodetect_pdfdocencoding = True + return retval + except UnicodeDecodeError: + return ByteStringObject(string) + else: + raise TypeError("create_string_object should have str or unicode arg") + + +def decode_pdfdocencoding(byte_array: bytes) -> str: + retval = "" + for b in byte_array: + c = _pdfdoc_encoding[b] + if c == "\u0000": + raise UnicodeDecodeError( + "pdfdocencoding", + bytearray(b), + -1, + -1, + "does not exist in translation table", + ) + retval += c + return retval diff --git a/.venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py b/.venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py new file mode 100644 index 00000000..a12f2d44 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/generic/_viewerpref.py @@ -0,0 +1,164 @@ +# Copyright (c) 2023, Pubpub-ZZ +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from typing import ( + Any, + List, + Optional, +) + +from ._base import BooleanObject, NameObject, NumberObject +from ._data_structures import ArrayObject, DictionaryObject + +f_obj = BooleanObject(False) + + +class ViewerPreferences(DictionaryObject): + def _get_bool(self, key: str, deft: Optional[BooleanObject]) -> BooleanObject: + return self.get(key, deft) + + def _set_bool(self, key: str, v: bool) -> None: + self[NameObject(key)] = BooleanObject(v is True) + + def _get_name(self, key: str, deft: Optional[NameObject]) -> Optional[NameObject]: + return self.get(key, deft) + + def _set_name(self, key: str, lst: List[str], v: NameObject) -> None: + if v[0] != "/": + raise ValueError(f"{v} is not starting with '/'") + if lst != [] and v not in lst: + raise ValueError(f"{v} is not par of acceptable values") + self[NameObject(key)] = NameObject(v) + + def _get_arr(self, key: str, deft: Optional[List[Any]]) -> NumberObject: + return self.get(key, None if deft is None else ArrayObject(deft)) + + def _set_arr(self, key: str, v: Optional[ArrayObject]) -> None: + if v is None: + try: + del self[NameObject(key)] + except KeyError: + pass + return + if not isinstance(v, ArrayObject): + raise ValueError("ArrayObject is expected") + self[NameObject(key)] = v + + def _get_int(self, key: str, deft: Optional[NumberObject]) -> NumberObject: + return self.get(key, deft) + + def _set_int(self, key: str, v: int) -> None: + self[NameObject(key)] = NumberObject(v) + + @property + def PRINT_SCALING(self) -> NameObject: + return NameObject("/PrintScaling") + + def __new__(cls: Any, value: Any = None) -> "ViewerPreferences": + def _add_prop_bool(key: str, deft: Optional[BooleanObject]) -> property: + return property( + lambda self: self._get_bool(key, deft), + lambda self, v: self._set_bool(key, v), + None, + f""" + Returns/Modify the status of {key}, Returns {deft} if not defined + """, + ) + + def _add_prop_name( + key: str, lst: List[str], deft: Optional[NameObject] + ) -> property: + return property( + lambda self: self._get_name(key, deft), + lambda self, v: self._set_name(key, lst, v), + None, + f""" + Returns/Modify the status of {key}, Returns {deft} if not defined. + Acceptable values: {lst} + """, + ) + + def _add_prop_arr(key: str, deft: Optional[ArrayObject]) -> property: + return property( + lambda self: self._get_arr(key, deft), + lambda self, v: self._set_arr(key, v), + None, + f""" + Returns/Modify the status of {key}, Returns {deft} if not defined + """, + ) + + def _add_prop_int(key: str, deft: Optional[int]) -> property: + return property( + lambda self: self._get_int(key, deft), + lambda self, v: self._set_int(key, v), + None, + f""" + Returns/Modify the status of {key}, Returns {deft} if not defined + """, + ) + + cls.hide_toolbar = _add_prop_bool("/HideToolbar", f_obj) + cls.hide_menubar = _add_prop_bool("/HideMenubar", f_obj) + cls.hide_windowui = _add_prop_bool("/HideWindowUI", f_obj) + cls.fit_window = _add_prop_bool("/FitWindow", f_obj) + cls.center_window = _add_prop_bool("/CenterWindow", f_obj) + cls.display_doctitle = _add_prop_bool("/DisplayDocTitle", f_obj) + + cls.non_fullscreen_pagemode = _add_prop_name( + "/NonFullScreenPageMode", + ["/UseNone", "/UseOutlines", "/UseThumbs", "/UseOC"], + NameObject("/UseNone"), + ) + cls.direction = _add_prop_name( + "/Direction", ["/L2R", "/R2L"], NameObject("/L2R") + ) + cls.view_area = _add_prop_name("/ViewArea", [], None) + cls.view_clip = _add_prop_name("/ViewClip", [], None) + cls.print_area = _add_prop_name("/PrintArea", [], None) + cls.print_clip = _add_prop_name("/PrintClip", [], None) + cls.print_scaling = _add_prop_name("/PrintScaling", [], None) + cls.duplex = _add_prop_name( + "/Duplex", ["/Simplex", "/DuplexFlipShortEdge", "/DuplexFlipLongEdge"], None + ) + cls.pick_tray_by_pdfsize = _add_prop_bool("/PickTrayByPDFSize", None) + cls.print_pagerange = _add_prop_arr("/PrintPageRange", None) + cls.num_copies = _add_prop_int("/NumCopies", None) + + cls.enforce = _add_prop_arr("/Enforce", ArrayObject()) + + return DictionaryObject.__new__(cls) + + def __init__(self, obj: Optional[DictionaryObject] = None) -> None: + super().__init__(self) + if obj is not None: + self.update(obj.items()) + try: + self.indirect_reference = obj.indirect_reference # type: ignore + except AttributeError: + pass |