two version of R2R are hereHEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_page.py
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 2458 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_page.py b/.venv/lib/python3.12/site-packages/pypdf/_page.py
new file mode 100644
index 00000000..63038d9d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_page.py
@@ -0,0 +1,2458 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import sys
+from decimal import Decimal
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+    cast,
+    overload,
+)
+
+from ._cmap import build_char_map, unknown_char_map
+from ._protocols import PdfCommonDocProtocol
+from ._text_extraction import (
+    OrientationNotFoundError,
+    _layout_mode,
+    crlf_space_check,
+    handle_tj,
+    mult,
+)
+from ._utils import (
+    CompressedTransformationMatrix,
+    File,
+    ImageFile,
+    TransformationMatrixType,
+    logger_warning,
+    matrix_multiply,
+)
+from .constants import AnnotationDictionaryAttributes as ADA
+from .constants import ImageAttributes as IA
+from .constants import PageAttributes as PG
+from .constants import Resources as RES
+from .errors import PageSizeNotDefinedError, PdfReadError
+from .filters import _xobj_to_image
+from .generic import (
+    ArrayObject,
+    ContentStream,
+    DictionaryObject,
+    EncodedStreamObject,
+    FloatObject,
+    IndirectObject,
+    NameObject,
+    NullObject,
+    NumberObject,
+    PdfObject,
+    RectangleObject,
+    StreamObject,
+)
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+
+MERGE_CROP_BOX = "cropbox"  # pypdf<=3.4.0 used 'trimbox'
+
+
+def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
+    retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
+    if isinstance(retval, RectangleObject):
+        return retval
+    if retval is None:
+        for d in defaults:
+            retval = self.get(d)
+            if retval is not None:
+                break
+    if isinstance(retval, IndirectObject):
+        retval = self.pdf.get_object(retval)
+    retval = RectangleObject(retval)  # type: ignore
+    _set_rectangle(self, name, retval)
+    return retval
+
+
+def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None:
+    name = NameObject(name)
+    self[name] = value
+
+
+def _delete_rectangle(self: Any, name: str) -> None:
+    del self[name]
+
+
+def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property:
+    return property(
+        lambda self: _get_rectangle(self, name, fallback),
+        lambda self, value: _set_rectangle(self, name, value),
+        lambda self: _delete_rectangle(self, name),
+    )
+
+
+class Transformation:
+    """
+    Represent a 2D transformation.
+
+    The transformation between two coordinate systems is represented by a 3-by-3
+    transformation matrix matrix with the following form::
+
+        a b 0
+        c d 0
+        e f 1
+
+    Because a transformation matrix has only six elements that can be changed,
+    it is usually specified in PDF as the six-element array [ a b c d e f ].
+
+    Coordinate transformations are expressed as matrix multiplications::
+
+                                 a b 0
+     [ x′ y′ 1 ] = [ x y 1 ] ×   c d 0
+                                 e f 1
+
+
+    Example:
+        >>> from pypdf import Transformation
+        >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20)
+        >>> page.add_transformation(op)
+    """
+
+    # 9.5.4 Coordinate Systems for 3D
+    # 4.2.2 Common Transformations
+    def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)):
+        self.ctm = ctm
+
+    @property
+    def matrix(self) -> TransformationMatrixType:
+        """
+        Return the transformation matrix as a tuple of tuples in the form:
+
+        ((a, b, 0), (c, d, 0), (e, f, 1))
+        """
+        return (
+            (self.ctm[0], self.ctm[1], 0),
+            (self.ctm[2], self.ctm[3], 0),
+            (self.ctm[4], self.ctm[5], 1),
+        )
+
+    @staticmethod
+    def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix:
+        """
+        Compresses the transformation matrix into a tuple of (a, b, c, d, e, f).
+
+        Args:
+            matrix: The transformation matrix as a tuple of tuples.
+
+        Returns:
+            A tuple representing the transformation matrix as (a, b, c, d, e, f)
+        """
+        return (
+            matrix[0][0],
+            matrix[0][1],
+            matrix[1][0],
+            matrix[1][1],
+            matrix[2][0],
+            matrix[2][1],
+        )
+
+    def transform(self, m: "Transformation") -> "Transformation":
+        """
+        Apply one transformation to another.
+
+        Args:
+            m: a Transformation to apply.
+
+        Returns:
+            A new ``Transformation`` instance
+
+        Example:
+            >>> from pypdf import Transformation
+            >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror
+            >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, iwidth, 0))) # horizontal mirror
+            >>> page.add_transformation(op)
+        """
+        ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix))
+        return Transformation(ctm)
+
+    def translate(self, tx: float = 0, ty: float = 0) -> "Transformation":
+        """
+        Translate the contents of a page.
+
+        Args:
+            tx: The translation along the x-axis.
+            ty: The translation along the y-axis.
+
+        Returns:
+            A new ``Transformation`` instance
+        """
+        m = self.ctm
+        return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty))
+
+    def scale(
+        self, sx: Optional[float] = None, sy: Optional[float] = None
+    ) -> "Transformation":
+        """
+        Scale the contents of a page towards the origin of the coordinate system.
+
+        Typically, that is the lower-left corner of the page. That can be
+        changed by translating the contents / the page boxes.
+
+        Args:
+            sx: The scale factor along the x-axis.
+            sy: The scale factor along the y-axis.
+
+        Returns:
+            A new Transformation instance with the scaled matrix.
+        """
+        if sx is None and sy is None:
+            raise ValueError("Either sx or sy must be specified")
+        if sx is None:
+            sx = sy
+        if sy is None:
+            sy = sx
+        assert sx is not None
+        assert sy is not None
+        op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1))
+        ctm = Transformation.compress(matrix_multiply(self.matrix, op))
+        return Transformation(ctm)
+
+    def rotate(self, rotation: float) -> "Transformation":
+        """
+        Rotate the contents of a page.
+
+        Args:
+            rotation: The angle of rotation in degrees.
+
+        Returns:
+            A new ``Transformation`` instance with the rotated matrix.
+        """
+        rotation = math.radians(rotation)
+        op: TransformationMatrixType = (
+            (math.cos(rotation), math.sin(rotation), 0),
+            (-math.sin(rotation), math.cos(rotation), 0),
+            (0, 0, 1),
+        )
+        ctm = Transformation.compress(matrix_multiply(self.matrix, op))
+        return Transformation(ctm)
+
+    def __repr__(self) -> str:
+        return f"Transformation(ctm={self.ctm})"
+
+    @overload
+    def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]:
+        ...
+
+    @overload
+    def apply_on(
+        self, pt: Tuple[float, float], as_object: bool = False
+    ) -> Tuple[float, float]:
+        ...
+
+    def apply_on(
+        self,
+        pt: Union[Tuple[float, float], List[float]],
+        as_object: bool = False,
+    ) -> Union[Tuple[float, float], List[float]]:
+        """
+        Apply the transformation matrix on the given point.
+
+        Args:
+            pt: A tuple or list representing the point in the form (x, y)
+
+        Returns:
+            A tuple or list representing the transformed point in the form (x', y')
+        """
+        typ = FloatObject if as_object else float
+        pt1 = (
+            typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]),
+            typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]),
+        )
+        return list(pt1) if isinstance(pt, list) else pt1
+
+
+class PageObject(DictionaryObject):
+    """
+    PageObject represents a single page within a PDF file.
+
+    Typically these objects will be created by accessing the
+    :attr:`pages<pypdf.PdfReader.pages>` property of the
+    :class:`PdfReader<pypdf.PdfReader>` class, but it is
+    also possible to create an empty page with the
+    :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method.
+
+    Args:
+        pdf: PDF file the page belongs to.
+        indirect_reference: Stores the original indirect reference to
+            this object in its source PDF
+    """
+
+    original_page: "PageObject"  # very local use in writer when appending
+
+    def __init__(
+        self,
+        pdf: Optional[PdfCommonDocProtocol] = None,
+        indirect_reference: Optional[IndirectObject] = None,
+    ) -> None:
+        DictionaryObject.__init__(self)
+        self.pdf = pdf
+        self.inline_images: Optional[Dict[str, ImageFile]] = None
+        # below Union for mypy but actually Optional[List[str]]
+        self.indirect_reference = indirect_reference
+
+    def hash_value_data(self) -> bytes:
+        data = super().hash_value_data()
+        data += b"%d" % id(self)
+        return data
+
+    @property
+    def user_unit(self) -> float:
+        """
+        A read-only positive number giving the size of user space units.
+
+        It is in multiples of 1/72 inch. Hence a value of 1 means a user
+        space unit is 1/72 inch, and a value of 3 means that a user
+        space unit is 3/72 inch.
+        """
+        return self.get(PG.USER_UNIT, 1)
+
+    @staticmethod
+    def create_blank_page(
+        pdf: Optional[PdfCommonDocProtocol] = None,
+        width: Union[float, Decimal, None] = None,
+        height: Union[float, Decimal, None] = None,
+    ) -> "PageObject":
+        """
+        Return a new blank page.
+
+        If ``width`` or ``height`` is ``None``, try to get the page size
+        from the last page of *pdf*.
+
+        Args:
+            pdf: PDF file the page is within.
+            width: The width of the new page expressed in default user
+                space units.
+            height: The height of the new page expressed in default user
+                space units.
+
+        Returns:
+            The new blank page
+
+        Raises:
+            PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
+                no page
+        """
+        page = PageObject(pdf)
+
+        # Creates a new page (cf PDF Reference  7.7.3.3)
+        page.__setitem__(NameObject(PG.TYPE), NameObject("/Page"))
+        page.__setitem__(NameObject(PG.PARENT), NullObject())
+        page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject())
+        if width is None or height is None:
+            if pdf is not None and len(pdf.pages) > 0:
+                lastpage = pdf.pages[len(pdf.pages) - 1]
+                width = lastpage.mediabox.width
+                height = lastpage.mediabox.height
+            else:
+                raise PageSizeNotDefinedError
+        page.__setitem__(
+            NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height))  # type: ignore
+        )
+
+        return page
+
+    @property
+    def _old_images(self) -> List[File]:  # deprecated
+        """
+        Get a list of all images of the page.
+
+        This requires pillow. You can install it via 'pip install pypdf[image]'.
+
+        For the moment, this does NOT include inline images. They will be added
+        in future.
+        """
+        images_extracted: List[File] = []
+        if RES.XOBJECT not in self[PG.RESOURCES]:  # type: ignore
+            return images_extracted
+
+        x_object = self[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
+        for obj in x_object:
+            if x_object[obj][IA.SUBTYPE] == "/Image":
+                extension, byte_stream, img = _xobj_to_image(x_object[obj])
+                if extension is not None:
+                    filename = f"{obj[1:]}{extension}"
+                    images_extracted.append(File(name=filename, data=byte_stream))
+                    images_extracted[-1].image = img
+                    images_extracted[-1].indirect_reference = x_object[
+                        obj
+                    ].indirect_reference
+        return images_extracted
+
+    def _get_ids_image(
+        self,
+        obj: Optional[DictionaryObject] = None,
+        ancest: Optional[List[str]] = None,
+        call_stack: Optional[List[Any]] = None,
+    ) -> List[Union[str, List[str]]]:
+        if call_stack is None:
+            call_stack = []
+        _i = getattr(obj, "indirect_reference", None)
+        if _i in call_stack:
+            return []
+        else:
+            call_stack.append(_i)
+        if self.inline_images is None:
+            self.inline_images = self._get_inline_images()
+        if obj is None:
+            obj = self
+        if ancest is None:
+            ancest = []
+        lst: List[Union[str, List[str]]] = []
+        if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
+            DictionaryObject, obj[PG.RESOURCES]
+        ):
+            return [] if self.inline_images is None else list(self.inline_images.keys())
+
+        x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object()  # type: ignore
+        for o in x_object:
+            if not isinstance(x_object[o], StreamObject):
+                continue
+            if x_object[o][IA.SUBTYPE] == "/Image":
+                lst.append(o if len(ancest) == 0 else ancest + [o])
+            else:  # is a form with possible images inside
+                lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))
+        assert self.inline_images is not None
+        lst.extend(list(self.inline_images.keys()))
+        return lst
+
+    def _get_image(
+        self,
+        id: Union[str, List[str], Tuple[str]],
+        obj: Optional[DictionaryObject] = None,
+    ) -> ImageFile:
+        if obj is None:
+            obj = cast(DictionaryObject, self)
+        if isinstance(id, tuple):
+            id = list(id)
+        if isinstance(id, List) and len(id) == 1:
+            id = id[0]
+        try:
+            xobjs = cast(
+                DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
+            )
+        except KeyError:
+            if not (id[0] == "~" and id[-1] == "~"):
+                raise
+        if isinstance(id, str):
+            if id[0] == "~" and id[-1] == "~":
+                if self.inline_images is None:
+                    self.inline_images = self._get_inline_images()
+                if self.inline_images is None:  # pragma: no cover
+                    raise KeyError("no inline image can be found")
+                return self.inline_images[id]
+
+            imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
+            extension, byte_stream = imgd[:2]
+            f = ImageFile(
+                name=f"{id[1:]}{extension}",
+                data=byte_stream,
+                image=imgd[2],
+                indirect_reference=xobjs[id].indirect_reference,
+            )
+            return f
+        else:  # in a sub object
+            ids = id[1:]
+            return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))
+
+    @property
+    def images(self) -> List[ImageFile]:
+        """
+        Read-only property emulating a list of images on a page.
+
+        Get a list of all images on the page. The key can be:
+        - A string (for the top object)
+        - A tuple (for images within XObject forms)
+        - An integer
+
+        Examples:
+            reader.pages[0].images[0]        # return fist image
+            reader.pages[0].images['/I0']    # return image '/I0'
+            # return image '/Image1' within '/TP1' Xobject/Form:
+            reader.pages[0].images['/TP1','/Image1']
+            for img in reader.pages[0].images: # loop within all objects
+
+        images.keys() and images.items() can be used.
+
+        The ImageFile has the following properties:
+
+            `.name` : name of the object
+            `.data` : bytes of the object
+            `.image`  : PIL Image Object
+            `.indirect_reference` : object reference
+
+        and the following methods:
+            `.replace(new_image: PIL.Image.Image, **kwargs)` :
+                replace the image in the pdf with the new image
+                applying the saving parameters indicated (such as quality)
+
+        Example usage:
+
+            reader.pages[0].images[0]=replace(Image.open("new_image.jpg", quality = 20)
+
+        Inline images are extracted and named ~0~, ~1~, ..., with the
+        indirect_reference set to None.
+        """
+        return _VirtualListImages(self._get_ids_image, self._get_image)  # type: ignore
+
+    def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject:
+        """Translate values used in inline image"""
+        try:
+            v = NameObject(
+                {
+                    "/G": "/DeviceGray",
+                    "/RGB": "/DeviceRGB",
+                    "/CMYK": "/DeviceCMYK",
+                    "/I": "/Indexed",
+                    "/AHx": "/ASCIIHexDecode",
+                    "/A85": "/ASCII85Decode",
+                    "/LZW": "/LZWDecode",
+                    "/Fl": "/FlateDecode",
+                    "/RL": "/RunLengthDecode",
+                    "/CCF": "/CCITTFaxDecode",
+                    "/DCT": "/DCTDecode",
+                    "/DeviceGray": "/DeviceGray",
+                    "/DeviceRGB": "/DeviceRGB",
+                    "/DeviceCMYK": "/DeviceCMYK",
+                    "/Indexed": "/Indexed",
+                    "/ASCIIHexDecode": "/ASCIIHexDecode",
+                    "/ASCII85Decode": "/ASCII85Decode",
+                    "/LZWDecode": "/LZWDecode",
+                    "/FlateDecode": "/FlateDecode",
+                    "/RunLengthDecode": "/RunLengthDecode",
+                    "/CCITTFaxDecode": "/CCITTFaxDecode",
+                    "/DCTDecode": "/DCTDecode",
+                }[cast(str, v)]
+            )
+        except (TypeError, KeyError):
+            if isinstance(v, NameObject):
+                # It is a custom name, thus we have to look in resources.
+                # The only applicable case is for ColorSpace.
+                try:
+                    res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"]
+                    v = cast(DictionaryObject, res)[v]
+                except KeyError:  # for res and v
+                    raise PdfReadError(f"Cannot find resource entry {v} for {k}")
+        return v
+
+    def _get_inline_images(self) -> Dict[str, ImageFile]:
+        """
+        get inline_images
+        entries will be identified as ~1~
+        """
+        content = self.get_contents()
+        if content is None:
+            return {}
+        imgs_data = []
+        for param, ope in content.operations:
+            if ope == b"INLINE IMAGE":
+                imgs_data.append(
+                    {"settings": param["settings"], "__streamdata__": param["data"]}
+                )
+            elif ope in (b"BI", b"EI", b"ID"):  # pragma: no cover
+                raise PdfReadError(
+                    f"{ope} operator met whereas not expected,"
+                    "please share usecase with pypdf dev team"
+                )
+            """backup
+            elif ope == b"BI":
+                img_data["settings"] = {}
+            elif ope == b"EI":
+                imgs_data.append(img_data)
+                img_data = {}
+            elif ope == b"ID":
+                img_data["__streamdata__"] = b""
+            elif "__streamdata__" in img_data:
+                if len(img_data["__streamdata__"]) > 0:
+                    img_data["__streamdata__"] += b"\n"
+                    raise Exception("check append")
+                img_data["__streamdata__"] += param
+            elif "settings" in img_data:
+                img_data["settings"][ope.decode()] = param
+            """
+        files = {}
+        for num, ii in enumerate(imgs_data):
+            init = {
+                "__streamdata__": ii["__streamdata__"],
+                "/Length": len(ii["__streamdata__"]),
+            }
+            for k, v in ii["settings"].items():
+                if k in {"/Length", "/L"}:  # no length is expected
+                    continue
+                if isinstance(v, list):
+                    v = ArrayObject(
+                        [self._translate_value_inlineimage(k, x) for x in v]
+                    )
+                else:
+                    v = self._translate_value_inlineimage(k, v)
+                k = NameObject(
+                    {
+                        "/BPC": "/BitsPerComponent",
+                        "/CS": "/ColorSpace",
+                        "/D": "/Decode",
+                        "/DP": "/DecodeParms",
+                        "/F": "/Filter",
+                        "/H": "/Height",
+                        "/W": "/Width",
+                        "/I": "/Interpolate",
+                        "/Intent": "/Intent",
+                        "/IM": "/ImageMask",
+                        "/BitsPerComponent": "/BitsPerComponent",
+                        "/ColorSpace": "/ColorSpace",
+                        "/Decode": "/Decode",
+                        "/DecodeParms": "/DecodeParms",
+                        "/Filter": "/Filter",
+                        "/Height": "/Height",
+                        "/Width": "/Width",
+                        "/Interpolate": "/Interpolate",
+                        "/ImageMask": "/ImageMask",
+                    }[k]
+                )
+                if k not in init:
+                    init[k] = v
+            ii["object"] = EncodedStreamObject.initialize_from_dictionary(init)
+            extension, byte_stream, img = _xobj_to_image(ii["object"])
+            files[f"~{num}~"] = ImageFile(
+                name=f"~{num}~{extension}",
+                data=byte_stream,
+                image=img,
+                indirect_reference=None,
+            )
+        return files
+
+    @property
+    def rotation(self) -> int:
+        """
+        The visual rotation of the page.
+
+        This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are
+        valid values. This property does not affect ``/Contents``.
+        """
+        rotate_obj = self.get(PG.ROTATE, 0)
+        return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object()
+
+    @rotation.setter
+    def rotation(self, r: float) -> None:
+        self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360)
+
+    def transfer_rotation_to_content(self) -> None:
+        """
+        Apply the rotation of the page to the content and the media/crop/...
+        boxes.
+
+        It is recommended to apply this function before page merging.
+        """
+        r = -self.rotation  # rotation to apply is in the otherway
+        self.rotation = 0
+        mb = RectangleObject(self.mediabox)
+        trsf = (
+            Transformation()
+            .translate(
+                -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2)
+            )
+            .rotate(r)
+        )
+        pt1 = trsf.apply_on(mb.lower_left)
+        pt2 = trsf.apply_on(mb.upper_right)
+        trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1]))
+        self.add_transformation(trsf, False)
+        for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]:
+            if b in self:
+                rr = RectangleObject(self[b])  # type: ignore
+                pt1 = trsf.apply_on(rr.lower_left)
+                pt2 = trsf.apply_on(rr.upper_right)
+                self[NameObject(b)] = RectangleObject(
+                    (
+                        min(pt1[0], pt2[0]),
+                        min(pt1[1], pt2[1]),
+                        max(pt1[0], pt2[0]),
+                        max(pt1[1], pt2[1]),
+                    )
+                )
+
+    def rotate(self, angle: int) -> "PageObject":
+        """
+        Rotate a page clockwise by increments of 90 degrees.
+
+        Args:
+            angle: Angle to rotate the page. Must be an increment of 90 deg.
+
+        Returns:
+            The rotated PageObject
+        """
+        if angle % 90 != 0:
+            raise ValueError("Rotation angle must be a multiple of 90")
+        self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle)
+        return self
+
+    def _merge_resources(
+        self,
+        res1: DictionaryObject,
+        res2: DictionaryObject,
+        resource: Any,
+        new_res1: bool = True,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        try:
+            assert isinstance(self.indirect_reference, IndirectObject)
+            pdf = self.indirect_reference.pdf
+            is_pdf_writer = hasattr(
+                pdf, "_add_object"
+            )  # ---------- expect isinstance(pdf,PdfWriter)
+        except (AssertionError, AttributeError):
+            pdf = None
+            is_pdf_writer = False
+
+        def compute_unique_key(base_key: str) -> Tuple[str, bool]:
+            """
+            Find a key that either doesn't already exist or has the same value
+            (indicated by the bool)
+
+            Args:
+                base_key: An index is added to this to get the computed key
+
+            Returns:
+                A tuple (computed key, bool) where the boolean indicates
+                if there is a resource of the given computed_key with the same
+                value.
+            """
+            value = page2res.raw_get(base_key)
+            # TODO : possible improvement : in case of writer, the indirect_reference
+            # can not be found because translated : this may be improved
+
+            # try the current key first (e.g. "foo"), but otherwise iterate
+            # through "foo-0", "foo-1", etc. new_res can contain only finitely
+            # many keys, thus this'll eventually end, even if it's been crafted
+            # to be maximally annoying.
+            computed_key = base_key
+            idx = 0
+            while computed_key in new_res:
+                if new_res.raw_get(computed_key) == value:
+                    # there's already a resource of this name, with the exact
+                    # same value
+                    return computed_key, True
+                computed_key = f"{base_key}-{idx}"
+                idx += 1
+            return computed_key, False
+
+        if new_res1:
+            new_res = DictionaryObject()
+            new_res.update(res1.get(resource, DictionaryObject()).get_object())
+        else:
+            new_res = cast(DictionaryObject, res1[resource])
+        page2res = cast(
+            DictionaryObject, res2.get(resource, DictionaryObject()).get_object()
+        )
+        rename_res = {}
+        for key in page2res:
+            unique_key, same_value = compute_unique_key(key)
+            newname = NameObject(unique_key)
+            if key != unique_key:
+                # we have to use a different name for this
+                rename_res[key] = newname
+
+            if not same_value:
+                if is_pdf_writer:
+                    new_res[newname] = page2res.raw_get(key).clone(pdf)
+                    try:
+                        new_res[newname] = new_res[newname].indirect_reference
+                    except AttributeError:
+                        pass
+                else:
+                    new_res[newname] = page2res.raw_get(key)
+            lst = sorted(new_res.items())
+            new_res.clear()
+            for el in lst:
+                new_res[el[0]] = el[1]
+        return new_res, rename_res
+
+    @staticmethod
+    def _content_stream_rename(
+        stream: ContentStream,
+        rename: Dict[Any, Any],
+        pdf: Optional[PdfCommonDocProtocol],
+    ) -> ContentStream:
+        if not rename:
+            return stream
+        stream = ContentStream(stream, pdf)
+        for operands, _operator in stream.operations:
+            if isinstance(operands, list):
+                for i, op in enumerate(operands):
+                    if isinstance(op, NameObject):
+                        operands[i] = rename.get(op, op)
+            elif isinstance(operands, dict):
+                for i, op in operands.items():
+                    if isinstance(op, NameObject):
+                        operands[i] = rename.get(op, op)
+            else:
+                raise KeyError(f"type of operands is {type(operands)}")
+        return stream
+
+    @staticmethod
+    def _add_transformation_matrix(
+        contents: Any,
+        pdf: Optional[PdfCommonDocProtocol],
+        ctm: CompressedTransformationMatrix,
+    ) -> ContentStream:
+        """Add transformation matrix at the beginning of the given contents stream."""
+        a, b, c, d, e, f = ctm
+        contents = ContentStream(contents, pdf)
+        contents.operations.insert(
+            0,
+            [
+                [
+                    FloatObject(a),
+                    FloatObject(b),
+                    FloatObject(c),
+                    FloatObject(d),
+                    FloatObject(e),
+                    FloatObject(f),
+                ],
+                " cm",
+            ],
+        )
+        return contents
+
+    def _get_contents_as_bytes(self) -> Optional[bytes]:
+        """
+        Return the page contents as bytes.
+
+        Returns:
+            The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
+
+        """
+        if PG.CONTENTS in self:
+            obj = self[PG.CONTENTS].get_object()
+            if isinstance(obj, list):
+                return b"".join(x.get_object().get_data() for x in obj)
+            else:
+                return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+        else:
+            return None
+
+    def get_contents(self) -> Optional[ContentStream]:
+        """
+        Access the page contents.
+
+        Returns:
+            The ``/Contents`` object, or ``None`` if it does not exist.
+            ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference.
+        """
+        if PG.CONTENTS in self:
+            try:
+                pdf = cast(IndirectObject, self.indirect_reference).pdf
+            except AttributeError:
+                pdf = None
+            obj = self[PG.CONTENTS].get_object()
+            if isinstance(obj, NullObject):
+                return None
+            else:
+                return ContentStream(obj, pdf)
+        else:
+            return None
+
+    def replace_contents(
+        self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject]
+    ) -> None:
+        """
+        Replace the page contents with the new content and nullify old objects
+        Args:
+            content: new content; if None delete the content field.
+        """
+        if not hasattr(self, "indirect_reference") or self.indirect_reference is None:
+            # the page is not attached : the content is directly attached.
+            self[NameObject(PG.CONTENTS)] = content
+            return
+        if isinstance(self.get(PG.CONTENTS, None), ArrayObject):
+            for o in self[PG.CONTENTS]:  # type: ignore[attr-defined]
+                try:
+                    self._objects[o.indirect_reference.idnum - 1] = NullObject()  # type: ignore
+                except AttributeError:
+                    pass
+
+        if isinstance(content, ArrayObject):
+            for i in range(len(content)):
+                content[i] = self.indirect_reference.pdf._add_object(content[i])
+
+        if content is None:
+            if PG.CONTENTS not in self:
+                return
+            else:
+                assert self.indirect_reference is not None
+                assert self[PG.CONTENTS].indirect_reference is not None
+                self.indirect_reference.pdf._objects[
+                    self[PG.CONTENTS].indirect_reference.idnum - 1  # type: ignore
+                ] = NullObject()
+                del self[PG.CONTENTS]
+        elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"):
+            try:
+                self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object(
+                    content
+                )
+            except AttributeError:
+                # applies at least for page not in writer
+                # as a backup solution, we put content as an object although not in accordance with pdf ref
+                # this will be fixed with the _add_object
+                self[NameObject(PG.CONTENTS)] = content
+        else:
+            content.indirect_reference = self[
+                PG.CONTENTS
+            ].indirect_reference  # TODO: in a future may required generation management
+            try:
+                self.indirect_reference.pdf._objects[
+                    content.indirect_reference.idnum - 1  # type: ignore
+                ] = content
+            except AttributeError:
+                # applies at least for page not in writer
+                # as a backup solution, we put content as an object although not in accordance with pdf ref
+                # this will be fixed with the _add_object
+                self[NameObject(PG.CONTENTS)] = content
+        # forces recalculation of inline_images
+        self.inline_images = None
+
+    def merge_page(
+        self, page2: "PageObject", expand: bool = False, over: bool = True
+    ) -> None:
+        """
+        Merge the content streams of two pages into one.
+
+        Resource references
+        (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
+        of this page are not altered. The parameter page's content stream will
+        be added to the end of this page's content stream, meaning that it will
+        be drawn after, or "on top" of this page.
+
+        Args:
+            page2: The page to be merged into this one. Should be
+                an instance of :class:`PageObject<PageObject>`.
+            over: set the page2 content over page1 if True (default) else under
+            expand: If True, the current page dimensions will be
+                expanded to accommodate the dimensions of the page to be merged.
+        """
+        self._merge_page(page2, over=over, expand=expand)
+
+    def _merge_page(
+        self,
+        page2: "PageObject",
+        page2transformation: Optional[Callable[[Any], ContentStream]] = None,
+        ctm: Optional[CompressedTransformationMatrix] = None,
+        over: bool = True,
+        expand: bool = False,
+    ) -> None:
+        # First we work on merging the resource dictionaries. This allows us
+        # to find out what symbols in the content streams we might need to
+        # rename.
+        try:
+            assert isinstance(self.indirect_reference, IndirectObject)
+            if hasattr(
+                self.indirect_reference.pdf, "_add_object"
+            ):  # ---------- to detect PdfWriter
+                return self._merge_page_writer(
+                    page2, page2transformation, ctm, over, expand
+                )
+        except (AssertionError, AttributeError):
+            pass
+
+        new_resources = DictionaryObject()
+        rename = {}
+        try:
+            original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
+        except KeyError:
+            original_resources = DictionaryObject()
+        try:
+            page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
+        except KeyError:
+            page2resources = DictionaryObject()
+        new_annots = ArrayObject()
+
+        for page in (self, page2):
+            if PG.ANNOTS in page:
+                annots = page[PG.ANNOTS]
+                if isinstance(annots, ArrayObject):
+                    new_annots.extend(annots)
+
+        for res in (
+            RES.EXT_G_STATE,
+            RES.FONT,
+            RES.XOBJECT,
+            RES.COLOR_SPACE,
+            RES.PATTERN,
+            RES.SHADING,
+            RES.PROPERTIES,
+        ):
+            new, newrename = self._merge_resources(
+                original_resources, page2resources, res
+            )
+            if new:
+                new_resources[NameObject(res)] = new
+                rename.update(newrename)
+
+        # Combine /ProcSet sets, making sure there's a consistent order
+        new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
+            sorted(
+                set(
+                    original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
+                ).union(
+                    set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
+                )
+            )
+        )
+
+        new_content_array = ArrayObject()
+        original_content = self.get_contents()
+        if original_content is not None:
+            original_content.isolate_graphics_state()
+            new_content_array.append(original_content)
+
+        page2content = page2.get_contents()
+        if page2content is not None:
+            rect = getattr(page2, MERGE_CROP_BOX)
+            page2content.operations.insert(
+                0,
+                (
+                    map(
+                        FloatObject,
+                        [
+                            rect.left,
+                            rect.bottom,
+                            rect.width,
+                            rect.height,
+                        ],
+                    ),
+                    "re",
+                ),
+            )
+            page2content.operations.insert(1, ([], "W"))
+            page2content.operations.insert(2, ([], "n"))
+            if page2transformation is not None:
+                page2content = page2transformation(page2content)
+            page2content = PageObject._content_stream_rename(
+                page2content, rename, self.pdf
+            )
+            page2content.isolate_graphics_state()
+            if over:
+                new_content_array.append(page2content)
+            else:
+                new_content_array.insert(0, page2content)
+
+        # if expanding the page to fit a new page, calculate the new media box size
+        if expand:
+            self._expand_mediabox(page2, ctm)
+
+        self.replace_contents(ContentStream(new_content_array, self.pdf))
+        self[NameObject(PG.RESOURCES)] = new_resources
+        self[NameObject(PG.ANNOTS)] = new_annots
+
+    def _merge_page_writer(
+        self,
+        page2: "PageObject",
+        page2transformation: Optional[Callable[[Any], ContentStream]] = None,
+        ctm: Optional[CompressedTransformationMatrix] = None,
+        over: bool = True,
+        expand: bool = False,
+    ) -> None:
+        # First we work on merging the resource dictionaries. This allows us
+        # to find which symbols in the content streams we might need to
+        # rename.
+        assert isinstance(self.indirect_reference, IndirectObject)
+        pdf = self.indirect_reference.pdf
+
+        rename = {}
+        if PG.RESOURCES not in self:
+            self[NameObject(PG.RESOURCES)] = DictionaryObject()
+        original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object())
+        if PG.RESOURCES not in page2:
+            page2resources = DictionaryObject()
+        else:
+            page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object())
+
+        for res in (
+            RES.EXT_G_STATE,
+            RES.FONT,
+            RES.XOBJECT,
+            RES.COLOR_SPACE,
+            RES.PATTERN,
+            RES.SHADING,
+            RES.PROPERTIES,
+        ):
+            if res in page2resources:
+                if res not in original_resources:
+                    original_resources[NameObject(res)] = DictionaryObject()
+                _, newrename = self._merge_resources(
+                    original_resources, page2resources, res, False
+                )
+                rename.update(newrename)
+        # Combine /ProcSet sets.
+        if RES.PROC_SET in page2resources:
+            if RES.PROC_SET not in original_resources:
+                original_resources[NameObject(RES.PROC_SET)] = ArrayObject()
+            arr = cast(ArrayObject, original_resources[RES.PROC_SET])
+            for x in cast(ArrayObject, page2resources[RES.PROC_SET]):
+                if x not in arr:
+                    arr.append(x)
+            arr.sort()
+
+        if PG.ANNOTS in page2:
+            if PG.ANNOTS not in self:
+                self[NameObject(PG.ANNOTS)] = ArrayObject()
+            annots = cast(ArrayObject, self[PG.ANNOTS].get_object())
+            if ctm is None:
+                trsf = Transformation()
+            else:
+                trsf = Transformation(ctm)
+            for a in cast(ArrayObject, page2[PG.ANNOTS]):
+                a = a.get_object()
+                aa = a.clone(
+                    pdf,
+                    ignore_fields=("/P", "/StructParent", "/Parent"),
+                    force_duplicate=True,
+                )
+                r = cast(ArrayObject, a["/Rect"])
+                pt1 = trsf.apply_on((r[0], r[1]), True)
+                pt2 = trsf.apply_on((r[2], r[3]), True)
+                aa[NameObject("/Rect")] = ArrayObject(
+                    (
+                        min(pt1[0], pt2[0]),
+                        min(pt1[1], pt2[1]),
+                        max(pt1[0], pt2[0]),
+                        max(pt1[1], pt2[1]),
+                    )
+                )
+                if "/QuadPoints" in a:
+                    q = cast(ArrayObject, a["/QuadPoints"])
+                    aa[NameObject("/QuadPoints")] = ArrayObject(
+                        trsf.apply_on((q[0], q[1]), True)
+                        + trsf.apply_on((q[2], q[3]), True)
+                        + trsf.apply_on((q[4], q[5]), True)
+                        + trsf.apply_on((q[6], q[7]), True)
+                    )
+                try:
+                    aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference
+                except KeyError:
+                    pass
+                try:
+                    aa[NameObject("/P")] = self.indirect_reference
+                    annots.append(aa.indirect_reference)
+                except AttributeError:
+                    pass
+
+        new_content_array = ArrayObject()
+        original_content = self.get_contents()
+        if original_content is not None:
+            original_content.isolate_graphics_state()
+            new_content_array.append(original_content)
+
+        page2content = page2.get_contents()
+        if page2content is not None:
+            rect = getattr(page2, MERGE_CROP_BOX)
+            page2content.operations.insert(
+                0,
+                (
+                    map(
+                        FloatObject,
+                        [
+                            rect.left,
+                            rect.bottom,
+                            rect.width,
+                            rect.height,
+                        ],
+                    ),
+                    "re",
+                ),
+            )
+            page2content.operations.insert(1, ([], "W"))
+            page2content.operations.insert(2, ([], "n"))
+            if page2transformation is not None:
+                page2content = page2transformation(page2content)
+            page2content = PageObject._content_stream_rename(
+                page2content, rename, self.pdf
+            )
+            page2content.isolate_graphics_state()
+            if over:
+                new_content_array.append(page2content)
+            else:
+                new_content_array.insert(0, page2content)
+
+        # if expanding the page to fit a new page, calculate the new media box size
+        if expand:
+            self._expand_mediabox(page2, ctm)
+
+        self.replace_contents(new_content_array)
+        # self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, pdf)
+        # self[NameObject(PG.RESOURCES)] = new_resources
+        # self[NameObject(PG.ANNOTS)] = new_annots
+
+    def _expand_mediabox(
+        self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix]
+    ) -> None:
+        corners1 = (
+            self.mediabox.left.as_numeric(),
+            self.mediabox.bottom.as_numeric(),
+            self.mediabox.right.as_numeric(),
+            self.mediabox.top.as_numeric(),
+        )
+        corners2 = (
+            page2.mediabox.left.as_numeric(),
+            page2.mediabox.bottom.as_numeric(),
+            page2.mediabox.left.as_numeric(),
+            page2.mediabox.top.as_numeric(),
+            page2.mediabox.right.as_numeric(),
+            page2.mediabox.top.as_numeric(),
+            page2.mediabox.right.as_numeric(),
+            page2.mediabox.bottom.as_numeric(),
+        )
+        if ctm is not None:
+            ctm = tuple(float(x) for x in ctm)  # type: ignore[assignment]
+            new_x = tuple(
+                ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4]
+                for i in range(0, 8, 2)
+            )
+            new_y = tuple(
+                ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5]
+                for i in range(0, 8, 2)
+            )
+        else:
+            new_x = corners2[0:8:2]
+            new_y = corners2[1:8:2]
+        lowerleft = (min(new_x), min(new_y))
+        upperright = (max(new_x), max(new_y))
+        lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1]))
+        upperright = (
+            max(corners1[2], upperright[0]),
+            max(corners1[3], upperright[1]),
+        )
+
+        self.mediabox.lower_left = lowerleft
+        self.mediabox.upper_right = upperright
+
+    def merge_transformed_page(
+        self,
+        page2: "PageObject",
+        ctm: Union[CompressedTransformationMatrix, Transformation],
+        over: bool = True,
+        expand: bool = False,
+    ) -> None:
+        """
+        merge_transformed_page is similar to merge_page, but a transformation
+        matrix is applied to the merged stream.
+
+        Args:
+          page2: The page to be merged into this one.
+          ctm: a 6-element tuple containing the operands of the
+                 transformation matrix
+          over: set the page2 content over page1 if True (default) else under
+          expand: Whether the page should be expanded to fit the dimensions
+            of the page to be merged.
+        """
+        if isinstance(ctm, Transformation):
+            ctm = ctm.ctm
+        self._merge_page(
+            page2,
+            lambda page2Content: PageObject._add_transformation_matrix(
+                page2Content, page2.pdf, cast(CompressedTransformationMatrix, ctm)
+            ),
+            ctm,
+            over,
+            expand,
+        )
+
+    def merge_scaled_page(
+        self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False
+    ) -> None:
+        """
+        merge_scaled_page is similar to merge_page, but the stream to be merged
+        is scaled by applying a transformation matrix.
+
+        Args:
+          page2: The page to be merged into this one.
+          scale: The scaling factor
+          over: set the page2 content over page1 if True (default) else under
+          expand: Whether the page should be expanded to fit the
+            dimensions of the page to be merged.
+        """
+        op = Transformation().scale(scale, scale)
+        self.merge_transformed_page(page2, op, over, expand)
+
+    def merge_rotated_page(
+        self,
+        page2: "PageObject",
+        rotation: float,
+        over: bool = True,
+        expand: bool = False,
+    ) -> None:
+        """
+        merge_rotated_page is similar to merge_page, but the stream to be merged
+        is rotated by applying a transformation matrix.
+
+        Args:
+          page2: The page to be merged into this one.
+          rotation: The angle of the rotation, in degrees
+          over: set the page2 content over page1 if True (default) else under
+          expand: Whether the page should be expanded to fit the
+            dimensions of the page to be merged.
+        """
+        op = Transformation().rotate(rotation)
+        self.merge_transformed_page(page2, op, over, expand)
+
+    def merge_translated_page(
+        self,
+        page2: "PageObject",
+        tx: float,
+        ty: float,
+        over: bool = True,
+        expand: bool = False,
+    ) -> None:
+        """
+        mergeTranslatedPage is similar to merge_page, but the stream to be
+        merged is translated by applying a transformation matrix.
+
+        Args:
+          page2: the page to be merged into this one.
+          tx: The translation on X axis
+          ty: The translation on Y axis
+          over: set the page2 content over page1 if True (default) else under
+          expand: Whether the page should be expanded to fit the
+            dimensions of the page to be merged.
+        """
+        op = Transformation().translate(tx, ty)
+        self.merge_transformed_page(page2, op, over, expand)
+
+    def add_transformation(
+        self,
+        ctm: Union[Transformation, CompressedTransformationMatrix],
+        expand: bool = False,
+    ) -> None:
+        """
+        Apply a transformation matrix to the page.
+
+        Args:
+            ctm: A 6-element tuple containing the operands of the
+                transformation matrix. Alternatively, a
+                :py:class:`Transformation<pypdf.Transformation>`
+                object can be passed.
+
+        See :doc:`/user/cropping-and-transforming`.
+        """
+        if isinstance(ctm, Transformation):
+            ctm = ctm.ctm
+        content = self.get_contents()
+        if content is not None:
+            content = PageObject._add_transformation_matrix(content, self.pdf, ctm)
+            content.isolate_graphics_state()
+            self.replace_contents(content)
+        # if expanding the page to fit a new page, calculate the new media box size
+        if expand:
+            corners = [
+                self.mediabox.left.as_numeric(),
+                self.mediabox.bottom.as_numeric(),
+                self.mediabox.left.as_numeric(),
+                self.mediabox.top.as_numeric(),
+                self.mediabox.right.as_numeric(),
+                self.mediabox.top.as_numeric(),
+                self.mediabox.right.as_numeric(),
+                self.mediabox.bottom.as_numeric(),
+            ]
+
+            ctm = tuple(float(x) for x in ctm)  # type: ignore[assignment]
+            new_x = [
+                ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4]
+                for i in range(0, 8, 2)
+            ]
+            new_y = [
+                ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5]
+                for i in range(0, 8, 2)
+            ]
+
+            lowerleft = (min(new_x), min(new_y))
+            upperright = (max(new_x), max(new_y))
+
+            self.mediabox.lower_left = lowerleft
+            self.mediabox.upper_right = upperright
+
+    def scale(self, sx: float, sy: float) -> None:
+        """
+        Scale a page by the given factors by applying a transformation matrix
+        to its content and updating the page size.
+
+        This updates the mediabox, the cropbox, and the contents
+        of the page.
+
+        Args:
+            sx: The scaling factor on horizontal axis.
+            sy: The scaling factor on vertical axis.
+        """
+        self.add_transformation((sx, 0, 0, sy, 0, 0))
+        self.cropbox = self.cropbox.scale(sx, sy)
+        self.artbox = self.artbox.scale(sx, sy)
+        self.bleedbox = self.bleedbox.scale(sx, sy)
+        self.trimbox = self.trimbox.scale(sx, sy)
+        self.mediabox = self.mediabox.scale(sx, sy)
+
+        if PG.ANNOTS in self:
+            annotations = self[PG.ANNOTS]
+            if isinstance(annotations, ArrayObject):
+                for annotation in annotations:
+                    annotation_obj = annotation.get_object()
+                    if ADA.Rect in annotation_obj:
+                        rectangle = annotation_obj[ADA.Rect]
+                        if isinstance(rectangle, ArrayObject):
+                            rectangle[0] = FloatObject(float(rectangle[0]) * sx)
+                            rectangle[1] = FloatObject(float(rectangle[1]) * sy)
+                            rectangle[2] = FloatObject(float(rectangle[2]) * sx)
+                            rectangle[3] = FloatObject(float(rectangle[3]) * sy)
+
+        if PG.VP in self:
+            viewport = self[PG.VP]
+            if isinstance(viewport, ArrayObject):
+                bbox = viewport[0]["/BBox"]
+            else:
+                bbox = viewport["/BBox"]  # type: ignore
+            scaled_bbox = RectangleObject(
+                (
+                    float(bbox[0]) * sx,
+                    float(bbox[1]) * sy,
+                    float(bbox[2]) * sx,
+                    float(bbox[3]) * sy,
+                )
+            )
+            if isinstance(viewport, ArrayObject):
+                self[NameObject(PG.VP)][NumberObject(0)][  # type: ignore
+                    NameObject("/BBox")
+                ] = scaled_bbox
+            else:
+                self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox  # type: ignore
+
+    def scale_by(self, factor: float) -> None:
+        """
+        Scale a page by the given factor by applying a transformation matrix to
+        its content and updating the page size.
+
+        Args:
+            factor: The scaling factor (for both X and Y axis).
+        """
+        self.scale(factor, factor)
+
+    def scale_to(self, width: float, height: float) -> None:
+        """
+        Scale a page to the specified dimensions by applying a transformation
+        matrix to its content and updating the page size.
+
+        Args:
+            width: The new width.
+            height: The new height.
+        """
+        sx = width / float(self.mediabox.width)
+        sy = height / float(self.mediabox.height)
+        self.scale(sx, sy)
+
+    def compress_content_streams(self, level: int = -1) -> None:
+        """
+        Compress the size of this page by joining all content streams and
+        applying a FlateDecode filter.
+
+        However, it is possible that this function will perform no action if
+        content stream compression becomes "automatic".
+        """
+        content = self.get_contents()
+        if content is not None:
+            content_obj = content.flate_encode(level)
+            try:
+                content.indirect_reference.pdf._objects[  # type: ignore
+                    content.indirect_reference.idnum - 1  # type: ignore
+                ] = content_obj
+            except AttributeError:
+                if self.indirect_reference is not None and hasattr(
+                    self.indirect_reference.pdf, "_add_object"
+                ):
+                    self.replace_contents(content_obj)
+                else:
+                    raise ValueError("Page must be part of a PdfWriter")
+
+    @property
+    def page_number(self) -> Optional[int]:
+        """
+        Read-only property which returns the page number within the PDF file.
+
+        Returns:
+            int : page number; None if the page is not attached to a PDF.
+        """
+        if self.indirect_reference is None:
+            return None
+        else:
+            try:
+                lst = self.indirect_reference.pdf.pages
+                return lst.index(self)
+            except ValueError:
+                return None
+
+    def _debug_for_extract(self) -> str:  # pragma: no cover
+        out = ""
+        for ope, op in ContentStream(
+            self["/Contents"].get_object(), self.pdf, "bytes"
+        ).operations:
+            if op == b"TJ":
+                s = [x for x in ope[0] if isinstance(x, str)]
+            else:
+                s = []
+            out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n"
+        out += "\n=============================\n"
+        try:
+            for fo in self[PG.RESOURCES]["/Font"]:  # type:ignore
+                out += fo + "\n"
+                out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n"  # type:ignore
+                try:
+                    enc_repr = self[PG.RESOURCES]["/Font"][fo][  # type:ignore
+                        "/Encoding"
+                    ].__repr__()
+                    out += enc_repr + "\n"
+                except Exception:
+                    pass
+                try:
+                    out += (
+                        self[PG.RESOURCES]["/Font"][fo][  # type:ignore
+                            "/ToUnicode"
+                        ]
+                        .get_data()
+                        .decode()
+                        + "\n"
+                    )
+                except Exception:
+                    pass
+
+        except KeyError:
+            out += "No Font\n"
+        return out
+
+    def _extract_text(
+        self,
+        obj: Any,
+        pdf: Any,
+        orientations: Tuple[int, ...] = (0, 90, 180, 270),
+        space_width: float = 200.0,
+        content_key: Optional[str] = PG.CONTENTS,
+        visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+    ) -> str:
+        """
+        See extract_text for most arguments.
+
+        Args:
+            content_key: indicate the default key where to extract data
+                None = the object; this allow to reuse the function on XObject
+                default = "/Content"
+        """
+        text: str = ""
+        output: str = ""
+        rtl_dir: bool = False  # right-to-left
+        cmaps: Dict[
+            str,
+            Tuple[
+                str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject
+            ],
+        ] = {}
+        try:
+            objr = obj
+            while NameObject(PG.RESOURCES) not in objr:
+                # /Resources can be inherited sometimes so we look to parents
+                objr = objr["/Parent"].get_object()
+                # if no parents we will have no /Resources will be available
+                # => an exception will be raised
+            resources_dict = cast(DictionaryObject, objr[PG.RESOURCES])
+        except Exception:
+            # no resources means no text is possible (no font) we consider the
+            # file as not damaged, no need to check for TJ or Tj
+            return ""
+        if "/Font" in resources_dict:
+            for f in cast(DictionaryObject, resources_dict["/Font"]):
+                cmaps[f] = build_char_map(f, space_width, obj)
+        cmap: Tuple[
+            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+        ] = (
+            "charmap",
+            {},
+            "NotInitialized",
+            None,
+        )  # (encoding,CMAP,font resource name,dictionary-object of font)
+        try:
+            content = (
+                obj[content_key].get_object() if isinstance(content_key, str) else obj
+            )
+            if not isinstance(content, ContentStream):
+                content = ContentStream(content, pdf, "bytes")
+        except KeyError:  # it means no content can be extracted(certainly empty page)
+            return ""
+        # Note: we check all strings are TextStringObjects. ByteStringObjects
+        # are strings where the byte->string encoding was unknown, so adding
+        # them to the text here would be gibberish.
+
+        cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        cm_stack = []
+        tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        # cm/tm_prev stores the last modified matrices can be an intermediate position
+        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
+        # memo_cm/tm will be used to store the position at the beginning of building the text
+        memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        char_scale = 1.0
+        space_scale = 1.0
+        _space_width: float = 500.0  # will be set correctly at first Tf
+        TL = 0.0
+        font_size = 12.0  # init just in case of
+
+        def current_spacewidth() -> float:
+            return _space_width / 1000.0
+
+        def process_operation(operator: bytes, operands: List[Any]) -> None:
+            nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
+            nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
+            nonlocal orientations, rtl_dir, visitor_text, output, text
+            global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+
+            check_crlf_space: bool = False
+            # Table 5.4 page 405
+            if operator == b"BT":
+                tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+                output += text
+                if visitor_text is not None:
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
+                return None
+            elif operator == b"ET":
+                output += text
+                if visitor_text is not None:
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
+            # table 4.7 "Graphics state operators", page 219
+            # cm_matrix calculation is a reserved for the moment
+            elif operator == b"q":
+                cm_stack.append(
+                    (
+                        cm_matrix,
+                        cmap,
+                        font_size,
+                        char_scale,
+                        space_scale,
+                        _space_width,
+                        TL,
+                    )
+                )
+            elif operator == b"Q":
+                try:
+                    (
+                        cm_matrix,
+                        cmap,
+                        font_size,
+                        char_scale,
+                        space_scale,
+                        _space_width,
+                        TL,
+                    ) = cm_stack.pop()
+                except Exception:
+                    cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+            elif operator == b"cm":
+                output += text
+                if visitor_text is not None:
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                text = ""
+                cm_matrix = mult(
+                    [
+                        float(operands[0]),
+                        float(operands[1]),
+                        float(operands[2]),
+                        float(operands[3]),
+                        float(operands[4]),
+                        float(operands[5]),
+                    ],
+                    cm_matrix,
+                )
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
+            # Table 5.2 page 398
+            elif operator == b"Tz":
+                char_scale = float(operands[0]) / 100.0
+            elif operator == b"Tw":
+                space_scale = 1.0 + float(operands[0])
+            elif operator == b"TL":
+                TL = float(operands[0])
+            elif operator == b"Tf":
+                if text != "":
+                    output += text  # .translate(cmap)
+                    if visitor_text is not None:
+                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
+                try:
+                    # charMapTuple: font_type, float(sp_width / 2), encoding,
+                    #               map_dict, font-dictionary
+                    charMapTuple = cmaps[operands[0]]
+                    _space_width = charMapTuple[1]
+                    # current cmap: encoding, map_dict, font resource name
+                    #               (internal name, not the real font-name),
+                    # font-dictionary. The font-dictionary describes the font.
+                    cmap = (
+                        charMapTuple[2],
+                        charMapTuple[3],
+                        operands[0],
+                        charMapTuple[4],
+                    )
+                except KeyError:  # font not found
+                    _space_width = unknown_char_map[1]
+                    cmap = (
+                        unknown_char_map[2],
+                        unknown_char_map[3],
+                        "???" + operands[0],
+                        None,
+                    )
+                try:
+                    font_size = float(operands[1])
+                except Exception:
+                    pass  # keep previous size
+            # Table 5.5 page 406
+            elif operator == b"Td":
+                check_crlf_space = True
+                # A special case is a translating only tm:
+                # tm[0..5] = 1 0 0 1 e f,
+                # i.e. tm[4] += tx, tm[5] += ty.
+                tx = float(operands[0])
+                ty = float(operands[1])
+                tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
+                tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
+            elif operator == b"Tm":
+                check_crlf_space = True
+                tm_matrix = [
+                    float(operands[0]),
+                    float(operands[1]),
+                    float(operands[2]),
+                    float(operands[3]),
+                    float(operands[4]),
+                    float(operands[5]),
+                ]
+            elif operator == b"T*":
+                check_crlf_space = True
+                tm_matrix[5] -= TL
+
+            elif operator == b"Tj":
+                check_crlf_space = True
+                text, rtl_dir = handle_tj(
+                    text,
+                    operands,
+                    cm_matrix,
+                    tm_matrix,  # text matrix
+                    cmap,
+                    orientations,
+                    output,
+                    font_size,
+                    rtl_dir,
+                    visitor_text,
+                )
+            else:
+                return None
+            if check_crlf_space:
+                try:
+                    text, output, cm_prev, tm_prev = crlf_space_check(
+                        text,
+                        (cm_prev, tm_prev),
+                        (cm_matrix, tm_matrix),
+                        (memo_cm, memo_tm),
+                        cmap,
+                        orientations,
+                        output,
+                        font_size,
+                        visitor_text,
+                        current_spacewidth(),
+                    )
+                    if text == "":
+                        memo_cm = cm_matrix.copy()
+                        memo_tm = tm_matrix.copy()
+                except OrientationNotFoundError:
+                    return None
+
+        for operands, operator in content.operations:
+            if visitor_operand_before is not None:
+                visitor_operand_before(operator, operands, cm_matrix, tm_matrix)
+            # multiple operators are defined in here ####
+            if operator == b"'":
+                process_operation(b"T*", [])
+                process_operation(b"Tj", operands)
+            elif operator == b'"':
+                process_operation(b"Tw", [operands[0]])
+                process_operation(b"Tc", [operands[1]])
+                process_operation(b"T*", [])
+                process_operation(b"Tj", operands[2:])
+            elif operator == b"TD":
+                process_operation(b"TL", [-operands[1]])
+                process_operation(b"Td", operands)
+            elif operator == b"TJ":
+                for op in operands[0]:
+                    if isinstance(op, (str, bytes)):
+                        process_operation(b"Tj", [op])
+                    if isinstance(op, (int, float, NumberObject, FloatObject)) and (
+                        (abs(float(op)) >= _space_width)
+                        and (len(text) > 0)
+                        and (text[-1] != " ")
+                    ):
+                        process_operation(b"Tj", [" "])
+            elif operator == b"Do":
+                output += text
+                if visitor_text is not None:
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                try:
+                    if output[-1] != "\n":
+                        output += "\n"
+                        if visitor_text is not None:
+                            visitor_text(
+                                "\n",
+                                memo_cm,
+                                memo_tm,
+                                cmap[3],
+                                font_size,
+                            )
+                except IndexError:
+                    pass
+                try:
+                    xobj = resources_dict["/XObject"]
+                    if xobj[operands[0]]["/Subtype"] != "/Image":  # type: ignore
+                        text = self.extract_xform_text(
+                            xobj[operands[0]],  # type: ignore
+                            orientations,
+                            space_width,
+                            visitor_operand_before,
+                            visitor_operand_after,
+                            visitor_text,
+                        )
+                        output += text
+                        if visitor_text is not None:
+                            visitor_text(
+                                text,
+                                memo_cm,
+                                memo_tm,
+                                cmap[3],
+                                font_size,
+                            )
+                except Exception:
+                    logger_warning(
+                        f" impossible to decode XFormObject {operands[0]}",
+                        __name__,
+                    )
+                finally:
+                    text = ""
+                    memo_cm = cm_matrix.copy()
+                    memo_tm = tm_matrix.copy()
+
+            else:
+                process_operation(operator, operands)
+            if visitor_operand_after is not None:
+                visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
+        output += text  # just in case of
+        if text != "" and visitor_text is not None:
+            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+        return output
+
+    def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
+        """
+        Get fonts formatted for "layout" mode text extraction.
+
+        Returns:
+            Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name
+        """
+        # Font retrieval logic adapted from pypdf.PageObject._extract_text()
+        objr: Any = self
+        fonts: Dict[str, _layout_mode.Font] = {}
+        while objr is not None:
+            try:
+                resources_dict: Any = objr[PG.RESOURCES]
+            except KeyError:
+                resources_dict = {}
+            if "/Font" in resources_dict and self.pdf is not None:
+                for font_name in resources_dict["/Font"]:
+                    *cmap, font_dict_obj = build_char_map(font_name, 200.0, self)
+                    font_dict = {
+                        k: v.get_object()
+                        if isinstance(v, IndirectObject)
+                        else [_v.get_object() for _v in v]
+                        if isinstance(v, ArrayObject)
+                        else v
+                        for k, v in font_dict_obj.items()
+                    }
+                    # mypy really sucks at unpacking
+                    fonts[font_name] = _layout_mode.Font(*cmap, font_dict)  # type: ignore[call-arg,arg-type]
+            try:
+                objr = objr["/Parent"].get_object()
+            except KeyError:
+                objr = None
+
+        return fonts
+
+    def _layout_mode_text(
+        self,
+        space_vertically: bool = True,
+        scale_weight: float = 1.25,
+        strip_rotated: bool = True,
+        debug_path: Optional[Path] = None,
+    ) -> str:
+        """
+        Get text preserving fidelity to source PDF text layout.
+
+        Args:
+            space_vertically: include blank lines inferred from y distance + font
+                height. Defaults to True.
+            scale_weight: multiplier for string length when calculating weighted
+                average character width. Defaults to 1.25.
+            strip_rotated: Removes text that is rotated w.r.t. to the page from
+                layout mode output. Defaults to True.
+            debug_path (Path | None): if supplied, must target a directory.
+                creates the following files with debug information for layout mode
+                functions if supplied:
+                  - fonts.json: output of self._layout_mode_fonts
+                  - tjs.json: individual text render ops with corresponding transform matrices
+                  - bts.json: text render ops left justified and grouped by BT/ET operators
+                  - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
+                Defaults to None.
+
+        Returns:
+            str: multiline string containing page text in a fixed width format that
+                closely adheres to the rendered layout in the source pdf.
+        """
+        fonts = self._layout_mode_fonts()
+        if debug_path:  # pragma: no cover
+            import json
+
+            debug_path.joinpath("fonts.json").write_text(
+                json.dumps(
+                    fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
+                ),
+                "utf-8",
+            )
+
+        ops = iter(
+            ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations
+        )
+        bt_groups = _layout_mode.text_show_operations(
+            ops, fonts, strip_rotated, debug_path
+        )
+
+        if not bt_groups:
+            return ""
+
+        ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path)
+
+        char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)
+
+        return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
+
+    def extract_text(
+        self,
+        *args: Any,
+        orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270),
+        space_width: float = 200.0,
+        visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+        extraction_mode: Literal["plain", "layout"] = "plain",
+        **kwargs: Any,
+    ) -> str:
+        """
+        Locate all text drawing commands, in the order they are provided in the
+        content stream, and extract the text.
+
+        This works well for some PDF files, but poorly for others, depending on
+        the generator used. This will be refined in the future.
+
+        Do not rely on the order of text coming out of this function, as it
+        will change if this function is made more sophisticated.
+
+        Arabic and Hebrew are extracted in the correct order.
+        If required a custom RTL range of characters can be defined;
+        see function set_custom_rtl.
+
+        Additionally you can provide visitor methods to get informed on all
+        operations and all text objects.
+        For example in some PDF files this can be useful to parse tables.
+
+        Args:
+            orientations: list of orientations extract_text will look for
+                default = (0, 90, 180, 270)
+                note: currently only 0 (up),90 (turned left), 180 (upside down),
+                270 (turned right)
+            space_width: force default space width
+                if not extracted from font (default: 200)
+            visitor_operand_before: function to be called before processing an operation.
+                It has four arguments: operator, operand-arguments,
+                current transformation matrix and text matrix.
+            visitor_operand_after: function to be called after processing an operation.
+                It has four arguments: operator, operand-arguments,
+                current transformation matrix and text matrix.
+            visitor_text: function to be called when extracting some text at some position.
+                It has five arguments: text, current transformation matrix,
+                text matrix, font-dictionary and font-size.
+                The font-dictionary may be None in case of unknown fonts.
+                If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
+            extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
+                "layout" for experimental layout mode functionality.
+                NOTE: orientations, space_width, and visitor_* parameters are NOT respected
+                in "layout" mode.
+
+        kwargs:
+            layout_mode_space_vertically (bool): include blank lines inferred from
+                y distance + font height. Defaults to True.
+            layout_mode_scale_weight (float): multiplier for string length when calculating
+                weighted average character width. Defaults to 1.25.
+            layout_mode_strip_rotated (bool): layout mode does not support rotated text.
+                Set to False to include rotated text anyway. If rotated text is discovered,
+                layout will be degraded and a warning will result. Defaults to True.
+            layout_mode_debug_path (Path | None): if supplied, must target a directory.
+                creates the following files with debug information for layout mode
+                functions if supplied:
+
+                  - fonts.json: output of self._layout_mode_fonts
+                  - tjs.json: individual text render ops with corresponding transform matrices
+                  - bts.json: text render ops left justified and grouped by BT/ET operators
+                  - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
+
+        Returns:
+            The extracted text
+        """
+        if extraction_mode not in ["plain", "layout"]:
+            raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
+        if extraction_mode == "layout":
+            return self._layout_mode_text(
+                space_vertically=kwargs.get("layout_mode_space_vertically", True),
+                scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
+                strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
+                debug_path=kwargs.get("layout_mode_debug_path", None),
+            )
+        if len(args) >= 1:
+            if isinstance(args[0], str):
+                if len(args) >= 3:
+                    if isinstance(args[2], (tuple, int)):
+                        orientations = args[2]
+                    else:
+                        raise TypeError(f"Invalid positional parameter {args[2]}")
+                if len(args) >= 4:
+                    if isinstance(args[3], (float, int)):
+                        space_width = args[3]
+                    else:
+                        raise TypeError(f"Invalid positional parameter {args[3]}")
+            elif isinstance(args[0], (tuple, int)):
+                orientations = args[0]
+                if len(args) >= 2:
+                    if isinstance(args[1], (float, int)):
+                        space_width = args[1]
+                    else:
+                        raise TypeError(f"Invalid positional parameter {args[1]}")
+            else:
+                raise TypeError(f"Invalid positional parameter {args[0]}")
+
+        if isinstance(orientations, int):
+            orientations = (orientations,)
+
+        return self._extract_text(
+            self,
+            self.pdf,
+            orientations,
+            space_width,
+            PG.CONTENTS,
+            visitor_operand_before,
+            visitor_operand_after,
+            visitor_text,
+        )
+
+    def extract_xform_text(
+        self,
+        xform: EncodedStreamObject,
+        orientations: Tuple[int, ...] = (0, 90, 270, 360),
+        space_width: float = 200.0,
+        visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
+        visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
+    ) -> str:
+        """
+        Extract text from an XObject.
+
+        Args:
+            xform:
+            orientations:
+            space_width:  force default space width (if not extracted from font (default 200)
+            visitor_operand_before:
+            visitor_operand_after:
+            visitor_text:
+
+        Returns:
+            The extracted text
+        """
+        return self._extract_text(
+            xform,
+            self.pdf,
+            orientations,
+            space_width,
+            None,
+            visitor_operand_before,
+            visitor_operand_after,
+            visitor_text,
+        )
+
+    def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
+        """
+        Get the names of embedded fonts and unembedded fonts.
+
+        Returns:
+            A tuple (Set of embedded fonts, set of unembedded fonts)
+        """
+        obj = self.get_object()
+        assert isinstance(obj, DictionaryObject)
+        fonts: Set[str] = set()
+        embedded: Set[str] = set()
+        fonts, embedded = _get_fonts_walk(obj, fonts, embedded)
+        unembedded = fonts - embedded
+        return embedded, unembedded
+
+    mediabox = _create_rectangle_accessor(PG.MEDIABOX, ())
+    """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
+    default user space units, defining the boundaries of the physical medium on
+    which the page is intended to be displayed or printed."""
+
+    cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,))
+    """
+    A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
+    default user space units, defining the visible region of default user
+    space.
+
+    When the page is displayed or printed, its contents are to be clipped
+    (cropped) to this rectangle and then imposed on the output medium in some
+    implementation-defined manner. Default value: same as
+    :attr:`mediabox<mediabox>`.
+    """
+
+    bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX))
+    """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
+    default user space units, defining the region to which the contents of the
+    page should be clipped when output in a production environment."""
+
+    trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX))
+    """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
+    default user space units, defining the intended dimensions of the finished
+    page after trimming."""
+
+    artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX))
+    """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in
+    default user space units, defining the extent of the page's meaningful
+    content as intended by the page's creator."""
+
+    @property
+    def annotations(self) -> Optional[ArrayObject]:
+        if "/Annots" not in self:
+            return None
+        else:
+            return cast(ArrayObject, self["/Annots"])
+
+    @annotations.setter
+    def annotations(self, value: Optional[ArrayObject]) -> None:
+        """
+        Set the annotations array of the page.
+
+        Typically you do not want to set this value, but append to it.
+        If you append to it, remember to add the object first to the writer
+        and only add the indirect object.
+        """
+        if value is None:
+            del self[NameObject("/Annots")]
+        else:
+            self[NameObject("/Annots")] = value
+
+
+class _VirtualList(Sequence[PageObject]):
+    def __init__(
+        self,
+        length_function: Callable[[], int],
+        get_function: Callable[[int], PageObject],
+    ) -> None:
+        self.length_function = length_function
+        self.get_function = get_function
+        self.current = -1
+
+    def __len__(self) -> int:
+        return self.length_function()
+
+    @overload
+    def __getitem__(self, index: int) -> PageObject:
+        ...
+
+    @overload
+    def __getitem__(self, index: slice) -> Sequence[PageObject]:
+        ...
+
+    def __getitem__(
+        self, index: Union[int, slice]
+    ) -> Union[PageObject, Sequence[PageObject]]:
+        if isinstance(index, slice):
+            indices = range(*index.indices(len(self)))
+            cls = type(self)
+            return cls(indices.__len__, lambda idx: self[indices[idx]])
+        if not isinstance(index, int):
+            raise TypeError("sequence indices must be integers")
+        len_self = len(self)
+        if index < 0:
+            # support negative indexes
+            index = len_self + index
+        if index < 0 or index >= len_self:
+            raise IndexError("sequence index out of range")
+        return self.get_function(index)
+
+    def __delitem__(self, index: Union[int, slice]) -> None:
+        if isinstance(index, slice):
+            r = list(range(*index.indices(len(self))))
+            # pages have to be deleted from last to first
+            r.sort()
+            r.reverse()
+            for p in r:
+                del self[p]  # recursive call
+            return
+        if not isinstance(index, int):
+            raise TypeError("index must be integers")
+        len_self = len(self)
+        if index < 0:
+            # support negative indexes
+            index = len_self + index
+        if index < 0 or index >= len_self:
+            raise IndexError("index out of range")
+        ind = self[index].indirect_reference
+        assert ind is not None
+        parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
+        while parent is not None:
+            parent = cast(DictionaryObject, parent.get_object())
+            try:
+                i = parent["/Kids"].index(ind)
+                del parent["/Kids"][i]
+                try:
+                    assert ind is not None
+                    del ind.pdf.flattened_pages[index]  # case of page in a Reader
+                except Exception:  # pragma: no cover
+                    pass
+                if "/Count" in parent:
+                    parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
+                if len(parent["/Kids"]) == 0:
+                    # No more objects in this part of this sub tree
+                    ind = parent.indirect_reference
+                    parent = cast(DictionaryObject, parent.get("/Parent", None))
+                else:
+                    parent = None
+            except ValueError:  # from index
+                raise PdfReadError(f"Page Not Found in Page Tree {ind}")
+
+    def __iter__(self) -> Iterator[PageObject]:
+        for i in range(len(self)):
+            yield self[i]
+
+    def __str__(self) -> str:
+        p = [f"PageObject({i})" for i in range(self.length_function())]
+        return f"[{', '.join(p)}]"
+
+
+def _get_fonts_walk(
+    obj: DictionaryObject,
+    fnt: Set[str],
+    emb: Set[str],
+) -> Tuple[Set[str], Set[str]]:
+    """
+    Get the set of all fonts and all embedded fonts.
+
+    Args:
+        obj: Page resources dictionary
+        fnt: font
+        emb: embedded fonts
+
+    Returns:
+        A tuple (fnt, emb)
+
+    If there is a key called 'BaseFont', that is a font that is used in the document.
+    If there is a key called 'FontName' and another key in the same dictionary object
+    that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
+    embedded.
+
+    We create and add to two sets, fnt = fonts used and emb = fonts embedded.
+    """
+    fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
+
+    def process_font(f: DictionaryObject) -> None:
+        nonlocal fnt, emb
+        f = cast(DictionaryObject, f.get_object())  # to be sure
+        if "/BaseFont" in f:
+            fnt.add(cast(str, f["/BaseFont"]))
+
+        if (
+            ("/CharProcs" in f)
+            or (
+                "/FontDescriptor" in f
+                and any(
+                    x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
+                )
+            )
+            or (
+                "/DescendantFonts" in f
+                and "/FontDescriptor"
+                in cast(
+                    DictionaryObject,
+                    cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
+                )
+                and any(
+                    x
+                    in cast(
+                        DictionaryObject,
+                        cast(
+                            DictionaryObject,
+                            cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
+                        )["/FontDescriptor"],
+                    )
+                    for x in fontkeys
+                )
+            )
+        ):
+            # the list comprehension ensures there is FontFile
+            try:
+                emb.add(cast(str, f["/BaseFont"]))
+            except KeyError:
+                emb.add("(" + cast(str, f["/Subtype"]) + ")")
+
+    if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
+        for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
+            process_font(f)
+    if "/Resources" in obj:
+        if "/Font" in cast(DictionaryObject, obj["/Resources"]):
+            for f in cast(
+                DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
+            ).values():
+                process_font(f)
+        if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
+            for x in cast(
+                DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
+            ).values():
+                _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
+    if "/Annots" in obj:
+        for a in cast(ArrayObject, obj["/Annots"]):
+            _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
+    if "/AP" in obj:
+        if (
+            cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
+                "/Type"
+            )
+            == "/XObject"
+        ):
+            _get_fonts_walk(
+                cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
+                fnt,
+                emb,
+            )
+        else:
+            for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
+                _get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
+    return fnt, emb  # return the sets for each page
+
+
+class _VirtualListImages(Sequence[ImageFile]):
+    def __init__(
+        self,
+        ids_function: Callable[[], List[Union[str, List[str]]]],
+        get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile],
+    ) -> None:
+        self.ids_function = ids_function
+        self.get_function = get_function
+        self.current = -1
+
+    def __len__(self) -> int:
+        return len(self.ids_function())
+
+    def keys(self) -> List[Union[str, List[str]]]:
+        return self.ids_function()
+
+    def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]:
+        return [(x, self[x]) for x in self.ids_function()]
+
+    @overload
+    def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile:
+        ...
+
+    @overload
+    def __getitem__(self, index: slice) -> Sequence[ImageFile]:
+        ...
+
+    def __getitem__(
+        self, index: Union[int, slice, str, List[str], Tuple[str]]
+    ) -> Union[ImageFile, Sequence[ImageFile]]:
+        lst = self.ids_function()
+        if isinstance(index, slice):
+            indices = range(*index.indices(len(self)))
+            lst = [lst[x] for x in indices]
+            cls = type(self)
+            return cls((lambda: lst), self.get_function)
+        if isinstance(index, (str, list, tuple)):
+            return self.get_function(index)
+        if not isinstance(index, int):
+            raise TypeError("invalid sequence indices type")
+        len_self = len(lst)
+        if index < 0:
+            # support negative indexes
+            index = len_self + index
+        if index < 0 or index >= len_self:
+            raise IndexError("sequence index out of range")
+        return self.get_function(lst[index])
+
+    def __iter__(self) -> Iterator[ImageFile]:
+        for i in range(len(self)):
+            yield self[i]
+
+    def __str__(self) -> str:
+        p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())]
+        return f"[{', '.join(p)}]"
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_page.py
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz