diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_page.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_page.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/pypdf/_page.py | 2458 |
1 files changed, 2458 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_page.py b/.venv/lib/python3.12/site-packages/pypdf/_page.py new file mode 100644 index 00000000..63038d9d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_page.py @@ -0,0 +1,2458 @@ +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math +import sys +from decimal import Decimal +from pathlib import Path +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Sequence, + Set, + Tuple, + Union, + cast, + overload, +) + +from ._cmap import build_char_map, unknown_char_map +from ._protocols import PdfCommonDocProtocol +from ._text_extraction import ( + OrientationNotFoundError, + _layout_mode, + crlf_space_check, + handle_tj, + mult, +) +from ._utils import ( + CompressedTransformationMatrix, + File, + ImageFile, + TransformationMatrixType, + logger_warning, + matrix_multiply, +) +from .constants import AnnotationDictionaryAttributes as ADA +from .constants import ImageAttributes as IA +from .constants import PageAttributes as PG +from .constants import Resources as RES +from .errors import PageSizeNotDefinedError, PdfReadError +from .filters import _xobj_to_image +from .generic import ( + ArrayObject, + ContentStream, + DictionaryObject, + EncodedStreamObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + RectangleObject, + StreamObject, +) + +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + + +MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' + + +def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: + retval: Union[None, RectangleObject, IndirectObject] = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval is None: + for d in defaults: + retval = self.get(d) + if retval is not None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.get_object(retval) + retval = RectangleObject(retval) # type: ignore + _set_rectangle(self, name, retval) + return retval + + +def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: + name = NameObject(name) + self[name] = value + + +def _delete_rectangle(self: Any, name: str) -> None: + del self[name] + + +def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: + return property( + lambda self: _get_rectangle(self, name, fallback), + lambda self, value: _set_rectangle(self, name, value), + lambda self: _delete_rectangle(self, name), + ) + + +class Transformation: + """ + Represent a 2D transformation. + + The transformation between two coordinate systems is represented by a 3-by-3 + transformation matrix matrix with the following form:: + + a b 0 + c d 0 + e f 1 + + Because a transformation matrix has only six elements that can be changed, + it is usually specified in PDF as the six-element array [ a b c d e f ]. + + Coordinate transformations are expressed as matrix multiplications:: + + a b 0 + [ x′ y′ 1 ] = [ x y 1 ] × c d 0 + e f 1 + + + Example: + >>> from pypdf import Transformation + >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) + >>> page.add_transformation(op) + """ + + # 9.5.4 Coordinate Systems for 3D + # 4.2.2 Common Transformations + def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)): + self.ctm = ctm + + @property + def matrix(self) -> TransformationMatrixType: + """ + Return the transformation matrix as a tuple of tuples in the form: + + ((a, b, 0), (c, d, 0), (e, f, 1)) + """ + return ( + (self.ctm[0], self.ctm[1], 0), + (self.ctm[2], self.ctm[3], 0), + (self.ctm[4], self.ctm[5], 1), + ) + + @staticmethod + def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: + """ + Compresses the transformation matrix into a tuple of (a, b, c, d, e, f). + + Args: + matrix: The transformation matrix as a tuple of tuples. + + Returns: + A tuple representing the transformation matrix as (a, b, c, d, e, f) + """ + return ( + matrix[0][0], + matrix[0][1], + matrix[1][0], + matrix[1][1], + matrix[2][0], + matrix[2][1], + ) + + def transform(self, m: "Transformation") -> "Transformation": + """ + Apply one transformation to another. + + Args: + m: a Transformation to apply. + + Returns: + A new ``Transformation`` instance + + Example: + >>> from pypdf import Transformation + >>> op = Transformation((1, 0, 0, -1, 0, height)) # vertical mirror + >>> op = Transformation().transform(Transformation((-1, 0, 0, 1, iwidth, 0))) # horizontal mirror + >>> page.add_transformation(op) + """ + ctm = Transformation.compress(matrix_multiply(self.matrix, m.matrix)) + return Transformation(ctm) + + def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": + """ + Translate the contents of a page. + + Args: + tx: The translation along the x-axis. + ty: The translation along the y-axis. + + Returns: + A new ``Transformation`` instance + """ + m = self.ctm + return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) + + def scale( + self, sx: Optional[float] = None, sy: Optional[float] = None + ) -> "Transformation": + """ + Scale the contents of a page towards the origin of the coordinate system. + + Typically, that is the lower-left corner of the page. That can be + changed by translating the contents / the page boxes. + + Args: + sx: The scale factor along the x-axis. + sy: The scale factor along the y-axis. + + Returns: + A new Transformation instance with the scaled matrix. + """ + if sx is None and sy is None: + raise ValueError("Either sx or sy must be specified") + if sx is None: + sx = sy + if sy is None: + sy = sx + assert sx is not None + assert sy is not None + op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) + ctm = Transformation.compress(matrix_multiply(self.matrix, op)) + return Transformation(ctm) + + def rotate(self, rotation: float) -> "Transformation": + """ + Rotate the contents of a page. + + Args: + rotation: The angle of rotation in degrees. + + Returns: + A new ``Transformation`` instance with the rotated matrix. + """ + rotation = math.radians(rotation) + op: TransformationMatrixType = ( + (math.cos(rotation), math.sin(rotation), 0), + (-math.sin(rotation), math.cos(rotation), 0), + (0, 0, 1), + ) + ctm = Transformation.compress(matrix_multiply(self.matrix, op)) + return Transformation(ctm) + + def __repr__(self) -> str: + return f"Transformation(ctm={self.ctm})" + + @overload + def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]: + ... + + @overload + def apply_on( + self, pt: Tuple[float, float], as_object: bool = False + ) -> Tuple[float, float]: + ... + + def apply_on( + self, + pt: Union[Tuple[float, float], List[float]], + as_object: bool = False, + ) -> Union[Tuple[float, float], List[float]]: + """ + Apply the transformation matrix on the given point. + + Args: + pt: A tuple or list representing the point in the form (x, y) + + Returns: + A tuple or list representing the transformed point in the form (x', y') + """ + typ = FloatObject if as_object else float + pt1 = ( + typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), + typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), + ) + return list(pt1) if isinstance(pt, list) else pt1 + + +class PageObject(DictionaryObject): + """ + PageObject represents a single page within a PDF file. + + Typically these objects will be created by accessing the + :attr:`pages<pypdf.PdfReader.pages>` property of the + :class:`PdfReader<pypdf.PdfReader>` class, but it is + also possible to create an empty page with the + :meth:`create_blank_page()<pypdf._page.PageObject.create_blank_page>` static method. + + Args: + pdf: PDF file the page belongs to. + indirect_reference: Stores the original indirect reference to + this object in its source PDF + """ + + original_page: "PageObject" # very local use in writer when appending + + def __init__( + self, + pdf: Optional[PdfCommonDocProtocol] = None, + indirect_reference: Optional[IndirectObject] = None, + ) -> None: + DictionaryObject.__init__(self) + self.pdf = pdf + self.inline_images: Optional[Dict[str, ImageFile]] = None + # below Union for mypy but actually Optional[List[str]] + self.indirect_reference = indirect_reference + + def hash_value_data(self) -> bytes: + data = super().hash_value_data() + data += b"%d" % id(self) + return data + + @property + def user_unit(self) -> float: + """ + A read-only positive number giving the size of user space units. + + It is in multiples of 1/72 inch. Hence a value of 1 means a user + space unit is 1/72 inch, and a value of 3 means that a user + space unit is 3/72 inch. + """ + return self.get(PG.USER_UNIT, 1) + + @staticmethod + def create_blank_page( + pdf: Optional[PdfCommonDocProtocol] = None, + width: Union[float, Decimal, None] = None, + height: Union[float, Decimal, None] = None, + ) -> "PageObject": + """ + Return a new blank page. + + If ``width`` or ``height`` is ``None``, try to get the page size + from the last page of *pdf*. + + Args: + pdf: PDF file the page is within. + width: The width of the new page expressed in default user + space units. + height: The height of the new page expressed in default user + space units. + + Returns: + The new blank page + + Raises: + PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains + no page + """ + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) + page.__setitem__(NameObject(PG.PARENT), NullObject()) + page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) + if width is None or height is None: + if pdf is not None and len(pdf.pages) > 0: + lastpage = pdf.pages[len(pdf.pages) - 1] + width = lastpage.mediabox.width + height = lastpage.mediabox.height + else: + raise PageSizeNotDefinedError + page.__setitem__( + NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore + ) + + return page + + @property + def _old_images(self) -> List[File]: # deprecated + """ + Get a list of all images of the page. + + This requires pillow. You can install it via 'pip install pypdf[image]'. + + For the moment, this does NOT include inline images. They will be added + in future. + """ + images_extracted: List[File] = [] + if RES.XOBJECT not in self[PG.RESOURCES]: # type: ignore + return images_extracted + + x_object = self[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for obj in x_object: + if x_object[obj][IA.SUBTYPE] == "/Image": + extension, byte_stream, img = _xobj_to_image(x_object[obj]) + if extension is not None: + filename = f"{obj[1:]}{extension}" + images_extracted.append(File(name=filename, data=byte_stream)) + images_extracted[-1].image = img + images_extracted[-1].indirect_reference = x_object[ + obj + ].indirect_reference + return images_extracted + + def _get_ids_image( + self, + obj: Optional[DictionaryObject] = None, + ancest: Optional[List[str]] = None, + call_stack: Optional[List[Any]] = None, + ) -> List[Union[str, List[str]]]: + if call_stack is None: + call_stack = [] + _i = getattr(obj, "indirect_reference", None) + if _i in call_stack: + return [] + else: + call_stack.append(_i) + if self.inline_images is None: + self.inline_images = self._get_inline_images() + if obj is None: + obj = self + if ancest is None: + ancest = [] + lst: List[Union[str, List[str]]] = [] + if PG.RESOURCES not in obj or RES.XOBJECT not in cast( + DictionaryObject, obj[PG.RESOURCES] + ): + return [] if self.inline_images is None else list(self.inline_images.keys()) + + x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for o in x_object: + if not isinstance(x_object[o], StreamObject): + continue + if x_object[o][IA.SUBTYPE] == "/Image": + lst.append(o if len(ancest) == 0 else ancest + [o]) + else: # is a form with possible images inside + lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) + assert self.inline_images is not None + lst.extend(list(self.inline_images.keys())) + return lst + + def _get_image( + self, + id: Union[str, List[str], Tuple[str]], + obj: Optional[DictionaryObject] = None, + ) -> ImageFile: + if obj is None: + obj = cast(DictionaryObject, self) + if isinstance(id, tuple): + id = list(id) + if isinstance(id, List) and len(id) == 1: + id = id[0] + try: + xobjs = cast( + DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] + ) + except KeyError: + if not (id[0] == "~" and id[-1] == "~"): + raise + if isinstance(id, str): + if id[0] == "~" and id[-1] == "~": + if self.inline_images is None: + self.inline_images = self._get_inline_images() + if self.inline_images is None: # pragma: no cover + raise KeyError("no inline image can be found") + return self.inline_images[id] + + imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) + extension, byte_stream = imgd[:2] + f = ImageFile( + name=f"{id[1:]}{extension}", + data=byte_stream, + image=imgd[2], + indirect_reference=xobjs[id].indirect_reference, + ) + return f + else: # in a sub object + ids = id[1:] + return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) + + @property + def images(self) -> List[ImageFile]: + """ + Read-only property emulating a list of images on a page. + + Get a list of all images on the page. The key can be: + - A string (for the top object) + - A tuple (for images within XObject forms) + - An integer + + Examples: + reader.pages[0].images[0] # return fist image + reader.pages[0].images['/I0'] # return image '/I0' + # return image '/Image1' within '/TP1' Xobject/Form: + reader.pages[0].images['/TP1','/Image1'] + for img in reader.pages[0].images: # loop within all objects + + images.keys() and images.items() can be used. + + The ImageFile has the following properties: + + `.name` : name of the object + `.data` : bytes of the object + `.image` : PIL Image Object + `.indirect_reference` : object reference + + and the following methods: + `.replace(new_image: PIL.Image.Image, **kwargs)` : + replace the image in the pdf with the new image + applying the saving parameters indicated (such as quality) + + Example usage: + + reader.pages[0].images[0]=replace(Image.open("new_image.jpg", quality = 20) + + Inline images are extracted and named ~0~, ~1~, ..., with the + indirect_reference set to None. + """ + return _VirtualListImages(self._get_ids_image, self._get_image) # type: ignore + + def _translate_value_inlineimage(self, k: str, v: PdfObject) -> PdfObject: + """Translate values used in inline image""" + try: + v = NameObject( + { + "/G": "/DeviceGray", + "/RGB": "/DeviceRGB", + "/CMYK": "/DeviceCMYK", + "/I": "/Indexed", + "/AHx": "/ASCIIHexDecode", + "/A85": "/ASCII85Decode", + "/LZW": "/LZWDecode", + "/Fl": "/FlateDecode", + "/RL": "/RunLengthDecode", + "/CCF": "/CCITTFaxDecode", + "/DCT": "/DCTDecode", + "/DeviceGray": "/DeviceGray", + "/DeviceRGB": "/DeviceRGB", + "/DeviceCMYK": "/DeviceCMYK", + "/Indexed": "/Indexed", + "/ASCIIHexDecode": "/ASCIIHexDecode", + "/ASCII85Decode": "/ASCII85Decode", + "/LZWDecode": "/LZWDecode", + "/FlateDecode": "/FlateDecode", + "/RunLengthDecode": "/RunLengthDecode", + "/CCITTFaxDecode": "/CCITTFaxDecode", + "/DCTDecode": "/DCTDecode", + }[cast(str, v)] + ) + except (TypeError, KeyError): + if isinstance(v, NameObject): + # It is a custom name, thus we have to look in resources. + # The only applicable case is for ColorSpace. + try: + res = cast(DictionaryObject, self["/Resources"])["/ColorSpace"] + v = cast(DictionaryObject, res)[v] + except KeyError: # for res and v + raise PdfReadError(f"Cannot find resource entry {v} for {k}") + return v + + def _get_inline_images(self) -> Dict[str, ImageFile]: + """ + get inline_images + entries will be identified as ~1~ + """ + content = self.get_contents() + if content is None: + return {} + imgs_data = [] + for param, ope in content.operations: + if ope == b"INLINE IMAGE": + imgs_data.append( + {"settings": param["settings"], "__streamdata__": param["data"]} + ) + elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover + raise PdfReadError( + f"{ope} operator met whereas not expected," + "please share usecase with pypdf dev team" + ) + """backup + elif ope == b"BI": + img_data["settings"] = {} + elif ope == b"EI": + imgs_data.append(img_data) + img_data = {} + elif ope == b"ID": + img_data["__streamdata__"] = b"" + elif "__streamdata__" in img_data: + if len(img_data["__streamdata__"]) > 0: + img_data["__streamdata__"] += b"\n" + raise Exception("check append") + img_data["__streamdata__"] += param + elif "settings" in img_data: + img_data["settings"][ope.decode()] = param + """ + files = {} + for num, ii in enumerate(imgs_data): + init = { + "__streamdata__": ii["__streamdata__"], + "/Length": len(ii["__streamdata__"]), + } + for k, v in ii["settings"].items(): + if k in {"/Length", "/L"}: # no length is expected + continue + if isinstance(v, list): + v = ArrayObject( + [self._translate_value_inlineimage(k, x) for x in v] + ) + else: + v = self._translate_value_inlineimage(k, v) + k = NameObject( + { + "/BPC": "/BitsPerComponent", + "/CS": "/ColorSpace", + "/D": "/Decode", + "/DP": "/DecodeParms", + "/F": "/Filter", + "/H": "/Height", + "/W": "/Width", + "/I": "/Interpolate", + "/Intent": "/Intent", + "/IM": "/ImageMask", + "/BitsPerComponent": "/BitsPerComponent", + "/ColorSpace": "/ColorSpace", + "/Decode": "/Decode", + "/DecodeParms": "/DecodeParms", + "/Filter": "/Filter", + "/Height": "/Height", + "/Width": "/Width", + "/Interpolate": "/Interpolate", + "/ImageMask": "/ImageMask", + }[k] + ) + if k not in init: + init[k] = v + ii["object"] = EncodedStreamObject.initialize_from_dictionary(init) + extension, byte_stream, img = _xobj_to_image(ii["object"]) + files[f"~{num}~"] = ImageFile( + name=f"~{num}~{extension}", + data=byte_stream, + image=img, + indirect_reference=None, + ) + return files + + @property + def rotation(self) -> int: + """ + The visual rotation of the page. + + This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are + valid values. This property does not affect ``/Contents``. + """ + rotate_obj = self.get(PG.ROTATE, 0) + return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() + + @rotation.setter + def rotation(self, r: float) -> None: + self[NameObject(PG.ROTATE)] = NumberObject((((int(r) + 45) // 90) * 90) % 360) + + def transfer_rotation_to_content(self) -> None: + """ + Apply the rotation of the page to the content and the media/crop/... + boxes. + + It is recommended to apply this function before page merging. + """ + r = -self.rotation # rotation to apply is in the otherway + self.rotation = 0 + mb = RectangleObject(self.mediabox) + trsf = ( + Transformation() + .translate( + -float(mb.left + mb.width / 2), -float(mb.bottom + mb.height / 2) + ) + .rotate(r) + ) + pt1 = trsf.apply_on(mb.lower_left) + pt2 = trsf.apply_on(mb.upper_right) + trsf = trsf.translate(-min(pt1[0], pt2[0]), -min(pt1[1], pt2[1])) + self.add_transformation(trsf, False) + for b in ["/MediaBox", "/CropBox", "/BleedBox", "/TrimBox", "/ArtBox"]: + if b in self: + rr = RectangleObject(self[b]) # type: ignore + pt1 = trsf.apply_on(rr.lower_left) + pt2 = trsf.apply_on(rr.upper_right) + self[NameObject(b)] = RectangleObject( + ( + min(pt1[0], pt2[0]), + min(pt1[1], pt2[1]), + max(pt1[0], pt2[0]), + max(pt1[1], pt2[1]), + ) + ) + + def rotate(self, angle: int) -> "PageObject": + """ + Rotate a page clockwise by increments of 90 degrees. + + Args: + angle: Angle to rotate the page. Must be an increment of 90 deg. + + Returns: + The rotated PageObject + """ + if angle % 90 != 0: + raise ValueError("Rotation angle must be a multiple of 90") + self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) + return self + + def _merge_resources( + self, + res1: DictionaryObject, + res2: DictionaryObject, + resource: Any, + new_res1: bool = True, + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + try: + assert isinstance(self.indirect_reference, IndirectObject) + pdf = self.indirect_reference.pdf + is_pdf_writer = hasattr( + pdf, "_add_object" + ) # ---------- expect isinstance(pdf,PdfWriter) + except (AssertionError, AttributeError): + pdf = None + is_pdf_writer = False + + def compute_unique_key(base_key: str) -> Tuple[str, bool]: + """ + Find a key that either doesn't already exist or has the same value + (indicated by the bool) + + Args: + base_key: An index is added to this to get the computed key + + Returns: + A tuple (computed key, bool) where the boolean indicates + if there is a resource of the given computed_key with the same + value. + """ + value = page2res.raw_get(base_key) + # TODO : possible improvement : in case of writer, the indirect_reference + # can not be found because translated : this may be improved + + # try the current key first (e.g. "foo"), but otherwise iterate + # through "foo-0", "foo-1", etc. new_res can contain only finitely + # many keys, thus this'll eventually end, even if it's been crafted + # to be maximally annoying. + computed_key = base_key + idx = 0 + while computed_key in new_res: + if new_res.raw_get(computed_key) == value: + # there's already a resource of this name, with the exact + # same value + return computed_key, True + computed_key = f"{base_key}-{idx}" + idx += 1 + return computed_key, False + + if new_res1: + new_res = DictionaryObject() + new_res.update(res1.get(resource, DictionaryObject()).get_object()) + else: + new_res = cast(DictionaryObject, res1[resource]) + page2res = cast( + DictionaryObject, res2.get(resource, DictionaryObject()).get_object() + ) + rename_res = {} + for key in page2res: + unique_key, same_value = compute_unique_key(key) + newname = NameObject(unique_key) + if key != unique_key: + # we have to use a different name for this + rename_res[key] = newname + + if not same_value: + if is_pdf_writer: + new_res[newname] = page2res.raw_get(key).clone(pdf) + try: + new_res[newname] = new_res[newname].indirect_reference + except AttributeError: + pass + else: + new_res[newname] = page2res.raw_get(key) + lst = sorted(new_res.items()) + new_res.clear() + for el in lst: + new_res[el[0]] = el[1] + return new_res, rename_res + + @staticmethod + def _content_stream_rename( + stream: ContentStream, + rename: Dict[Any, Any], + pdf: Optional[PdfCommonDocProtocol], + ) -> ContentStream: + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands, _operator in stream.operations: + if isinstance(operands, list): + for i, op in enumerate(operands): + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + elif isinstance(operands, dict): + for i, op in operands.items(): + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + else: + raise KeyError(f"type of operands is {type(operands)}") + return stream + + @staticmethod + def _add_transformation_matrix( + contents: Any, + pdf: Optional[PdfCommonDocProtocol], + ctm: CompressedTransformationMatrix, + ) -> ContentStream: + """Add transformation matrix at the beginning of the given contents stream.""" + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert( + 0, + [ + [ + FloatObject(a), + FloatObject(b), + FloatObject(c), + FloatObject(d), + FloatObject(e), + FloatObject(f), + ], + " cm", + ], + ) + return contents + + def _get_contents_as_bytes(self) -> Optional[bytes]: + """ + Return the page contents as bytes. + + Returns: + The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. + + """ + if PG.CONTENTS in self: + obj = self[PG.CONTENTS].get_object() + if isinstance(obj, list): + return b"".join(x.get_object().get_data() for x in obj) + else: + return cast(bytes, cast(EncodedStreamObject, obj).get_data()) + else: + return None + + def get_contents(self) -> Optional[ContentStream]: + """ + Access the page contents. + + Returns: + The ``/Contents`` object, or ``None`` if it does not exist. + ``/Contents`` is optional, as described in §7.7.3.3 of the PDF Reference. + """ + if PG.CONTENTS in self: + try: + pdf = cast(IndirectObject, self.indirect_reference).pdf + except AttributeError: + pdf = None + obj = self[PG.CONTENTS].get_object() + if isinstance(obj, NullObject): + return None + else: + return ContentStream(obj, pdf) + else: + return None + + def replace_contents( + self, content: Union[None, ContentStream, EncodedStreamObject, ArrayObject] + ) -> None: + """ + Replace the page contents with the new content and nullify old objects + Args: + content: new content; if None delete the content field. + """ + if not hasattr(self, "indirect_reference") or self.indirect_reference is None: + # the page is not attached : the content is directly attached. + self[NameObject(PG.CONTENTS)] = content + return + if isinstance(self.get(PG.CONTENTS, None), ArrayObject): + for o in self[PG.CONTENTS]: # type: ignore[attr-defined] + try: + self._objects[o.indirect_reference.idnum - 1] = NullObject() # type: ignore + except AttributeError: + pass + + if isinstance(content, ArrayObject): + for i in range(len(content)): + content[i] = self.indirect_reference.pdf._add_object(content[i]) + + if content is None: + if PG.CONTENTS not in self: + return + else: + assert self.indirect_reference is not None + assert self[PG.CONTENTS].indirect_reference is not None + self.indirect_reference.pdf._objects[ + self[PG.CONTENTS].indirect_reference.idnum - 1 # type: ignore + ] = NullObject() + del self[PG.CONTENTS] + elif not hasattr(self.get(PG.CONTENTS, None), "indirect_reference"): + try: + self[NameObject(PG.CONTENTS)] = self.indirect_reference.pdf._add_object( + content + ) + except AttributeError: + # applies at least for page not in writer + # as a backup solution, we put content as an object although not in accordance with pdf ref + # this will be fixed with the _add_object + self[NameObject(PG.CONTENTS)] = content + else: + content.indirect_reference = self[ + PG.CONTENTS + ].indirect_reference # TODO: in a future may required generation management + try: + self.indirect_reference.pdf._objects[ + content.indirect_reference.idnum - 1 # type: ignore + ] = content + except AttributeError: + # applies at least for page not in writer + # as a backup solution, we put content as an object although not in accordance with pdf ref + # this will be fixed with the _add_object + self[NameObject(PG.CONTENTS)] = content + # forces recalculation of inline_images + self.inline_images = None + + def merge_page( + self, page2: "PageObject", expand: bool = False, over: bool = True + ) -> None: + """ + Merge the content streams of two pages into one. + + Resource references + (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + of this page are not altered. The parameter page's content stream will + be added to the end of this page's content stream, meaning that it will + be drawn after, or "on top" of this page. + + Args: + page2: The page to be merged into this one. Should be + an instance of :class:`PageObject<PageObject>`. + over: set the page2 content over page1 if True (default) else under + expand: If True, the current page dimensions will be + expanded to accommodate the dimensions of the page to be merged. + """ + self._merge_page(page2, over=over, expand=expand) + + def _merge_page( + self, + page2: "PageObject", + page2transformation: Optional[Callable[[Any], ContentStream]] = None, + ctm: Optional[CompressedTransformationMatrix] = None, + over: bool = True, + expand: bool = False, + ) -> None: + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + try: + assert isinstance(self.indirect_reference, IndirectObject) + if hasattr( + self.indirect_reference.pdf, "_add_object" + ): # ---------- to detect PdfWriter + return self._merge_page_writer( + page2, page2transformation, ctm, over, expand + ) + except (AssertionError, AttributeError): + pass + + new_resources = DictionaryObject() + rename = {} + try: + original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) + except KeyError: + original_resources = DictionaryObject() + try: + page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) + except KeyError: + page2resources = DictionaryObject() + new_annots = ArrayObject() + + for page in (self, page2): + if PG.ANNOTS in page: + annots = page[PG.ANNOTS] + if isinstance(annots, ArrayObject): + new_annots.extend(annots) + + for res in ( + RES.EXT_G_STATE, + RES.FONT, + RES.XOBJECT, + RES.COLOR_SPACE, + RES.PATTERN, + RES.SHADING, + RES.PROPERTIES, + ): + new, newrename = self._merge_resources( + original_resources, page2resources, res + ) + if new: + new_resources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets, making sure there's a consistent order + new_resources[NameObject(RES.PROC_SET)] = ArrayObject( + sorted( + set( + original_resources.get(RES.PROC_SET, ArrayObject()).get_object() + ).union( + set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) + ) + ) + ) + + new_content_array = ArrayObject() + original_content = self.get_contents() + if original_content is not None: + original_content.isolate_graphics_state() + new_content_array.append(original_content) + + page2content = page2.get_contents() + if page2content is not None: + rect = getattr(page2, MERGE_CROP_BOX) + page2content.operations.insert( + 0, + ( + map( + FloatObject, + [ + rect.left, + rect.bottom, + rect.width, + rect.height, + ], + ), + "re", + ), + ) + page2content.operations.insert(1, ([], "W")) + page2content.operations.insert(2, ([], "n")) + if page2transformation is not None: + page2content = page2transformation(page2content) + page2content = PageObject._content_stream_rename( + page2content, rename, self.pdf + ) + page2content.isolate_graphics_state() + if over: + new_content_array.append(page2content) + else: + new_content_array.insert(0, page2content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + self._expand_mediabox(page2, ctm) + + self.replace_contents(ContentStream(new_content_array, self.pdf)) + self[NameObject(PG.RESOURCES)] = new_resources + self[NameObject(PG.ANNOTS)] = new_annots + + def _merge_page_writer( + self, + page2: "PageObject", + page2transformation: Optional[Callable[[Any], ContentStream]] = None, + ctm: Optional[CompressedTransformationMatrix] = None, + over: bool = True, + expand: bool = False, + ) -> None: + # First we work on merging the resource dictionaries. This allows us + # to find which symbols in the content streams we might need to + # rename. + assert isinstance(self.indirect_reference, IndirectObject) + pdf = self.indirect_reference.pdf + + rename = {} + if PG.RESOURCES not in self: + self[NameObject(PG.RESOURCES)] = DictionaryObject() + original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) + if PG.RESOURCES not in page2: + page2resources = DictionaryObject() + else: + page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) + + for res in ( + RES.EXT_G_STATE, + RES.FONT, + RES.XOBJECT, + RES.COLOR_SPACE, + RES.PATTERN, + RES.SHADING, + RES.PROPERTIES, + ): + if res in page2resources: + if res not in original_resources: + original_resources[NameObject(res)] = DictionaryObject() + _, newrename = self._merge_resources( + original_resources, page2resources, res, False + ) + rename.update(newrename) + # Combine /ProcSet sets. + if RES.PROC_SET in page2resources: + if RES.PROC_SET not in original_resources: + original_resources[NameObject(RES.PROC_SET)] = ArrayObject() + arr = cast(ArrayObject, original_resources[RES.PROC_SET]) + for x in cast(ArrayObject, page2resources[RES.PROC_SET]): + if x not in arr: + arr.append(x) + arr.sort() + + if PG.ANNOTS in page2: + if PG.ANNOTS not in self: + self[NameObject(PG.ANNOTS)] = ArrayObject() + annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) + if ctm is None: + trsf = Transformation() + else: + trsf = Transformation(ctm) + for a in cast(ArrayObject, page2[PG.ANNOTS]): + a = a.get_object() + aa = a.clone( + pdf, + ignore_fields=("/P", "/StructParent", "/Parent"), + force_duplicate=True, + ) + r = cast(ArrayObject, a["/Rect"]) + pt1 = trsf.apply_on((r[0], r[1]), True) + pt2 = trsf.apply_on((r[2], r[3]), True) + aa[NameObject("/Rect")] = ArrayObject( + ( + min(pt1[0], pt2[0]), + min(pt1[1], pt2[1]), + max(pt1[0], pt2[0]), + max(pt1[1], pt2[1]), + ) + ) + if "/QuadPoints" in a: + q = cast(ArrayObject, a["/QuadPoints"]) + aa[NameObject("/QuadPoints")] = ArrayObject( + trsf.apply_on((q[0], q[1]), True) + + trsf.apply_on((q[2], q[3]), True) + + trsf.apply_on((q[4], q[5]), True) + + trsf.apply_on((q[6], q[7]), True) + ) + try: + aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference + except KeyError: + pass + try: + aa[NameObject("/P")] = self.indirect_reference + annots.append(aa.indirect_reference) + except AttributeError: + pass + + new_content_array = ArrayObject() + original_content = self.get_contents() + if original_content is not None: + original_content.isolate_graphics_state() + new_content_array.append(original_content) + + page2content = page2.get_contents() + if page2content is not None: + rect = getattr(page2, MERGE_CROP_BOX) + page2content.operations.insert( + 0, + ( + map( + FloatObject, + [ + rect.left, + rect.bottom, + rect.width, + rect.height, + ], + ), + "re", + ), + ) + page2content.operations.insert(1, ([], "W")) + page2content.operations.insert(2, ([], "n")) + if page2transformation is not None: + page2content = page2transformation(page2content) + page2content = PageObject._content_stream_rename( + page2content, rename, self.pdf + ) + page2content.isolate_graphics_state() + if over: + new_content_array.append(page2content) + else: + new_content_array.insert(0, page2content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + self._expand_mediabox(page2, ctm) + + self.replace_contents(new_content_array) + # self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, pdf) + # self[NameObject(PG.RESOURCES)] = new_resources + # self[NameObject(PG.ANNOTS)] = new_annots + + def _expand_mediabox( + self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] + ) -> None: + corners1 = ( + self.mediabox.left.as_numeric(), + self.mediabox.bottom.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.top.as_numeric(), + ) + corners2 = ( + page2.mediabox.left.as_numeric(), + page2.mediabox.bottom.as_numeric(), + page2.mediabox.left.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.bottom.as_numeric(), + ) + if ctm is not None: + ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] + new_x = tuple( + ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] + for i in range(0, 8, 2) + ) + new_y = tuple( + ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] + for i in range(0, 8, 2) + ) + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = (min(new_x), min(new_y)) + upperright = (max(new_x), max(new_y)) + lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) + upperright = ( + max(corners1[2], upperright[0]), + max(corners1[3], upperright[1]), + ) + + self.mediabox.lower_left = lowerleft + self.mediabox.upper_right = upperright + + def merge_transformed_page( + self, + page2: "PageObject", + ctm: Union[CompressedTransformationMatrix, Transformation], + over: bool = True, + expand: bool = False, + ) -> None: + """ + merge_transformed_page is similar to merge_page, but a transformation + matrix is applied to the merged stream. + + Args: + page2: The page to be merged into this one. + ctm: a 6-element tuple containing the operands of the + transformation matrix + over: set the page2 content over page1 if True (default) else under + expand: Whether the page should be expanded to fit the dimensions + of the page to be merged. + """ + if isinstance(ctm, Transformation): + ctm = ctm.ctm + self._merge_page( + page2, + lambda page2Content: PageObject._add_transformation_matrix( + page2Content, page2.pdf, cast(CompressedTransformationMatrix, ctm) + ), + ctm, + over, + expand, + ) + + def merge_scaled_page( + self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False + ) -> None: + """ + merge_scaled_page is similar to merge_page, but the stream to be merged + is scaled by applying a transformation matrix. + + Args: + page2: The page to be merged into this one. + scale: The scaling factor + over: set the page2 content over page1 if True (default) else under + expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + op = Transformation().scale(scale, scale) + self.merge_transformed_page(page2, op, over, expand) + + def merge_rotated_page( + self, + page2: "PageObject", + rotation: float, + over: bool = True, + expand: bool = False, + ) -> None: + """ + merge_rotated_page is similar to merge_page, but the stream to be merged + is rotated by applying a transformation matrix. + + Args: + page2: The page to be merged into this one. + rotation: The angle of the rotation, in degrees + over: set the page2 content over page1 if True (default) else under + expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + op = Transformation().rotate(rotation) + self.merge_transformed_page(page2, op, over, expand) + + def merge_translated_page( + self, + page2: "PageObject", + tx: float, + ty: float, + over: bool = True, + expand: bool = False, + ) -> None: + """ + mergeTranslatedPage is similar to merge_page, but the stream to be + merged is translated by applying a transformation matrix. + + Args: + page2: the page to be merged into this one. + tx: The translation on X axis + ty: The translation on Y axis + over: set the page2 content over page1 if True (default) else under + expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + """ + op = Transformation().translate(tx, ty) + self.merge_transformed_page(page2, op, over, expand) + + def add_transformation( + self, + ctm: Union[Transformation, CompressedTransformationMatrix], + expand: bool = False, + ) -> None: + """ + Apply a transformation matrix to the page. + + Args: + ctm: A 6-element tuple containing the operands of the + transformation matrix. Alternatively, a + :py:class:`Transformation<pypdf.Transformation>` + object can be passed. + + See :doc:`/user/cropping-and-transforming`. + """ + if isinstance(ctm, Transformation): + ctm = ctm.ctm + content = self.get_contents() + if content is not None: + content = PageObject._add_transformation_matrix(content, self.pdf, ctm) + content.isolate_graphics_state() + self.replace_contents(content) + # if expanding the page to fit a new page, calculate the new media box size + if expand: + corners = [ + self.mediabox.left.as_numeric(), + self.mediabox.bottom.as_numeric(), + self.mediabox.left.as_numeric(), + self.mediabox.top.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.top.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.bottom.as_numeric(), + ] + + ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] + new_x = [ + ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] + for i in range(0, 8, 2) + ] + new_y = [ + ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] + for i in range(0, 8, 2) + ] + + lowerleft = (min(new_x), min(new_y)) + upperright = (max(new_x), max(new_y)) + + self.mediabox.lower_left = lowerleft + self.mediabox.upper_right = upperright + + def scale(self, sx: float, sy: float) -> None: + """ + Scale a page by the given factors by applying a transformation matrix + to its content and updating the page size. + + This updates the mediabox, the cropbox, and the contents + of the page. + + Args: + sx: The scaling factor on horizontal axis. + sy: The scaling factor on vertical axis. + """ + self.add_transformation((sx, 0, 0, sy, 0, 0)) + self.cropbox = self.cropbox.scale(sx, sy) + self.artbox = self.artbox.scale(sx, sy) + self.bleedbox = self.bleedbox.scale(sx, sy) + self.trimbox = self.trimbox.scale(sx, sy) + self.mediabox = self.mediabox.scale(sx, sy) + + if PG.ANNOTS in self: + annotations = self[PG.ANNOTS] + if isinstance(annotations, ArrayObject): + for annotation in annotations: + annotation_obj = annotation.get_object() + if ADA.Rect in annotation_obj: + rectangle = annotation_obj[ADA.Rect] + if isinstance(rectangle, ArrayObject): + rectangle[0] = FloatObject(float(rectangle[0]) * sx) + rectangle[1] = FloatObject(float(rectangle[1]) * sy) + rectangle[2] = FloatObject(float(rectangle[2]) * sx) + rectangle[3] = FloatObject(float(rectangle[3]) * sy) + + if PG.VP in self: + viewport = self[PG.VP] + if isinstance(viewport, ArrayObject): + bbox = viewport[0]["/BBox"] + else: + bbox = viewport["/BBox"] # type: ignore + scaled_bbox = RectangleObject( + ( + float(bbox[0]) * sx, + float(bbox[1]) * sy, + float(bbox[2]) * sx, + float(bbox[3]) * sy, + ) + ) + if isinstance(viewport, ArrayObject): + self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore + NameObject("/BBox") + ] = scaled_bbox + else: + self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore + + def scale_by(self, factor: float) -> None: + """ + Scale a page by the given factor by applying a transformation matrix to + its content and updating the page size. + + Args: + factor: The scaling factor (for both X and Y axis). + """ + self.scale(factor, factor) + + def scale_to(self, width: float, height: float) -> None: + """ + Scale a page to the specified dimensions by applying a transformation + matrix to its content and updating the page size. + + Args: + width: The new width. + height: The new height. + """ + sx = width / float(self.mediabox.width) + sy = height / float(self.mediabox.height) + self.scale(sx, sy) + + def compress_content_streams(self, level: int = -1) -> None: + """ + Compress the size of this page by joining all content streams and + applying a FlateDecode filter. + + However, it is possible that this function will perform no action if + content stream compression becomes "automatic". + """ + content = self.get_contents() + if content is not None: + content_obj = content.flate_encode(level) + try: + content.indirect_reference.pdf._objects[ # type: ignore + content.indirect_reference.idnum - 1 # type: ignore + ] = content_obj + except AttributeError: + if self.indirect_reference is not None and hasattr( + self.indirect_reference.pdf, "_add_object" + ): + self.replace_contents(content_obj) + else: + raise ValueError("Page must be part of a PdfWriter") + + @property + def page_number(self) -> Optional[int]: + """ + Read-only property which returns the page number within the PDF file. + + Returns: + int : page number; None if the page is not attached to a PDF. + """ + if self.indirect_reference is None: + return None + else: + try: + lst = self.indirect_reference.pdf.pages + return lst.index(self) + except ValueError: + return None + + def _debug_for_extract(self) -> str: # pragma: no cover + out = "" + for ope, op in ContentStream( + self["/Contents"].get_object(), self.pdf, "bytes" + ).operations: + if op == b"TJ": + s = [x for x in ope[0] if isinstance(x, str)] + else: + s = [] + out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" + out += "\n=============================\n" + try: + for fo in self[PG.RESOURCES]["/Font"]: # type:ignore + out += fo + "\n" + out += self[PG.RESOURCES]["/Font"][fo].__repr__() + "\n" # type:ignore + try: + enc_repr = self[PG.RESOURCES]["/Font"][fo][ # type:ignore + "/Encoding" + ].__repr__() + out += enc_repr + "\n" + except Exception: + pass + try: + out += ( + self[PG.RESOURCES]["/Font"][fo][ # type:ignore + "/ToUnicode" + ] + .get_data() + .decode() + + "\n" + ) + except Exception: + pass + + except KeyError: + out += "No Font\n" + return out + + def _extract_text( + self, + obj: Any, + pdf: Any, + orientations: Tuple[int, ...] = (0, 90, 180, 270), + space_width: float = 200.0, + content_key: Optional[str] = PG.CONTENTS, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + ) -> str: + """ + See extract_text for most arguments. + + Args: + content_key: indicate the default key where to extract data + None = the object; this allow to reuse the function on XObject + default = "/Content" + """ + text: str = "" + output: str = "" + rtl_dir: bool = False # right-to-left + cmaps: Dict[ + str, + Tuple[ + str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject + ], + ] = {} + try: + objr = obj + while NameObject(PG.RESOURCES) not in objr: + # /Resources can be inherited sometimes so we look to parents + objr = objr["/Parent"].get_object() + # if no parents we will have no /Resources will be available + # => an exception will be raised + resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) + except Exception: + # no resources means no text is possible (no font) we consider the + # file as not damaged, no need to check for TJ or Tj + return "" + if "/Font" in resources_dict: + for f in cast(DictionaryObject, resources_dict["/Font"]): + cmaps[f] = build_char_map(f, space_width, obj) + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ] = ( + "charmap", + {}, + "NotInitialized", + None, + ) # (encoding,CMAP,font resource name,dictionary-object of font) + try: + content = ( + obj[content_key].get_object() if isinstance(content_key, str) else obj + ) + if not isinstance(content, ContentStream): + content = ContentStream(content, pdf, "bytes") + except KeyError: # it means no content can be extracted(certainly empty page) + return "" + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + + cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + cm_stack = [] + tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + # cm/tm_prev stores the last modified matrices can be an intermediate position + cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + # memo_cm/tm will be used to store the position at the beginning of building the text + memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + char_scale = 1.0 + space_scale = 1.0 + _space_width: float = 500.0 # will be set correctly at first Tf + TL = 0.0 + font_size = 12.0 # init just in case of + + def current_spacewidth() -> float: + return _space_width / 1000.0 + + def process_operation(operator: bytes, operands: List[Any]) -> None: + nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm + nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap + nonlocal orientations, rtl_dir, visitor_text, output, text + global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS + + check_crlf_space: bool = False + # Table 5.4 page 405 + if operator == b"BT": + tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + output += text + if visitor_text is not None: + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + return None + elif operator == b"ET": + output += text + if visitor_text is not None: + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + # table 4.7 "Graphics state operators", page 219 + # cm_matrix calculation is a reserved for the moment + elif operator == b"q": + cm_stack.append( + ( + cm_matrix, + cmap, + font_size, + char_scale, + space_scale, + _space_width, + TL, + ) + ) + elif operator == b"Q": + try: + ( + cm_matrix, + cmap, + font_size, + char_scale, + space_scale, + _space_width, + TL, + ) = cm_stack.pop() + except Exception: + cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + elif operator == b"cm": + output += text + if visitor_text is not None: + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + text = "" + cm_matrix = mult( + [ + float(operands[0]), + float(operands[1]), + float(operands[2]), + float(operands[3]), + float(operands[4]), + float(operands[5]), + ], + cm_matrix, + ) + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + # Table 5.2 page 398 + elif operator == b"Tz": + char_scale = float(operands[0]) / 100.0 + elif operator == b"Tw": + space_scale = 1.0 + float(operands[0]) + elif operator == b"TL": + TL = float(operands[0]) + elif operator == b"Tf": + if text != "": + output += text # .translate(cmap) + if visitor_text is not None: + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + try: + # charMapTuple: font_type, float(sp_width / 2), encoding, + # map_dict, font-dictionary + charMapTuple = cmaps[operands[0]] + _space_width = charMapTuple[1] + # current cmap: encoding, map_dict, font resource name + # (internal name, not the real font-name), + # font-dictionary. The font-dictionary describes the font. + cmap = ( + charMapTuple[2], + charMapTuple[3], + operands[0], + charMapTuple[4], + ) + except KeyError: # font not found + _space_width = unknown_char_map[1] + cmap = ( + unknown_char_map[2], + unknown_char_map[3], + "???" + operands[0], + None, + ) + try: + font_size = float(operands[1]) + except Exception: + pass # keep previous size + # Table 5.5 page 406 + elif operator == b"Td": + check_crlf_space = True + # A special case is a translating only tm: + # tm[0..5] = 1 0 0 1 e f, + # i.e. tm[4] += tx, tm[5] += ty. + tx = float(operands[0]) + ty = float(operands[1]) + tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] + tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] + elif operator == b"Tm": + check_crlf_space = True + tm_matrix = [ + float(operands[0]), + float(operands[1]), + float(operands[2]), + float(operands[3]), + float(operands[4]), + float(operands[5]), + ] + elif operator == b"T*": + check_crlf_space = True + tm_matrix[5] -= TL + + elif operator == b"Tj": + check_crlf_space = True + text, rtl_dir = handle_tj( + text, + operands, + cm_matrix, + tm_matrix, # text matrix + cmap, + orientations, + output, + font_size, + rtl_dir, + visitor_text, + ) + else: + return None + if check_crlf_space: + try: + text, output, cm_prev, tm_prev = crlf_space_check( + text, + (cm_prev, tm_prev), + (cm_matrix, tm_matrix), + (memo_cm, memo_tm), + cmap, + orientations, + output, + font_size, + visitor_text, + current_spacewidth(), + ) + if text == "": + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + except OrientationNotFoundError: + return None + + for operands, operator in content.operations: + if visitor_operand_before is not None: + visitor_operand_before(operator, operands, cm_matrix, tm_matrix) + # multiple operators are defined in here #### + if operator == b"'": + process_operation(b"T*", []) + process_operation(b"Tj", operands) + elif operator == b'"': + process_operation(b"Tw", [operands[0]]) + process_operation(b"Tc", [operands[1]]) + process_operation(b"T*", []) + process_operation(b"Tj", operands[2:]) + elif operator == b"TD": + process_operation(b"TL", [-operands[1]]) + process_operation(b"Td", operands) + elif operator == b"TJ": + for op in operands[0]: + if isinstance(op, (str, bytes)): + process_operation(b"Tj", [op]) + if isinstance(op, (int, float, NumberObject, FloatObject)) and ( + (abs(float(op)) >= _space_width) + and (len(text) > 0) + and (text[-1] != " ") + ): + process_operation(b"Tj", [" "]) + elif operator == b"Do": + output += text + if visitor_text is not None: + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + try: + if output[-1] != "\n": + output += "\n" + if visitor_text is not None: + visitor_text( + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + except IndexError: + pass + try: + xobj = resources_dict["/XObject"] + if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore + text = self.extract_xform_text( + xobj[operands[0]], # type: ignore + orientations, + space_width, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) + output += text + if visitor_text is not None: + visitor_text( + text, + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + except Exception: + logger_warning( + f" impossible to decode XFormObject {operands[0]}", + __name__, + ) + finally: + text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + + else: + process_operation(operator, operands) + if visitor_operand_after is not None: + visitor_operand_after(operator, operands, cm_matrix, tm_matrix) + output += text # just in case of + if text != "" and visitor_text is not None: + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + return output + + def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]: + """ + Get fonts formatted for "layout" mode text extraction. + + Returns: + Dict[str, Font]: dictionary of _layout_mode.Font instances keyed by font name + """ + # Font retrieval logic adapted from pypdf.PageObject._extract_text() + objr: Any = self + fonts: Dict[str, _layout_mode.Font] = {} + while objr is not None: + try: + resources_dict: Any = objr[PG.RESOURCES] + except KeyError: + resources_dict = {} + if "/Font" in resources_dict and self.pdf is not None: + for font_name in resources_dict["/Font"]: + *cmap, font_dict_obj = build_char_map(font_name, 200.0, self) + font_dict = { + k: v.get_object() + if isinstance(v, IndirectObject) + else [_v.get_object() for _v in v] + if isinstance(v, ArrayObject) + else v + for k, v in font_dict_obj.items() + } + # mypy really sucks at unpacking + fonts[font_name] = _layout_mode.Font(*cmap, font_dict) # type: ignore[call-arg,arg-type] + try: + objr = objr["/Parent"].get_object() + except KeyError: + objr = None + + return fonts + + def _layout_mode_text( + self, + space_vertically: bool = True, + scale_weight: float = 1.25, + strip_rotated: bool = True, + debug_path: Optional[Path] = None, + ) -> str: + """ + Get text preserving fidelity to source PDF text layout. + + Args: + space_vertically: include blank lines inferred from y distance + font + height. Defaults to True. + scale_weight: multiplier for string length when calculating weighted + average character width. Defaults to 1.25. + strip_rotated: Removes text that is rotated w.r.t. to the page from + layout mode output. Defaults to True. + debug_path (Path | None): if supplied, must target a directory. + creates the following files with debug information for layout mode + functions if supplied: + - fonts.json: output of self._layout_mode_fonts + - tjs.json: individual text render ops with corresponding transform matrices + - bts.json: text render ops left justified and grouped by BT/ET operators + - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) + Defaults to None. + + Returns: + str: multiline string containing page text in a fixed width format that + closely adheres to the rendered layout in the source pdf. + """ + fonts = self._layout_mode_fonts() + if debug_path: # pragma: no cover + import json + + debug_path.joinpath("fonts.json").write_text( + json.dumps( + fonts, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) + ), + "utf-8", + ) + + ops = iter( + ContentStream(self["/Contents"].get_object(), self.pdf, "bytes").operations + ) + bt_groups = _layout_mode.text_show_operations( + ops, fonts, strip_rotated, debug_path + ) + + if not bt_groups: + return "" + + ty_groups = _layout_mode.y_coordinate_groups(bt_groups, debug_path) + + char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight) + + return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically) + + def extract_text( + self, + *args: Any, + orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270), + space_width: float = 200.0, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + extraction_mode: Literal["plain", "layout"] = "plain", + **kwargs: Any, + ) -> str: + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. + + This works well for some PDF files, but poorly for others, depending on + the generator used. This will be refined in the future. + + Do not rely on the order of text coming out of this function, as it + will change if this function is made more sophisticated. + + Arabic and Hebrew are extracted in the correct order. + If required a custom RTL range of characters can be defined; + see function set_custom_rtl. + + Additionally you can provide visitor methods to get informed on all + operations and all text objects. + For example in some PDF files this can be useful to parse tables. + + Args: + orientations: list of orientations extract_text will look for + default = (0, 90, 180, 270) + note: currently only 0 (up),90 (turned left), 180 (upside down), + 270 (turned right) + space_width: force default space width + if not extracted from font (default: 200) + visitor_operand_before: function to be called before processing an operation. + It has four arguments: operator, operand-arguments, + current transformation matrix and text matrix. + visitor_operand_after: function to be called after processing an operation. + It has four arguments: operator, operand-arguments, + current transformation matrix and text matrix. + visitor_text: function to be called when extracting some text at some position. + It has five arguments: text, current transformation matrix, + text matrix, font-dictionary and font-size. + The font-dictionary may be None in case of unknown fonts. + If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality, + "layout" for experimental layout mode functionality. + NOTE: orientations, space_width, and visitor_* parameters are NOT respected + in "layout" mode. + + kwargs: + layout_mode_space_vertically (bool): include blank lines inferred from + y distance + font height. Defaults to True. + layout_mode_scale_weight (float): multiplier for string length when calculating + weighted average character width. Defaults to 1.25. + layout_mode_strip_rotated (bool): layout mode does not support rotated text. + Set to False to include rotated text anyway. If rotated text is discovered, + layout will be degraded and a warning will result. Defaults to True. + layout_mode_debug_path (Path | None): if supplied, must target a directory. + creates the following files with debug information for layout mode + functions if supplied: + + - fonts.json: output of self._layout_mode_fonts + - tjs.json: individual text render ops with corresponding transform matrices + - bts.json: text render ops left justified and grouped by BT/ET operators + - bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines) + + Returns: + The extracted text + """ + if extraction_mode not in ["plain", "layout"]: + raise ValueError(f"Invalid text extraction mode '{extraction_mode}'") + if extraction_mode == "layout": + return self._layout_mode_text( + space_vertically=kwargs.get("layout_mode_space_vertically", True), + scale_weight=kwargs.get("layout_mode_scale_weight", 1.25), + strip_rotated=kwargs.get("layout_mode_strip_rotated", True), + debug_path=kwargs.get("layout_mode_debug_path", None), + ) + if len(args) >= 1: + if isinstance(args[0], str): + if len(args) >= 3: + if isinstance(args[2], (tuple, int)): + orientations = args[2] + else: + raise TypeError(f"Invalid positional parameter {args[2]}") + if len(args) >= 4: + if isinstance(args[3], (float, int)): + space_width = args[3] + else: + raise TypeError(f"Invalid positional parameter {args[3]}") + elif isinstance(args[0], (tuple, int)): + orientations = args[0] + if len(args) >= 2: + if isinstance(args[1], (float, int)): + space_width = args[1] + else: + raise TypeError(f"Invalid positional parameter {args[1]}") + else: + raise TypeError(f"Invalid positional parameter {args[0]}") + + if isinstance(orientations, int): + orientations = (orientations,) + + return self._extract_text( + self, + self.pdf, + orientations, + space_width, + PG.CONTENTS, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) + + def extract_xform_text( + self, + xform: EncodedStreamObject, + orientations: Tuple[int, ...] = (0, 90, 270, 360), + space_width: float = 200.0, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + ) -> str: + """ + Extract text from an XObject. + + Args: + xform: + orientations: + space_width: force default space width (if not extracted from font (default 200) + visitor_operand_before: + visitor_operand_after: + visitor_text: + + Returns: + The extracted text + """ + return self._extract_text( + xform, + self.pdf, + orientations, + space_width, + None, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) + + def _get_fonts(self) -> Tuple[Set[str], Set[str]]: + """ + Get the names of embedded fonts and unembedded fonts. + + Returns: + A tuple (Set of embedded fonts, set of unembedded fonts) + """ + obj = self.get_object() + assert isinstance(obj, DictionaryObject) + fonts: Set[str] = set() + embedded: Set[str] = set() + fonts, embedded = _get_fonts_walk(obj, fonts, embedded) + unembedded = fonts - embedded + return embedded, unembedded + + mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) + """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in + default user space units, defining the boundaries of the physical medium on + which the page is intended to be displayed or printed.""" + + cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) + """ + A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in + default user space units, defining the visible region of default user + space. + + When the page is displayed or printed, its contents are to be clipped + (cropped) to this rectangle and then imposed on the output medium in some + implementation-defined manner. Default value: same as + :attr:`mediabox<mediabox>`. + """ + + bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) + """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in + default user space units, defining the region to which the contents of the + page should be clipped when output in a production environment.""" + + trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) + """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in + default user space units, defining the intended dimensions of the finished + page after trimming.""" + + artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) + """A :class:`RectangleObject<pypdf.generic.RectangleObject>`, expressed in + default user space units, defining the extent of the page's meaningful + content as intended by the page's creator.""" + + @property + def annotations(self) -> Optional[ArrayObject]: + if "/Annots" not in self: + return None + else: + return cast(ArrayObject, self["/Annots"]) + + @annotations.setter + def annotations(self, value: Optional[ArrayObject]) -> None: + """ + Set the annotations array of the page. + + Typically you do not want to set this value, but append to it. + If you append to it, remember to add the object first to the writer + and only add the indirect object. + """ + if value is None: + del self[NameObject("/Annots")] + else: + self[NameObject("/Annots")] = value + + +class _VirtualList(Sequence[PageObject]): + def __init__( + self, + length_function: Callable[[], int], + get_function: Callable[[int], PageObject], + ) -> None: + self.length_function = length_function + self.get_function = get_function + self.current = -1 + + def __len__(self) -> int: + return self.length_function() + + @overload + def __getitem__(self, index: int) -> PageObject: + ... + + @overload + def __getitem__(self, index: slice) -> Sequence[PageObject]: + ... + + def __getitem__( + self, index: Union[int, slice] + ) -> Union[PageObject, Sequence[PageObject]]: + if isinstance(index, slice): + indices = range(*index.indices(len(self))) + cls = type(self) + return cls(indices.__len__, lambda idx: self[indices[idx]]) + if not isinstance(index, int): + raise TypeError("sequence indices must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.get_function(index) + + def __delitem__(self, index: Union[int, slice]) -> None: + if isinstance(index, slice): + r = list(range(*index.indices(len(self)))) + # pages have to be deleted from last to first + r.sort() + r.reverse() + for p in r: + del self[p] # recursive call + return + if not isinstance(index, int): + raise TypeError("index must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("index out of range") + ind = self[index].indirect_reference + assert ind is not None + parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None) + while parent is not None: + parent = cast(DictionaryObject, parent.get_object()) + try: + i = parent["/Kids"].index(ind) + del parent["/Kids"][i] + try: + assert ind is not None + del ind.pdf.flattened_pages[index] # case of page in a Reader + except Exception: # pragma: no cover + pass + if "/Count" in parent: + parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1) + if len(parent["/Kids"]) == 0: + # No more objects in this part of this sub tree + ind = parent.indirect_reference + parent = cast(DictionaryObject, parent.get("/Parent", None)) + else: + parent = None + except ValueError: # from index + raise PdfReadError(f"Page Not Found in Page Tree {ind}") + + def __iter__(self) -> Iterator[PageObject]: + for i in range(len(self)): + yield self[i] + + def __str__(self) -> str: + p = [f"PageObject({i})" for i in range(self.length_function())] + return f"[{', '.join(p)}]" + + +def _get_fonts_walk( + obj: DictionaryObject, + fnt: Set[str], + emb: Set[str], +) -> Tuple[Set[str], Set[str]]: + """ + Get the set of all fonts and all embedded fonts. + + Args: + obj: Page resources dictionary + fnt: font + emb: embedded fonts + + Returns: + A tuple (fnt, emb) + + If there is a key called 'BaseFont', that is a font that is used in the document. + If there is a key called 'FontName' and another key in the same dictionary object + that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is + embedded. + + We create and add to two sets, fnt = fonts used and emb = fonts embedded. + """ + fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") + + def process_font(f: DictionaryObject) -> None: + nonlocal fnt, emb + f = cast(DictionaryObject, f.get_object()) # to be sure + if "/BaseFont" in f: + fnt.add(cast(str, f["/BaseFont"])) + + if ( + ("/CharProcs" in f) + or ( + "/FontDescriptor" in f + and any( + x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys + ) + ) + or ( + "/DescendantFonts" in f + and "/FontDescriptor" + in cast( + DictionaryObject, + cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), + ) + and any( + x + in cast( + DictionaryObject, + cast( + DictionaryObject, + cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), + )["/FontDescriptor"], + ) + for x in fontkeys + ) + ) + ): + # the list comprehension ensures there is FontFile + try: + emb.add(cast(str, f["/BaseFont"])) + except KeyError: + emb.add("(" + cast(str, f["/Subtype"]) + ")") + + if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): + for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): + process_font(f) + if "/Resources" in obj: + if "/Font" in cast(DictionaryObject, obj["/Resources"]): + for f in cast( + DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] + ).values(): + process_font(f) + if "/XObject" in cast(DictionaryObject, obj["/Resources"]): + for x in cast( + DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] + ).values(): + _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) + if "/Annots" in obj: + for a in cast(ArrayObject, obj["/Annots"]): + _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) + if "/AP" in obj: + if ( + cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( + "/Type" + ) + == "/XObject" + ): + _get_fonts_walk( + cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), + fnt, + emb, + ) + else: + for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): + _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) + return fnt, emb # return the sets for each page + + +class _VirtualListImages(Sequence[ImageFile]): + def __init__( + self, + ids_function: Callable[[], List[Union[str, List[str]]]], + get_function: Callable[[Union[str, List[str], Tuple[str]]], ImageFile], + ) -> None: + self.ids_function = ids_function + self.get_function = get_function + self.current = -1 + + def __len__(self) -> int: + return len(self.ids_function()) + + def keys(self) -> List[Union[str, List[str]]]: + return self.ids_function() + + def items(self) -> List[Tuple[Union[str, List[str]], ImageFile]]: + return [(x, self[x]) for x in self.ids_function()] + + @overload + def __getitem__(self, index: Union[int, str, List[str]]) -> ImageFile: + ... + + @overload + def __getitem__(self, index: slice) -> Sequence[ImageFile]: + ... + + def __getitem__( + self, index: Union[int, slice, str, List[str], Tuple[str]] + ) -> Union[ImageFile, Sequence[ImageFile]]: + lst = self.ids_function() + if isinstance(index, slice): + indices = range(*index.indices(len(self))) + lst = [lst[x] for x in indices] + cls = type(self) + return cls((lambda: lst), self.get_function) + if isinstance(index, (str, list, tuple)): + return self.get_function(index) + if not isinstance(index, int): + raise TypeError("invalid sequence indices type") + len_self = len(lst) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.get_function(lst[index]) + + def __iter__(self) -> Iterator[ImageFile]: + for i in range(len(self)): + yield self[i] + + def __str__(self) -> str: + p = [f"Image_{i}={n}" for i, n in enumerate(self.ids_function())] + return f"[{', '.join(p)}]" |