diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/PyPDF2/_writer.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/PyPDF2/_writer.py | 2822 |
1 files changed, 2822 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/_writer.py b/.venv/lib/python3.12/site-packages/PyPDF2/_writer.py new file mode 100644 index 00000000..b2e92cdb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/PyPDF2/_writer.py @@ -0,0 +1,2822 @@ +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import codecs +import collections +import decimal +import logging +import random +import re +import struct +import time +import uuid +import warnings +from hashlib import md5 +from io import BytesIO, FileIO, IOBase +from pathlib import Path +from types import TracebackType +from typing import ( + IO, + Any, + Callable, + Deque, + Dict, + Iterable, + List, + Optional, + Pattern, + Tuple, + Type, + Union, + cast, +) + +from ._encryption import Encryption +from ._page import PageObject, _VirtualList +from ._reader import PdfReader +from ._security import _alg33, _alg34, _alg35 +from ._utils import ( + StrByteType, + StreamType, + _get_max_pdf_version_header, + b_, + deprecate_with_replacement, + deprecation_bookmark, + deprecation_with_replacement, + logger_warning, +) +from .constants import AnnotationDictionaryAttributes +from .constants import CatalogAttributes as CA +from .constants import CatalogDictionary +from .constants import Core as CO +from .constants import EncryptionDictAttributes as ED +from .constants import ( + FieldDictionaryAttributes, + FieldFlag, + FileSpecificationDictionaryEntries, + GoToActionArguments, + InteractiveFormDictEntries, +) +from .constants import PageAttributes as PG +from .constants import PagesAttributes as PA +from .constants import StreamAttributes as SA +from .constants import TrailerKeys as TK +from .constants import TypFitArguments, UserAccessPermissions +from .generic import ( + PAGE_FIT, + AnnotationBuilder, + ArrayObject, + BooleanObject, + ByteStringObject, + ContentStream, + DecodedStreamObject, + Destination, + DictionaryObject, + Fit, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + RectangleObject, + StreamObject, + TextStringObject, + TreeObject, + create_string_object, + hex_to_rgb, +) +from .pagerange import PageRange, PageRangeSpec +from .types import ( + BorderArrayType, + FitType, + LayoutType, + OutlineItemType, + OutlineType, + PagemodeType, + ZoomArgType, +) + +logger = logging.getLogger(__name__) + + +OPTIONAL_READ_WRITE_FIELD = FieldFlag(0) +ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions((2**31 - 1) - 3) + + +class PdfWriter: + """ + This class supports writing PDF files out, given pages produced by another + class (typically :class:`PdfReader<PyPDF2.PdfReader>`). + """ + + def __init__(self, fileobj: StrByteType = "") -> None: + self._header = b"%PDF-1.3" + self._objects: List[PdfObject] = [] # array of indirect objects + self._idnum_hash: Dict[bytes, IndirectObject] = {} + self._id_translated: Dict[int, Dict[int, int]] = {} + + # The root of our page tree node. + pages = DictionaryObject() + pages.update( + { + NameObject(PA.TYPE): NameObject("/Pages"), + NameObject(PA.COUNT): NumberObject(0), + NameObject(PA.KIDS): ArrayObject(), + } + ) + self._pages = self._add_object(pages) + + # info object + info = DictionaryObject() + info.update( + { + NameObject("/Producer"): create_string_object( + codecs.BOM_UTF16_BE + "PyPDF2".encode("utf-16be") + ) + } + ) + self._info = self._add_object(info) + + # root object + self._root_object = DictionaryObject() + self._root_object.update( + { + NameObject(PA.TYPE): NameObject(CO.CATALOG), + NameObject(CO.PAGES): self._pages, + } + ) + self._root = self._add_object(self._root_object) + self.fileobj = fileobj + self.with_as_usage = False + + def __enter__(self) -> "PdfWriter": + """Store that writer is initialized by 'with'.""" + self.with_as_usage = True + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> None: + """Write data to the fileobj.""" + if self.fileobj: + self.write(self.fileobj) + + @property + def pdf_header(self) -> bytes: + """ + Header of the PDF document that is written. + + This should be something like b'%PDF-1.5'. It is recommended to set the + lowest version that supports all features which are used within the + PDF file. + """ + return self._header + + @pdf_header.setter + def pdf_header(self, new_header: bytes) -> None: + self._header = new_header + + def _add_object(self, obj: PdfObject) -> IndirectObject: + if hasattr(obj, "indirect_reference") and obj.indirect_reference.pdf == self: # type: ignore + return obj.indirect_reference # type: ignore + self._objects.append(obj) + obj.indirect_reference = IndirectObject(len(self._objects), 0, self) + return obj.indirect_reference + + def get_object( + self, + indirect_reference: Union[None, int, IndirectObject] = None, + ido: Optional[IndirectObject] = None, + ) -> PdfObject: + if ido is not None: # deprecated + if indirect_reference is not None: + raise ValueError( + "Please only set 'indirect_reference'. The 'ido' argument is deprecated." + ) + else: + indirect_reference = ido + warnings.warn( + "The parameter 'ido' is depreciated and will be removed in PyPDF2 4.0.0.", + DeprecationWarning, + ) + assert ( + indirect_reference is not None + ) # the None value is only there to keep the deprecated name + if isinstance(indirect_reference, int): + return self._objects[indirect_reference - 1] + if indirect_reference.pdf != self: + raise ValueError("pdf must be self") + return self._objects[indirect_reference.idnum - 1] # type: ignore + + def getObject( + self, ido: Union[int, IndirectObject] + ) -> PdfObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`get_object` instead. + """ + deprecation_with_replacement("getObject", "get_object", "3.0.0") + return self.get_object(ido) + + def _add_page( + self, + page: PageObject, + action: Callable[[Any, IndirectObject], None], + excluded_keys: Iterable[str] = (), + ) -> PageObject: + assert cast(str, page[PA.TYPE]) == CO.PAGE + page_org = page + excluded_keys = list(excluded_keys) + excluded_keys += [PA.PARENT, "/StructParents"] + # acrobat does not accept to have two indirect ref pointing on the same page; + # therefore in order to add easily multiple copies of the same page, we need to create a new + # dictionary for the page, however the objects below (including content) is not duplicated + try: # delete an already existing page + del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore + page_org.indirect_reference.idnum # type: ignore + ] + except Exception: + pass + page = cast("PageObject", page_org.clone(self, False, excluded_keys)) + # page_ind = self._add_object(page) + if page_org.pdf is not None: + other = page_org.pdf.pdf_header + if isinstance(other, str): + other = other.encode() # type: ignore + self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) # type: ignore + page[NameObject(PA.PARENT)] = self._pages + pages = cast(DictionaryObject, self.get_object(self._pages)) + assert page.indirect_reference is not None + action(pages[PA.KIDS], page.indirect_reference) + page_count = cast(int, pages[PA.COUNT]) + pages[NameObject(PA.COUNT)] = NumberObject(page_count + 1) + return page + + def set_need_appearances_writer(self) -> None: + # See 12.7.2 and 7.7.2 for more information: + # http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + try: + catalog = self._root_object + # get the AcroForm tree + if CatalogDictionary.ACRO_FORM not in catalog: + self._root_object.update( + { + NameObject(CatalogDictionary.ACRO_FORM): IndirectObject( + len(self._objects), 0, self + ) + } + ) + + need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) + self._root_object[CatalogDictionary.ACRO_FORM][need_appearances] = BooleanObject(True) # type: ignore + except Exception as exc: + logger.error("set_need_appearances_writer() catch : ", repr(exc)) + + def add_page( + self, + page: PageObject, + excluded_keys: Iterable[str] = (), + ) -> PageObject: + """ + Add a page to this PDF file. + Recommended for advanced usage including the adequate excluded_keys + + The page is usually acquired from a :class:`PdfReader<PyPDF2.PdfReader>` + instance. + + :param PageObject page: The page to add to the document. Should be + an instance of :class:`PageObject<PyPDF2._page.PageObject>` + """ + return self._add_page(page, list.append, excluded_keys) + + def addPage( + self, + page: PageObject, + excluded_keys: Iterable[str] = (), + ) -> PageObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_page` instead. + """ + deprecation_with_replacement("addPage", "add_page", "3.0.0") + return self.add_page(page, excluded_keys) + + def insert_page( + self, + page: PageObject, + index: int = 0, + excluded_keys: Iterable[str] = (), + ) -> PageObject: + """ + Insert a page in this PDF file. The page is usually acquired from a + :class:`PdfReader<PyPDF2.PdfReader>` instance. + + :param PageObject page: The page to add to the document. + :param int index: Position at which the page will be inserted. + """ + return self._add_page(page, lambda l, p: l.insert(index, p)) + + def insertPage( + self, + page: PageObject, + index: int = 0, + excluded_keys: Iterable[str] = (), + ) -> PageObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`insert_page` instead. + """ + deprecation_with_replacement("insertPage", "insert_page", "3.0.0") + return self.insert_page(page, index, excluded_keys) + + def get_page( + self, page_number: Optional[int] = None, pageNumber: Optional[int] = None + ) -> PageObject: + """ + Retrieve a page by number from this PDF file. + + :param int page_number: The page number to retrieve + (pages begin at zero) + :return: the page at the index given by *page_number* + """ + if pageNumber is not None: # pragma: no cover + if page_number is not None: + raise ValueError("Please only use the page_number parameter") + deprecate_with_replacement( + "get_page(pageNumber)", "get_page(page_number)", "4.0.0" + ) + page_number = pageNumber + if page_number is None and pageNumber is None: # pragma: no cover + raise ValueError("Please specify the page_number") + pages = cast(Dict[str, Any], self.get_object(self._pages)) + # TODO: crude hack + return cast(PageObject, pages[PA.KIDS][page_number].get_object()) + + def getPage(self, pageNumber: int) -> PageObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :code:`writer.pages[page_number]` instead. + """ + deprecation_with_replacement("getPage", "writer.pages[page_number]", "3.0.0") + return self.get_page(pageNumber) + + def _get_num_pages(self) -> int: + pages = cast(Dict[str, Any], self.get_object(self._pages)) + return int(pages[NameObject("/Count")]) + + def getNumPages(self) -> int: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :code:`len(writer.pages)` instead. + """ + deprecation_with_replacement("getNumPages", "len(writer.pages)", "3.0.0") + return self._get_num_pages() + + @property + def pages(self) -> List[PageObject]: + """Property that emulates a list of :class:`PageObject<PyPDF2._page.PageObject>`.""" + return _VirtualList(self._get_num_pages, self.get_page) # type: ignore + + def add_blank_page( + self, width: Optional[float] = None, height: Optional[float] = None + ) -> PageObject: + """ + Append a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :return: the newly appended page + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + page = PageObject.create_blank_page(self, width, height) + self.add_page(page) + return page + + def addBlankPage( + self, width: Optional[float] = None, height: Optional[float] = None + ) -> PageObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_blank_page` instead. + """ + deprecation_with_replacement("addBlankPage", "add_blank_page", "3.0.0") + return self.add_blank_page(width, height) + + def insert_blank_page( + self, + width: Optional[decimal.Decimal] = None, + height: Optional[decimal.Decimal] = None, + index: int = 0, + ) -> PageObject: + """ + Insert a blank page to this PDF file and returns it. If no page size + is specified, use the size of the last page. + + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default + user space units. + :param int index: Position to add the page. + :return: the newly appended page + :raises PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. + """ + if width is None or height is None and (self._get_num_pages() - 1) >= index: + oldpage = self.pages[index] + width = oldpage.mediabox.width + height = oldpage.mediabox.height + page = PageObject.create_blank_page(self, width, height) + self.insert_page(page, index) + return page + + def insertBlankPage( + self, + width: Optional[decimal.Decimal] = None, + height: Optional[decimal.Decimal] = None, + index: int = 0, + ) -> PageObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`insertBlankPage` instead. + """ + deprecation_with_replacement("insertBlankPage", "insert_blank_page", "3.0.0") + return self.insert_blank_page(width, height, index) + + @property + def open_destination( + self, + ) -> Union[None, Destination, TextStringObject, ByteStringObject]: + """ + Property to access the opening destination ("/OpenAction" entry in the + PDF catalog). + it returns `None` if the entry does not exist is not set. + + :param destination:. + the property can be set to a Destination, a Page or an string(NamedDest) or + None (to remove "/OpenAction") + + (value stored in "/OpenAction" entry in the Pdf Catalog) + """ + if "/OpenAction" not in self._root_object: + return None + oa = self._root_object["/OpenAction"] + if isinstance(oa, (str, bytes)): + return create_string_object(str(oa)) + elif isinstance(oa, ArrayObject): + try: + page, typ = oa[0:2] # type: ignore + array = oa[2:] + fit = Fit(typ, tuple(array)) + return Destination("OpenAction", page, fit) + except Exception as exc: + raise Exception(f"Invalid Destination {oa}: {exc}") + else: + return None + + @open_destination.setter + def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: + if dest is None: + try: + del self._root_object["/OpenAction"] + except KeyError: + pass + elif isinstance(dest, str): + self._root_object[NameObject("/OpenAction")] = TextStringObject(dest) + elif isinstance(dest, Destination): + self._root_object[NameObject("/OpenAction")] = dest.dest_array + elif isinstance(dest, PageObject): + self._root_object[NameObject("/OpenAction")] = Destination( + "Opening", + dest.indirect_reference + if dest.indirect_reference is not None + else NullObject(), + PAGE_FIT, + ).dest_array + + def add_js(self, javascript: str) -> None: + """ + Add Javascript which will launch upon opening this PDF. + + :param str javascript: Your Javascript. + + >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + # Example: This will launch the print window when the PDF is opened. + """ + # Names / JavaScript prefered to be able to add multiple scripts + if "/Names" not in self._root_object: + self._root_object[NameObject(CA.NAMES)] = DictionaryObject() + names = cast(DictionaryObject, self._root_object[CA.NAMES]) + if "/JavaScript" not in names: + names[NameObject("/JavaScript")] = DictionaryObject( + {NameObject("/Names"): ArrayObject()} + ) + # cast(DictionaryObject, names[NameObject("/JavaScript")])[NameObject("/Names")] = ArrayObject() + js_list = cast( + ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] + ) + + js = DictionaryObject() + js.update( + { + NameObject(PA.TYPE): NameObject("/Action"), + NameObject("/S"): NameObject("/JavaScript"), + NameObject("/JS"): TextStringObject(f"{javascript}"), + } + ) + # We need a name for parameterized javascript in the pdf file, but it can be anything. + js_list.append(create_string_object(str(uuid.uuid4()))) + js_list.append(self._add_object(js)) + + def addJS(self, javascript: str) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_js` instead. + """ + deprecation_with_replacement("addJS", "add_js", "3.0.0") + return self.add_js(javascript) + + def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: + """ + Embed a file inside the PDF. + + :param str filename: The filename to display. + :param str data: The data in the file. + + Reference: + https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf + Section 7.11.3 + """ + # We need three entries: + # * The file's data + # * The /Filespec entry + # * The file's name, which goes in the Catalog + + # The entry for the file + # Sample: + # 8 0 obj + # << + # /Length 12 + # /Type /EmbeddedFile + # >> + # stream + # Hello world! + # endstream + # endobj + + file_entry = DecodedStreamObject() + file_entry.set_data(data) + file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) + + # The Filespec entry + # Sample: + # 7 0 obj + # << + # /Type /Filespec + # /F (hello.txt) + # /EF << /F 8 0 R >> + # >> + + ef_entry = DictionaryObject() + ef_entry.update({NameObject("/F"): file_entry}) + + filespec = DictionaryObject() + filespec.update( + { + NameObject(PA.TYPE): NameObject("/Filespec"), + NameObject(FileSpecificationDictionaryEntries.F): create_string_object( + filename + ), # Perhaps also try TextStringObject + NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, + } + ) + + # Then create the entry for the root, as it needs a reference to the Filespec + # Sample: + # 1 0 obj + # << + # /Type /Catalog + # /Outlines 2 0 R + # /Pages 3 0 R + # /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> + # >> + # endobj + + embedded_files_names_dictionary = DictionaryObject() + embedded_files_names_dictionary.update( + { + NameObject(CA.NAMES): ArrayObject( + [create_string_object(filename), filespec] + ) + } + ) + + embedded_files_dictionary = DictionaryObject() + embedded_files_dictionary.update( + {NameObject("/EmbeddedFiles"): embedded_files_names_dictionary} + ) + # Update the root + self._root_object.update({NameObject(CA.NAMES): embedded_files_dictionary}) + + def addAttachment( + self, fname: str, fdata: Union[str, bytes] + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_attachment` instead. + """ + deprecation_with_replacement("addAttachment", "add_attachment", "3.0.0") + return self.add_attachment(fname, fdata) + + def append_pages_from_reader( + self, + reader: PdfReader, + after_page_append: Optional[Callable[[PageObject], None]] = None, + ) -> None: + """ + Copy pages from reader to writer. Includes an optional callback parameter + which is invoked after pages are appended to the writer. + + :param PdfReader reader: a PdfReader object from which to copy page + annotations to this writer object. The writer's annots + will then be updated + :param Callable[[PageObject], None] after_page_append: + Callback function that is invoked after each page is appended to + the writer. Signature includes a reference to the appended page + (delegates to append_pages_from_reader). The single parameter of the + callback is a reference to the page just appended to the document. + """ + # Get page count from writer and reader + reader_num_pages = len(reader.pages) + # Copy pages from reader to writer + for reader_page_number in range(reader_num_pages): + reader_page = reader.pages[reader_page_number] + writer_page = self.add_page(reader_page) + # Trigger callback, pass writer page as parameter + if callable(after_page_append): + after_page_append(writer_page) + + def appendPagesFromReader( + self, + reader: PdfReader, + after_page_append: Optional[Callable[[PageObject], None]] = None, + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`append_pages_from_reader` instead. + """ + deprecation_with_replacement( + "appendPagesFromReader", "append_pages_from_reader", "3.0.0" + ) + self.append_pages_from_reader(reader, after_page_append) + + def update_page_form_field_values( + self, + page: PageObject, + fields: Dict[str, Any], + flags: FieldFlag = OPTIONAL_READ_WRITE_FIELD, + ) -> None: + """ + Update the form field values for a given page from a fields dictionary. + + Copy field texts and values from fields to page. + If the field links to a parent object, add the information to the parent. + + :param PageObject page: Page reference from PDF writer where the + annotations and field data will be updated. + :param dict fields: a Python dictionary of field names (/T) and text + values (/V) + :param int flags: An integer (0 to 7). The first bit sets ReadOnly, the + second bit sets Required, the third bit sets NoExport. See + PDF Reference Table 8.70 for details. + """ + self.set_need_appearances_writer() + # Iterate through pages, update field values + if PG.ANNOTS not in page: + logger_warning("No fields to update on this page", __name__) + return + for j in range(len(page[PG.ANNOTS])): # type: ignore + writer_annot = page[PG.ANNOTS][j].get_object() # type: ignore + # retrieve parent field values, if present + writer_parent_annot = {} # fallback if it's not there + if PG.PARENT in writer_annot: + writer_parent_annot = writer_annot[PG.PARENT] + for field in fields: + if writer_annot.get(FieldDictionaryAttributes.T) == field: + if writer_annot.get(FieldDictionaryAttributes.FT) == "/Btn": + writer_annot.update( + { + NameObject( + AnnotationDictionaryAttributes.AS + ): NameObject(fields[field]) + } + ) + writer_annot.update( + { + NameObject(FieldDictionaryAttributes.V): TextStringObject( + fields[field] + ) + } + ) + if flags: + writer_annot.update( + { + NameObject(FieldDictionaryAttributes.Ff): NumberObject( + flags + ) + } + ) + elif writer_parent_annot.get(FieldDictionaryAttributes.T) == field: + writer_parent_annot.update( + { + NameObject(FieldDictionaryAttributes.V): TextStringObject( + fields[field] + ) + } + ) + + def updatePageFormFieldValues( + self, + page: PageObject, + fields: Dict[str, Any], + flags: FieldFlag = OPTIONAL_READ_WRITE_FIELD, + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`update_page_form_field_values` instead. + """ + deprecation_with_replacement( + "updatePageFormFieldValues", "update_page_form_field_values", "3.0.0" + ) + return self.update_page_form_field_values(page, fields, flags) + + def clone_reader_document_root(self, reader: PdfReader) -> None: + """ + Copy the reader document root to the writer. + + :param reader: PdfReader from the document root should be copied. + """ + self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT]) + + def cloneReaderDocumentRoot(self, reader: PdfReader) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`clone_reader_document_root` instead. + """ + deprecation_with_replacement( + "cloneReaderDocumentRoot", "clone_reader_document_root", "3.0.0" + ) + self.clone_reader_document_root(reader) + + def clone_document_from_reader( + self, + reader: PdfReader, + after_page_append: Optional[Callable[[PageObject], None]] = None, + ) -> None: + """ + Create a copy (clone) of a document from a PDF file reader + + :param reader: PDF file reader instance from which the clone + should be created. + :param Callable[[PageObject], None] after_page_append: + Callback function that is invoked after each page is appended to + the writer. Signature includes a reference to the appended page + (delegates to append_pages_from_reader). The single parameter of the + callback is a reference to the page just appended to the document. + """ + # TODO : ppZZ may be limited because we do not copy all info... + self.clone_reader_document_root(reader) + self.append_pages_from_reader(reader, after_page_append) + + def cloneDocumentFromReader( + self, + reader: PdfReader, + after_page_append: Optional[Callable[[PageObject], None]] = None, + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`clone_document_from_reader` instead. + """ + deprecation_with_replacement( + "cloneDocumentFromReader", "clone_document_from_reader", "3.0.0" + ) + self.clone_document_from_reader(reader, after_page_append) + + def encrypt( + self, + user_password: Optional[str] = None, + owner_password: Optional[str] = None, + use_128bit: bool = True, + permissions_flag: UserAccessPermissions = ALL_DOCUMENT_PERMISSIONS, + user_pwd: Optional[str] = None, # deprecated + owner_pwd: Optional[str] = None, # deprecated + ) -> None: + """ + Encrypt this PDF file with the PDF Standard encryption handler. + + :param str user_password: The "user password", which allows for opening + and reading the PDF file with the restrictions provided. + :param str owner_password: The "owner password", which allows for + opening the PDF files without any restrictions. By default, + the owner password is the same as the user password. + :param bool use_128bit: flag as to whether to use 128bit + encryption. When false, 40bit encryption will be used. By default, + this flag is on. + :param unsigned int permissions_flag: permissions as described in + TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the + permission is grantend. Hence an integer value of -1 will set all + flags. + Bit position 3 is for printing, 4 is for modifying content, 5 and 6 + control annotations, 9 for form fields, 10 for extraction of + text and graphics. + """ + if user_pwd is not None: + if user_password is not None: + raise ValueError( + "Please only set 'user_password'. " + "The 'user_pwd' argument is deprecated." + ) + else: + warnings.warn( + "Please use 'user_password' instead of 'user_pwd'. " + "The 'user_pwd' argument is deprecated and " + "will be removed in PyPDF2 4.0.0." + ) + user_password = user_pwd + if user_password is None: # deprecated + # user_password is only Optional for due to the deprecated user_pwd + raise ValueError("user_password may not be None") + + if owner_pwd is not None: # deprecated + if owner_password is not None: + raise ValueError( + "The argument owner_pwd of encrypt is deprecated. Use owner_password only." + ) + else: + old_term = "owner_pwd" + new_term = "owner_password" + warnings.warn( + message=( + f"{old_term} is deprecated as an argument and will be " + f"removed in PyPDF2 4.0.0. Use {new_term} instead" + ), + category=DeprecationWarning, + ) + owner_password = owner_pwd + + if owner_password is None: + owner_password = user_password + if use_128bit: + V = 2 + rev = 3 + keylen = int(128 / 8) + else: + V = 1 + rev = 2 + keylen = int(40 / 8) + P = permissions_flag + O = ByteStringObject(_alg33(owner_password, user_password, rev, keylen)) # type: ignore[arg-type] + ID_1 = ByteStringObject(md5((repr(time.time())).encode("utf8")).digest()) + ID_2 = ByteStringObject(md5((repr(random.random())).encode("utf8")).digest()) + self._ID = ArrayObject((ID_1, ID_2)) + if rev == 2: + U, key = _alg34(user_password, O, P, ID_1) + else: + assert rev == 3 + U, key = _alg35(user_password, rev, keylen, O, P, ID_1, False) # type: ignore[arg-type] + encrypt = DictionaryObject() + encrypt[NameObject(SA.FILTER)] = NameObject("/Standard") + encrypt[NameObject("/V")] = NumberObject(V) + if V == 2: + encrypt[NameObject(SA.LENGTH)] = NumberObject(keylen * 8) + encrypt[NameObject(ED.R)] = NumberObject(rev) + encrypt[NameObject(ED.O)] = ByteStringObject(O) + encrypt[NameObject(ED.U)] = ByteStringObject(U) + encrypt[NameObject(ED.P)] = NumberObject(P) + self._encrypt = self._add_object(encrypt) + self._encrypt_key = key + + def write_stream(self, stream: StreamType) -> None: + if hasattr(stream, "mode") and "b" not in stream.mode: + logger_warning( + f"File <{stream.name}> to write to is not in binary mode. " # type: ignore + "It may not be written to correctly.", + __name__, + ) + + if not self._root: + self._root = self._add_object(self._root_object) + + # PDF objects sometimes have circular references to their /Page objects + # inside their object tree (for example, annotations). Those will be + # indirect references to objects that we've recreated in this PDF. To + # address this problem, PageObject's store their original object + # reference number, and we add it to the external reference map before + # we sweep for indirect references. This forces self-page-referencing + # trees to reference the correct new object location, rather than + # copying in a new copy of the page object. + self._sweep_indirect_references(self._root) + + object_positions = self._write_header(stream) + xref_location = self._write_xref_table(stream, object_positions) + self._write_trailer(stream) + stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n")) # eof + + def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: + """ + Write the collection of pages added to this object out as a PDF file. + + :param stream: An object to write the file to. The object can support + the write method and the tell method, similar to a file object, or + be a file path, just like the fileobj, just named it stream to keep + existing workflow. + """ + my_file = False + + if stream == "": + raise ValueError(f"Output(stream={stream}) is empty.") + + if isinstance(stream, (str, Path)): + stream = FileIO(stream, "wb") + self.with_as_usage = True # + my_file = True + + self.write_stream(stream) + + if self.with_as_usage: + stream.close() + + return my_file, stream + + def _write_header(self, stream: StreamType) -> List[int]: + object_positions = [] + stream.write(self.pdf_header + b"\n") + stream.write(b"%\xE2\xE3\xCF\xD3\n") + for i, obj in enumerate(self._objects): + obj = self._objects[i] + # If the obj is None we can't write anything + if obj is not None: + idnum = i + 1 + object_positions.append(stream.tell()) + stream.write(b_(str(idnum)) + b" 0 obj\n") + key = None + if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: + pack1 = struct.pack("<i", i + 1)[:3] + pack2 = struct.pack("<i", 0)[:2] + key = self._encrypt_key + pack1 + pack2 + assert len(key) == (len(self._encrypt_key) + 5) + md5_hash = md5(key).digest() + key = md5_hash[: min(16, len(self._encrypt_key) + 5)] + obj.write_to_stream(stream, key) + stream.write(b"\nendobj\n") + return object_positions + + def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int: + xref_location = stream.tell() + stream.write(b"xref\n") + stream.write(b_(f"0 {len(self._objects) + 1}\n")) + stream.write(b_(f"{0:0>10} {65535:0>5} f \n")) + for offset in object_positions: + stream.write(b_(f"{offset:0>10} {0:0>5} n \n")) + return xref_location + + def _write_trailer(self, stream: StreamType) -> None: + stream.write(b"trailer\n") + trailer = DictionaryObject() + trailer.update( + { + NameObject(TK.SIZE): NumberObject(len(self._objects) + 1), + NameObject(TK.ROOT): self._root, + NameObject(TK.INFO): self._info, + } + ) + if hasattr(self, "_ID"): + trailer[NameObject(TK.ID)] = self._ID + if hasattr(self, "_encrypt"): + trailer[NameObject(TK.ENCRYPT)] = self._encrypt + trailer.write_to_stream(stream, None) + + def add_metadata(self, infos: Dict[str, Any]) -> None: + """ + Add custom metadata to the output. + + :param dict infos: a Python dictionary where each key is a field + and each value is your new metadata. + """ + args = {} + for key, value in list(infos.items()): + args[NameObject(key)] = create_string_object(value) + self.get_object(self._info).update(args) # type: ignore + + def addMetadata(self, infos: Dict[str, Any]) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_metadata` instead. + """ + deprecation_with_replacement("addMetadata", "add_metadata", "3.0.0") + self.add_metadata(infos) + + def _sweep_indirect_references( + self, + root: Union[ + ArrayObject, + BooleanObject, + DictionaryObject, + FloatObject, + IndirectObject, + NameObject, + PdfObject, + NumberObject, + TextStringObject, + NullObject, + ], + ) -> None: + stack: Deque[ + Tuple[ + Any, + Optional[Any], + Any, + List[PdfObject], + ] + ] = collections.deque() + discovered = [] + parent = None + grant_parents: List[PdfObject] = [] + key_or_id = None + + # Start from root + stack.append((root, parent, key_or_id, grant_parents)) + + while len(stack): + data, parent, key_or_id, grant_parents = stack.pop() + + # Build stack for a processing depth-first + if isinstance(data, (ArrayObject, DictionaryObject)): + for key, value in data.items(): + stack.append( + ( + value, + data, + key, + grant_parents + [parent] if parent is not None else [], + ) + ) + elif isinstance(data, IndirectObject): + if data.pdf != self: + data = self._resolve_indirect_object(data) + + if str(data) not in discovered: + discovered.append(str(data)) + stack.append((data.get_object(), None, None, [])) + + # Check if data has a parent and if it is a dict or an array update the value + if isinstance(parent, (DictionaryObject, ArrayObject)): + if isinstance(data, StreamObject): + # a dictionary value is a stream. streams must be indirect + # objects, so we need to change this value. + data = self._resolve_indirect_object(self._add_object(data)) + + update_hashes = [] + + # Data changed and thus the hash value changed + if parent[key_or_id] != data: + update_hashes = [parent.hash_value()] + [ + grant_parent.hash_value() for grant_parent in grant_parents + ] + parent[key_or_id] = data + + # Update old hash value to new hash value + for old_hash in update_hashes: + indirect_reference = self._idnum_hash.pop(old_hash, None) + + if indirect_reference is not None: + indirect_reference_obj = indirect_reference.get_object() + + if indirect_reference_obj is not None: + self._idnum_hash[ + indirect_reference_obj.hash_value() + ] = indirect_reference + + def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: + """ + Resolves indirect object to this pdf indirect objects. + + If it is a new object then it is added to self._objects + and new idnum is given and generation is always 0. + """ + if hasattr(data.pdf, "stream") and data.pdf.stream.closed: + raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}") + + if data.pdf == self: + return data + + # Get real object indirect object + real_obj = data.pdf.get_object(data) + + if real_obj is None: + logger_warning( + f"Unable to resolve [{data.__class__.__name__}: {data}], " + "returning NullObject instead", + __name__, + ) + real_obj = NullObject() + + hash_value = real_obj.hash_value() + + # Check if object is handled + if hash_value in self._idnum_hash: + return self._idnum_hash[hash_value] + + if data.pdf == self: + self._idnum_hash[hash_value] = IndirectObject(data.idnum, 0, self) + # This is new object in this pdf + else: + self._idnum_hash[hash_value] = self._add_object(real_obj) + + return self._idnum_hash[hash_value] + + def get_reference(self, obj: PdfObject) -> IndirectObject: + idnum = self._objects.index(obj) + 1 + ref = IndirectObject(idnum, 0, self) + assert ref.get_object() == obj + return ref + + def getReference(self, obj: PdfObject) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`get_reference` instead. + """ + deprecation_with_replacement("getReference", "get_reference", "3.0.0") + return self.get_reference(obj) + + def get_outline_root(self) -> TreeObject: + if CO.OUTLINES in self._root_object: + # TABLE 3.25 Entries in the catalog dictionary + outline = cast(TreeObject, self._root_object[CO.OUTLINES]) + idnum = self._objects.index(outline) + 1 + outline_ref = IndirectObject(idnum, 0, self) + assert outline_ref.get_object() == outline + else: + outline = TreeObject() + outline.update({}) + outline_ref = self._add_object(outline) + self._root_object[NameObject(CO.OUTLINES)] = outline_ref + + return outline + + def get_threads_root(self) -> ArrayObject: + """ + the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array (possibly empty) of Dictionaries with "/F" and "/I" properties + """ + if CO.THREADS in self._root_object: + # TABLE 3.25 Entries in the catalog dictionary + threads = cast(ArrayObject, self._root_object[CO.THREADS]) + else: + threads = ArrayObject() + self._root_object[NameObject(CO.THREADS)] = threads + return threads + + @property + def threads(self) -> ArrayObject: + """ + Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec + + :return: an Array (possibly empty) of Dictionaries with "/F" and "/I" properties + """ + return self.get_threads_root() + + def getOutlineRoot(self) -> TreeObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`get_outline_root` instead. + """ + deprecation_with_replacement("getOutlineRoot", "get_outline_root", "3.0.0") + return self.get_outline_root() + + def get_named_dest_root(self) -> ArrayObject: + if CA.NAMES in self._root_object and isinstance( + self._root_object[CA.NAMES], DictionaryObject + ): + names = cast(DictionaryObject, self._root_object[CA.NAMES]) + names_ref = names.indirect_reference + if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): + # 3.6.3 Name Dictionary (PDF spec 1.7) + dests = cast(DictionaryObject, names[CA.DESTS]) + dests_ref = dests.indirect_reference + if CA.NAMES in dests: + # TABLE 3.33 Entries in a name tree node dictionary + nd = cast(ArrayObject, dests[CA.NAMES]) + else: + nd = ArrayObject() + dests[NameObject(CA.NAMES)] = nd + else: + dests = DictionaryObject() + dests_ref = self._add_object(dests) + names[NameObject(CA.DESTS)] = dests_ref + nd = ArrayObject() + dests[NameObject(CA.NAMES)] = nd + + else: + names = DictionaryObject() + names_ref = self._add_object(names) + self._root_object[NameObject(CA.NAMES)] = names_ref + dests = DictionaryObject() + dests_ref = self._add_object(dests) + names[NameObject(CA.DESTS)] = dests_ref + nd = ArrayObject() + dests[NameObject(CA.NAMES)] = nd + + return nd + + def getNamedDestRoot(self) -> ArrayObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`get_named_dest_root` instead. + """ + deprecation_with_replacement("getNamedDestRoot", "get_named_dest_root", "3.0.0") + return self.get_named_dest_root() + + def add_outline_item_destination( + self, + page_destination: Union[None, PageObject, TreeObject] = None, + parent: Union[None, TreeObject, IndirectObject] = None, + before: Union[None, TreeObject, IndirectObject] = None, + dest: Union[None, PageObject, TreeObject] = None, # deprecated + ) -> IndirectObject: + if page_destination is not None and dest is not None: # deprecated + raise ValueError( + "The argument dest of add_outline_item_destination is deprecated. Use page_destination only." + ) + if dest is not None: # deprecated + old_term = "dest" + new_term = "page_destination" + warnings.warn( + message=( + f"{old_term} is deprecated as an argument and will be " + f"removed in PyPDF2 4.0.0. Use {new_term} instead" + ), + category=DeprecationWarning, + ) + page_destination = dest + if page_destination is None: # deprecated + # argument is only Optional due to deprecated argument. + raise ValueError("page_destination may not be None") + + if parent is None: + parent = self.get_outline_root() + + parent = cast(TreeObject, parent.get_object()) + page_destination_ref = self._add_object(page_destination) + if before is not None: + before = before.indirect_reference + parent.insert_child(page_destination_ref, before, self) + + return page_destination_ref + + def add_bookmark_destination( + self, + dest: Union[PageObject, TreeObject], + parent: Union[None, TreeObject, IndirectObject] = None, + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 2.9.0 + + Use :meth:`add_outline_item_destination` instead. + """ + deprecation_with_replacement( + "add_bookmark_destination", "add_outline_item_destination", "3.0.0" + ) + return self.add_outline_item_destination(dest, parent) + + def addBookmarkDestination( + self, dest: PageObject, parent: Optional[TreeObject] = None + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_outline_item_destination` instead. + """ + deprecation_with_replacement( + "addBookmarkDestination", "add_outline_item_destination", "3.0.0" + ) + return self.add_outline_item_destination(dest, parent) + + @deprecation_bookmark(bookmark="outline_item") + def add_outline_item_dict( + self, + outline_item: OutlineItemType, + parent: Union[None, TreeObject, IndirectObject] = None, + before: Union[None, TreeObject, IndirectObject] = None, + ) -> IndirectObject: + outline_item_object = TreeObject() + for k, v in list(outline_item.items()): + outline_item_object[NameObject(str(k))] = v + outline_item_object.update(outline_item) + + if "/A" in outline_item: + action = DictionaryObject() + a_dict = cast(DictionaryObject, outline_item["/A"]) + for k, v in list(a_dict.items()): + action[NameObject(str(k))] = v + action_ref = self._add_object(action) + outline_item_object[NameObject("/A")] = action_ref + + return self.add_outline_item_destination(outline_item_object, parent, before) + + @deprecation_bookmark(bookmark="outline_item") + def add_bookmark_dict( + self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 2.9.0 + + Use :meth:`add_outline_item_dict` instead. + """ + deprecation_with_replacement( + "add_bookmark_dict", "add_outline_item_dict", "3.0.0" + ) + return self.add_outline_item_dict(outline_item, parent) + + @deprecation_bookmark(bookmark="outline_item") + def addBookmarkDict( + self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_outline_item_dict` instead. + """ + deprecation_with_replacement( + "addBookmarkDict", "add_outline_item_dict", "3.0.0" + ) + return self.add_outline_item_dict(outline_item, parent) + + def add_outline_item( + self, + title: str, + page_number: Union[None, PageObject, IndirectObject, int], + parent: Union[None, TreeObject, IndirectObject] = None, + before: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Union[Tuple[float, float, float], str]] = None, + bold: bool = False, + italic: bool = False, + fit: Fit = PAGE_FIT, + pagenum: Optional[int] = None, # deprecated + ) -> IndirectObject: + """ + Add an outline item (commonly referred to as a "Bookmark") to this PDF file. + + :param str title: Title to use for this outline item. + :param int page_number: Page number this outline item will point to. + :param parent: A reference to a parent outline item to create nested + outline items. + :param parent: A reference to a parent outline item to create nested + outline items. + :param tuple color: Color of the outline item's font as a red, green, blue tuple + from 0.0 to 1.0 or as a Hex String (#RRGGBB) + :param bool bold: Outline item font is bold + :param bool italic: Outline item font is italic + :param Fit fit: The fit of the destination page. + """ + page_ref: Union[None, NullObject, IndirectObject, NumberObject] + if isinstance(italic, Fit): # it means that we are on the old params + if fit is not None and page_number is None: + page_number = fit # type: ignore + return self.add_outline_item( + title, page_number, parent, None, before, color, bold, italic # type: ignore + ) + if page_number is not None and pagenum is not None: + raise ValueError( + "The argument pagenum of add_outline_item is deprecated. Use page_number only." + ) + if page_number is None: + action_ref = None + else: + if isinstance(page_number, IndirectObject): + page_ref = page_number + elif isinstance(page_number, PageObject): + page_ref = page_number.indirect_reference + elif isinstance(page_number, int): + try: + page_ref = self.pages[page_number].indirect_reference + except IndexError: + page_ref = NumberObject(page_number) + if page_ref is None: + logger_warning( + f"can not find reference of page {page_number}", + __name__, + ) + page_ref = NullObject() + dest = Destination( + NameObject("/" + title + " outline item"), + page_ref, + fit, + ) + + action_ref = self._add_object( + DictionaryObject( + { + NameObject(GoToActionArguments.D): dest.dest_array, + NameObject(GoToActionArguments.S): NameObject("/GoTo"), + } + ) + ) + outline_item = _create_outline_item(action_ref, title, color, italic, bold) + + if parent is None: + parent = self.get_outline_root() + return self.add_outline_item_destination(outline_item, parent, before) + + def add_bookmark( + self, + title: str, + pagenum: int, # deprecated, but the whole method is deprecated + parent: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Tuple[float, float, float]] = None, + bold: bool = False, + italic: bool = False, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 2.9.0 + + Use :meth:`add_outline_item` instead. + """ + deprecation_with_replacement("add_bookmark", "add_outline_item", "3.0.0") + return self.add_outline_item( + title, + pagenum, + parent, + color, # type: ignore + bold, # type: ignore + italic, + Fit(fit_type=fit, fit_args=args), # type: ignore + ) + + def addBookmark( + self, + title: str, + pagenum: int, + parent: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Tuple[float, float, float]] = None, + bold: bool = False, + italic: bool = False, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_outline_item` instead. + """ + deprecation_with_replacement("addBookmark", "add_outline_item", "3.0.0") + return self.add_outline_item( + title, + pagenum, + parent, + None, + color, + bold, + italic, + Fit(fit_type=fit, fit_args=args), + ) + + def add_outline(self) -> None: + raise NotImplementedError( + "This method is not yet implemented. Use :meth:`add_outline_item` instead." + ) + + def add_named_destination_array( + self, title: TextStringObject, destination: Union[IndirectObject, ArrayObject] + ) -> None: + nd = self.get_named_dest_root() + i = 0 + while i < len(nd): + if title < nd[i]: + nd.insert(i, destination) + nd.insert(i, TextStringObject(title)) + return + else: + i += 2 + nd.extend([TextStringObject(title), destination]) + return + + def add_named_destination_object( + self, + page_destination: Optional[PdfObject] = None, + dest: Optional[PdfObject] = None, + ) -> IndirectObject: + if page_destination is not None and dest is not None: + raise ValueError( + "The argument dest of add_named_destination_object is deprecated. Use page_destination only." + ) + if dest is not None: # deprecated + old_term = "dest" + new_term = "page_destination" + warnings.warn( + message=( + f"{old_term} is deprecated as an argument and will be " + f"removed in PyPDF2 4.0.0. Use {new_term} instead" + ), + category=DeprecationWarning, + ) + page_destination = dest + if page_destination is None: # deprecated + raise ValueError("page_destination may not be None") + + page_destination_ref = self._add_object(page_destination.dest_array) # type: ignore + self.add_named_destination_array( + cast("TextStringObject", page_destination["/Title"]), page_destination_ref # type: ignore + ) + + return page_destination_ref + + def addNamedDestinationObject( + self, dest: Destination + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_named_destination_object` instead. + """ + deprecation_with_replacement( + "addNamedDestinationObject", "add_named_destination_object", "3.0.0" + ) + return self.add_named_destination_object(dest) + + def add_named_destination( + self, + title: str, + page_number: Optional[int] = None, + pagenum: Optional[int] = None, # deprecated + ) -> IndirectObject: + if page_number is not None and pagenum is not None: + raise ValueError( + "The argument pagenum of add_outline_item is deprecated. Use page_number only." + ) + if pagenum is not None: + old_term = "pagenum" + new_term = "page_number" + warnings.warn( + message=( + f"{old_term} is deprecated as an argument and will be " + f"removed in PyPDF2 4.0.0. Use {new_term} instead" + ), + category=DeprecationWarning, + ) + page_number = pagenum + if page_number is None: + raise ValueError("page_number may not be None") + page_ref = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore + dest = DictionaryObject() + dest.update( + { + NameObject(GoToActionArguments.D): ArrayObject( + [page_ref, NameObject(TypFitArguments.FIT_H), NumberObject(826)] + ), + NameObject(GoToActionArguments.S): NameObject("/GoTo"), + } + ) + + dest_ref = self._add_object(dest) + nd = self.get_named_dest_root() + if not isinstance(title, TextStringObject): + title = TextStringObject(str(title)) + nd.extend([title, dest_ref]) + return dest_ref + + def addNamedDestination( + self, title: str, pagenum: int + ) -> IndirectObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_named_destination` instead. + """ + deprecation_with_replacement( + "addNamedDestination", "add_named_destination", "3.0.0" + ) + return self.add_named_destination(title, pagenum) + + def remove_links(self) -> None: + """Remove links and annotations from this output.""" + pg_dict = cast(DictionaryObject, self.get_object(self._pages)) + pages = cast(ArrayObject, pg_dict[PA.KIDS]) + for page in pages: + page_ref = cast(DictionaryObject, self.get_object(page)) + if PG.ANNOTS in page_ref: + del page_ref[PG.ANNOTS] + + def removeLinks(self) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`remove_links` instead. + """ + deprecation_with_replacement("removeLinks", "remove_links", "3.0.0") + return self.remove_links() + + def remove_images(self, ignore_byte_string_object: bool = False) -> None: + """ + Remove images from this output. + + :param bool ignore_byte_string_object: optional parameter + to ignore ByteString Objects. + """ + pg_dict = cast(DictionaryObject, self.get_object(self._pages)) + pages = cast(ArrayObject, pg_dict[PA.KIDS]) + jump_operators = ( + b"cm", + b"w", + b"J", + b"j", + b"M", + b"d", + b"ri", + b"i", + b"gs", + b"W", + b"b", + b"s", + b"S", + b"f", + b"F", + b"n", + b"m", + b"l", + b"c", + b"v", + b"y", + b"h", + b"B", + b"Do", + b"sh", + ) + for page in pages: + page_ref = cast(DictionaryObject, self.get_object(page)) + content = page_ref["/Contents"].get_object() + if not isinstance(content, ContentStream): + content = ContentStream(content, page_ref) + + _operations = [] + seq_graphics = False + for operands, operator in content.operations: + if operator in [b"Tj", b"'"]: + text = operands[0] + if ignore_byte_string_object and not isinstance( + text, TextStringObject + ): + operands[0] = TextStringObject() + elif operator == b'"': + text = operands[2] + if ignore_byte_string_object and not isinstance( + text, TextStringObject + ): + operands[2] = TextStringObject() + elif operator == b"TJ": + for i in range(len(operands[0])): + if ignore_byte_string_object and not isinstance( + operands[0][i], TextStringObject + ): + operands[0][i] = TextStringObject() + + if operator == b"q": + seq_graphics = True + if operator == b"Q": + seq_graphics = False + if seq_graphics and operator in jump_operators: + continue + if operator == b"re": + continue + _operations.append((operands, operator)) + + content.operations = _operations + page_ref.__setitem__(NameObject("/Contents"), content) + + def removeImages( + self, ignoreByteStringObject: bool = False + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`remove_images` instead. + """ + deprecation_with_replacement("removeImages", "remove_images", "3.0.0") + return self.remove_images(ignoreByteStringObject) + + def remove_text(self, ignore_byte_string_object: bool = False) -> None: + """ + Remove text from this output. + + :param bool ignore_byte_string_object: optional parameter + to ignore ByteString Objects. + """ + pg_dict = cast(DictionaryObject, self.get_object(self._pages)) + pages = cast(List[IndirectObject], pg_dict[PA.KIDS]) + for page in pages: + page_ref = cast(PageObject, self.get_object(page)) + content = page_ref["/Contents"].get_object() + if not isinstance(content, ContentStream): + content = ContentStream(content, page_ref) + for operands, operator in content.operations: + if operator in [b"Tj", b"'"]: + text = operands[0] + if not ignore_byte_string_object: + if isinstance(text, TextStringObject): + operands[0] = TextStringObject() + else: + if isinstance(text, (TextStringObject, ByteStringObject)): + operands[0] = TextStringObject() + elif operator == b'"': + text = operands[2] + if not ignore_byte_string_object: + if isinstance(text, TextStringObject): + operands[2] = TextStringObject() + else: + if isinstance(text, (TextStringObject, ByteStringObject)): + operands[2] = TextStringObject() + elif operator == b"TJ": + for i in range(len(operands[0])): + if not ignore_byte_string_object: + if isinstance(operands[0][i], TextStringObject): + operands[0][i] = TextStringObject() + else: + if isinstance( + operands[0][i], (TextStringObject, ByteStringObject) + ): + operands[0][i] = TextStringObject() + + page_ref.__setitem__(NameObject("/Contents"), content) + + def removeText( + self, ignoreByteStringObject: bool = False + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`remove_text` instead. + """ + deprecation_with_replacement("removeText", "remove_text", "3.0.0") + return self.remove_text(ignoreByteStringObject) + + def add_uri( + self, + page_number: int, + uri: str, + rect: RectangleObject, + border: Optional[ArrayObject] = None, + pagenum: Optional[int] = None, + ) -> None: + """ + Add an URI from a rectangular area to the specified page. + This uses the basic structure of :meth:`add_link` + + :param int page_number: index of the page on which to place the URI action. + :param str uri: URI of resource to link to. + :param Tuple[int, int, int, int] rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four + integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. + :param ArrayObject border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + """ + if pagenum is not None: + warnings.warn( + "The 'pagenum' argument of add_uri is deprecated and will be " + "removed in PyPDF2 4.0.0. Use 'page_number' instead.", + category=DeprecationWarning, + ) + page_number = pagenum + page_link = self.get_object(self._pages)[PA.KIDS][page_number] # type: ignore + page_ref = cast(Dict[str, Any], self.get_object(page_link)) + + border_arr: BorderArrayType + if border is not None: + border_arr = [NameObject(n) for n in border[:3]] + if len(border) == 4: + dash_pattern = ArrayObject([NameObject(n) for n in border[3]]) + border_arr.append(dash_pattern) + else: + border_arr = [NumberObject(2)] * 3 + + if isinstance(rect, str): + rect = NameObject(rect) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + lnk2 = DictionaryObject() + lnk2.update( + { + NameObject("/S"): NameObject("/URI"), + NameObject("/URI"): TextStringObject(uri), + } + ) + lnk = DictionaryObject() + lnk.update( + { + NameObject(AnnotationDictionaryAttributes.Type): NameObject(PG.ANNOTS), + NameObject(AnnotationDictionaryAttributes.Subtype): NameObject("/Link"), + NameObject(AnnotationDictionaryAttributes.P): page_link, + NameObject(AnnotationDictionaryAttributes.Rect): rect, + NameObject("/H"): NameObject("/I"), + NameObject(AnnotationDictionaryAttributes.Border): ArrayObject( + border_arr + ), + NameObject("/A"): lnk2, + } + ) + lnk_ref = self._add_object(lnk) + + if PG.ANNOTS in page_ref: + page_ref[PG.ANNOTS].append(lnk_ref) + else: + page_ref[NameObject(PG.ANNOTS)] = ArrayObject([lnk_ref]) + + def addURI( + self, + pagenum: int, # deprecated, but method is deprecated already + uri: str, + rect: RectangleObject, + border: Optional[ArrayObject] = None, + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_uri` instead. + """ + deprecation_with_replacement("addURI", "add_uri", "3.0.0") + return self.add_uri(pagenum, uri, rect, border) + + def add_link( + self, + pagenum: int, # deprecated, but method is deprecated already + page_destination: int, + rect: RectangleObject, + border: Optional[ArrayObject] = None, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> None: + deprecation_with_replacement( + "add_link", "add_annotation(AnnotationBuilder.link(...))" + ) + + if isinstance(rect, str): + rect = rect.strip()[1:-1] + rect = RectangleObject( + [float(num) for num in rect.split(" ") if len(num) > 0] + ) + elif isinstance(rect, RectangleObject): + pass + else: + rect = RectangleObject(rect) + + annotation = AnnotationBuilder.link( + rect=rect, + border=border, + target_page_index=page_destination, + fit=Fit(fit_type=fit, fit_args=args), + ) + return self.add_annotation(page_number=pagenum, annotation=annotation) + + def addLink( + self, + pagenum: int, # deprecated, but method is deprecated already + page_destination: int, + rect: RectangleObject, + border: Optional[ArrayObject] = None, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_link` instead. + """ + deprecate_with_replacement( + "addLink", "add_annotation(AnnotationBuilder.link(...))", "4.0.0" + ) + return self.add_link(pagenum, page_destination, rect, border, fit, *args) + + _valid_layouts = ( + "/NoLayout", + "/SinglePage", + "/OneColumn", + "/TwoColumnLeft", + "/TwoColumnRight", + "/TwoPageLeft", + "/TwoPageRight", + ) + + def _get_page_layout(self) -> Optional[LayoutType]: + try: + return cast(LayoutType, self._root_object["/PageLayout"]) + except KeyError: + return None + + def getPageLayout(self) -> Optional[LayoutType]: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_layout` instead. + """ + deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") + return self._get_page_layout() + + def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: + """ + Set the page layout. + + :param str layout: The page layout to be used. + + .. list-table:: Valid ``layout`` arguments + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + if not isinstance(layout, NameObject): + if layout not in self._valid_layouts: + logger_warning( + f"Layout should be one of: {'', ''.join(self._valid_layouts)}", + __name__, + ) + layout = NameObject(layout) + self._root_object.update({NameObject("/PageLayout"): layout}) + + def set_page_layout(self, layout: LayoutType) -> None: + """ + Set the page layout. + + :param str layout: The page layout to be used + + .. list-table:: Valid ``layout`` arguments + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + self._set_page_layout(layout) + + def setPageLayout(self, layout: LayoutType) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_layout` instead. + """ + deprecation_with_replacement( + "writer.setPageLayout(val)", "writer.page_layout = val", "3.0.0" + ) + return self._set_page_layout(layout) + + @property + def page_layout(self) -> Optional[LayoutType]: + """ + Page layout property. + + .. list-table:: Valid ``layout`` values + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + return self._get_page_layout() + + @page_layout.setter + def page_layout(self, layout: LayoutType) -> None: + self._set_page_layout(layout) + + @property + def pageLayout(self) -> Optional[LayoutType]: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_layout` instead. + """ + deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") + return self.page_layout + + @pageLayout.setter + def pageLayout(self, layout: LayoutType) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_layout` instead. + """ + deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") + self.page_layout = layout + + _valid_modes = ( + "/UseNone", + "/UseOutlines", + "/UseThumbs", + "/FullScreen", + "/UseOC", + "/UseAttachments", + ) + + def _get_page_mode(self) -> Optional[PagemodeType]: + try: + return cast(PagemodeType, self._root_object["/PageMode"]) + except KeyError: + return None + + def getPageMode(self) -> Optional[PagemodeType]: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_mode` instead. + """ + deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") + return self._get_page_mode() + + def set_page_mode(self, mode: PagemodeType) -> None: + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_mode` instead. + """ + if isinstance(mode, NameObject): + mode_name: NameObject = mode + else: + if mode not in self._valid_modes: + logger_warning( + f"Mode should be one of: {', '.join(self._valid_modes)}", __name__ + ) + mode_name = NameObject(mode) + self._root_object.update({NameObject("/PageMode"): mode_name}) + + def setPageMode(self, mode: PagemodeType) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_mode` instead. + """ + deprecation_with_replacement( + "writer.setPageMode(val)", "writer.page_mode = val", "3.0.0" + ) + self.set_page_mode(mode) + + @property + def page_mode(self) -> Optional[PagemodeType]: + """ + Page mode property. + + .. list-table:: Valid ``mode`` values + :widths: 50 200 + + * - /UseNone + - Do not show outline or thumbnails panels + * - /UseOutlines + - Show outline (aka bookmarks) panel + * - /UseThumbs + - Show page thumbnails panel + * - /FullScreen + - Fullscreen view + * - /UseOC + - Show Optional Content Group (OCG) panel + * - /UseAttachments + - Show attachments panel + """ + return self._get_page_mode() + + @page_mode.setter + def page_mode(self, mode: PagemodeType) -> None: + self.set_page_mode(mode) + + @property + def pageMode(self) -> Optional[PagemodeType]: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_mode` instead. + """ + deprecation_with_replacement("pageMode", "page_mode", "3.0.0") + return self.page_mode + + @pageMode.setter + def pageMode(self, mode: PagemodeType) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`page_mode` instead. + """ + deprecation_with_replacement("pageMode", "page_mode", "3.0.0") + self.page_mode = mode + + def add_annotation(self, page_number: int, annotation: Dict[str, Any]) -> None: + to_add = cast(DictionaryObject, _pdf_objectify(annotation)) + to_add[NameObject("/P")] = self.get_object(self._pages)["/Kids"][page_number] # type: ignore + page = self.pages[page_number] + if page.annotations is None: + page[NameObject("/Annots")] = ArrayObject() + assert page.annotations is not None + + # Internal link annotations need the correct object type for the + # destination + if to_add.get("/Subtype") == "/Link" and NameObject("/Dest") in to_add: + tmp = cast(dict, to_add[NameObject("/Dest")]) + dest = Destination( + NameObject("/LinkName"), + tmp["target_page_index"], + Fit( + fit_type=tmp["fit"], fit_args=dict(tmp)["fit_args"] + ), # I have no clue why this dict-hack is necessary + ) + to_add[NameObject("/Dest")] = dest.dest_array + + ind_obj = self._add_object(to_add) + + page.annotations.append(ind_obj) + + def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: + """ + Perform some clean up in the page. + Currently: convert NameObject nameddestination to TextStringObject (required for names/dests list) + """ + page = cast("PageObject", page.get_object()) + for a in page.get("/Annots", []): + a_obj = a.get_object() + d = a_obj.get("/Dest", None) + act = a_obj.get("/A", None) + if isinstance(d, NameObject): + a_obj[NameObject("/Dest")] = TextStringObject(d) + elif act is not None: + act = act.get_object() + d = act.get("/D", None) + if isinstance(d, NameObject): + act[NameObject("/D")] = TextStringObject(d) + return page + + def _create_stream( + self, fileobj: Union[Path, StrByteType, PdfReader] + ) -> Tuple[IOBase, Optional[Encryption]]: + # If the fileobj parameter is a string, assume it is a path + # and create a file object at that location. If it is a file, + # copy the file's contents into a BytesIO stream object; if + # it is a PdfReader, copy that reader's stream into a + # BytesIO stream. + # If fileobj is none of the above types, it is not modified + encryption_obj = None + stream: IOBase + if isinstance(fileobj, (str, Path)): + with FileIO(fileobj, "rb") as f: + stream = BytesIO(f.read()) + elif isinstance(fileobj, PdfReader): + if fileobj._encryption: + encryption_obj = fileobj._encryption + orig_tell = fileobj.stream.tell() + fileobj.stream.seek(0) + stream = BytesIO(fileobj.stream.read()) + + # reset the stream to its original location + fileobj.stream.seek(orig_tell) + elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): + fileobj.seek(0) + filecontent = fileobj.read() + stream = BytesIO(filecontent) + else: + raise NotImplementedError( + "PdfMerger.merge requires an object that PdfReader can parse. " + "Typically, that is a Path or a string representing a Path, " + "a file object, or an object implementing .seek and .read. " + "Passing a PdfReader directly works as well." + ) + return stream, encryption_obj + + def append( + self, + fileobj: Union[StrByteType, PdfReader, Path], + outline_item: Union[ + str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] + ] = None, + pages: Union[ + None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] + ] = None, + import_outline: bool = True, + excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None, + ) -> None: + """ + Identical to the :meth:`merge()<merge>` method, but assumes you want to + concatenate all pages onto the end of the file instead of specifying a + position. + + :param fileobj: A File Object or an object that supports the standard + read and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str outline_item: Optionally, you may specify a string to build an outline + (aka 'bookmark') to identify the + beginning of the included file. + + :param pages: can be a :class:`PageRange<PyPDF2.pagerange.PageRange>` + or a ``(start, stop[, step])`` tuple + or a list of pages to be processed + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. + + :param List excluded_fields: provide the list of fields/keys to be ignored + if "/Annots" is part of the list, the annotation will be ignored + if "/B" is part of the list, the articles will be ignored + """ + if excluded_fields is None: + excluded_fields = () + if isinstance(outline_item, (tuple, list, PageRange)): + if isinstance(pages, bool): + if not isinstance(import_outline, bool): + excluded_fields = import_outline + import_outline = pages + pages = outline_item + self.merge(None, fileobj, None, pages, import_outline, excluded_fields) + else: # if isinstance(outline_item,str): + self.merge( + None, fileobj, outline_item, pages, import_outline, excluded_fields + ) + + @deprecation_bookmark(bookmark="outline_item", import_bookmarks="import_outline") + def merge( + self, + position: Optional[int], + fileobj: Union[Path, StrByteType, PdfReader], + outline_item: Optional[str] = None, + pages: Optional[PageRangeSpec] = None, + import_outline: bool = True, + excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (), + ) -> None: + """ + Merge the pages from the given file into the output file at the + specified page number. + + :param int position: The *page number* to insert this file. File will + be inserted after the given number. + + :param fileobj: A File Object or an object that supports the standard + read and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + + :param str outline_item: Optionally, you may specify a string to build an outline + (aka 'bookmark') to identify the + beginning of the included file. + + :param pages: can be a :class:`PageRange<PyPDF2.pagerange.PageRange>` + or a ``(start, stop[, step])`` tuple + or a list of pages to be processed + to merge only the specified range of pages from the source + document into the output document. + + :param bool import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. + + :param List excluded_fields: provide the list of fields/keys to be ignored + if "/Annots" is part of the list, the annotation will be ignored + if "/B" is part of the list, the articles will be ignored + """ + if isinstance(fileobj, PdfReader): + reader = fileobj + else: + stream, encryption_obj = self._create_stream(fileobj) + # Create a new PdfReader instance using the stream + # (either file or BytesIO or StringIO) created above + reader = PdfReader(stream, strict=False) # type: ignore[arg-type] + + if excluded_fields is None: + excluded_fields = () + # Find the range of pages to merge. + if pages is None: + pages = list(range(0, len(reader.pages))) + elif isinstance(pages, PageRange): + pages = list(range(*pages.indices(len(reader.pages)))) + elif isinstance(pages, list): + pass # keep unchanged + elif isinstance(pages, tuple) and len(pages) <= 3: + pages = list(range(*pages)) + elif not isinstance(pages, tuple): + raise TypeError( + '"pages" must be a tuple of (start, stop[, step]) or a list' + ) + + srcpages = {} + for i in pages: + pg = reader.pages[i] + assert pg.indirect_reference is not None + if position is None: + srcpages[pg.indirect_reference.idnum] = self.add_page( + pg, list(excluded_fields) + ["/B", "/Annots"] # type: ignore + ) + else: + srcpages[pg.indirect_reference.idnum] = self.insert_page( + pg, position, list(excluded_fields) + ["/B", "/Annots"] # type: ignore + ) + position += 1 + srcpages[pg.indirect_reference.idnum].original_page = pg + + reader._namedDests = ( + reader.named_destinations + ) # need for the outline processing below + for dest in reader._namedDests.values(): + arr = dest.dest_array + # try: + if isinstance(dest["/Page"], NullObject): + pass # self.add_named_destination_array(dest["/Title"],arr) + elif dest["/Page"].indirect_reference.idnum in srcpages: + arr[NumberObject(0)] = srcpages[ + dest["/Page"].indirect_reference.idnum + ].indirect_reference + self.add_named_destination_array(dest["/Title"], arr) + # except Exception as e: + # logger_warning(f"can not insert {dest} : {e.msg}",__name__) + + outline_item_typ: TreeObject + if outline_item is not None: + outline_item_typ = cast( + "TreeObject", + self.add_outline_item( + TextStringObject(outline_item), + list(srcpages.values())[0].indirect_reference, + fit=PAGE_FIT, + ).get_object(), + ) + else: + outline_item_typ = self.get_outline_root() + + _ro = cast("DictionaryObject", reader.trailer[TK.ROOT]) + if import_outline and CO.OUTLINES in _ro: + outline = self._get_filtered_outline( + _ro.get(CO.OUTLINES, None), srcpages, reader + ) + self._insert_filtered_outline( + outline, outline_item_typ, None + ) # TODO : use before parameter + + if "/Annots" not in excluded_fields: + for pag in srcpages.values(): + lst = self._insert_filtered_annotations( + pag.original_page.get("/Annots", ()), pag, srcpages, reader + ) + if len(lst) > 0: + pag[NameObject("/Annots")] = lst + self.clean_page(pag) + + if "/B" not in excluded_fields: + self.add_filtered_articles("", srcpages, reader) + + return + + def _add_articles_thread( + self, + thread: DictionaryObject, # thread entry from the reader's array of threads + pages: Dict[int, PageObject], + reader: PdfReader, + ) -> IndirectObject: + """ + clone the thread with only the applicable articles + + """ + nthread = thread.clone( + self, force_duplicate=True, ignore_fields=("/F",) + ) # use of clone to keep link between reader and writer + self.threads.append(nthread.indirect_reference) + first_article = cast("DictionaryObject", thread["/F"]) + current_article: Optional[DictionaryObject] = first_article + new_article: Optional[DictionaryObject] = None + while current_article is not None: + pag = self._get_cloned_page( + cast("PageObject", current_article["/P"]), pages, reader + ) + if pag is not None: + if new_article is None: + new_article = cast( + "DictionaryObject", + self._add_object(DictionaryObject()).get_object(), + ) + new_first = new_article + nthread[NameObject("/F")] = new_article.indirect_reference + else: + new_article2 = cast( + "DictionaryObject", + self._add_object( + DictionaryObject( + {NameObject("/V"): new_article.indirect_reference} + ) + ).get_object(), + ) + new_article[NameObject("/N")] = new_article2.indirect_reference + new_article = new_article2 + new_article[NameObject("/P")] = pag + new_article[NameObject("/T")] = nthread.indirect_reference + new_article[NameObject("/R")] = current_article["/R"] + pag_obj = cast("PageObject", pag.get_object()) + if "/B" not in pag_obj: + pag_obj[NameObject("/B")] = ArrayObject() + cast("ArrayObject", pag_obj["/B"]).append( + new_article.indirect_reference + ) + current_article = cast("DictionaryObject", current_article["/N"]) + if current_article == first_article: + new_article[NameObject("/N")] = new_first.indirect_reference # type: ignore + new_first[NameObject("/V")] = new_article.indirect_reference # type: ignore + current_article = None + assert nthread.indirect_reference is not None + return nthread.indirect_reference + + def add_filtered_articles( + self, + fltr: Union[Pattern, str], # thread entry from the reader's array of threads + pages: Dict[int, PageObject], + reader: PdfReader, + ) -> None: + """ + Add articles matching the defined criteria + """ + if isinstance(fltr, str): + fltr = re.compile(fltr) + elif not isinstance(fltr, Pattern): + fltr = re.compile("") + for p in pages.values(): + pp = p.original_page + for a in pp.get("/B", ()): + thr = a.get_object()["/T"] + if thr.indirect_reference.idnum not in self._id_translated[ + id(reader) + ] and fltr.search(thr["/I"]["/Title"]): + self._add_articles_thread(thr, pages, reader) + + def _get_cloned_page( + self, + page: Union[None, int, IndirectObject, PageObject, NullObject], + pages: Dict[int, PageObject], + reader: PdfReader, + ) -> Optional[IndirectObject]: + if isinstance(page, NullObject): + return None + if isinstance(page, int): + _i = reader.pages[page].indirect_reference + # elif isinstance(page, PageObject): + # _i = page.indirect_reference + elif isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": + _i = page.indirect_reference + elif isinstance(page, IndirectObject): + _i = page + try: + return pages[_i.idnum].indirect_reference # type: ignore + except Exception: + return None + + def _insert_filtered_annotations( + self, + annots: Union[IndirectObject, List[DictionaryObject]], + page: PageObject, + pages: Dict[int, PageObject], + reader: PdfReader, + ) -> List[Destination]: + outlist = ArrayObject() + if isinstance(annots, IndirectObject): + annots = cast("List", annots.get_object()) + for an in annots: + ano = cast("DictionaryObject", an.get_object()) + if ( + ano["/Subtype"] != "/Link" + or "/A" not in ano + or cast("DictionaryObject", ano["/A"])["/S"] != "/GoTo" + or "/Dest" in ano + ): + if "/Dest" not in ano: + outlist.append(ano.clone(self).indirect_reference) + else: + d = ano["/Dest"] + if isinstance(d, str): + # it is a named dest + if str(d) in self.get_named_dest_root(): + outlist.append(ano.clone(self).indirect_reference) + else: + d = cast("ArrayObject", d) + p = self._get_cloned_page(d[0], pages, reader) + if p is not None: + anc = ano.clone(self, ignore_fields=("/Dest",)) + anc[NameObject("/Dest")] = ArrayObject([p] + d[1:]) + outlist.append(anc.indirect_reference) + else: + d = cast("DictionaryObject", ano["/A"])["/D"] + if isinstance(d, str): + # it is a named dest + if str(d) in self.get_named_dest_root(): + outlist.append(ano.clone(self).indirect_reference) + else: + d = cast("ArrayObject", d) + p = self._get_cloned_page(d[0], pages, reader) + if p is not None: + anc = ano.clone(self, ignore_fields=("/D",)) + anc = cast("DictionaryObject", anc) + cast("DictionaryObject", anc["/A"])[ + NameObject("/D") + ] = ArrayObject([p] + d[1:]) + outlist.append(anc.indirect_reference) + return outlist + + def _get_filtered_outline( + self, + node: Any, + pages: Dict[int, PageObject], + reader: PdfReader, + ) -> List[Destination]: + """Extract outline item entries that are part of the specified page set.""" + new_outline = [] + node = node.get_object() + if node.get("/Type", "") == "/Outlines" or "/Title" not in node: + node = node.get("/First", None) + if node is not None: + node = node.get_object() + new_outline += self._get_filtered_outline(node, pages, reader) + else: + v: Union[None, IndirectObject, NullObject] + while node is not None: + node = node.get_object() + o = cast("Destination", reader._build_outline_item(node)) + v = self._get_cloned_page(cast("PageObject", o["/Page"]), pages, reader) + if v is None: + v = NullObject() + o[NameObject("/Page")] = v + if "/First" in node: + o.childs = self._get_filtered_outline(node["/First"], pages, reader) + else: + o.childs = [] + if not isinstance(o["/Page"], NullObject) or len(o.childs) > 0: + new_outline.append(o) + node = node.get("/Next", None) + return new_outline + + def _clone_outline(self, dest: Destination) -> TreeObject: + n_ol = TreeObject() + self._add_object(n_ol) + n_ol[NameObject("/Title")] = TextStringObject(dest["/Title"]) + if not isinstance(dest["/Page"], NullObject): + if dest.node is not None and "/A" in dest.node: + n_ol[NameObject("/A")] = dest.node["/A"].clone(self) + # elif "/D" in dest.node: + # n_ol[NameObject("/Dest")] = dest.node["/D"].clone(self) + # elif "/Dest" in dest.node: + # n_ol[NameObject("/Dest")] = dest.node["/Dest"].clone(self) + else: + n_ol[NameObject("/Dest")] = dest.dest_array + # TODO: /SE + if dest.node is not None: + n_ol[NameObject("/F")] = NumberObject(dest.node.get("/F", 0)) + n_ol[NameObject("/C")] = ArrayObject( + dest.node.get( + "/C", [FloatObject(0.0), FloatObject(0.0), FloatObject(0.0)] + ) + ) + return n_ol + + def _insert_filtered_outline( + self, + outlines: List[Destination], + parent: Union[TreeObject, IndirectObject], + before: Union[None, TreeObject, IndirectObject] = None, + ) -> None: + for dest in outlines: + # TODO : can be improved to keep A and SE entries (ignored for the moment) + # np=self.add_outline_item_destination(dest,parent,before) + if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: + np = parent + else: + np = self._clone_outline(dest) + cast(TreeObject, parent.get_object()).insert_child(np, before, self) + self._insert_filtered_outline(dest.childs, np, None) + + def close(self) -> None: + """To match the functions from Merger""" + return + + # @deprecation_bookmark(bookmark="outline_item") + def find_outline_item( + self, + outline_item: Dict[str, Any], + root: Optional[OutlineType] = None, + ) -> Optional[List[int]]: + if root is None: + o = self.get_outline_root() + else: + o = cast("TreeObject", root) + + i = 0 + while o is not None: + if ( + o.indirect_reference == outline_item + or o.get("/Title", None) == outline_item + ): + return [i] + else: + if "/First" in o: + res = self.find_outline_item( + outline_item, cast(OutlineType, o["/First"]) + ) + if res: + return ([i] if "/Title" in o else []) + res + if "/Next" in o: + i += 1 + o = cast(TreeObject, o["/Next"]) + else: + return None + + @deprecation_bookmark(bookmark="outline_item") + def find_bookmark( + self, + outline_item: Dict[str, Any], + root: Optional[OutlineType] = None, + ) -> Optional[List[int]]: # pragma: no cover + """ + .. deprecated:: 2.9.0 + Use :meth:`find_outline_item` instead. + """ + return self.find_outline_item(outline_item, root) + + def reset_translation( + self, reader: Union[None, PdfReader, IndirectObject] = None + ) -> None: + """ + reset the translation table between reader and the writer object. + late cloning will create new independent objects + + :param reader: PdfReader or IndirectObject refering a PdfReader object. + if set to None or omitted, all tables will be reset. + """ + if reader is None: + self._id_translated = {} + elif isinstance(reader, PdfReader): + try: + del self._id_translated[id(reader)] + except Exception: + pass + elif isinstance(reader, IndirectObject): + try: + del self._id_translated[id(reader.pdf)] + except Exception: + pass + else: + raise Exception("invalid parameter {reader}") + + +def _pdf_objectify(obj: Union[Dict[str, Any], str, int, List[Any]]) -> PdfObject: + if isinstance(obj, PdfObject): + return obj + if isinstance(obj, dict): + to_add = DictionaryObject() + for key, value in obj.items(): + name_key = NameObject(key) + casted_value = _pdf_objectify(value) + to_add[name_key] = casted_value + return to_add + elif isinstance(obj, list): + arr = ArrayObject() + for el in obj: + arr.append(_pdf_objectify(el)) + return arr + elif isinstance(obj, str): + if obj.startswith("/"): + return NameObject(obj) + else: + return TextStringObject(obj) + elif isinstance(obj, (int, float)): + return FloatObject(obj) + else: + raise NotImplementedError( + f"type(obj)={type(obj)} could not be casted to PdfObject" + ) + + +def _create_outline_item( + action_ref: Union[None, IndirectObject], + title: str, + color: Union[Tuple[float, float, float], str, None], + italic: bool, + bold: bool, +) -> TreeObject: + outline_item = TreeObject() + if action_ref is not None: + outline_item[NameObject("/A")] = action_ref + outline_item.update( + { + NameObject("/Title"): create_string_object(title), + } + ) + if color: + if isinstance(color, str): + color = hex_to_rgb(color) + prec = decimal.Decimal("1.00000") + outline_item.update( + { + NameObject("/C"): ArrayObject( + [FloatObject(decimal.Decimal(c).quantize(prec)) for c in color] + ) + } + ) + if italic or bold: + format_flag = 0 + if italic: + format_flag += 1 + if bold: + format_flag += 2 + outline_item.update({NameObject("/F"): NumberObject(format_flag)}) + return outline_item + + +class PdfFileWriter(PdfWriter): # pragma: no cover + def __init__(self, *args: Any, **kwargs: Any) -> None: + deprecation_with_replacement("PdfFileWriter", "PdfWriter", "3.0.0") + super().__init__(*args, **kwargs) |