aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/pypdf/_merger.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_merger.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_merger.py678
1 files changed, 678 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_merger.py b/.venv/lib/python3.12/site-packages/pypdf/_merger.py
new file mode 100644
index 00000000..7176a1ad
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_merger.py
@@ -0,0 +1,678 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from io import BytesIO, FileIO, IOBase
+from pathlib import Path
+from types import TracebackType
+from typing import (
+ Any,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Type,
+ Union,
+ cast,
+)
+
+from ._encryption import Encryption
+from ._page import PageObject
+from ._reader import PdfReader
+from ._utils import (
+ StrByteType,
+ deprecate_with_replacement,
+ str_,
+)
+from ._writer import PdfWriter
+from .constants import GoToActionArguments, TypArguments, TypFitArguments
+from .constants import PagesAttributes as PA
+from .generic import (
+ PAGE_FIT,
+ ArrayObject,
+ Destination,
+ DictionaryObject,
+ Fit,
+ FloatObject,
+ IndirectObject,
+ NameObject,
+ NullObject,
+ NumberObject,
+ OutlineItem,
+ TextStringObject,
+ TreeObject,
+)
+from .pagerange import PageRange, PageRangeSpec
+from .types import LayoutType, OutlineType, PagemodeType
+
+ERR_CLOSED_WRITER = "close() was called and thus the writer cannot be used anymore"
+
+
+class _MergedPage:
+ """Collect necessary information on each page that is being merged."""
+
+ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
+ self.src = src
+ self.pagedata = pagedata
+ self.out_pagedata = None
+ self.id = id
+
+
+class PdfMerger:
+ """
+ Use :class:`PdfWriter` instead.
+
+ .. deprecated:: 5.0.0
+ """
+
+ def __init__(
+ self, strict: bool = False, fileobj: Union[Path, StrByteType] = ""
+ ) -> None:
+ deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
+ self.inputs: List[Tuple[Any, PdfReader]] = []
+ self.pages: List[Any] = []
+ self.output: Optional[PdfWriter] = PdfWriter()
+ self.outline: OutlineType = []
+ self.named_dests: List[Any] = []
+ self.id_count = 0
+ self.fileobj = fileobj
+ self.strict = strict
+
+ def __enter__(self) -> "PdfMerger":
+ # There is nothing to do.
+ deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
+ return self
+
+ def __exit__(
+ self,
+ exc_type: Optional[Type[BaseException]],
+ exc: Optional[BaseException],
+ traceback: Optional[TracebackType],
+ ) -> None:
+ """Write to the fileobj and close the merger."""
+ if self.fileobj:
+ self.write(self.fileobj)
+ self.close()
+
+ def merge(
+ self,
+ page_number: int,
+ fileobj: Union[Path, StrByteType, PdfReader],
+ outline_item: Optional[str] = None,
+ pages: Optional[PageRangeSpec] = None,
+ import_outline: bool = True,
+ ) -> None:
+ """
+ Merge the pages from the given file into the output file at the
+ specified page number.
+
+ Args:
+ page_number: The *page number* to insert this file. File will
+ be inserted after the given number.
+ fileobj: A File Object or an object that supports the standard
+ read and seek methods similar to a File Object. Could also be a
+ string representing a path to a PDF file.
+ outline_item: Optionally, you may specify an outline item
+ (previously referred to as a 'bookmark') to be applied at the
+ beginning of the included file by supplying the text of the outline item.
+ pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
+ or a ``(start, stop[, step])`` tuple
+ to merge only the specified range of pages from the source
+ document into the output document.
+ Can also be a list of pages to merge.
+ import_outline: You may prevent the source document's
+ outline (collection of outline items, previously referred to as
+ 'bookmarks') from being imported by specifying this as ``False``.
+ """
+ stream, encryption_obj = self._create_stream(fileobj)
+
+ # Create a new PdfReader instance using the stream
+ # (either file or BytesIO or StringIO) created above
+ reader = PdfReader(stream, strict=self.strict) # type: ignore[arg-type]
+ self.inputs.append((stream, reader))
+ if encryption_obj is not None:
+ reader._encryption = encryption_obj
+
+ # Find the range of pages to merge.
+ if pages is None:
+ pages = (0, len(reader.pages))
+ elif isinstance(pages, PageRange):
+ pages = pages.indices(len(reader.pages))
+ elif isinstance(pages, list):
+ pass
+ elif not isinstance(pages, tuple):
+ raise TypeError('"pages" must be a tuple of (start, stop[, step])')
+
+ srcpages = []
+
+ outline = []
+ if import_outline:
+ outline = reader.outline
+ outline = self._trim_outline(reader, outline, pages)
+
+ if outline_item:
+ outline_item_typ = OutlineItem(
+ TextStringObject(outline_item),
+ NumberObject(self.id_count),
+ Fit.fit(),
+ )
+ self.outline += [outline_item_typ, outline] # type: ignore
+ else:
+ self.outline += outline
+
+ dests = reader.named_destinations
+ trimmed_dests = self._trim_dests(reader, dests, pages)
+ self.named_dests += trimmed_dests
+
+ # Gather all the pages that are going to be merged
+ for i in range(*pages):
+ page = reader.pages[i]
+
+ id = self.id_count
+ self.id_count += 1
+
+ mp = _MergedPage(page, reader, id)
+
+ srcpages.append(mp)
+
+ self._associate_dests_to_pages(srcpages)
+ self._associate_outline_items_to_pages(srcpages)
+
+ # Slice to insert the pages at the specified page_number
+ self.pages[page_number:page_number] = srcpages
+
+ def _create_stream(
+ self, fileobj: Union[Path, StrByteType, PdfReader]
+ ) -> Tuple[IOBase, Optional[Encryption]]:
+ # If the fileobj parameter is a string, assume it is a path
+ # and create a file object at that location. If it is a file,
+ # copy the file's contents into a BytesIO stream object; if
+ # it is a PdfReader, copy that reader's stream into a
+ # BytesIO stream.
+ # If fileobj is none of the above types, it is not modified
+ encryption_obj = None
+ stream: IOBase
+ if isinstance(fileobj, (str, Path)):
+ stream = FileIO(fileobj, "rb")
+ elif isinstance(fileobj, PdfReader):
+ if fileobj._encryption:
+ encryption_obj = fileobj._encryption
+ orig_tell = fileobj.stream.tell()
+ fileobj.stream.seek(0)
+ stream = BytesIO(fileobj.stream.read())
+
+ # reset the stream to its original location
+ fileobj.stream.seek(orig_tell)
+ elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"):
+ fileobj.seek(0)
+ file_content = fileobj.read()
+ stream = BytesIO(file_content)
+ else:
+ raise NotImplementedError(
+ "PdfMerger.merge requires an object that PdfReader can parse. "
+ "Typically, that is a Path or a string representing a Path, "
+ "a file object, or an object implementing .seek and .read. "
+ "Passing a PdfReader directly works as well."
+ )
+ return stream, encryption_obj
+
+ def append(
+ self,
+ fileobj: Union[StrByteType, PdfReader, Path],
+ outline_item: Optional[str] = None,
+ pages: Union[
+ None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]
+ ] = None,
+ import_outline: bool = True,
+ ) -> None:
+ """
+ Identical to the :meth:`merge()<merge>` method, but assumes you want to
+ concatenate all pages onto the end of the file instead of specifying a
+ position.
+
+ Args:
+ fileobj: A File Object or an object that supports the standard
+ read and seek methods similar to a File Object. Could also be a
+ string representing a path to a PDF file.
+ outline_item: Optionally, you may specify an outline item
+ (previously referred to as a 'bookmark') to be applied at the
+ beginning of the included file by supplying the text of the outline item.
+ pages: can be a :class:`PageRange<pypdf.pagerange.PageRange>`
+ or a ``(start, stop[, step])`` tuple
+ to merge only the specified range of pages from the source
+ document into the output document.
+ Can also be a list of pages to append.
+ import_outline: You may prevent the source document's
+ outline (collection of outline items, previously referred to as
+ 'bookmarks') from being imported by specifying this as ``False``.
+ """
+ self.merge(len(self.pages), fileobj, outline_item, pages, import_outline)
+
+ def write(self, fileobj: Union[Path, StrByteType]) -> None:
+ """
+ Write all data that has been merged to the given output file.
+
+ Args:
+ fileobj: Output file. Can be a filename or any kind of
+ file-like object.
+ """
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+
+ # Add pages to the PdfWriter
+ # The commented out line below was replaced with the two lines below it
+ # to allow PdfMerger to work with PyPdf 1.13
+ for page in self.pages:
+ self.output.add_page(page.pagedata)
+ pages_obj = cast(Dict[str, Any], self.output._pages.get_object())
+ page.out_pagedata = self.output.get_reference(
+ pages_obj[PA.KIDS][-1].get_object()
+ )
+
+ # Once all pages are added, create outline items to point at those pages
+ self._write_dests()
+ self._write_outline()
+
+ # Write the output to the file
+ my_file, ret_fileobj = self.output.write(fileobj)
+
+ if my_file:
+ ret_fileobj.close()
+
+ def close(self) -> None:
+ """Shut all file descriptors (input and output) and clear all memory usage."""
+ self.pages = []
+ for file_descriptor, _reader in self.inputs:
+ file_descriptor.close()
+
+ self.inputs = []
+ self.output = None
+
+ def add_metadata(self, infos: Dict[str, Any]) -> None:
+ """
+ Add custom metadata to the output.
+
+ Args:
+ infos: a Python dictionary where each key is a field
+ and each value is your new metadata.
+ An example is ``{'/Title': 'My title'}``
+ """
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ self.output.add_metadata(infos)
+
+ def set_page_layout(self, layout: LayoutType) -> None:
+ """
+ Set the page layout.
+
+ Args:
+ layout: The page layout to be used
+
+ .. list-table:: Valid ``layout`` arguments
+ :widths: 50 200
+
+ * - /NoLayout
+ - Layout explicitly not specified
+ * - /SinglePage
+ - Show one page at a time
+ * - /OneColumn
+ - Show one column at a time
+ * - /TwoColumnLeft
+ - Show pages in two columns, odd-numbered pages on the left
+ * - /TwoColumnRight
+ - Show pages in two columns, odd-numbered pages on the right
+ * - /TwoPageLeft
+ - Show two pages at a time, odd-numbered pages on the left
+ * - /TwoPageRight
+ - Show two pages at a time, odd-numbered pages on the right
+ """
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ self.output._set_page_layout(layout)
+
+ def set_page_mode(self, mode: PagemodeType) -> None:
+ """
+ Set the page mode.
+
+ Args:
+ mode: The page mode to use.
+
+ .. list-table:: Valid ``mode`` arguments
+ :widths: 50 200
+
+ * - /UseNone
+ - Do not show outline or thumbnails panels
+ * - /UseOutlines
+ - Show outline (aka bookmarks) panel
+ * - /UseThumbs
+ - Show page thumbnails panel
+ * - /FullScreen
+ - Fullscreen view
+ * - /UseOC
+ - Show Optional Content Group (OCG) panel
+ * - /UseAttachments
+ - Show attachments panel
+ """
+ self.page_mode = mode
+
+ @property
+ def page_mode(self) -> Optional[PagemodeType]:
+ """
+ Set the page mode.
+
+ Args:
+ mode: The page mode to use.
+
+ .. list-table:: Valid ``mode`` arguments
+ :widths: 50 200
+
+ * - /UseNone
+ - Do not show outline or thumbnails panels
+ * - /UseOutlines
+ - Show outline (aka bookmarks) panel
+ * - /UseThumbs
+ - Show page thumbnails panel
+ * - /FullScreen
+ - Fullscreen view
+ * - /UseOC
+ - Show Optional Content Group (OCG) panel
+ * - /UseAttachments
+ - Show attachments panel
+ """
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ return self.output.page_mode
+
+ @page_mode.setter
+ def page_mode(self, mode: PagemodeType) -> None:
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ self.output.page_mode = mode
+
+ def _trim_dests(
+ self,
+ pdf: PdfReader,
+ dests: Dict[str, Dict[str, Any]],
+ pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]],
+ ) -> List[Dict[str, Any]]:
+ """
+ Remove named destinations that are not a part of the specified page set.
+
+ Args:
+ pdf:
+ dests:
+ pages:
+ """
+ new_dests = []
+ lst = pages if isinstance(pages, list) else list(range(*pages))
+ for key, obj in dests.items():
+ for j in lst:
+ if pdf.pages[j].get_object() == obj["/Page"].get_object():
+ obj[NameObject("/Page")] = obj["/Page"].get_object()
+ assert str_(key) == str_(obj["/Title"])
+ new_dests.append(obj)
+ break
+ return new_dests
+
+ def _trim_outline(
+ self,
+ pdf: PdfReader,
+ outline: OutlineType,
+ pages: Union[Tuple[int, int], Tuple[int, int, int], List[int]],
+ ) -> OutlineType:
+ """
+ Remove outline item entries that are not a part of the specified page set.
+
+ Args:
+ pdf:
+ outline:
+ pages:
+
+ Returns:
+ An outline type
+ """
+ new_outline = []
+ prev_header_added = True
+ lst = pages if isinstance(pages, list) else list(range(*pages))
+ for i, outline_item in enumerate(outline):
+ if isinstance(outline_item, list):
+ sub = self._trim_outline(pdf, outline_item, lst) # type: ignore
+ if sub:
+ if not prev_header_added:
+ new_outline.append(outline[i - 1])
+ new_outline.append(sub) # type: ignore
+ else:
+ prev_header_added = False
+ for j in lst:
+ if outline_item["/Page"] is None:
+ continue
+ if pdf.pages[j].get_object() == outline_item["/Page"].get_object():
+ outline_item[NameObject("/Page")] = outline_item[
+ "/Page"
+ ].get_object()
+ new_outline.append(outline_item)
+ prev_header_added = True
+ break
+ return new_outline
+
+ def _write_dests(self) -> None:
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ for named_dest in self.named_dests:
+ page_index = None
+ if "/Page" in named_dest: # deprecated
+ for page_index, page in enumerate(self.pages): # noqa: B007
+ if page.id == named_dest["/Page"]:
+ named_dest[NameObject("/Page")] = page.out_pagedata
+ break
+
+ if page_index is not None: # deprecated
+ self.output.add_named_destination_object(named_dest)
+
+ def _write_outline(
+ self,
+ outline: Optional[Iterable[OutlineItem]] = None,
+ parent: Optional[TreeObject] = None,
+ ) -> None:
+ if self.output is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ if outline is None:
+ outline = self.outline # type: ignore
+ assert outline is not None, "hint for mypy" # TODO: is that true?
+
+ last_added = None
+ for outline_item in outline:
+ if isinstance(outline_item, list):
+ self._write_outline(outline_item, last_added)
+ continue
+
+ page_no = None
+ if "/Page" in outline_item:
+ for page_no, page in enumerate(self.pages): # noqa: B007
+ if page.id == outline_item["/Page"]:
+ self._write_outline_item_on_page(outline_item, page)
+ break
+ if page_no is not None:
+ del outline_item["/Page"], outline_item["/Type"]
+ last_added = self.output.add_outline_item_dict(outline_item, parent)
+
+ def _write_outline_item_on_page(
+ self, outline_item: Union[OutlineItem, Destination], page: _MergedPage
+ ) -> None:
+ oi_type = cast(str, outline_item["/Type"])
+ args = [NumberObject(page.id), NameObject(oi_type)]
+ fit2arg_keys: Dict[str, Tuple[str, ...]] = {
+ TypFitArguments.FIT_H: (TypArguments.TOP,),
+ TypFitArguments.FIT_BH: (TypArguments.TOP,),
+ TypFitArguments.FIT_V: (TypArguments.LEFT,),
+ TypFitArguments.FIT_BV: (TypArguments.LEFT,),
+ TypFitArguments.XYZ: (TypArguments.LEFT, TypArguments.TOP, "/Zoom"),
+ TypFitArguments.FIT_R: (
+ TypArguments.LEFT,
+ TypArguments.BOTTOM,
+ TypArguments.RIGHT,
+ TypArguments.TOP,
+ ),
+ }
+ for arg_key in fit2arg_keys.get(oi_type, ()):
+ if arg_key in outline_item and not isinstance(
+ outline_item[arg_key], NullObject
+ ):
+ args.append(FloatObject(outline_item[arg_key]))
+ else:
+ args.append(FloatObject(0))
+ del outline_item[arg_key]
+
+ outline_item[NameObject("/A")] = DictionaryObject(
+ {
+ NameObject(GoToActionArguments.S): NameObject("/GoTo"),
+ NameObject(GoToActionArguments.D): ArrayObject(args),
+ }
+ )
+
+ def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None:
+ for named_dest in self.named_dests:
+ page_index = None
+ np = named_dest["/Page"]
+
+ if isinstance(np, NumberObject):
+ continue
+
+ for page in pages:
+ if np.get_object() == page.pagedata.get_object():
+ page_index = page.id
+
+ if page_index is None: # deprecated
+ raise ValueError(
+ f"Unresolved named destination '{named_dest['/Title']}'"
+ )
+ named_dest[NameObject("/Page")] = NumberObject(page_index)
+
+ def _associate_outline_items_to_pages(
+ self, pages: List[_MergedPage], outline: Optional[Iterable[OutlineItem]] = None
+ ) -> None:
+ if outline is None:
+ outline = self.outline # type: ignore # TODO: self.bookmarks can be None!
+ assert outline is not None, "hint for mypy"
+ for outline_item in outline:
+ if isinstance(outline_item, list):
+ self._associate_outline_items_to_pages(pages, outline_item)
+ continue
+
+ page_index = None
+ outline_item_page = outline_item["/Page"]
+
+ if isinstance(outline_item_page, NumberObject):
+ continue
+
+ for p in pages:
+ if outline_item_page.get_object() == p.pagedata.get_object():
+ page_index = p.id
+
+ if page_index is not None:
+ outline_item[NameObject("/Page")] = NumberObject(page_index)
+
+ def find_outline_item(
+ self,
+ outline_item: Dict[str, Any],
+ root: Optional[OutlineType] = None,
+ ) -> Optional[List[int]]:
+ if root is None:
+ root = self.outline
+
+ for i, oi_enum in enumerate(root):
+ if isinstance(oi_enum, list):
+ # oi_enum is still an inner node
+ # (OutlineType, if recursive types were supported by mypy)
+ res = self.find_outline_item(outline_item, oi_enum) # type: ignore
+ if res: # deprecated
+ return [i] + res
+ elif (
+ oi_enum == outline_item
+ or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item
+ ):
+ # we found a leaf node
+ return [i]
+
+ return None
+
+ def add_outline_item(
+ self,
+ title: str,
+ page_number: int,
+ parent: Union[None, TreeObject, IndirectObject] = None,
+ color: Optional[Tuple[float, float, float]] = None,
+ bold: bool = False,
+ italic: bool = False,
+ fit: Fit = PAGE_FIT,
+ ) -> IndirectObject:
+ """
+ Add an outline item (commonly referred to as a "Bookmark") to this PDF file.
+
+ Args:
+ title: Title to use for this outline item.
+ page_number: Page number this outline item will point to.
+ parent: A reference to a parent outline item to create nested
+ outline items.
+ color: Color of the outline item's font as a red, green, blue tuple
+ from 0.0 to 1.0
+ bold: Outline item font is bold
+ italic: Outline item font is italic
+ fit: The fit of the destination page.
+ """
+ writer = self.output
+ if writer is None:
+ raise RuntimeError(ERR_CLOSED_WRITER)
+ return writer.add_outline_item(
+ title,
+ page_number,
+ parent,
+ None,
+ color,
+ bold,
+ italic,
+ fit,
+ )
+
+ def add_named_destination(
+ self,
+ title: str,
+ page_number: int,
+ ) -> None:
+ """
+ Add a destination to the output.
+
+ Args:
+ title: Title to use
+ page_number: Page number this destination points at.
+ """
+ dest = Destination(
+ TextStringObject(title),
+ NumberObject(page_number),
+ Fit.fit_horizontally(top=826),
+ )
+ self.named_dests.append(dest)