diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_doc_common.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_doc_common.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/pypdf/_doc_common.py | 1365 |
1 files changed, 1365 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_doc_common.py b/.venv/lib/python3.12/site-packages/pypdf/_doc_common.py new file mode 100644 index 00000000..d4c5c43c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_doc_common.py @@ -0,0 +1,1365 @@ +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com> +# Copyright (c) 2024, Pubpub-ZZ +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import struct +import zlib +from abc import abstractmethod +from datetime import datetime +from typing import ( + Any, + Dict, + Iterable, + Iterator, + List, + Mapping, + Optional, + Tuple, + Union, + cast, +) + +from ._encryption import Encryption +from ._page import PageObject, _VirtualList +from ._page_labels import index2label as page_index2page_label +from ._utils import ( + b_, + deprecate_with_replacement, + logger_warning, + parse_iso8824_date, +) +from .constants import CatalogAttributes as CA +from .constants import CatalogDictionary as CD +from .constants import ( + CheckboxRadioButtonAttributes, + GoToActionArguments, + UserAccessPermissions, +) +from .constants import Core as CO +from .constants import DocumentInformationAttributes as DI +from .constants import FieldDictionaryAttributes as FA +from .constants import PageAttributes as PG +from .constants import PagesAttributes as PA +from .errors import ( + PdfReadError, +) +from .generic import ( + ArrayObject, + BooleanObject, + ByteStringObject, + Destination, + DictionaryObject, + EncodedStreamObject, + Field, + Fit, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + PdfObject, + TextStringObject, + TreeObject, + ViewerPreferences, + create_string_object, +) +from .types import OutlineType, PagemodeType +from .xmp import XmpInformation + + +def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: + if size > 8: + raise PdfReadError("invalid size in convert_to_int") + d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d + d = d[-8:] + return struct.unpack(">q", d)[0] + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`. + + All text properties of the document metadata have + *two* properties, e.g. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where the + metadata is being displayed. The raw property can sometimes return a + ``ByteStringObject``, if pypdf was unable to decode the string's text + encoding; this requires additional safety in the caller and therefore is not + as commonly accessed. + """ + + def __init__(self) -> None: + DictionaryObject.__init__(self) + + def _get_text(self, key: str) -> Optional[str]: + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + @property + def title(self) -> Optional[str]: + """ + Read-only property accessing the document's title. + + Returns a ``TextStringObject`` or ``None`` if the title is not + specified. + """ + return ( + self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore + if self.get(DI.TITLE) + else None + ) + + @property + def title_raw(self) -> Optional[str]: + """The "raw" version of title; can return a ``ByteStringObject``.""" + return self.get(DI.TITLE) + + @property + def author(self) -> Optional[str]: + """ + Read-only property accessing the document's author. + + Returns a ``TextStringObject`` or ``None`` if the author is not + specified. + """ + return self._get_text(DI.AUTHOR) + + @property + def author_raw(self) -> Optional[str]: + """The "raw" version of author; can return a ``ByteStringObject``.""" + return self.get(DI.AUTHOR) + + @property + def subject(self) -> Optional[str]: + """ + Read-only property accessing the document's subject. + + Returns a ``TextStringObject`` or ``None`` if the subject is not + specified. + """ + return self._get_text(DI.SUBJECT) + + @property + def subject_raw(self) -> Optional[str]: + """The "raw" version of subject; can return a ``ByteStringObject``.""" + return self.get(DI.SUBJECT) + + @property + def creator(self) -> Optional[str]: + """ + Read-only property accessing the document's creator. + + If the document was converted to PDF from another format, this is the + name of the application (e.g. OpenOffice) that created the original + document from which it was converted. Returns a ``TextStringObject`` or + ``None`` if the creator is not specified. + """ + return self._get_text(DI.CREATOR) + + @property + def creator_raw(self) -> Optional[str]: + """The "raw" version of creator; can return a ``ByteStringObject``.""" + return self.get(DI.CREATOR) + + @property + def producer(self) -> Optional[str]: + """ + Read-only property accessing the document's producer. + + If the document was converted to PDF from another format, this is the + name of the application (for example, macOS Quartz) that converted it to + PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not + specified. + """ + return self._get_text(DI.PRODUCER) + + @property + def producer_raw(self) -> Optional[str]: + """The "raw" version of producer; can return a ``ByteStringObject``.""" + return self.get(DI.PRODUCER) + + @property + def creation_date(self) -> Optional[datetime]: + """Read-only property accessing the document's creation date.""" + return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) + + @property + def creation_date_raw(self) -> Optional[str]: + """ + The "raw" version of creation date; can return a ``ByteStringObject``. + + Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix + is the offset from UTC. + """ + return self.get(DI.CREATION_DATE) + + @property + def modification_date(self) -> Optional[datetime]: + """ + Read-only property accessing the document's modification date. + + The date and time the document was most recently modified. + """ + return parse_iso8824_date(self._get_text(DI.MOD_DATE)) + + @property + def modification_date_raw(self) -> Optional[str]: + """ + The "raw" version of modification date; can return a + ``ByteStringObject``. + + Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix + is the offset from UTC. + """ + return self.get(DI.MOD_DATE) + + +class PdfDocCommon: + """ + Common functions from PdfWriter and PdfReader objects. + + This root class is strongly abstracted. + """ + + strict: bool = False # default + + _encryption: Optional[Encryption] = None + + @property + @abstractmethod + def root_object(self) -> DictionaryObject: + ... # pragma: no cover + + @property + @abstractmethod + def pdf_header(self) -> str: + ... # pragma: no cover + + @abstractmethod + def get_object( + self, indirect_reference: Union[int, IndirectObject] + ) -> Optional[PdfObject]: + ... # pragma: no cover + + @abstractmethod + def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject: + ... # pragma: no cover + + @property + @abstractmethod + def _info(self) -> Optional[DictionaryObject]: + ... # pragma: no cover + + @property + def metadata(self) -> Optional[DocumentInformation]: + """ + Retrieve the PDF file's document information dictionary, if it exists. + + Note that some PDF files use metadata streams instead of document + information dictionaries, and these metadata streams will not be + accessed by this function. + """ + retval = DocumentInformation() + if self._info is None: + return None + retval.update(self._info) + return retval + + @property + def xmp_metadata(self) -> Optional[XmpInformation]: + ... # pragma: no cover + + @abstractmethod + def _repr_mimebundle_( + self, + include: Union[None, Iterable[str]] = None, + exclude: Union[None, Iterable[str]] = None, + ) -> Dict[str, Any]: + """ + Integration into Jupyter Notebooks. + + This method returns a dictionary that maps a mime-type to its + representation. + + See https://ipython.readthedocs.io/en/stable/config/integrating.html + """ + ... # pragma: no cover + + @property + def viewer_preferences(self) -> Optional[ViewerPreferences]: + """Returns the existing ViewerPreferences as an overloaded dictionary.""" + o = self.root_object.get(CD.VIEWER_PREFERENCES, None) + if o is None: + return None + o = o.get_object() + if not isinstance(o, ViewerPreferences): + o = ViewerPreferences(o) + if hasattr(o, "indirect_reference"): + self._replace_object(o.indirect_reference, o) + else: + self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o + return o + + flattened_pages: Optional[List[PageObject]] = None + + def get_num_pages(self) -> int: + """ + Calculate the number of pages in this PDF file. + + Returns: + The number of pages of the parsed PDF file. + + Raises: + PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + # Flattened pages will not work on an encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.is_encrypted: + return self.root_object["/Pages"]["/Count"] # type: ignore + else: + if self.flattened_pages is None: + self._flatten() + assert self.flattened_pages is not None + return len(self.flattened_pages) + + def get_page(self, page_number: int) -> PageObject: + """ + Retrieve a page by number from this PDF file. + Most of the time ``.pages[page_number]`` is preferred. + + Args: + page_number: The page number to retrieve + (pages begin at zero) + + Returns: + A :class:`PageObject<pypdf._page.PageObject>` instance. + """ + if self.flattened_pages is None: + self._flatten() + assert self.flattened_pages is not None, "hint for mypy" + return self.flattened_pages[page_number] + + @property + def named_destinations(self) -> Dict[str, Any]: + """ + A read-only dictionary which maps names to + :class:`Destinations<pypdf.generic.Destination>` + """ + return self._get_named_destinations() + + def get_named_dest_root(self) -> ArrayObject: + named_dest = ArrayObject() + if CA.NAMES in self.root_object and isinstance( + self.root_object[CA.NAMES], DictionaryObject + ): + names = cast(DictionaryObject, self.root_object[CA.NAMES]) + names_ref = names.indirect_reference + if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject): + # 3.6.3 Name Dictionary (PDF spec 1.7) + dests = cast(DictionaryObject, names[CA.DESTS]) + dests_ref = dests.indirect_reference + if CA.NAMES in dests: + # §7.9.6, entries in a name tree node dictionary + named_dest = cast(ArrayObject, dests[CA.NAMES]) + else: + named_dest = ArrayObject() + dests[NameObject(CA.NAMES)] = named_dest + elif hasattr(self, "_add_object"): + dests = DictionaryObject() + dests_ref = self._add_object(dests) + names[NameObject(CA.DESTS)] = dests_ref + dests[NameObject(CA.NAMES)] = named_dest + + elif hasattr(self, "_add_object"): + names = DictionaryObject() + names_ref = self._add_object(names) + self.root_object[NameObject(CA.NAMES)] = names_ref + dests = DictionaryObject() + dests_ref = self._add_object(dests) + names[NameObject(CA.DESTS)] = dests_ref + dests[NameObject(CA.NAMES)] = named_dest + + return named_dest + + ## common + def _get_named_destinations( + self, + tree: Union[TreeObject, None] = None, + retval: Optional[Any] = None, + ) -> Dict[str, Any]: + """ + Retrieve the named destinations present in the document. + + Args: + tree: + retval: + + Returns: + A dictionary which maps names to + :class:`Destinations<pypdf.generic.Destination>`. + """ + if retval is None: + retval = {} + catalog = self.root_object + + # get the name tree + if CA.DESTS in catalog: + tree = cast(TreeObject, catalog[CA.DESTS]) + elif CA.NAMES in catalog: + names = cast(DictionaryObject, catalog[CA.NAMES]) + if CA.DESTS in names: + tree = cast(TreeObject, names[CA.DESTS]) + + if tree is None: + return retval + + if PA.KIDS in tree: + # recurse down the tree + for kid in cast(ArrayObject, tree[PA.KIDS]): + self._get_named_destinations(kid.get_object(), retval) + # §7.9.6, entries in a name tree node dictionary + elif CA.NAMES in tree: # /Kids and /Names are exclusives (§7.9.6) + names = cast(DictionaryObject, tree[CA.NAMES]) + i = 0 + while i < len(names): + key = cast(str, names[i].get_object()) + i += 1 + if not isinstance(key, str): + continue + try: + value = names[i].get_object() + except IndexError: + break + i += 1 + if isinstance(value, DictionaryObject): + if "/D" in value: + value = value["/D"] + else: + continue + dest = self._build_destination(key, value) # type: ignore + if dest is not None: + retval[key] = dest + else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1) + for k__, v__ in tree.items(): + val = v__.get_object() + if isinstance(val, DictionaryObject): + if "/D" in val: + val = val["/D"].get_object() + else: + continue + dest = self._build_destination(k__, val) + if dest is not None: + retval[k__] = dest + return retval + + # A select group of relevant field attributes. For the complete list. + # See §12.3.2 of the PDF 1.7 or PDF 2.0 specification. + + def get_fields( + self, + tree: Optional[TreeObject] = None, + retval: Optional[Dict[Any, Any]] = None, + fileobj: Optional[Any] = None, + stack: Optional[List[PdfObject]] = None, + ) -> Optional[Dict[str, Any]]: + """ + Extract field data if this PDF contains interactive form fields. + + The *tree*, *retval*, *stack* parameters are for recursive use. + + Args: + tree: Current object to parse. + retval: In-progress list of fields. + fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + stack: List of already parsed objects. + + Returns: + A dictionary where each key is a field name, and each + value is a :class:`Field<pypdf.generic.Field>` object. By + default, the mapping name is used for keys. + ``None`` if form data could not be located. + """ + field_attributes = FA.attributes_dict() + field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) + if retval is None: + retval = {} + catalog = self.root_object + stack = [] + # get the AcroForm tree + if CD.ACRO_FORM in catalog: + tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) + else: + return None + if tree is None: + return retval + assert stack is not None + if "/Fields" in tree: + fields = cast(ArrayObject, tree["/Fields"]) + for f in fields: + field = f.get_object() + self._build_field(field, retval, fileobj, field_attributes, stack) + elif any(attr in tree for attr in field_attributes): + # Tree is a field + self._build_field(tree, retval, fileobj, field_attributes, stack) + return retval + + def _get_qualified_field_name(self, parent: DictionaryObject) -> str: + if "/TM" in parent: + return cast(str, parent["/TM"]) + elif "/Parent" in parent: + return ( + self._get_qualified_field_name( + cast(DictionaryObject, parent["/Parent"]) + ) + + "." + + cast(str, parent.get("/T", "")) + ) + else: + return cast(str, parent.get("/T", "")) + + def _build_field( + self, + field: Union[TreeObject, DictionaryObject], + retval: Dict[Any, Any], + fileobj: Any, + field_attributes: Any, + stack: List[PdfObject], + ) -> None: + if all(attr not in field for attr in ("/T", "/TM")): + return + key = self._get_qualified_field_name(field) + if fileobj: + self._write_field(fileobj, field, field_attributes) + fileobj.write("\n") + retval[key] = Field(field) + obj = retval[key].indirect_reference.get_object() # to get the full object + if obj.get(FA.FT, "") == "/Ch": + retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] + if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: + # Checkbox + retval[key][NameObject("/_States_")] = ArrayObject( + list(obj["/AP"]["/N"].keys()) + ) + if "/Off" not in retval[key]["/_States_"]: + retval[key][NameObject("/_States_")].append(NameObject("/Off")) + elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: + states: List[str] = [] + retval[key][NameObject("/_States_")] = ArrayObject(states) + for k in obj.get(FA.Kids, {}): + k = k.get_object() + for s in list(k["/AP"]["/N"].keys()): + if s not in states: + states.append(s) + retval[key][NameObject("/_States_")] = ArrayObject(states) + if ( + obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 + and "/Off" in retval[key]["/_States_"] + ): + del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] + # at last for order + self._check_kids(field, retval, fileobj, stack) + + def _check_kids( + self, + tree: Union[TreeObject, DictionaryObject], + retval: Any, + fileobj: Any, + stack: List[PdfObject], + ) -> None: + if tree in stack: + logger_warning( + f"{self._get_qualified_field_name(tree)} already parsed", __name__ + ) + return + stack.append(tree) + if PA.KIDS in tree: + # recurse down the tree + for kid in tree[PA.KIDS]: # type: ignore + kid = kid.get_object() + self.get_fields(kid, retval, fileobj, stack) + + def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: + field_attributes_tuple = FA.attributes() + field_attributes_tuple = ( + field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() + ) + + for attr in field_attributes_tuple: + if attr in ( + FA.Kids, + FA.AA, + ): + continue + attr_name = field_attributes[attr] + try: + if attr == FA.FT: + # Make the field type value more clear + types = { + "/Btn": "Button", + "/Tx": "Text", + "/Ch": "Choice", + "/Sig": "Signature", + } + if field[attr] in types: + fileobj.write(f"{attr_name}: {types[field[attr]]}\n") + elif attr == FA.Parent: + # Let's just write the name of the parent + try: + name = field[attr][FA.TM] + except KeyError: + name = field[attr][FA.T] + fileobj.write(f"{attr_name}: {name}\n") + else: + fileobj.write(f"{attr_name}: {field[attr]}\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]: + """ + Retrieve form fields from the document with textual data. + + Args: + full_qualified_name: to get full name + + Returns: + A dictionary. The key is the name of the form field, + the value is the content of the field. + + If the document contains multiple form fields with the same name, the + second and following will get the suffix .2, .3, ... + """ + + def indexed_key(k: str, fields: Dict[Any, Any]) -> str: + if k not in fields: + return k + else: + return ( + k + + "." + + str(sum([1 for kk in fields if kk.startswith(k + ".")]) + 2) + ) + + # Retrieve document form fields + formfields = self.get_fields() + if formfields is None: + return {} + ff = {} + for field, value in formfields.items(): + if value.get("/FT") == "/Tx": + if full_qualified_name: + ff[field] = value.get("/V") + else: + ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") + return ff + + def get_pages_showing_field( + self, field: Union[Field, PdfObject, IndirectObject] + ) -> List[PageObject]: + """ + Provides list of pages where the field is called. + + Args: + field: Field Object, PdfObject or IndirectObject referencing a Field + + Returns: + List of pages: + - Empty list: + The field has no widgets attached + (either hidden field or ancestor field). + - Single page list: + Page where the widget is present + (most common). + - Multi-page list: + Field with multiple kids widgets + (example: radio buttons, field repeated on multiple pages). + """ + + def _get_inherited(obj: DictionaryObject, key: str) -> Any: + if key in obj: + return obj[key] + elif "/Parent" in obj: + return _get_inherited( + cast(DictionaryObject, obj["/Parent"].get_object()), key + ) + else: + return None + + try: + # to cope with all types + field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore + except Exception as exc: + raise ValueError("field type is invalid") from exc + if _get_inherited(field, "/FT") is None: + raise ValueError("field is not valid") + ret = [] + if field.get("/Subtype", "") == "/Widget": + if "/P" in field: + ret = [field["/P"].get_object()] + else: + ret = [ + p + for p in self.pages + if field.indirect_reference in p.get("/Annots", "") + ] + else: + kids = field.get("/Kids", ()) + for k in kids: + k = k.get_object() + if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k): + # Kid that is just a widget, not a field: + if "/P" in k: + ret += [k["/P"].get_object()] + else: + ret += [ + p + for p in self.pages + if k.indirect_reference in p.get("/Annots", "") + ] + return [ + x + if isinstance(x, PageObject) + else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore + for x in ret + ] + + @property + def open_destination( + self, + ) -> Union[None, Destination, TextStringObject, ByteStringObject]: + """ + Property to access the opening destination (``/OpenAction`` entry in + the PDF catalog). It returns ``None`` if the entry does not exist + or is not set. + + Raises: + Exception: If a destination is invalid. + """ + if "/OpenAction" not in self.root_object: + return None + oa: Any = self.root_object["/OpenAction"] + if isinstance(oa, bytes): # pragma: no cover + oa = oa.decode() + if isinstance(oa, str): + return create_string_object(oa) + elif isinstance(oa, ArrayObject): + try: + page, typ = oa[0:2] + array = oa[2:] + fit = Fit(typ, tuple(array)) + return Destination("OpenAction", page, fit) + except Exception as exc: + raise Exception(f"Invalid Destination {oa}: {exc}") + else: + return None + + @open_destination.setter + def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None: + raise NotImplementedError("no setter for open_destination") + + @property + def outline(self) -> OutlineType: + """ + Read-only property for the outline present in the document + (i.e., a collection of 'outline items' which are also known as + 'bookmarks'). + """ + return self._get_outline() + + def _get_outline( + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: + if outline is None: + outline = [] + catalog = self.root_object + + # get the outline dictionary and named destinations + if CO.OUTLINES in catalog: + lines = cast(DictionaryObject, catalog[CO.OUTLINES]) + + if isinstance(lines, NullObject): + return outline + + # §12.3.3 Document outline, entries in the outline dictionary + if lines is not None and "/First" in lines: + node = cast(DictionaryObject, lines["/First"]) + self._namedDests = self._get_named_destinations() + + if node is None: + return outline + + # see if there are any more outline items + while True: + outline_obj = self._build_outline_item(node) + if outline_obj: + outline.append(outline_obj) + + # check for sub-outline + if "/First" in node: + sub_outline: List[Any] = [] + self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) + if sub_outline: + outline.append(sub_outline) + + if "/Next" not in node: + break + node = cast(DictionaryObject, node["/Next"]) + + return outline + + @property + def threads(self) -> Optional[ArrayObject]: + """ + Read-only property for the list of threads. + + See §12.4.3 from the PDF 1.7 or 2.0 specification. + + It is an array of dictionaries with "/F" (the first bead in the thread) + and "/I" (a thread information dictionary containing information about + the thread, such as its title, author, and creation date) properties or + None if there are no articles. + + Since PDF 2.0 it can also contain an indirect reference to a metadata + stream containing information about the thread, such as its title, + author, and creation date. + """ + catalog = self.root_object + if CO.THREADS in catalog: + return cast("ArrayObject", catalog[CO.THREADS]) + else: + return None + + @abstractmethod + def _get_page_number_by_indirect( + self, indirect_reference: Union[None, int, NullObject, IndirectObject] + ) -> Optional[int]: + ... # pragma: no cover + + def get_page_number(self, page: PageObject) -> Optional[int]: + """ + Retrieve page number of a given PageObject. + + Args: + page: The page to get page number. Should be + an instance of :class:`PageObject<pypdf._page.PageObject>` + + Returns: + The page number or None if page is not found + """ + return self._get_page_number_by_indirect(page.indirect_reference) + + def get_destination_page_number(self, destination: Destination) -> Optional[int]: + """ + Retrieve page number of a given Destination object. + + Args: + destination: The destination to get page number. + + Returns: + The page number or None if page is not found + """ + return self._get_page_number_by_indirect(destination.page) + + def _build_destination( + self, + title: str, + array: Optional[ + List[ + Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] + ] + ], + ) -> Destination: + page, typ = None, None + # handle outline items with missing or invalid destination + if ( + isinstance(array, (NullObject, str)) + or (isinstance(array, ArrayObject) and len(array) == 0) + or array is None + ): + page = NullObject() + return Destination(title, page, Fit.fit()) + else: + page, typ = array[0:2] # type: ignore + array = array[2:] + try: + return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore + except PdfReadError: + logger_warning(f"Unknown destination: {title} {array}", __name__) + if self.strict: + raise + # create a link to first Page + tmp = self.pages[0].indirect_reference + indirect_reference = NullObject() if tmp is None else tmp + return Destination(title, indirect_reference, Fit.fit()) # type: ignore + + def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: + dest, title, outline_item = None, None, None + + # title required for valid outline + # § 12.3.3, entries in an outline item dictionary + try: + title = cast("str", node["/Title"]) + except KeyError: + if self.strict: + raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") + title = "" + + if "/A" in node: + # Action, PDFv1.7 Section 12.6 (only type GoTo supported) + action = cast(DictionaryObject, node["/A"]) + action_type = cast(NameObject, action[GoToActionArguments.S]) + if action_type == "/GoTo": + dest = action[GoToActionArguments.D] + elif "/Dest" in node: + # Destination, PDFv1.7 Section 12.3.2 + dest = node["/Dest"] + # if array was referenced in another object, will be a dict w/ key "/D" + if isinstance(dest, DictionaryObject) and "/D" in dest: + dest = dest["/D"] + + if isinstance(dest, ArrayObject): + outline_item = self._build_destination(title, dest) + elif isinstance(dest, str): + # named destination, addresses NameObject Issue #193 + # TODO : keep named destination instead of replacing it ? + try: + outline_item = self._build_destination( + title, self._namedDests[dest].dest_array + ) + except KeyError: + # named destination not found in Name Dict + outline_item = self._build_destination(title, None) + elif dest is None: + # outline item not required to have destination or action + # PDFv1.7 Table 153 + outline_item = self._build_destination(title, dest) + else: + if self.strict: + raise PdfReadError(f"Unexpected destination {dest!r}") + else: + logger_warning( + f"Removed unexpected destination {dest!r} from destination", + __name__, + ) + outline_item = self._build_destination(title, None) + + # if outline item created, add color, format, and child count if present + if outline_item: + if "/C" in node: + # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 + outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore + if "/F" in node: + # specifies style characteristics bold and/or italic + # with 1=italic, 2=bold, 3=both + outline_item[NameObject("/F")] = node["/F"] + if "/Count" in node: + # absolute value = num. visible children + # with positive = open/unfolded, negative = closed/folded + outline_item[NameObject("/Count")] = node["/Count"] + # if count is 0 we will consider it as open ( in order to have always an is_open to simplify + outline_item[NameObject("/%is_open%")] = BooleanObject( + node.get("/Count", 0) >= 0 + ) + outline_item.node = node + try: + outline_item.indirect_reference = node.indirect_reference + except AttributeError: + pass + return outline_item + + @property + def pages(self) -> List[PageObject]: + """ + Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`. + This property allows to get a page or a range of pages. + + Note: + For PdfWriter only: Provides the capability to remove a page/range of + page from the list (using the del operator). Remember: Only the page + entry is removed, as the objects beneath can be used elsewhere. A + solution to completely remove them - if they are not used anywhere - is + to write to a buffer/temporary file and then load it into a new + PdfWriter. + + """ + return _VirtualList(self.get_num_pages, self.get_page) # type: ignore + + @property + def page_labels(self) -> List[str]: + """ + A list of labels for the pages in this document. + + This property is read-only. The labels are in the order that the pages + appear in the document. + """ + return [page_index2page_label(self, i) for i in range(len(self.pages))] + + @property + def page_layout(self) -> Optional[str]: + """ + Get the page layout currently being used. + + .. list-table:: Valid ``layout`` values + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + try: + return cast(NameObject, self.root_object[CD.PAGE_LAYOUT]) + except KeyError: + return None + + @property + def page_mode(self) -> Optional[PagemodeType]: + """ + Get the page mode currently being used. + + .. list-table:: Valid ``mode`` values + :widths: 50 200 + + * - /UseNone + - Do not show outline or thumbnails panels + * - /UseOutlines + - Show outline (aka bookmarks) panel + * - /UseThumbs + - Show page thumbnails panel + * - /FullScreen + - Fullscreen view + * - /UseOC + - Show Optional Content Group (OCG) panel + * - /UseAttachments + - Show attachments panel + """ + try: + return self.root_object["/PageMode"] # type: ignore + except KeyError: + return None + + def _flatten( + self, + pages: Union[None, DictionaryObject, PageObject] = None, + inherit: Optional[Dict[str, Any]] = None, + indirect_reference: Optional[IndirectObject] = None, + ) -> None: + inheritable_page_attributes = ( + NameObject(PG.RESOURCES), + NameObject(PG.MEDIABOX), + NameObject(PG.CROPBOX), + NameObject(PG.ROTATE), + ) + if inherit is None: + inherit = {} + if pages is None: + # Fix issue 327: set flattened_pages attribute only for + # decrypted file + catalog = self.root_object + pages = catalog["/Pages"].get_object() # type: ignore + assert isinstance(pages, DictionaryObject) + self.flattened_pages = [] + + if PA.TYPE in pages: + t = cast(str, pages[PA.TYPE]) + # if pdf has no type, considered as a page if /Kids is missing + elif PA.KIDS not in pages: + t = "/Page" + else: + t = "/Pages" + + if t == "/Pages": + for attr in inheritable_page_attributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in cast(ArrayObject, pages[PA.KIDS]): + addt = {} + if isinstance(page, IndirectObject): + addt["indirect_reference"] = page + obj = page.get_object() + if obj: + # damaged file may have invalid child in /Pages + self._flatten(obj, inherit, **addt) + elif t == "/Page": + for attr_in, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr_in not in pages: + pages[attr_in] = value + page_obj = PageObject(self, indirect_reference) + page_obj.update(pages) + + # TODO: Could flattened_pages be None at this point? + self.flattened_pages.append(page_obj) # type: ignore + + def remove_page( + self, + page: Union[int, PageObject, IndirectObject], + clean: bool = False, + ) -> None: + """ + Remove page from pages list. + + Args: + page: + * :class:`int`: Page number to be removed. + * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times + only the first one will be removed. + * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed. + + clean: replace PageObject with NullObject to prevent annotations + or destinations to reference a detached page. + """ + if self.flattened_pages is None: + self._flatten() + assert self.flattened_pages is not None + if isinstance(page, IndirectObject): + p = page.get_object() + if not isinstance(p, PageObject): + logger_warning("IndirectObject is not referencing a page", __name__) + return + page = p + + if not isinstance(page, int): + try: + page = self.flattened_pages.index(page) + except ValueError: + logger_warning("Cannot find page in pages", __name__) + return + if not (0 <= page < len(self.flattened_pages)): + logger_warning("Page number is out of range", __name__) + return + + ind = self.pages[page].indirect_reference + del self.pages[page] + if clean and ind is not None: + self._replace_object(ind, NullObject()) + + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: + """ + Used to ease development. + + This is equivalent to generic.IndirectObject(num,gen,self).get_object() + + Args: + num: The object number of the indirect object. + gen: The generation number of the indirect object. + + Returns: + A PdfObject + """ + return IndirectObject(num, gen, self).get_object() + + def decode_permissions( + self, permissions_code: int + ) -> Dict[str, bool]: # pragma: no cover + """Take the permissions as an integer, return the allowed access.""" + deprecate_with_replacement( + old_name="decode_permissions", + new_name="user_access_permissions", + removed_in="5.0.0", + ) + + permissions_mapping = { + "print": UserAccessPermissions.PRINT, + "modify": UserAccessPermissions.MODIFY, + "copy": UserAccessPermissions.EXTRACT, + "annotations": UserAccessPermissions.ADD_OR_MODIFY, + "forms": UserAccessPermissions.FILL_FORM_FIELDS, + # Do not fix typo, as part of official, but deprecated API. + "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS, + "assemble": UserAccessPermissions.ASSEMBLE_DOC, + "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION, + } + + return { + key: permissions_code & flag != 0 + for key, flag in permissions_mapping.items() + } + + @property + def user_access_permissions(self) -> Optional[UserAccessPermissions]: + """Get the user access permissions for encrypted documents. Returns None if not encrypted.""" + if self._encryption is None: + return None + return UserAccessPermissions(self._encryption.P) + + @property + @abstractmethod + def is_encrypted(self) -> bool: + """ + Read-only boolean property showing whether this PDF file is encrypted. + + Note that this property, if true, will remain true even after the + :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called. + """ + ... # pragma: no cover + + @property + def xfa(self) -> Optional[Dict[str, Any]]: + tree: Optional[TreeObject] = None + retval: Dict[str, Any] = {} + catalog = self.root_object + + if "/AcroForm" not in catalog or not catalog["/AcroForm"]: + return None + + tree = cast(TreeObject, catalog["/AcroForm"]) + + if "/XFA" in tree: + fields = cast(ArrayObject, tree["/XFA"]) + i = iter(fields) + for f in i: + tag = f + f = next(i) + if isinstance(f, IndirectObject): + field = cast(Optional[EncodedStreamObject], f.get_object()) + if field: + es = zlib.decompress(b_(field._data)) + retval[tag] = es + return retval + + @property + def attachments(self) -> Mapping[str, List[bytes]]: + return LazyDict( + { + name: (self._get_attachment_list, name) + for name in self._list_attachments() + } + ) + + def _list_attachments(self) -> List[str]: + """ + Retrieves the list of filenames of file attachments. + + Returns: + list of filenames + """ + catalog = self.root_object + # From the catalog get the embedded file names + try: + filenames = cast( + ArrayObject, + cast( + DictionaryObject, + cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], + )["/Names"], + ) + except KeyError: + return [] + attachments_names = [f for f in filenames if isinstance(f, str)] + return attachments_names + + def _get_attachment_list(self, name: str) -> List[bytes]: + out = self._get_attachments(name)[name] + if isinstance(out, list): + return out + return [out] + + def _get_attachments( + self, filename: Optional[str] = None + ) -> Dict[str, Union[bytes, List[bytes]]]: + """ + Retrieves all or selected file attachments of the PDF as a dictionary of file names + and the file data as a bytestring. + + Args: + filename: If filename is None, then a dictionary of all attachments + will be returned, where the key is the filename and the value + is the content. Otherwise, a dictionary with just a single key + - the filename - and its content will be returned. + + Returns: + dictionary of filename -> Union[bytestring or List[ByteString]] + If the filename exists multiple times a list of the different versions will be provided. + """ + catalog = self.root_object + # From the catalog get the embedded file names + try: + filenames = cast( + ArrayObject, + cast( + DictionaryObject, + cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], + )["/Names"], + ) + except KeyError: + return {} + attachments: Dict[str, Union[bytes, List[bytes]]] = {} + # Loop through attachments + for i in range(len(filenames)): + f = filenames[i] + if isinstance(f, str): + if filename is not None and f != filename: + continue + name = f + f_dict = filenames[i + 1].get_object() + f_data = f_dict["/EF"]["/F"].get_data() + if name in attachments: + if not isinstance(attachments[name], list): + attachments[name] = [attachments[name]] # type:ignore + attachments[name].append(f_data) # type:ignore + else: + attachments[name] = f_data + return attachments + + +class LazyDict(Mapping[Any, Any]): + def __init__(self, *args: Any, **kw: Any) -> None: + self._raw_dict = dict(*args, **kw) + + def __getitem__(self, key: str) -> Any: + func, arg = self._raw_dict.__getitem__(key) + return func(arg) + + def __iter__(self) -> Iterator[Any]: + return iter(self._raw_dict) + + def __len__(self) -> int: + return len(self._raw_dict) + + def __str__(self) -> str: + return f"LazyDict(keys={list(self.keys())})" |