two version of R2R are hereHEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_doc_common.py
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
1 files changed, 1365 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_doc_common.py b/.venv/lib/python3.12/site-packages/pypdf/_doc_common.py
new file mode 100644
index 00000000..d4c5c43c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_doc_common.py
@@ -0,0 +1,1365 @@
+# Copyright (c) 2006, Mathieu Fenniak
+# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
+# Copyright (c) 2024, Pubpub-ZZ
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import struct
+import zlib
+from abc import abstractmethod
+from datetime import datetime
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
+
+from ._encryption import Encryption
+from ._page import PageObject, _VirtualList
+from ._page_labels import index2label as page_index2page_label
+from ._utils import (
+    b_,
+    deprecate_with_replacement,
+    logger_warning,
+    parse_iso8824_date,
+)
+from .constants import CatalogAttributes as CA
+from .constants import CatalogDictionary as CD
+from .constants import (
+    CheckboxRadioButtonAttributes,
+    GoToActionArguments,
+    UserAccessPermissions,
+)
+from .constants import Core as CO
+from .constants import DocumentInformationAttributes as DI
+from .constants import FieldDictionaryAttributes as FA
+from .constants import PageAttributes as PG
+from .constants import PagesAttributes as PA
+from .errors import (
+    PdfReadError,
+)
+from .generic import (
+    ArrayObject,
+    BooleanObject,
+    ByteStringObject,
+    Destination,
+    DictionaryObject,
+    EncodedStreamObject,
+    Field,
+    Fit,
+    FloatObject,
+    IndirectObject,
+    NameObject,
+    NullObject,
+    NumberObject,
+    PdfObject,
+    TextStringObject,
+    TreeObject,
+    ViewerPreferences,
+    create_string_object,
+)
+from .types import OutlineType, PagemodeType
+from .xmp import XmpInformation
+
+
+def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]:
+    if size > 8:
+        raise PdfReadError("invalid size in convert_to_int")
+    d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d
+    d = d[-8:]
+    return struct.unpack(">q", d)[0]
+
+
+class DocumentInformation(DictionaryObject):
+    """
+    A class representing the basic document metadata provided in a PDF File.
+    This class is accessible through
+    :py:class:`PdfReader.metadata<pypdf.PdfReader.metadata>`.
+
+    All text properties of the document metadata have
+    *two* properties, e.g. author and author_raw. The non-raw property will
+    always return a ``TextStringObject``, making it ideal for a case where the
+    metadata is being displayed. The raw property can sometimes return a
+    ``ByteStringObject``, if pypdf was unable to decode the string's text
+    encoding; this requires additional safety in the caller and therefore is not
+    as commonly accessed.
+    """
+
+    def __init__(self) -> None:
+        DictionaryObject.__init__(self)
+
+    def _get_text(self, key: str) -> Optional[str]:
+        retval = self.get(key, None)
+        if isinstance(retval, TextStringObject):
+            return retval
+        return None
+
+    @property
+    def title(self) -> Optional[str]:
+        """
+        Read-only property accessing the document's title.
+
+        Returns a ``TextStringObject`` or ``None`` if the title is not
+        specified.
+        """
+        return (
+            self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object()  # type: ignore
+            if self.get(DI.TITLE)
+            else None
+        )
+
+    @property
+    def title_raw(self) -> Optional[str]:
+        """The "raw" version of title; can return a ``ByteStringObject``."""
+        return self.get(DI.TITLE)
+
+    @property
+    def author(self) -> Optional[str]:
+        """
+        Read-only property accessing the document's author.
+
+        Returns a ``TextStringObject`` or ``None`` if the author is not
+        specified.
+        """
+        return self._get_text(DI.AUTHOR)
+
+    @property
+    def author_raw(self) -> Optional[str]:
+        """The "raw" version of author; can return a ``ByteStringObject``."""
+        return self.get(DI.AUTHOR)
+
+    @property
+    def subject(self) -> Optional[str]:
+        """
+        Read-only property accessing the document's subject.
+
+        Returns a ``TextStringObject`` or ``None`` if the subject is not
+        specified.
+        """
+        return self._get_text(DI.SUBJECT)
+
+    @property
+    def subject_raw(self) -> Optional[str]:
+        """The "raw" version of subject; can return a ``ByteStringObject``."""
+        return self.get(DI.SUBJECT)
+
+    @property
+    def creator(self) -> Optional[str]:
+        """
+        Read-only property accessing the document's creator.
+
+        If the document was converted to PDF from another format, this is the
+        name of the application (e.g. OpenOffice) that created the original
+        document from which it was converted. Returns a ``TextStringObject`` or
+        ``None`` if the creator is not specified.
+        """
+        return self._get_text(DI.CREATOR)
+
+    @property
+    def creator_raw(self) -> Optional[str]:
+        """The "raw" version of creator; can return a ``ByteStringObject``."""
+        return self.get(DI.CREATOR)
+
+    @property
+    def producer(self) -> Optional[str]:
+        """
+        Read-only property accessing the document's producer.
+
+        If the document was converted to PDF from another format, this is the
+        name of the application (for example, macOS Quartz) that converted it to
+        PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not
+        specified.
+        """
+        return self._get_text(DI.PRODUCER)
+
+    @property
+    def producer_raw(self) -> Optional[str]:
+        """The "raw" version of producer; can return a ``ByteStringObject``."""
+        return self.get(DI.PRODUCER)
+
+    @property
+    def creation_date(self) -> Optional[datetime]:
+        """Read-only property accessing the document's creation date."""
+        return parse_iso8824_date(self._get_text(DI.CREATION_DATE))
+
+    @property
+    def creation_date_raw(self) -> Optional[str]:
+        """
+        The "raw" version of creation date; can return a ``ByteStringObject``.
+
+        Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
+        is the offset from UTC.
+        """
+        return self.get(DI.CREATION_DATE)
+
+    @property
+    def modification_date(self) -> Optional[datetime]:
+        """
+        Read-only property accessing the document's modification date.
+
+        The date and time the document was most recently modified.
+        """
+        return parse_iso8824_date(self._get_text(DI.MOD_DATE))
+
+    @property
+    def modification_date_raw(self) -> Optional[str]:
+        """
+        The "raw" version of modification date; can return a
+        ``ByteStringObject``.
+
+        Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix
+        is the offset from UTC.
+        """
+        return self.get(DI.MOD_DATE)
+
+
+class PdfDocCommon:
+    """
+    Common functions from PdfWriter and PdfReader objects.
+
+    This root class is strongly abstracted.
+    """
+
+    strict: bool = False  # default
+
+    _encryption: Optional[Encryption] = None
+
+    @property
+    @abstractmethod
+    def root_object(self) -> DictionaryObject:
+        ...  # pragma: no cover
+
+    @property
+    @abstractmethod
+    def pdf_header(self) -> str:
+        ...  # pragma: no cover
+
+    @abstractmethod
+    def get_object(
+        self, indirect_reference: Union[int, IndirectObject]
+    ) -> Optional[PdfObject]:
+        ...  # pragma: no cover
+
+    @abstractmethod
+    def _replace_object(self, indirect: IndirectObject, obj: PdfObject) -> PdfObject:
+        ...  # pragma: no cover
+
+    @property
+    @abstractmethod
+    def _info(self) -> Optional[DictionaryObject]:
+        ...  # pragma: no cover
+
+    @property
+    def metadata(self) -> Optional[DocumentInformation]:
+        """
+        Retrieve the PDF file's document information dictionary, if it exists.
+
+        Note that some PDF files use metadata streams instead of document
+        information dictionaries, and these metadata streams will not be
+        accessed by this function.
+        """
+        retval = DocumentInformation()
+        if self._info is None:
+            return None
+        retval.update(self._info)
+        return retval
+
+    @property
+    def xmp_metadata(self) -> Optional[XmpInformation]:
+        ...  # pragma: no cover
+
+    @abstractmethod
+    def _repr_mimebundle_(
+        self,
+        include: Union[None, Iterable[str]] = None,
+        exclude: Union[None, Iterable[str]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Integration into Jupyter Notebooks.
+
+        This method returns a dictionary that maps a mime-type to its
+        representation.
+
+        See https://ipython.readthedocs.io/en/stable/config/integrating.html
+        """
+        ...  # pragma: no cover
+
+    @property
+    def viewer_preferences(self) -> Optional[ViewerPreferences]:
+        """Returns the existing ViewerPreferences as an overloaded dictionary."""
+        o = self.root_object.get(CD.VIEWER_PREFERENCES, None)
+        if o is None:
+            return None
+        o = o.get_object()
+        if not isinstance(o, ViewerPreferences):
+            o = ViewerPreferences(o)
+            if hasattr(o, "indirect_reference"):
+                self._replace_object(o.indirect_reference, o)
+            else:
+                self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
+        return o
+
+    flattened_pages: Optional[List[PageObject]] = None
+
+    def get_num_pages(self) -> int:
+        """
+        Calculate the number of pages in this PDF file.
+
+        Returns:
+            The number of pages of the parsed PDF file.
+
+        Raises:
+            PdfReadError: if file is encrypted and restrictions prevent
+                this action.
+        """
+        # Flattened pages will not work on an encrypted PDF;
+        # the PDF file's page count is used in this case. Otherwise,
+        # the original method (flattened page count) is used.
+        if self.is_encrypted:
+            return self.root_object["/Pages"]["/Count"]  # type: ignore
+        else:
+            if self.flattened_pages is None:
+                self._flatten()
+            assert self.flattened_pages is not None
+            return len(self.flattened_pages)
+
+    def get_page(self, page_number: int) -> PageObject:
+        """
+        Retrieve a page by number from this PDF file.
+        Most of the time ``.pages[page_number]`` is preferred.
+
+        Args:
+            page_number: The page number to retrieve
+                (pages begin at zero)
+
+        Returns:
+            A :class:`PageObject<pypdf._page.PageObject>` instance.
+        """
+        if self.flattened_pages is None:
+            self._flatten()
+        assert self.flattened_pages is not None, "hint for mypy"
+        return self.flattened_pages[page_number]
+
+    @property
+    def named_destinations(self) -> Dict[str, Any]:
+        """
+        A read-only dictionary which maps names to
+        :class:`Destinations<pypdf.generic.Destination>`
+        """
+        return self._get_named_destinations()
+
+    def get_named_dest_root(self) -> ArrayObject:
+        named_dest = ArrayObject()
+        if CA.NAMES in self.root_object and isinstance(
+            self.root_object[CA.NAMES], DictionaryObject
+        ):
+            names = cast(DictionaryObject, self.root_object[CA.NAMES])
+            names_ref = names.indirect_reference
+            if CA.DESTS in names and isinstance(names[CA.DESTS], DictionaryObject):
+                # 3.6.3 Name Dictionary (PDF spec 1.7)
+                dests = cast(DictionaryObject, names[CA.DESTS])
+                dests_ref = dests.indirect_reference
+                if CA.NAMES in dests:
+                    # §7.9.6, entries in a name tree node dictionary
+                    named_dest = cast(ArrayObject, dests[CA.NAMES])
+                else:
+                    named_dest = ArrayObject()
+                    dests[NameObject(CA.NAMES)] = named_dest
+            elif hasattr(self, "_add_object"):
+                dests = DictionaryObject()
+                dests_ref = self._add_object(dests)
+                names[NameObject(CA.DESTS)] = dests_ref
+                dests[NameObject(CA.NAMES)] = named_dest
+
+        elif hasattr(self, "_add_object"):
+            names = DictionaryObject()
+            names_ref = self._add_object(names)
+            self.root_object[NameObject(CA.NAMES)] = names_ref
+            dests = DictionaryObject()
+            dests_ref = self._add_object(dests)
+            names[NameObject(CA.DESTS)] = dests_ref
+            dests[NameObject(CA.NAMES)] = named_dest
+
+        return named_dest
+
+    ## common
+    def _get_named_destinations(
+        self,
+        tree: Union[TreeObject, None] = None,
+        retval: Optional[Any] = None,
+    ) -> Dict[str, Any]:
+        """
+        Retrieve the named destinations present in the document.
+
+        Args:
+            tree:
+            retval:
+
+        Returns:
+            A dictionary which maps names to
+            :class:`Destinations<pypdf.generic.Destination>`.
+        """
+        if retval is None:
+            retval = {}
+            catalog = self.root_object
+
+            # get the name tree
+            if CA.DESTS in catalog:
+                tree = cast(TreeObject, catalog[CA.DESTS])
+            elif CA.NAMES in catalog:
+                names = cast(DictionaryObject, catalog[CA.NAMES])
+                if CA.DESTS in names:
+                    tree = cast(TreeObject, names[CA.DESTS])
+
+        if tree is None:
+            return retval
+
+        if PA.KIDS in tree:
+            # recurse down the tree
+            for kid in cast(ArrayObject, tree[PA.KIDS]):
+                self._get_named_destinations(kid.get_object(), retval)
+        # §7.9.6, entries in a name tree node dictionary
+        elif CA.NAMES in tree:  # /Kids and /Names are exclusives (§7.9.6)
+            names = cast(DictionaryObject, tree[CA.NAMES])
+            i = 0
+            while i < len(names):
+                key = cast(str, names[i].get_object())
+                i += 1
+                if not isinstance(key, str):
+                    continue
+                try:
+                    value = names[i].get_object()
+                except IndexError:
+                    break
+                i += 1
+                if isinstance(value, DictionaryObject):
+                    if "/D" in value:
+                        value = value["/D"]
+                    else:
+                        continue
+                dest = self._build_destination(key, value)  # type: ignore
+                if dest is not None:
+                    retval[key] = dest
+        else:  # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF 1.1)
+            for k__, v__ in tree.items():
+                val = v__.get_object()
+                if isinstance(val, DictionaryObject):
+                    if "/D" in val:
+                        val = val["/D"].get_object()
+                    else:
+                        continue
+                dest = self._build_destination(k__, val)
+                if dest is not None:
+                    retval[k__] = dest
+        return retval
+
+    # A select group of relevant field attributes. For the complete list.
+    # See §12.3.2 of the PDF 1.7 or PDF 2.0 specification.
+
+    def get_fields(
+        self,
+        tree: Optional[TreeObject] = None,
+        retval: Optional[Dict[Any, Any]] = None,
+        fileobj: Optional[Any] = None,
+        stack: Optional[List[PdfObject]] = None,
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Extract field data if this PDF contains interactive form fields.
+
+        The *tree*, *retval*, *stack* parameters are for recursive use.
+
+        Args:
+            tree: Current object to parse.
+            retval: In-progress list of fields.
+            fileobj: A file object (usually a text file) to write
+                a report to on all interactive form fields found.
+            stack: List of already parsed objects.
+
+        Returns:
+            A dictionary where each key is a field name, and each
+            value is a :class:`Field<pypdf.generic.Field>` object. By
+            default, the mapping name is used for keys.
+            ``None`` if form data could not be located.
+        """
+        field_attributes = FA.attributes_dict()
+        field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict())
+        if retval is None:
+            retval = {}
+            catalog = self.root_object
+            stack = []
+            # get the AcroForm tree
+            if CD.ACRO_FORM in catalog:
+                tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
+            else:
+                return None
+        if tree is None:
+            return retval
+        assert stack is not None
+        if "/Fields" in tree:
+            fields = cast(ArrayObject, tree["/Fields"])
+            for f in fields:
+                field = f.get_object()
+                self._build_field(field, retval, fileobj, field_attributes, stack)
+        elif any(attr in tree for attr in field_attributes):
+            # Tree is a field
+            self._build_field(tree, retval, fileobj, field_attributes, stack)
+        return retval
+
+    def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
+        if "/TM" in parent:
+            return cast(str, parent["/TM"])
+        elif "/Parent" in parent:
+            return (
+                self._get_qualified_field_name(
+                    cast(DictionaryObject, parent["/Parent"])
+                )
+                + "."
+                + cast(str, parent.get("/T", ""))
+            )
+        else:
+            return cast(str, parent.get("/T", ""))
+
+    def _build_field(
+        self,
+        field: Union[TreeObject, DictionaryObject],
+        retval: Dict[Any, Any],
+        fileobj: Any,
+        field_attributes: Any,
+        stack: List[PdfObject],
+    ) -> None:
+        if all(attr not in field for attr in ("/T", "/TM")):
+            return
+        key = self._get_qualified_field_name(field)
+        if fileobj:
+            self._write_field(fileobj, field, field_attributes)
+            fileobj.write("\n")
+        retval[key] = Field(field)
+        obj = retval[key].indirect_reference.get_object()  # to get the full object
+        if obj.get(FA.FT, "") == "/Ch":
+            retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)]
+        if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj:
+            #  Checkbox
+            retval[key][NameObject("/_States_")] = ArrayObject(
+                list(obj["/AP"]["/N"].keys())
+            )
+            if "/Off" not in retval[key]["/_States_"]:
+                retval[key][NameObject("/_States_")].append(NameObject("/Off"))
+        elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0:
+            states: List[str] = []
+            retval[key][NameObject("/_States_")] = ArrayObject(states)
+            for k in obj.get(FA.Kids, {}):
+                k = k.get_object()
+                for s in list(k["/AP"]["/N"].keys()):
+                    if s not in states:
+                        states.append(s)
+                retval[key][NameObject("/_States_")] = ArrayObject(states)
+            if (
+                obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0
+                and "/Off" in retval[key]["/_States_"]
+            ):
+                del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]
+        # at last for order
+        self._check_kids(field, retval, fileobj, stack)
+
+    def _check_kids(
+        self,
+        tree: Union[TreeObject, DictionaryObject],
+        retval: Any,
+        fileobj: Any,
+        stack: List[PdfObject],
+    ) -> None:
+        if tree in stack:
+            logger_warning(
+                f"{self._get_qualified_field_name(tree)} already parsed", __name__
+            )
+            return
+        stack.append(tree)
+        if PA.KIDS in tree:
+            # recurse down the tree
+            for kid in tree[PA.KIDS]:  # type: ignore
+                kid = kid.get_object()
+                self.get_fields(kid, retval, fileobj, stack)
+
+    def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
+        field_attributes_tuple = FA.attributes()
+        field_attributes_tuple = (
+            field_attributes_tuple + CheckboxRadioButtonAttributes.attributes()
+        )
+
+        for attr in field_attributes_tuple:
+            if attr in (
+                FA.Kids,
+                FA.AA,
+            ):
+                continue
+            attr_name = field_attributes[attr]
+            try:
+                if attr == FA.FT:
+                    # Make the field type value more clear
+                    types = {
+                        "/Btn": "Button",
+                        "/Tx": "Text",
+                        "/Ch": "Choice",
+                        "/Sig": "Signature",
+                    }
+                    if field[attr] in types:
+                        fileobj.write(f"{attr_name}: {types[field[attr]]}\n")
+                elif attr == FA.Parent:
+                    # Let's just write the name of the parent
+                    try:
+                        name = field[attr][FA.TM]
+                    except KeyError:
+                        name = field[attr][FA.T]
+                    fileobj.write(f"{attr_name}: {name}\n")
+                else:
+                    fileobj.write(f"{attr_name}: {field[attr]}\n")
+            except KeyError:
+                # Field attribute is N/A or unknown, so don't write anything
+                pass
+
+    def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]:
+        """
+        Retrieve form fields from the document with textual data.
+
+        Args:
+            full_qualified_name: to get full name
+
+        Returns:
+            A dictionary. The key is the name of the form field,
+            the value is the content of the field.
+
+            If the document contains multiple form fields with the same name, the
+            second and following will get the suffix .2, .3, ...
+        """
+
+        def indexed_key(k: str, fields: Dict[Any, Any]) -> str:
+            if k not in fields:
+                return k
+            else:
+                return (
+                    k
+                    + "."
+                    + str(sum([1 for kk in fields if kk.startswith(k + ".")]) + 2)
+                )
+
+        # Retrieve document form fields
+        formfields = self.get_fields()
+        if formfields is None:
+            return {}
+        ff = {}
+        for field, value in formfields.items():
+            if value.get("/FT") == "/Tx":
+                if full_qualified_name:
+                    ff[field] = value.get("/V")
+                else:
+                    ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
+        return ff
+
+    def get_pages_showing_field(
+        self, field: Union[Field, PdfObject, IndirectObject]
+    ) -> List[PageObject]:
+        """
+        Provides list of pages where the field is called.
+
+        Args:
+            field: Field Object, PdfObject or IndirectObject referencing a Field
+
+        Returns:
+            List of pages:
+                - Empty list:
+                    The field has no widgets attached
+                    (either hidden field or ancestor field).
+                - Single page list:
+                    Page where the widget is present
+                    (most common).
+                - Multi-page list:
+                    Field with multiple kids widgets
+                    (example: radio buttons, field repeated on multiple pages).
+        """
+
+        def _get_inherited(obj: DictionaryObject, key: str) -> Any:
+            if key in obj:
+                return obj[key]
+            elif "/Parent" in obj:
+                return _get_inherited(
+                    cast(DictionaryObject, obj["/Parent"].get_object()), key
+                )
+            else:
+                return None
+
+        try:
+            # to cope with all types
+            field = cast(DictionaryObject, field.indirect_reference.get_object())  # type: ignore
+        except Exception as exc:
+            raise ValueError("field type is invalid") from exc
+        if _get_inherited(field, "/FT") is None:
+            raise ValueError("field is not valid")
+        ret = []
+        if field.get("/Subtype", "") == "/Widget":
+            if "/P" in field:
+                ret = [field["/P"].get_object()]
+            else:
+                ret = [
+                    p
+                    for p in self.pages
+                    if field.indirect_reference in p.get("/Annots", "")
+                ]
+        else:
+            kids = field.get("/Kids", ())
+            for k in kids:
+                k = k.get_object()
+                if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
+                    # Kid that is just a widget, not a field:
+                    if "/P" in k:
+                        ret += [k["/P"].get_object()]
+                    else:
+                        ret += [
+                            p
+                            for p in self.pages
+                            if k.indirect_reference in p.get("/Annots", "")
+                        ]
+        return [
+            x
+            if isinstance(x, PageObject)
+            else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)])  # type: ignore
+            for x in ret
+        ]
+
+    @property
+    def open_destination(
+        self,
+    ) -> Union[None, Destination, TextStringObject, ByteStringObject]:
+        """
+        Property to access the opening destination (``/OpenAction`` entry in
+        the PDF catalog). It returns ``None`` if the entry does not exist
+        or is not set.
+
+        Raises:
+            Exception: If a destination is invalid.
+        """
+        if "/OpenAction" not in self.root_object:
+            return None
+        oa: Any = self.root_object["/OpenAction"]
+        if isinstance(oa, bytes):  # pragma: no cover
+            oa = oa.decode()
+        if isinstance(oa, str):
+            return create_string_object(oa)
+        elif isinstance(oa, ArrayObject):
+            try:
+                page, typ = oa[0:2]
+                array = oa[2:]
+                fit = Fit(typ, tuple(array))
+                return Destination("OpenAction", page, fit)
+            except Exception as exc:
+                raise Exception(f"Invalid Destination {oa}: {exc}")
+        else:
+            return None
+
+    @open_destination.setter
+    def open_destination(self, dest: Union[None, str, Destination, PageObject]) -> None:
+        raise NotImplementedError("no setter for open_destination")
+
+    @property
+    def outline(self) -> OutlineType:
+        """
+        Read-only property for the outline present in the document
+        (i.e., a collection of 'outline items' which are also known as
+        'bookmarks').
+        """
+        return self._get_outline()
+
+    def _get_outline(
+        self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None
+    ) -> OutlineType:
+        if outline is None:
+            outline = []
+            catalog = self.root_object
+
+            # get the outline dictionary and named destinations
+            if CO.OUTLINES in catalog:
+                lines = cast(DictionaryObject, catalog[CO.OUTLINES])
+
+                if isinstance(lines, NullObject):
+                    return outline
+
+                # §12.3.3 Document outline, entries in the outline dictionary
+                if lines is not None and "/First" in lines:
+                    node = cast(DictionaryObject, lines["/First"])
+            self._namedDests = self._get_named_destinations()
+
+        if node is None:
+            return outline
+
+        # see if there are any more outline items
+        while True:
+            outline_obj = self._build_outline_item(node)
+            if outline_obj:
+                outline.append(outline_obj)
+
+            # check for sub-outline
+            if "/First" in node:
+                sub_outline: List[Any] = []
+                self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline)
+                if sub_outline:
+                    outline.append(sub_outline)
+
+            if "/Next" not in node:
+                break
+            node = cast(DictionaryObject, node["/Next"])
+
+        return outline
+
+    @property
+    def threads(self) -> Optional[ArrayObject]:
+        """
+        Read-only property for the list of threads.
+
+        See §12.4.3 from the PDF 1.7 or 2.0 specification.
+
+        It is an array of dictionaries with "/F" (the first bead in the thread)
+        and "/I" (a thread information dictionary containing information about
+        the thread, such as its title, author, and creation date) properties or
+        None if there are no articles.
+
+        Since PDF 2.0 it can also contain an indirect reference to a metadata
+        stream containing information about the thread, such as its title,
+        author, and creation date.
+        """
+        catalog = self.root_object
+        if CO.THREADS in catalog:
+            return cast("ArrayObject", catalog[CO.THREADS])
+        else:
+            return None
+
+    @abstractmethod
+    def _get_page_number_by_indirect(
+        self, indirect_reference: Union[None, int, NullObject, IndirectObject]
+    ) -> Optional[int]:
+        ...  # pragma: no cover
+
+    def get_page_number(self, page: PageObject) -> Optional[int]:
+        """
+        Retrieve page number of a given PageObject.
+
+        Args:
+            page: The page to get page number. Should be
+                an instance of :class:`PageObject<pypdf._page.PageObject>`
+
+        Returns:
+            The page number or None if page is not found
+        """
+        return self._get_page_number_by_indirect(page.indirect_reference)
+
+    def get_destination_page_number(self, destination: Destination) -> Optional[int]:
+        """
+        Retrieve page number of a given Destination object.
+
+        Args:
+            destination: The destination to get page number.
+
+        Returns:
+            The page number or None if page is not found
+        """
+        return self._get_page_number_by_indirect(destination.page)
+
+    def _build_destination(
+        self,
+        title: str,
+        array: Optional[
+            List[
+                Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject]
+            ]
+        ],
+    ) -> Destination:
+        page, typ = None, None
+        # handle outline items with missing or invalid destination
+        if (
+            isinstance(array, (NullObject, str))
+            or (isinstance(array, ArrayObject) and len(array) == 0)
+            or array is None
+        ):
+            page = NullObject()
+            return Destination(title, page, Fit.fit())
+        else:
+            page, typ = array[0:2]  # type: ignore
+            array = array[2:]
+            try:
+                return Destination(title, page, Fit(fit_type=typ, fit_args=array))  # type: ignore
+            except PdfReadError:
+                logger_warning(f"Unknown destination: {title} {array}", __name__)
+                if self.strict:
+                    raise
+                # create a link to first Page
+                tmp = self.pages[0].indirect_reference
+                indirect_reference = NullObject() if tmp is None else tmp
+                return Destination(title, indirect_reference, Fit.fit())  # type: ignore
+
+    def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]:
+        dest, title, outline_item = None, None, None
+
+        # title required for valid outline
+        # § 12.3.3, entries in an outline item dictionary
+        try:
+            title = cast("str", node["/Title"])
+        except KeyError:
+            if self.strict:
+                raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}")
+            title = ""
+
+        if "/A" in node:
+            # Action, PDFv1.7 Section 12.6 (only type GoTo supported)
+            action = cast(DictionaryObject, node["/A"])
+            action_type = cast(NameObject, action[GoToActionArguments.S])
+            if action_type == "/GoTo":
+                dest = action[GoToActionArguments.D]
+        elif "/Dest" in node:
+            # Destination, PDFv1.7 Section 12.3.2
+            dest = node["/Dest"]
+            # if array was referenced in another object, will be a dict w/ key "/D"
+            if isinstance(dest, DictionaryObject) and "/D" in dest:
+                dest = dest["/D"]
+
+        if isinstance(dest, ArrayObject):
+            outline_item = self._build_destination(title, dest)
+        elif isinstance(dest, str):
+            # named destination, addresses NameObject Issue #193
+            # TODO : keep named destination instead of replacing it ?
+            try:
+                outline_item = self._build_destination(
+                    title, self._namedDests[dest].dest_array
+                )
+            except KeyError:
+                # named destination not found in Name Dict
+                outline_item = self._build_destination(title, None)
+        elif dest is None:
+            # outline item not required to have destination or action
+            # PDFv1.7 Table 153
+            outline_item = self._build_destination(title, dest)
+        else:
+            if self.strict:
+                raise PdfReadError(f"Unexpected destination {dest!r}")
+            else:
+                logger_warning(
+                    f"Removed unexpected destination {dest!r} from destination",
+                    __name__,
+                )
+            outline_item = self._build_destination(title, None)
+
+        # if outline item created, add color, format, and child count if present
+        if outline_item:
+            if "/C" in node:
+                # Color of outline item font in (R, G, B) with values ranging 0.0-1.0
+                outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"])  # type: ignore
+            if "/F" in node:
+                # specifies style characteristics bold and/or italic
+                # with 1=italic, 2=bold, 3=both
+                outline_item[NameObject("/F")] = node["/F"]
+            if "/Count" in node:
+                # absolute value = num. visible children
+                # with positive = open/unfolded, negative = closed/folded
+                outline_item[NameObject("/Count")] = node["/Count"]
+            #  if count is 0 we will consider it as open ( in order to have always an is_open to simplify
+            outline_item[NameObject("/%is_open%")] = BooleanObject(
+                node.get("/Count", 0) >= 0
+            )
+        outline_item.node = node
+        try:
+            outline_item.indirect_reference = node.indirect_reference
+        except AttributeError:
+            pass
+        return outline_item
+
+    @property
+    def pages(self) -> List[PageObject]:
+        """
+        Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
+        This property allows to get a page or a range of pages.
+
+        Note:
+            For PdfWriter only: Provides the capability to remove a page/range of
+            page from the list (using the del operator). Remember: Only the page
+            entry is removed, as the objects beneath can be used elsewhere. A
+            solution to completely remove them - if they are not used anywhere - is
+            to write to a buffer/temporary file and then load it into a new
+            PdfWriter.
+
+        """
+        return _VirtualList(self.get_num_pages, self.get_page)  # type: ignore
+
+    @property
+    def page_labels(self) -> List[str]:
+        """
+        A list of labels for the pages in this document.
+
+        This property is read-only. The labels are in the order that the pages
+        appear in the document.
+        """
+        return [page_index2page_label(self, i) for i in range(len(self.pages))]
+
+    @property
+    def page_layout(self) -> Optional[str]:
+        """
+        Get the page layout currently being used.
+
+        .. list-table:: Valid ``layout`` values
+           :widths: 50 200
+
+           * - /NoLayout
+             - Layout explicitly not specified
+           * - /SinglePage
+             - Show one page at a time
+           * - /OneColumn
+             - Show one column at a time
+           * - /TwoColumnLeft
+             - Show pages in two columns, odd-numbered pages on the left
+           * - /TwoColumnRight
+             - Show pages in two columns, odd-numbered pages on the right
+           * - /TwoPageLeft
+             - Show two pages at a time, odd-numbered pages on the left
+           * - /TwoPageRight
+             - Show two pages at a time, odd-numbered pages on the right
+        """
+        try:
+            return cast(NameObject, self.root_object[CD.PAGE_LAYOUT])
+        except KeyError:
+            return None
+
+    @property
+    def page_mode(self) -> Optional[PagemodeType]:
+        """
+        Get the page mode currently being used.
+
+        .. list-table:: Valid ``mode`` values
+           :widths: 50 200
+
+           * - /UseNone
+             - Do not show outline or thumbnails panels
+           * - /UseOutlines
+             - Show outline (aka bookmarks) panel
+           * - /UseThumbs
+             - Show page thumbnails panel
+           * - /FullScreen
+             - Fullscreen view
+           * - /UseOC
+             - Show Optional Content Group (OCG) panel
+           * - /UseAttachments
+             - Show attachments panel
+        """
+        try:
+            return self.root_object["/PageMode"]  # type: ignore
+        except KeyError:
+            return None
+
+    def _flatten(
+        self,
+        pages: Union[None, DictionaryObject, PageObject] = None,
+        inherit: Optional[Dict[str, Any]] = None,
+        indirect_reference: Optional[IndirectObject] = None,
+    ) -> None:
+        inheritable_page_attributes = (
+            NameObject(PG.RESOURCES),
+            NameObject(PG.MEDIABOX),
+            NameObject(PG.CROPBOX),
+            NameObject(PG.ROTATE),
+        )
+        if inherit is None:
+            inherit = {}
+        if pages is None:
+            # Fix issue 327: set flattened_pages attribute only for
+            # decrypted file
+            catalog = self.root_object
+            pages = catalog["/Pages"].get_object()  # type: ignore
+            assert isinstance(pages, DictionaryObject)
+            self.flattened_pages = []
+
+        if PA.TYPE in pages:
+            t = cast(str, pages[PA.TYPE])
+        # if pdf has no type, considered as a page if /Kids is missing
+        elif PA.KIDS not in pages:
+            t = "/Page"
+        else:
+            t = "/Pages"
+
+        if t == "/Pages":
+            for attr in inheritable_page_attributes:
+                if attr in pages:
+                    inherit[attr] = pages[attr]
+            for page in cast(ArrayObject, pages[PA.KIDS]):
+                addt = {}
+                if isinstance(page, IndirectObject):
+                    addt["indirect_reference"] = page
+                obj = page.get_object()
+                if obj:
+                    # damaged file may have invalid child in /Pages
+                    self._flatten(obj, inherit, **addt)
+        elif t == "/Page":
+            for attr_in, value in list(inherit.items()):
+                # if the page has it's own value, it does not inherit the
+                # parent's value:
+                if attr_in not in pages:
+                    pages[attr_in] = value
+            page_obj = PageObject(self, indirect_reference)
+            page_obj.update(pages)
+
+            # TODO: Could flattened_pages be None at this point?
+            self.flattened_pages.append(page_obj)  # type: ignore
+
+    def remove_page(
+        self,
+        page: Union[int, PageObject, IndirectObject],
+        clean: bool = False,
+    ) -> None:
+        """
+        Remove page from pages list.
+
+        Args:
+            page:
+                * :class:`int`: Page number to be removed.
+                * :class:`~pypdf._page.PageObject`: page to be removed. If the page appears many times
+                  only the first one will be removed.
+                * :class:`~pypdf.generic.IndirectObject`: Reference to page to be removed.
+
+            clean: replace PageObject with NullObject to prevent annotations
+                or destinations to reference a detached page.
+        """
+        if self.flattened_pages is None:
+            self._flatten()
+        assert self.flattened_pages is not None
+        if isinstance(page, IndirectObject):
+            p = page.get_object()
+            if not isinstance(p, PageObject):
+                logger_warning("IndirectObject is not referencing a page", __name__)
+                return
+            page = p
+
+        if not isinstance(page, int):
+            try:
+                page = self.flattened_pages.index(page)
+            except ValueError:
+                logger_warning("Cannot find page in pages", __name__)
+                return
+        if not (0 <= page < len(self.flattened_pages)):
+            logger_warning("Page number is out of range", __name__)
+            return
+
+        ind = self.pages[page].indirect_reference
+        del self.pages[page]
+        if clean and ind is not None:
+            self._replace_object(ind, NullObject())
+
+    def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
+        """
+        Used to ease development.
+
+        This is equivalent to generic.IndirectObject(num,gen,self).get_object()
+
+        Args:
+            num: The object number of the indirect object.
+            gen: The generation number of the indirect object.
+
+        Returns:
+            A PdfObject
+        """
+        return IndirectObject(num, gen, self).get_object()
+
+    def decode_permissions(
+        self, permissions_code: int
+    ) -> Dict[str, bool]:  # pragma: no cover
+        """Take the permissions as an integer, return the allowed access."""
+        deprecate_with_replacement(
+            old_name="decode_permissions",
+            new_name="user_access_permissions",
+            removed_in="5.0.0",
+        )
+
+        permissions_mapping = {
+            "print": UserAccessPermissions.PRINT,
+            "modify": UserAccessPermissions.MODIFY,
+            "copy": UserAccessPermissions.EXTRACT,
+            "annotations": UserAccessPermissions.ADD_OR_MODIFY,
+            "forms": UserAccessPermissions.FILL_FORM_FIELDS,
+            # Do not fix typo, as part of official, but deprecated API.
+            "accessability": UserAccessPermissions.EXTRACT_TEXT_AND_GRAPHICS,
+            "assemble": UserAccessPermissions.ASSEMBLE_DOC,
+            "print_high_quality": UserAccessPermissions.PRINT_TO_REPRESENTATION,
+        }
+
+        return {
+            key: permissions_code & flag != 0
+            for key, flag in permissions_mapping.items()
+        }
+
+    @property
+    def user_access_permissions(self) -> Optional[UserAccessPermissions]:
+        """Get the user access permissions for encrypted documents. Returns None if not encrypted."""
+        if self._encryption is None:
+            return None
+        return UserAccessPermissions(self._encryption.P)
+
+    @property
+    @abstractmethod
+    def is_encrypted(self) -> bool:
+        """
+        Read-only boolean property showing whether this PDF file is encrypted.
+
+        Note that this property, if true, will remain true even after the
+        :meth:`decrypt()<pypdf.PdfReader.decrypt>` method is called.
+        """
+        ...  # pragma: no cover
+
+    @property
+    def xfa(self) -> Optional[Dict[str, Any]]:
+        tree: Optional[TreeObject] = None
+        retval: Dict[str, Any] = {}
+        catalog = self.root_object
+
+        if "/AcroForm" not in catalog or not catalog["/AcroForm"]:
+            return None
+
+        tree = cast(TreeObject, catalog["/AcroForm"])
+
+        if "/XFA" in tree:
+            fields = cast(ArrayObject, tree["/XFA"])
+            i = iter(fields)
+            for f in i:
+                tag = f
+                f = next(i)
+                if isinstance(f, IndirectObject):
+                    field = cast(Optional[EncodedStreamObject], f.get_object())
+                    if field:
+                        es = zlib.decompress(b_(field._data))
+                        retval[tag] = es
+        return retval
+
+    @property
+    def attachments(self) -> Mapping[str, List[bytes]]:
+        return LazyDict(
+            {
+                name: (self._get_attachment_list, name)
+                for name in self._list_attachments()
+            }
+        )
+
+    def _list_attachments(self) -> List[str]:
+        """
+        Retrieves the list of filenames of file attachments.
+
+        Returns:
+            list of filenames
+        """
+        catalog = self.root_object
+        # From the catalog get the embedded file names
+        try:
+            filenames = cast(
+                ArrayObject,
+                cast(
+                    DictionaryObject,
+                    cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
+                )["/Names"],
+            )
+        except KeyError:
+            return []
+        attachments_names = [f for f in filenames if isinstance(f, str)]
+        return attachments_names
+
+    def _get_attachment_list(self, name: str) -> List[bytes]:
+        out = self._get_attachments(name)[name]
+        if isinstance(out, list):
+            return out
+        return [out]
+
+    def _get_attachments(
+        self, filename: Optional[str] = None
+    ) -> Dict[str, Union[bytes, List[bytes]]]:
+        """
+        Retrieves all or selected file attachments of the PDF as a dictionary of file names
+        and the file data as a bytestring.
+
+        Args:
+            filename: If filename is None, then a dictionary of all attachments
+                will be returned, where the key is the filename and the value
+                is the content. Otherwise, a dictionary with just a single key
+                - the filename - and its content will be returned.
+
+        Returns:
+            dictionary of filename -> Union[bytestring or List[ByteString]]
+            If the filename exists multiple times a list of the different versions will be provided.
+        """
+        catalog = self.root_object
+        # From the catalog get the embedded file names
+        try:
+            filenames = cast(
+                ArrayObject,
+                cast(
+                    DictionaryObject,
+                    cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
+                )["/Names"],
+            )
+        except KeyError:
+            return {}
+        attachments: Dict[str, Union[bytes, List[bytes]]] = {}
+        # Loop through attachments
+        for i in range(len(filenames)):
+            f = filenames[i]
+            if isinstance(f, str):
+                if filename is not None and f != filename:
+                    continue
+                name = f
+                f_dict = filenames[i + 1].get_object()
+                f_data = f_dict["/EF"]["/F"].get_data()
+                if name in attachments:
+                    if not isinstance(attachments[name], list):
+                        attachments[name] = [attachments[name]]  # type:ignore
+                    attachments[name].append(f_data)  # type:ignore
+                else:
+                    attachments[name] = f_data
+        return attachments
+
+
+class LazyDict(Mapping[Any, Any]):
+    def __init__(self, *args: Any, **kw: Any) -> None:
+        self._raw_dict = dict(*args, **kw)
+
+    def __getitem__(self, key: str) -> Any:
+        func, arg = self._raw_dict.__getitem__(key)
+        return func(arg)
+
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self._raw_dict)
+
+    def __len__(self) -> int:
+        return len(self._raw_dict)
+
+    def __str__(self) -> str:
+        return f"LazyDict(keys={list(self.keys())})"
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_doc_common.py
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz