about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_page_labels.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_page_labels.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_page_labels.py280
1 files changed, 280 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py b/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py
new file mode 100644
index 00000000..b0252795
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py
@@ -0,0 +1,280 @@
+"""
+Page labels are shown by PDF viewers as "the page number".
+
+A page has a numeric index, starting at 0. Additionally, the page
+has a label. In the most simple case:
+
+    label = index + 1
+
+However, the title page and the table of contents might have Roman numerals as
+page labels. This makes things more complicated.
+
+Example 1
+---------
+
+>>> reader.root_object["/PageLabels"]["/Nums"]
+[0, IndirectObject(18, 0, 139929798197504),
+ 8, IndirectObject(19, 0, 139929798197504)]
+>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
+{'/S': '/r'}
+>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
+{'/S': '/D'}
+
+Example 2
+---------
+The following is a document with pages labeled
+i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
+
+1 0 obj
+    << /Type /Catalog
+       /PageLabels << /Nums [
+                        0 << /S /r >>
+                        4 << /S /D >>
+                        7 << /S /D
+                             /P ( A- )
+                             /St 8
+                        >>
+                        % A number tree containing
+                        % three page label dictionaries
+                        ]
+                   >>
+    ...
+    >>
+endobj
+
+
+ยง12.4.2 PDF Specification 1.7 and 2.0
+=====================================
+
+Entries in a page label dictionary
+----------------------------------
+The /S key:
+D       Decimal Arabic numerals
+R       Uppercase Roman numerals
+r       Lowercase Roman numerals
+A       Uppercase letters (A to Z for the first 26 pages,
+                           AA to ZZ for the next 26, and so on)
+a       Lowercase letters (a to z for the first 26 pages,
+                           aa to zz for the next 26, and so on)
+"""
+
+from typing import Iterator, List, Optional, Tuple, cast
+
+from ._protocols import PdfCommonDocProtocol
+from ._utils import logger_warning
+from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject
+
+
+def number2uppercase_roman_numeral(num: int) -> str:
+    roman = [
+        (1000, "M"),
+        (900, "CM"),
+        (500, "D"),
+        (400, "CD"),
+        (100, "C"),
+        (90, "XC"),
+        (50, "L"),
+        (40, "XL"),
+        (10, "X"),
+        (9, "IX"),
+        (5, "V"),
+        (4, "IV"),
+        (1, "I"),
+    ]
+
+    def roman_num(num: int) -> Iterator[str]:
+        for decimal, roman_repr in roman:
+            x, _ = divmod(num, decimal)
+            yield roman_repr * x
+            num -= decimal * x
+            if num <= 0:
+                break
+
+    return "".join(list(roman_num(num)))
+
+
+def number2lowercase_roman_numeral(number: int) -> str:
+    return number2uppercase_roman_numeral(number).lower()
+
+
+def number2uppercase_letter(number: int) -> str:
+    if number <= 0:
+        raise ValueError("Expecting a positive number")
+    alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+    rep = ""
+    while number > 0:
+        remainder = number % 26
+        if remainder == 0:
+            remainder = 26
+        rep = alphabet[remainder - 1] + rep
+        # update
+        number -= remainder
+        number = number // 26
+    return rep
+
+
+def number2lowercase_letter(number: int) -> str:
+    return number2uppercase_letter(number).lower()
+
+
+def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
+    # [Nums] shall be an array of the form
+    #   [ key 1 value 1 key 2 value 2 ... key n value n ]
+    # where each key_i is an integer and the corresponding
+    # value_i shall be the object associated with that key.
+    # The keys shall be sorted in numerical order,
+    # analogously to the arrangement of keys in a name tree
+    # as described in 7.9.6, "Name Trees."
+    nums = cast(ArrayObject, dictionary_object["/Nums"])
+    i = 0
+    value = None
+    start_index = 0
+    while i < len(nums):
+        start_index = nums[i]
+        value = nums[i + 1].get_object()
+        if i + 2 == len(nums):
+            break
+        if nums[i + 2] > index:
+            break
+        i += 2
+    m = {
+        None: lambda n: "",
+        "/D": lambda n: str(n),
+        "/R": number2uppercase_roman_numeral,
+        "/r": number2lowercase_roman_numeral,
+        "/A": number2uppercase_letter,
+        "/a": number2lowercase_letter,
+    }
+    # if /Nums array is not following the specification or if /Nums is empty
+    if not isinstance(value, dict):
+        return str(index + 1)  # Fallback
+    start = value.get("/St", 1)
+    prefix = value.get("/P", "")
+    return prefix + m[value.get("/S")](index - start_index + start)
+
+
+def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
+    """
+    See 7.9.7 "Number Trees".
+
+    Args:
+        reader: The PdfReader
+        index: The index of the page
+
+    Returns:
+        The label of the page, e.g. "iv" or "4".
+    """
+    root = cast(DictionaryObject, reader.root_object)
+    if "/PageLabels" not in root:
+        return str(index + 1)  # Fallback
+    number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
+    if "/Nums" in number_tree:
+        return get_label_from_nums(number_tree, index)
+    if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
+        # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
+        # Limit maximum depth.
+        level = 0
+        while level < 100:
+            kids = cast(List[DictionaryObject], number_tree["/Kids"])
+            for kid in kids:
+                # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
+                limits = cast(List[int], kid["/Limits"])
+                if limits[0] <= index <= limits[1]:
+                    if kid.get("/Kids", None) is not None:
+                        # Recursive definition.
+                        level += 1
+                        if level == 100:  # pragma: no cover
+                            raise NotImplementedError("Too deep nesting is not supported.")
+                        number_tree = kid
+                        # Exit the inner `for` loop and continue at the next level with the
+                        # next iteration of the `while` loop.
+                        break
+                    return get_label_from_nums(kid, index)
+            else:
+                # When there are no kids, make sure to exit the `while` loop directly
+                # and continue with the fallback.
+                break
+
+    logger_warning(
+        f"Could not reliably determine page label for {index}.",
+        __name__
+    )
+    return str(index + 1)  # Fallback if neither /Nums nor /Kids is in the number_tree
+
+
+def nums_insert(
+    key: NumberObject,
+    value: DictionaryObject,
+    nums: ArrayObject,
+) -> None:
+    """
+    Insert a key, value pair in a Nums array.
+
+    See 7.9.7 "Number Trees".
+
+    Args:
+        key: number key of the entry
+        value: value of the entry
+        nums: Nums array to modify
+    """
+    if len(nums) % 2 != 0:
+        raise ValueError("a nums like array must have an even number of elements")
+
+    i = len(nums)
+    while i != 0 and key <= nums[i - 2]:
+        i = i - 2
+
+    if i < len(nums) and key == nums[i]:
+        nums[i + 1] = value
+    else:
+        nums.insert(i, key)
+        nums.insert(i + 1, value)
+
+
+def nums_clear_range(
+    key: NumberObject,
+    page_index_to: int,
+    nums: ArrayObject,
+) -> None:
+    """
+    Remove all entries in a number tree in a range after an entry.
+
+    See 7.9.7 "Number Trees".
+
+    Args:
+        key: number key of the entry before the range
+        page_index_to: The page index of the upper limit of the range
+        nums: Nums array to modify
+    """
+    if len(nums) % 2 != 0:
+        raise ValueError("a nums like array must have an even number of elements")
+    if page_index_to < key:
+        raise ValueError("page_index_to must be greater or equal than key")
+
+    i = nums.index(key) + 2
+    while i < len(nums) and nums[i] <= page_index_to:
+        nums.pop(i)
+        nums.pop(i)
+
+
+def nums_next(
+    key: NumberObject,
+    nums: ArrayObject,
+) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
+    """
+    Return the (key, value) pair of the entry after the given one.
+
+    See 7.9.7 "Number Trees".
+
+    Args:
+        key: number key of the entry
+        nums: Nums array
+    """
+    if len(nums) % 2 != 0:
+        raise ValueError("a nums like array must have an even number of elements")
+
+    i = nums.index(key) + 2
+    if i < len(nums):
+        return (nums[i], nums[i + 1])
+    else:
+        return (None, None)