diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_page_labels.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/pypdf/_page_labels.py | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py b/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py new file mode 100644 index 00000000..b0252795 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_page_labels.py @@ -0,0 +1,280 @@ +""" +Page labels are shown by PDF viewers as "the page number". + +A page has a numeric index, starting at 0. Additionally, the page +has a label. In the most simple case: + + label = index + 1 + +However, the title page and the table of contents might have Roman numerals as +page labels. This makes things more complicated. + +Example 1 +--------- + +>>> reader.root_object["/PageLabels"]["/Nums"] +[0, IndirectObject(18, 0, 139929798197504), + 8, IndirectObject(19, 0, 139929798197504)] +>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1]) +{'/S': '/r'} +>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3]) +{'/S': '/D'} + +Example 2 +--------- +The following is a document with pages labeled +i, ii, iii, iv, 1, 2, 3, A-8, A-9, ... + +1 0 obj + << /Type /Catalog + /PageLabels << /Nums [ + 0 << /S /r >> + 4 << /S /D >> + 7 << /S /D + /P ( A- ) + /St 8 + >> + % A number tree containing + % three page label dictionaries + ] + >> + ... + >> +endobj + + +ยง12.4.2 PDF Specification 1.7 and 2.0 +===================================== + +Entries in a page label dictionary +---------------------------------- +The /S key: +D Decimal Arabic numerals +R Uppercase Roman numerals +r Lowercase Roman numerals +A Uppercase letters (A to Z for the first 26 pages, + AA to ZZ for the next 26, and so on) +a Lowercase letters (a to z for the first 26 pages, + aa to zz for the next 26, and so on) +""" + +from typing import Iterator, List, Optional, Tuple, cast + +from ._protocols import PdfCommonDocProtocol +from ._utils import logger_warning +from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject + + +def number2uppercase_roman_numeral(num: int) -> str: + roman = [ + (1000, "M"), + (900, "CM"), + (500, "D"), + (400, "CD"), + (100, "C"), + (90, "XC"), + (50, "L"), + (40, "XL"), + (10, "X"), + (9, "IX"), + (5, "V"), + (4, "IV"), + (1, "I"), + ] + + def roman_num(num: int) -> Iterator[str]: + for decimal, roman_repr in roman: + x, _ = divmod(num, decimal) + yield roman_repr * x + num -= decimal * x + if num <= 0: + break + + return "".join(list(roman_num(num))) + + +def number2lowercase_roman_numeral(number: int) -> str: + return number2uppercase_roman_numeral(number).lower() + + +def number2uppercase_letter(number: int) -> str: + if number <= 0: + raise ValueError("Expecting a positive number") + alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)] + rep = "" + while number > 0: + remainder = number % 26 + if remainder == 0: + remainder = 26 + rep = alphabet[remainder - 1] + rep + # update + number -= remainder + number = number // 26 + return rep + + +def number2lowercase_letter(number: int) -> str: + return number2uppercase_letter(number).lower() + + +def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str: + # [Nums] shall be an array of the form + # [ key 1 value 1 key 2 value 2 ... key n value n ] + # where each key_i is an integer and the corresponding + # value_i shall be the object associated with that key. + # The keys shall be sorted in numerical order, + # analogously to the arrangement of keys in a name tree + # as described in 7.9.6, "Name Trees." + nums = cast(ArrayObject, dictionary_object["/Nums"]) + i = 0 + value = None + start_index = 0 + while i < len(nums): + start_index = nums[i] + value = nums[i + 1].get_object() + if i + 2 == len(nums): + break + if nums[i + 2] > index: + break + i += 2 + m = { + None: lambda n: "", + "/D": lambda n: str(n), + "/R": number2uppercase_roman_numeral, + "/r": number2lowercase_roman_numeral, + "/A": number2uppercase_letter, + "/a": number2lowercase_letter, + } + # if /Nums array is not following the specification or if /Nums is empty + if not isinstance(value, dict): + return str(index + 1) # Fallback + start = value.get("/St", 1) + prefix = value.get("/P", "") + return prefix + m[value.get("/S")](index - start_index + start) + + +def index2label(reader: PdfCommonDocProtocol, index: int) -> str: + """ + See 7.9.7 "Number Trees". + + Args: + reader: The PdfReader + index: The index of the page + + Returns: + The label of the page, e.g. "iv" or "4". + """ + root = cast(DictionaryObject, reader.root_object) + if "/PageLabels" not in root: + return str(index + 1) # Fallback + number_tree = cast(DictionaryObject, root["/PageLabels"].get_object()) + if "/Nums" in number_tree: + return get_label_from_nums(number_tree, index) + if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject): + # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]} + # Limit maximum depth. + level = 0 + while level < 100: + kids = cast(List[DictionaryObject], number_tree["/Kids"]) + for kid in kids: + # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]} + limits = cast(List[int], kid["/Limits"]) + if limits[0] <= index <= limits[1]: + if kid.get("/Kids", None) is not None: + # Recursive definition. + level += 1 + if level == 100: # pragma: no cover + raise NotImplementedError("Too deep nesting is not supported.") + number_tree = kid + # Exit the inner `for` loop and continue at the next level with the + # next iteration of the `while` loop. + break + return get_label_from_nums(kid, index) + else: + # When there are no kids, make sure to exit the `while` loop directly + # and continue with the fallback. + break + + logger_warning( + f"Could not reliably determine page label for {index}.", + __name__ + ) + return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree + + +def nums_insert( + key: NumberObject, + value: DictionaryObject, + nums: ArrayObject, +) -> None: + """ + Insert a key, value pair in a Nums array. + + See 7.9.7 "Number Trees". + + Args: + key: number key of the entry + value: value of the entry + nums: Nums array to modify + """ + if len(nums) % 2 != 0: + raise ValueError("a nums like array must have an even number of elements") + + i = len(nums) + while i != 0 and key <= nums[i - 2]: + i = i - 2 + + if i < len(nums) and key == nums[i]: + nums[i + 1] = value + else: + nums.insert(i, key) + nums.insert(i + 1, value) + + +def nums_clear_range( + key: NumberObject, + page_index_to: int, + nums: ArrayObject, +) -> None: + """ + Remove all entries in a number tree in a range after an entry. + + See 7.9.7 "Number Trees". + + Args: + key: number key of the entry before the range + page_index_to: The page index of the upper limit of the range + nums: Nums array to modify + """ + if len(nums) % 2 != 0: + raise ValueError("a nums like array must have an even number of elements") + if page_index_to < key: + raise ValueError("page_index_to must be greater or equal than key") + + i = nums.index(key) + 2 + while i < len(nums) and nums[i] <= page_index_to: + nums.pop(i) + nums.pop(i) + + +def nums_next( + key: NumberObject, + nums: ArrayObject, +) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]: + """ + Return the (key, value) pair of the entry after the given one. + + See 7.9.7 "Number Trees". + + Args: + key: number key of the entry + nums: Nums array + """ + if len(nums) % 2 != 0: + raise ValueError("a nums like array must have an even number of elements") + + i = nums.index(key) + 2 + if i < len(nums): + return (nums[i], nums[i + 1]) + else: + return (None, None) |