""" Page labels are shown by PDF viewers as "the page number". A page has a numeric index, starting at 0. Additionally, the page has a label. In the most simple case: label = index + 1 However, the title page and the table of contents might have Roman numerals as page labels. This makes things more complicated. Example 1 --------- >>> reader.root_object["/PageLabels"]["/Nums"] [0, IndirectObject(18, 0, 139929798197504), 8, IndirectObject(19, 0, 139929798197504)] >>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1]) {'/S': '/r'} >>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3]) {'/S': '/D'} Example 2 --------- The following is a document with pages labeled i, ii, iii, iv, 1, 2, 3, A-8, A-9, ... 1 0 obj << /Type /Catalog /PageLabels << /Nums [ 0 << /S /r >> 4 << /S /D >> 7 << /S /D /P ( A- ) /St 8 >> % A number tree containing % three page label dictionaries ] >> ... >> endobj ยง12.4.2 PDF Specification 1.7 and 2.0 ===================================== Entries in a page label dictionary ---------------------------------- The /S key: D Decimal Arabic numerals R Uppercase Roman numerals r Lowercase Roman numerals A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on) """ from typing import Iterator, List, Optional, Tuple, cast from ._protocols import PdfCommonDocProtocol from ._utils import logger_warning from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject def number2uppercase_roman_numeral(num: int) -> str: roman = [ (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I"), ] def roman_num(num: int) -> Iterator[str]: for decimal, roman_repr in roman: x, _ = divmod(num, decimal) yield roman_repr * x num -= decimal * x if num <= 0: break return "".join(list(roman_num(num))) def number2lowercase_roman_numeral(number: int) -> str: return number2uppercase_roman_numeral(number).lower() def number2uppercase_letter(number: int) -> str: if number <= 0: raise ValueError("Expecting a positive number") alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)] rep = "" while number > 0: remainder = number % 26 if remainder == 0: remainder = 26 rep = alphabet[remainder - 1] + rep # update number -= remainder number = number // 26 return rep def number2lowercase_letter(number: int) -> str: return number2uppercase_letter(number).lower() def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str: # [Nums] shall be an array of the form # [ key 1 value 1 key 2 value 2 ... key n value n ] # where each key_i is an integer and the corresponding # value_i shall be the object associated with that key. # The keys shall be sorted in numerical order, # analogously to the arrangement of keys in a name tree # as described in 7.9.6, "Name Trees." nums = cast(ArrayObject, dictionary_object["/Nums"]) i = 0 value = None start_index = 0 while i < len(nums): start_index = nums[i] value = nums[i + 1].get_object() if i + 2 == len(nums): break if nums[i + 2] > index: break i += 2 m = { None: lambda n: "", "/D": lambda n: str(n), "/R": number2uppercase_roman_numeral, "/r": number2lowercase_roman_numeral, "/A": number2uppercase_letter, "/a": number2lowercase_letter, } # if /Nums array is not following the specification or if /Nums is empty if not isinstance(value, dict): return str(index + 1) # Fallback start = value.get("/St", 1) prefix = value.get("/P", "") return prefix + m[value.get("/S")](index - start_index + start) def index2label(reader: PdfCommonDocProtocol, index: int) -> str: """ See 7.9.7 "Number Trees". Args: reader: The PdfReader index: The index of the page Returns: The label of the page, e.g. "iv" or "4". """ root = cast(DictionaryObject, reader.root_object) if "/PageLabels" not in root: return str(index + 1) # Fallback number_tree = cast(DictionaryObject, root["/PageLabels"].get_object()) if "/Nums" in number_tree: return get_label_from_nums(number_tree, index) if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject): # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]} # Limit maximum depth. level = 0 while level < 100: kids = cast(List[DictionaryObject], number_tree["/Kids"]) for kid in kids: # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]} limits = cast(List[int], kid["/Limits"]) if limits[0] <= index <= limits[1]: if kid.get("/Kids", None) is not None: # Recursive definition. level += 1 if level == 100: # pragma: no cover raise NotImplementedError("Too deep nesting is not supported.") number_tree = kid # Exit the inner `for` loop and continue at the next level with the # next iteration of the `while` loop. break return get_label_from_nums(kid, index) else: # When there are no kids, make sure to exit the `while` loop directly # and continue with the fallback. break logger_warning( f"Could not reliably determine page label for {index}.", __name__ ) return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree def nums_insert( key: NumberObject, value: DictionaryObject, nums: ArrayObject, ) -> None: """ Insert a key, value pair in a Nums array. See 7.9.7 "Number Trees". Args: key: number key of the entry value: value of the entry nums: Nums array to modify """ if len(nums) % 2 != 0: raise ValueError("a nums like array must have an even number of elements") i = len(nums) while i != 0 and key <= nums[i - 2]: i = i - 2 if i < len(nums) and key == nums[i]: nums[i + 1] = value else: nums.insert(i, key) nums.insert(i + 1, value) def nums_clear_range( key: NumberObject, page_index_to: int, nums: ArrayObject, ) -> None: """ Remove all entries in a number tree in a range after an entry. See 7.9.7 "Number Trees". Args: key: number key of the entry before the range page_index_to: The page index of the upper limit of the range nums: Nums array to modify """ if len(nums) % 2 != 0: raise ValueError("a nums like array must have an even number of elements") if page_index_to < key: raise ValueError("page_index_to must be greater or equal than key") i = nums.index(key) + 2 while i < len(nums) and nums[i] <= page_index_to: nums.pop(i) nums.pop(i) def nums_next( key: NumberObject, nums: ArrayObject, ) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]: """ Return the (key, value) pair of the entry after the given one. See 7.9.7 "Number Trees". Args: key: number key of the entry nums: Nums array """ if len(nums) % 2 != 0: raise ValueError("a nums like array must have an even number of elements") i = nums.index(key) + 2 if i < len(nums): return (nums[i], nums[i + 1]) else: return (None, None)