1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
|
"""
Page labels are shown by PDF viewers as "the page number".
A page has a numeric index, starting at 0. Additionally, the page
has a label. In the most simple case:
label = index + 1
However, the title page and the table of contents might have Roman numerals as
page labels. This makes things more complicated.
Example 1
---------
>>> reader.root_object["/PageLabels"]["/Nums"]
[0, IndirectObject(18, 0, 139929798197504),
8, IndirectObject(19, 0, 139929798197504)]
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][1])
{'/S': '/r'}
>>> reader.get_object(reader.root_object["/PageLabels"]["/Nums"][3])
{'/S': '/D'}
Example 2
---------
The following is a document with pages labeled
i, ii, iii, iv, 1, 2, 3, A-8, A-9, ...
1 0 obj
<< /Type /Catalog
/PageLabels << /Nums [
0 << /S /r >>
4 << /S /D >>
7 << /S /D
/P ( A- )
/St 8
>>
% A number tree containing
% three page label dictionaries
]
>>
...
>>
endobj
§12.4.2 PDF Specification 1.7 and 2.0
=====================================
Entries in a page label dictionary
----------------------------------
The /S key:
D Decimal Arabic numerals
R Uppercase Roman numerals
r Lowercase Roman numerals
A Uppercase letters (A to Z for the first 26 pages,
AA to ZZ for the next 26, and so on)
a Lowercase letters (a to z for the first 26 pages,
aa to zz for the next 26, and so on)
"""
from typing import Iterator, List, Optional, Tuple, cast
from ._protocols import PdfCommonDocProtocol
from ._utils import logger_warning
from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject
def number2uppercase_roman_numeral(num: int) -> str:
roman = [
(1000, "M"),
(900, "CM"),
(500, "D"),
(400, "CD"),
(100, "C"),
(90, "XC"),
(50, "L"),
(40, "XL"),
(10, "X"),
(9, "IX"),
(5, "V"),
(4, "IV"),
(1, "I"),
]
def roman_num(num: int) -> Iterator[str]:
for decimal, roman_repr in roman:
x, _ = divmod(num, decimal)
yield roman_repr * x
num -= decimal * x
if num <= 0:
break
return "".join(list(roman_num(num)))
def number2lowercase_roman_numeral(number: int) -> str:
return number2uppercase_roman_numeral(number).lower()
def number2uppercase_letter(number: int) -> str:
if number <= 0:
raise ValueError("Expecting a positive number")
alphabet = [chr(i) for i in range(ord("A"), ord("Z") + 1)]
rep = ""
while number > 0:
remainder = number % 26
if remainder == 0:
remainder = 26
rep = alphabet[remainder - 1] + rep
# update
number -= remainder
number = number // 26
return rep
def number2lowercase_letter(number: int) -> str:
return number2uppercase_letter(number).lower()
def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str:
# [Nums] shall be an array of the form
# [ key 1 value 1 key 2 value 2 ... key n value n ]
# where each key_i is an integer and the corresponding
# value_i shall be the object associated with that key.
# The keys shall be sorted in numerical order,
# analogously to the arrangement of keys in a name tree
# as described in 7.9.6, "Name Trees."
nums = cast(ArrayObject, dictionary_object["/Nums"])
i = 0
value = None
start_index = 0
while i < len(nums):
start_index = nums[i]
value = nums[i + 1].get_object()
if i + 2 == len(nums):
break
if nums[i + 2] > index:
break
i += 2
m = {
None: lambda n: "",
"/D": lambda n: str(n),
"/R": number2uppercase_roman_numeral,
"/r": number2lowercase_roman_numeral,
"/A": number2uppercase_letter,
"/a": number2lowercase_letter,
}
# if /Nums array is not following the specification or if /Nums is empty
if not isinstance(value, dict):
return str(index + 1) # Fallback
start = value.get("/St", 1)
prefix = value.get("/P", "")
return prefix + m[value.get("/S")](index - start_index + start)
def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
"""
See 7.9.7 "Number Trees".
Args:
reader: The PdfReader
index: The index of the page
Returns:
The label of the page, e.g. "iv" or "4".
"""
root = cast(DictionaryObject, reader.root_object)
if "/PageLabels" not in root:
return str(index + 1) # Fallback
number_tree = cast(DictionaryObject, root["/PageLabels"].get_object())
if "/Nums" in number_tree:
return get_label_from_nums(number_tree, index)
if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject):
# number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]}
# Limit maximum depth.
level = 0
while level < 100:
kids = cast(List[DictionaryObject], number_tree["/Kids"])
for kid in kids:
# kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
limits = cast(List[int], kid["/Limits"])
if limits[0] <= index <= limits[1]:
if kid.get("/Kids", None) is not None:
# Recursive definition.
level += 1
if level == 100: # pragma: no cover
raise NotImplementedError("Too deep nesting is not supported.")
number_tree = kid
# Exit the inner `for` loop and continue at the next level with the
# next iteration of the `while` loop.
break
return get_label_from_nums(kid, index)
else:
# When there are no kids, make sure to exit the `while` loop directly
# and continue with the fallback.
break
logger_warning(
f"Could not reliably determine page label for {index}.",
__name__
)
return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree
def nums_insert(
key: NumberObject,
value: DictionaryObject,
nums: ArrayObject,
) -> None:
"""
Insert a key, value pair in a Nums array.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry
value: value of the entry
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
i = len(nums)
while i != 0 and key <= nums[i - 2]:
i = i - 2
if i < len(nums) and key == nums[i]:
nums[i + 1] = value
else:
nums.insert(i, key)
nums.insert(i + 1, value)
def nums_clear_range(
key: NumberObject,
page_index_to: int,
nums: ArrayObject,
) -> None:
"""
Remove all entries in a number tree in a range after an entry.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry before the range
page_index_to: The page index of the upper limit of the range
nums: Nums array to modify
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
if page_index_to < key:
raise ValueError("page_index_to must be greater or equal than key")
i = nums.index(key) + 2
while i < len(nums) and nums[i] <= page_index_to:
nums.pop(i)
nums.pop(i)
def nums_next(
key: NumberObject,
nums: ArrayObject,
) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]:
"""
Return the (key, value) pair of the entry after the given one.
See 7.9.7 "Number Trees".
Args:
key: number key of the entry
nums: Nums array
"""
if len(nums) % 2 != 0:
raise ValueError("a nums like array must have an even number of elements")
i = nums.index(key) + 2
if i < len(nums):
return (nums[i], nums[i + 1])
else:
return (None, None)
|