1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
|
"""
Code related to text extraction.
Some parts are still in _page.py. In doubt, they will stay there.
"""
import math
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
class OrientationNotFoundError(Exception):
pass
def set_custom_rtl(
_min: Union[str, int, None] = None,
_max: Union[str, int, None] = None,
specials: Union[str, List[int], None] = None,
) -> Tuple[int, int, List[int]]:
"""
Change the Right-To-Left and special characters custom parameters.
Args:
_min: The new minimum value for the range of custom characters that
will be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
_max: The new maximum value for the range of custom characters that will
be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
specials: The new list of special characters to be inserted in the
current insertion order.
If set to ``None``, the current value will not be changed.
If set to a string, it will be converted to a list of ASCII codes.
The default value is an empty list.
Returns:
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
"""
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
if isinstance(_min, int):
CUSTOM_RTL_MIN = _min
elif isinstance(_min, str):
CUSTOM_RTL_MIN = ord(_min)
if isinstance(_max, int):
CUSTOM_RTL_MAX = _max
elif isinstance(_max, str):
CUSTOM_RTL_MAX = ord(_max)
if isinstance(specials, str):
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
elif isinstance(specials, list):
CUSTOM_RTL_SPECIAL_CHARS = specials
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
def mult(m: List[float], n: List[float]) -> List[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]
def orient(m: List[float]) -> int:
if m[3] > 1e-6:
return 0
elif m[3] < -1e-6:
return 180
elif m[1] > 0:
return 90
else:
return 270
def crlf_space_check(
text: str,
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...],
output: str,
font_size: float,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]
m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
cm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
try:
if orientation == 0:
if delta_y < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_y) < f * 0.3
and abs(delta_x) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 180:
if delta_y > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_y) < f * 0.3
and abs(delta_x) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 90:
if delta_x > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_x) < f * 0.3
and abs(delta_y) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 270:
if delta_x < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_x) < f * 0.3
and abs(delta_y) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
except Exception:
pass
tm_prev = tm_matrix.copy()
cm_prev = cm_matrix.copy()
return text, output, cm_prev, tm_prev
def handle_tj(
text: str,
operands: List[Union[str, TextStringObject]],
cm_matrix: List[float],
tm_matrix: List[float],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...],
output: str,
font_size: float,
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
) -> Tuple[str, bool]:
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations and len(operands) > 0:
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception:
# the data does not match the expectation,
# we use the alternative ;
# text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
xx = ord(x)
else:
xx = 1
# fmt: off
if (
# cases where the current inserting order is kept
(xx <= 0x2F) # punctuations but...
or 0x3A <= xx <= 0x40 # numbers (x30-39)
or 0x2000 <= xx <= 0x206F # upper punctuations..
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
elif ( # right-to-left characters set
0x0590 <= xx <= 0x08FF
or 0xFB1D <= xx <= 0xFDFF
or 0xFE70 <= xx <= 0xFEFF
or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
):
if not rtl_dir:
rtl_dir = True
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = x + text
else: # left-to-right
# print(">",xx,x,end="")
if rtl_dir:
rtl_dir = False
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
# fmt: on
return text, rtl_dir
|