1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
|
import re
import codecs
"""
Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676
and modified for better output of tables.
"""
# fmt: off
# control words which specify a "destination".
destinations = frozenset((
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','file','filetbl','fldinst','fldtype','fonttbl',
'fname','fontemb','fontfile','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
'listoverridetable','listpicture','liststylename','listtable','listtext',
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
# fmt: on
charset_map = {
0: "cp1252", # Default
42: "cp1252", # Symbol
77: "mac_roman", # Mac Roman
78: "mac_japanese", # Mac Japanese
79: "mac_chinesetrad", # Mac Traditional Chinese
80: "mac_korean", # Mac Korean
81: "mac_arabic", # Mac Arabic
82: "mac_hebrew", # Mac Hebrew
83: "mac_greek", # Mac Greek
84: "mac_cyrillic", # Mac Cyrillic
85: "mac_chinesesimp", # Mac Simplified Chinese
86: "mac_rumanian", # Mac Romanian
87: "mac_ukrainian", # Mac Ukrainian
88: "mac_thai", # Mac Thai
89: "mac_ce", # Mac Central European
128: "cp932", # Japanese
129: "cp949", # Korean
130: "cp1361", # Johab (Korean)
134: "cp936", # Simplified Chinese (GBK)
136: "cp950", # Traditional Chinese (Big5)
161: "cp1253", # Greek
162: "cp1254", # Turkish
163: "cp1258", # Vietnamese
177: "cp1255", # Hebrew
178: "cp1256", # Arabic
186: "cp1257", # Baltic
204: "cp1251", # Cyrillic
222: "cp874", # Thai
238: "cp1250", # Eastern European
254: "cp437", # OEM United States
255: "cp850", # OEM Multilingual Latin 1
}
# Translation of some special characters.
# and section characters reset formatting
sectionchars = {"par": "\n", "sect": "\n\n", "page": "\n\n"}
specialchars = {
**{
"line": "\n",
"tab": "\t",
"emdash": "\u2014",
"endash": "\u2013",
"emspace": "\u2003",
"enspace": "\u2002",
"qmspace": "\u2005",
"bullet": "\u2022",
"lquote": "\u2018",
"rquote": "\u2019",
"ldblquote": "\u201C",
"rdblquote": "\u201D",
"row": "\n",
"cell": "|",
"nestcell": "|",
"~": "\xa0",
"\n": "\n",
"\r": "\r",
"{": "{",
"}": "}",
"\\": "\\",
"-": "\xad",
"_": "\u2011",
},
**sectionchars,
}
PATTERN = re.compile(
r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)",
re.IGNORECASE,
)
HYPERLINKS = re.compile(
r"(\{\\field\{\s*\\\*\\fldinst\{.*HYPERLINK\s(\".*\")\}{2}\s*\{.*?\s+(.*?)\}{2,3})",
re.IGNORECASE,
)
FONTTABLE = re.compile(r"\\f(\d+).*?\\fcharset(\d+).*?([^;]+);")
def rtf_to_text(text, encoding="cp1252", errors="strict"):
"""Converts the rtf text to plain text.
Parameters
----------
text : str
The rtf text
encoding : str
Input encoding which is ignored if the rtf file contains an explicit codepage directive,
as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used.
errors : str
How to handle encoding errors. Default is "strict", which throws an error. Another
option is "ignore" which, as the name says, ignores encoding errors.
Returns
-------
str
the converted rtf text as a python unicode string
"""
text = re.sub(
HYPERLINKS, "\\1(\\2)", text
) # captures links like link_text(http://link_dest)
stack = []
fonttbl = {}
default_font = None
current_font = None
ignorable = False # Whether this group (and all inside it) are "ignorable".
suppress_output = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
hexes = None
out = ""
# Simplified font table regex
fonttbl_matches = FONTTABLE.findall(text)
for font_id, fcharset, font_name in fonttbl_matches:
fonttbl[font_id] = {
"name": font_name.strip(),
"charset": fcharset,
"encoding": charset_map.get(int(fcharset), encoding),
}
for match in PATTERN.finditer(text):
word, arg, _hex, char, brace, tchar = match.groups()
if hexes and not _hex:
# Decode accumulated hexes
out += bytes.fromhex(hexes).decode(
encoding=fonttbl.get(current_font, {"encoding": encoding}).get(
"encoding", encoding
),
errors=errors,
)
hexes = None
if brace:
curskip = 0
if brace == "{":
# Push state
stack.append((ucskip, ignorable, suppress_output))
elif brace == "}":
# Pop state
if stack:
ucskip, ignorable, suppress_output = stack.pop()
# sample_3.rtf throws an IndexError because of stack being empty.
# don't know right now how this could happen, so for now this is
# a ugly hack to prevent it
else:
ucskip = 0
ignorable = True
elif char: # \x (not a letter)
curskip = 0
if char in specialchars:
if char in sectionchars:
current_font = default_font
if not ignorable:
out += specialchars[char]
elif char == "*":
ignorable = True
elif word: # \foo
curskip = 0
if word in destinations:
ignorable = True
# http://www.biblioscape.com/rtf15_spec.htm#Heading8
elif word == "ansicpg":
encoding = f"cp{arg}"
try:
codecs.lookup(encoding)
except LookupError:
encoding = "utf8"
if ignorable or suppress_output:
pass
elif word in specialchars:
out += specialchars[word]
elif word == "uc":
ucskip = int(arg)
elif word == "u":
# because of https://github.com/joshy/striprtf/issues/6
if arg is None:
curskip = ucskip
else:
c = int(arg)
if c < 0:
c += 0x10000
out += chr(c)
curskip = ucskip
elif word == "f":
current_font = arg
elif word == "deff":
default_font = arg
elif word == "fonttbl":
suppress_output = True
elif word == "colortbl":
suppress_output = True
elif _hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
# Accumulate hex characters to decode later
if not hexes:
hexes = _hex
else:
hexes += _hex
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable and not suppress_output:
out += tchar
return out
|