aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/striprtf/striprtf.py
blob: 48c045a64a31b7bbe6c2ae4780b7e2fcba8b0ebe (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import re
import codecs

"""
Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676
and modified for better output of tables.
"""

# fmt: off
# control words which specify a "destination".
destinations = frozenset((
    'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
    'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
    'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
    'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
    'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
    'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
    'ffname','ffstattext','file','filetbl','fldinst','fldtype','fonttbl',
    'fname','fontemb','fontfile','footer','footerf','footerl','footerr',
    'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
    'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
    'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
    'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
    'listoverridetable','listpicture','liststylename','listtable','listtext',
    'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
    'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
    'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
    'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
    'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
    'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
    'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
    'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
    'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
    'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
    'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
    'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
    'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
    'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
    'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
    'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
    'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
    'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
    'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
    'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
    'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
    'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
    'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
    'svb','tc','template','themedata','title','txe','ud','upr','userprops',
    'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
    'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
    'xmlopen',
))
# fmt: on
charset_map = {
    0: "cp1252",  # Default
    42: "cp1252",  # Symbol
    77: "mac_roman",  # Mac Roman
    78: "mac_japanese",  # Mac Japanese
    79: "mac_chinesetrad",  # Mac Traditional Chinese
    80: "mac_korean",  # Mac Korean
    81: "mac_arabic",  # Mac Arabic
    82: "mac_hebrew",  # Mac Hebrew
    83: "mac_greek",  # Mac Greek
    84: "mac_cyrillic",  # Mac Cyrillic
    85: "mac_chinesesimp",  # Mac Simplified Chinese
    86: "mac_rumanian",  # Mac Romanian
    87: "mac_ukrainian",  # Mac Ukrainian
    88: "mac_thai",  # Mac Thai
    89: "mac_ce",  # Mac Central European
    128: "cp932",  # Japanese
    129: "cp949",  # Korean
    130: "cp1361",  # Johab (Korean)
    134: "cp936",  # Simplified Chinese (GBK)
    136: "cp950",  # Traditional Chinese (Big5)
    161: "cp1253",  # Greek
    162: "cp1254",  # Turkish
    163: "cp1258",  # Vietnamese
    177: "cp1255",  # Hebrew
    178: "cp1256",  # Arabic
    186: "cp1257",  # Baltic
    204: "cp1251",  # Cyrillic
    222: "cp874",  # Thai
    238: "cp1250",  # Eastern European
    254: "cp437",  # OEM United States
    255: "cp850",  # OEM Multilingual Latin 1
}

# Translation of some special characters.
# and section characters reset formatting
sectionchars = {"par": "\n", "sect": "\n\n", "page": "\n\n"}
specialchars = {
    **{
        "line": "\n",
        "tab": "\t",
        "emdash": "\u2014",
        "endash": "\u2013",
        "emspace": "\u2003",
        "enspace": "\u2002",
        "qmspace": "\u2005",
        "bullet": "\u2022",
        "lquote": "\u2018",
        "rquote": "\u2019",
        "ldblquote": "\u201C",
        "rdblquote": "\u201D",
        "row": "\n",
        "cell": "|",
        "nestcell": "|",
        "~": "\xa0",
        "\n": "\n",
        "\r": "\r",
        "{": "{",
        "}": "}",
        "\\": "\\",
        "-": "\xad",
        "_": "\u2011",
    },
    **sectionchars,
}

PATTERN = re.compile(
    r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)",
    re.IGNORECASE,
)

HYPERLINKS = re.compile(
    r"(\{\\field\{\s*\\\*\\fldinst\{.*HYPERLINK\s(\".*\")\}{2}\s*\{.*?\s+(.*?)\}{2,3})",
    re.IGNORECASE,
)

FONTTABLE = re.compile(r"\\f(\d+).*?\\fcharset(\d+).*?([^;]+);")

def rtf_to_text(text, encoding="cp1252", errors="strict"):
    """Converts the rtf text to plain text.

    Parameters
    ----------
    text : str
        The rtf text
    encoding : str
        Input encoding which is ignored if the rtf file contains an explicit codepage directive,
        as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used.
    errors : str
        How to handle encoding errors. Default is "strict", which throws an error. Another
        option is "ignore" which, as the name says, ignores encoding errors.

    Returns
    -------
    str
        the converted rtf text as a python unicode string
    """
    text = re.sub(
        HYPERLINKS, "\\1(\\2)", text
    )  # captures links like link_text(http://link_dest)
    stack = []
    fonttbl = {}
    default_font = None
    current_font = None
    ignorable = False  # Whether this group (and all inside it) are "ignorable".
    suppress_output = False  # Whether this group (and all inside it) are "ignorable".
    ucskip = 1  # Number of ASCII characters to skip after a unicode character.
    curskip = 0  # Number of ASCII characters left to skip
    hexes = None
    out = ""

    # Simplified font table regex

    fonttbl_matches = FONTTABLE.findall(text)
    for font_id, fcharset, font_name in fonttbl_matches:
        fonttbl[font_id] = {
            "name": font_name.strip(),
            "charset": fcharset,
            "encoding": charset_map.get(int(fcharset), encoding),
        }
    for match in PATTERN.finditer(text):
        word, arg, _hex, char, brace, tchar = match.groups()
        if hexes and not _hex:
            # Decode accumulated hexes
            out += bytes.fromhex(hexes).decode(
                encoding=fonttbl.get(current_font, {"encoding": encoding}).get(
                    "encoding", encoding
                ),
                errors=errors,
            )
            hexes = None
        if brace:
            curskip = 0
            if brace == "{":
                # Push state
                stack.append((ucskip, ignorable, suppress_output))
            elif brace == "}":
                # Pop state
                if stack:
                    ucskip, ignorable, suppress_output = stack.pop()
                # sample_3.rtf throws an IndexError because of stack being empty.
                # don't know right now how this could happen, so for now this is
                # a ugly hack to prevent it
                else:
                    ucskip = 0
                    ignorable = True
        elif char:  # \x (not a letter)
            curskip = 0
            if char in specialchars:
                if char in sectionchars:
                    current_font = default_font
                if not ignorable:
                    out += specialchars[char]
            elif char == "*":
                ignorable = True
        elif word:  # \foo
            curskip = 0
            if word in destinations:
                ignorable = True
            # http://www.biblioscape.com/rtf15_spec.htm#Heading8
            elif word == "ansicpg":
                encoding = f"cp{arg}"
                try:
                    codecs.lookup(encoding)
                except LookupError:
                    encoding = "utf8"
            if ignorable or suppress_output:
                pass
            elif word in specialchars:
                out += specialchars[word]
            elif word == "uc":
                ucskip = int(arg)
            elif word == "u":
                # because of https://github.com/joshy/striprtf/issues/6
                if arg is None:
                    curskip = ucskip
                else:
                    c = int(arg)
                    if c < 0:
                        c += 0x10000
                    out += chr(c)
                    curskip = ucskip
            elif word == "f":
                current_font = arg
            elif word == "deff":
                default_font = arg
            elif word == "fonttbl":
                suppress_output = True
            elif word == "colortbl":
                suppress_output = True

        elif _hex:  # \'xx
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
                # Accumulate hex characters to decode later
                if not hexes:
                    hexes = _hex
                else:
                    hexes += _hex
        elif tchar:
            if curskip > 0:
                curskip -= 1
            elif not ignorable and not suppress_output:
                out += tchar

    return out