import re import codecs """ Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676 and modified for better output of tables. """ # fmt: off # control words which specify a "destination". destinations = frozenset(( 'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid', 'atnparent','atnref','atntime','atrfend','atrfstart','author','background', 'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping', 'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap', 'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt', 'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl', 'ffname','ffstattext','file','filetbl','fldinst','fldtype','fonttbl', 'fname','fontemb','fontfile','footer','footerf','footerl','footerr', 'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl', 'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc', 'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers', 'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride', 'listoverridetable','listpicture','liststylename','listtable','listtext', 'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr', 'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr', 'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me', 'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr', 'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag', 'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname', 'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr', 'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject', 'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname', 'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl', 'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr', 'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu', 'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr', 'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup', 'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide', 'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol', 'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables', 'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops', 'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password', 'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta', 'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe', 'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst', 'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv', 'svb','tc','template','themedata','title','txe','ud','upr','userprops', 'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform', 'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl', 'xmlopen', )) # fmt: on charset_map = { 0: "cp1252", # Default 42: "cp1252", # Symbol 77: "mac_roman", # Mac Roman 78: "mac_japanese", # Mac Japanese 79: "mac_chinesetrad", # Mac Traditional Chinese 80: "mac_korean", # Mac Korean 81: "mac_arabic", # Mac Arabic 82: "mac_hebrew", # Mac Hebrew 83: "mac_greek", # Mac Greek 84: "mac_cyrillic", # Mac Cyrillic 85: "mac_chinesesimp", # Mac Simplified Chinese 86: "mac_rumanian", # Mac Romanian 87: "mac_ukrainian", # Mac Ukrainian 88: "mac_thai", # Mac Thai 89: "mac_ce", # Mac Central European 128: "cp932", # Japanese 129: "cp949", # Korean 130: "cp1361", # Johab (Korean) 134: "cp936", # Simplified Chinese (GBK) 136: "cp950", # Traditional Chinese (Big5) 161: "cp1253", # Greek 162: "cp1254", # Turkish 163: "cp1258", # Vietnamese 177: "cp1255", # Hebrew 178: "cp1256", # Arabic 186: "cp1257", # Baltic 204: "cp1251", # Cyrillic 222: "cp874", # Thai 238: "cp1250", # Eastern European 254: "cp437", # OEM United States 255: "cp850", # OEM Multilingual Latin 1 } # Translation of some special characters. # and section characters reset formatting sectionchars = {"par": "\n", "sect": "\n\n", "page": "\n\n"} specialchars = { **{ "line": "\n", "tab": "\t", "emdash": "\u2014", "endash": "\u2013", "emspace": "\u2003", "enspace": "\u2002", "qmspace": "\u2005", "bullet": "\u2022", "lquote": "\u2018", "rquote": "\u2019", "ldblquote": "\u201C", "rdblquote": "\u201D", "row": "\n", "cell": "|", "nestcell": "|", "~": "\xa0", "\n": "\n", "\r": "\r", "{": "{", "}": "}", "\\": "\\", "-": "\xad", "_": "\u2011", }, **sectionchars, } PATTERN = re.compile( r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.IGNORECASE, ) HYPERLINKS = re.compile( r"(\{\\field\{\s*\\\*\\fldinst\{.*HYPERLINK\s(\".*\")\}{2}\s*\{.*?\s+(.*?)\}{2,3})", re.IGNORECASE, ) FONTTABLE = re.compile(r"\\f(\d+).*?\\fcharset(\d+).*?([^;]+);") def rtf_to_text(text, encoding="cp1252", errors="strict"): """Converts the rtf text to plain text. Parameters ---------- text : str The rtf text encoding : str Input encoding which is ignored if the rtf file contains an explicit codepage directive, as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used. errors : str How to handle encoding errors. Default is "strict", which throws an error. Another option is "ignore" which, as the name says, ignores encoding errors. Returns ------- str the converted rtf text as a python unicode string """ text = re.sub( HYPERLINKS, "\\1(\\2)", text ) # captures links like link_text(http://link_dest) stack = [] fonttbl = {} default_font = None current_font = None ignorable = False # Whether this group (and all inside it) are "ignorable". suppress_output = False # Whether this group (and all inside it) are "ignorable". ucskip = 1 # Number of ASCII characters to skip after a unicode character. curskip = 0 # Number of ASCII characters left to skip hexes = None out = "" # Simplified font table regex fonttbl_matches = FONTTABLE.findall(text) for font_id, fcharset, font_name in fonttbl_matches: fonttbl[font_id] = { "name": font_name.strip(), "charset": fcharset, "encoding": charset_map.get(int(fcharset), encoding), } for match in PATTERN.finditer(text): word, arg, _hex, char, brace, tchar = match.groups() if hexes and not _hex: # Decode accumulated hexes out += bytes.fromhex(hexes).decode( encoding=fonttbl.get(current_font, {"encoding": encoding}).get( "encoding", encoding ), errors=errors, ) hexes = None if brace: curskip = 0 if brace == "{": # Push state stack.append((ucskip, ignorable, suppress_output)) elif brace == "}": # Pop state if stack: ucskip, ignorable, suppress_output = stack.pop() # sample_3.rtf throws an IndexError because of stack being empty. # don't know right now how this could happen, so for now this is # a ugly hack to prevent it else: ucskip = 0 ignorable = True elif char: # \x (not a letter) curskip = 0 if char in specialchars: if char in sectionchars: current_font = default_font if not ignorable: out += specialchars[char] elif char == "*": ignorable = True elif word: # \foo curskip = 0 if word in destinations: ignorable = True # http://www.biblioscape.com/rtf15_spec.htm#Heading8 elif word == "ansicpg": encoding = f"cp{arg}" try: codecs.lookup(encoding) except LookupError: encoding = "utf8" if ignorable or suppress_output: pass elif word in specialchars: out += specialchars[word] elif word == "uc": ucskip = int(arg) elif word == "u": # because of https://github.com/joshy/striprtf/issues/6 if arg is None: curskip = ucskip else: c = int(arg) if c < 0: c += 0x10000 out += chr(c) curskip = ucskip elif word == "f": current_font = arg elif word == "deff": default_font = arg elif word == "fonttbl": suppress_output = True elif word == "colortbl": suppress_output = True elif _hex: # \'xx if curskip > 0: curskip -= 1 elif not ignorable: # Accumulate hex characters to decode later if not hexes: hexes = _hex else: hexes += _hex elif tchar: if curskip > 0: curskip -= 1 elif not ignorable and not suppress_output: out += tchar return out