aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/striprtf/striprtf.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/striprtf/striprtf.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/striprtf/striprtf.py')
-rw-r--r--.venv/lib/python3.12/site-packages/striprtf/striprtf.py260
1 files changed, 260 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/striprtf/striprtf.py b/.venv/lib/python3.12/site-packages/striprtf/striprtf.py
new file mode 100644
index 00000000..48c045a6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/striprtf/striprtf.py
@@ -0,0 +1,260 @@
+import re
+import codecs
+
+"""
+Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676
+and modified for better output of tables.
+"""
+
+# fmt: off
+# control words which specify a "destination".
+destinations = frozenset((
+ 'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
+ 'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
+ 'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
+ 'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
+ 'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
+ 'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
+ 'ffname','ffstattext','file','filetbl','fldinst','fldtype','fonttbl',
+ 'fname','fontemb','fontfile','footer','footerf','footerl','footerr',
+ 'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
+ 'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
+ 'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
+ 'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
+ 'listoverridetable','listpicture','liststylename','listtable','listtext',
+ 'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
+ 'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
+ 'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
+ 'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
+ 'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
+ 'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
+ 'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
+ 'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
+ 'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
+ 'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
+ 'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
+ 'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
+ 'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
+ 'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
+ 'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
+ 'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
+ 'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
+ 'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
+ 'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
+ 'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
+ 'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
+ 'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
+ 'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
+ 'svb','tc','template','themedata','title','txe','ud','upr','userprops',
+ 'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
+ 'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
+ 'xmlopen',
+))
+# fmt: on
+charset_map = {
+ 0: "cp1252", # Default
+ 42: "cp1252", # Symbol
+ 77: "mac_roman", # Mac Roman
+ 78: "mac_japanese", # Mac Japanese
+ 79: "mac_chinesetrad", # Mac Traditional Chinese
+ 80: "mac_korean", # Mac Korean
+ 81: "mac_arabic", # Mac Arabic
+ 82: "mac_hebrew", # Mac Hebrew
+ 83: "mac_greek", # Mac Greek
+ 84: "mac_cyrillic", # Mac Cyrillic
+ 85: "mac_chinesesimp", # Mac Simplified Chinese
+ 86: "mac_rumanian", # Mac Romanian
+ 87: "mac_ukrainian", # Mac Ukrainian
+ 88: "mac_thai", # Mac Thai
+ 89: "mac_ce", # Mac Central European
+ 128: "cp932", # Japanese
+ 129: "cp949", # Korean
+ 130: "cp1361", # Johab (Korean)
+ 134: "cp936", # Simplified Chinese (GBK)
+ 136: "cp950", # Traditional Chinese (Big5)
+ 161: "cp1253", # Greek
+ 162: "cp1254", # Turkish
+ 163: "cp1258", # Vietnamese
+ 177: "cp1255", # Hebrew
+ 178: "cp1256", # Arabic
+ 186: "cp1257", # Baltic
+ 204: "cp1251", # Cyrillic
+ 222: "cp874", # Thai
+ 238: "cp1250", # Eastern European
+ 254: "cp437", # OEM United States
+ 255: "cp850", # OEM Multilingual Latin 1
+}
+
+# Translation of some special characters.
+# and section characters reset formatting
+sectionchars = {"par": "\n", "sect": "\n\n", "page": "\n\n"}
+specialchars = {
+ **{
+ "line": "\n",
+ "tab": "\t",
+ "emdash": "\u2014",
+ "endash": "\u2013",
+ "emspace": "\u2003",
+ "enspace": "\u2002",
+ "qmspace": "\u2005",
+ "bullet": "\u2022",
+ "lquote": "\u2018",
+ "rquote": "\u2019",
+ "ldblquote": "\u201C",
+ "rdblquote": "\u201D",
+ "row": "\n",
+ "cell": "|",
+ "nestcell": "|",
+ "~": "\xa0",
+ "\n": "\n",
+ "\r": "\r",
+ "{": "{",
+ "}": "}",
+ "\\": "\\",
+ "-": "\xad",
+ "_": "\u2011",
+ },
+ **sectionchars,
+}
+
+PATTERN = re.compile(
+ r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)",
+ re.IGNORECASE,
+)
+
+HYPERLINKS = re.compile(
+ r"(\{\\field\{\s*\\\*\\fldinst\{.*HYPERLINK\s(\".*\")\}{2}\s*\{.*?\s+(.*?)\}{2,3})",
+ re.IGNORECASE,
+)
+
+FONTTABLE = re.compile(r"\\f(\d+).*?\\fcharset(\d+).*?([^;]+);")
+
+def rtf_to_text(text, encoding="cp1252", errors="strict"):
+ """Converts the rtf text to plain text.
+
+ Parameters
+ ----------
+ text : str
+ The rtf text
+ encoding : str
+ Input encoding which is ignored if the rtf file contains an explicit codepage directive,
+ as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used.
+ errors : str
+ How to handle encoding errors. Default is "strict", which throws an error. Another
+ option is "ignore" which, as the name says, ignores encoding errors.
+
+ Returns
+ -------
+ str
+ the converted rtf text as a python unicode string
+ """
+ text = re.sub(
+ HYPERLINKS, "\\1(\\2)", text
+ ) # captures links like link_text(http://link_dest)
+ stack = []
+ fonttbl = {}
+ default_font = None
+ current_font = None
+ ignorable = False # Whether this group (and all inside it) are "ignorable".
+ suppress_output = False # Whether this group (and all inside it) are "ignorable".
+ ucskip = 1 # Number of ASCII characters to skip after a unicode character.
+ curskip = 0 # Number of ASCII characters left to skip
+ hexes = None
+ out = ""
+
+ # Simplified font table regex
+
+ fonttbl_matches = FONTTABLE.findall(text)
+ for font_id, fcharset, font_name in fonttbl_matches:
+ fonttbl[font_id] = {
+ "name": font_name.strip(),
+ "charset": fcharset,
+ "encoding": charset_map.get(int(fcharset), encoding),
+ }
+ for match in PATTERN.finditer(text):
+ word, arg, _hex, char, brace, tchar = match.groups()
+ if hexes and not _hex:
+ # Decode accumulated hexes
+ out += bytes.fromhex(hexes).decode(
+ encoding=fonttbl.get(current_font, {"encoding": encoding}).get(
+ "encoding", encoding
+ ),
+ errors=errors,
+ )
+ hexes = None
+ if brace:
+ curskip = 0
+ if brace == "{":
+ # Push state
+ stack.append((ucskip, ignorable, suppress_output))
+ elif brace == "}":
+ # Pop state
+ if stack:
+ ucskip, ignorable, suppress_output = stack.pop()
+ # sample_3.rtf throws an IndexError because of stack being empty.
+ # don't know right now how this could happen, so for now this is
+ # a ugly hack to prevent it
+ else:
+ ucskip = 0
+ ignorable = True
+ elif char: # \x (not a letter)
+ curskip = 0
+ if char in specialchars:
+ if char in sectionchars:
+ current_font = default_font
+ if not ignorable:
+ out += specialchars[char]
+ elif char == "*":
+ ignorable = True
+ elif word: # \foo
+ curskip = 0
+ if word in destinations:
+ ignorable = True
+ # http://www.biblioscape.com/rtf15_spec.htm#Heading8
+ elif word == "ansicpg":
+ encoding = f"cp{arg}"
+ try:
+ codecs.lookup(encoding)
+ except LookupError:
+ encoding = "utf8"
+ if ignorable or suppress_output:
+ pass
+ elif word in specialchars:
+ out += specialchars[word]
+ elif word == "uc":
+ ucskip = int(arg)
+ elif word == "u":
+ # because of https://github.com/joshy/striprtf/issues/6
+ if arg is None:
+ curskip = ucskip
+ else:
+ c = int(arg)
+ if c < 0:
+ c += 0x10000
+ out += chr(c)
+ curskip = ucskip
+ elif word == "f":
+ current_font = arg
+ elif word == "deff":
+ default_font = arg
+ elif word == "fonttbl":
+ suppress_output = True
+ elif word == "colortbl":
+ suppress_output = True
+
+ elif _hex: # \'xx
+ if curskip > 0:
+ curskip -= 1
+ elif not ignorable:
+ # Accumulate hex characters to decode later
+ if not hexes:
+ hexes = _hex
+ else:
+ hexes += _hex
+ elif tchar:
+ if curskip > 0:
+ curskip -= 1
+ elif not ignorable and not suppress_output:
+ out += tchar
+
+ return out