diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/docutils/utils/math/latex2mathml.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/docutils/utils/math/latex2mathml.py | 1252 |
1 files changed, 1252 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/docutils/utils/math/latex2mathml.py b/.venv/lib/python3.12/site-packages/docutils/utils/math/latex2mathml.py new file mode 100644 index 00000000..b6ca3934 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/docutils/utils/math/latex2mathml.py @@ -0,0 +1,1252 @@ +# :Id: $Id: latex2mathml.py 9536 2024-02-01 13:04:22Z milde $ +# :Copyright: © 2005 Jens Jørgen Mortensen [1]_ +# © 2010, 2021, 2024 Günter Milde. +# +# :License: Released under the terms of the `2-Clause BSD license`_, in short: +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. +# This file is offered as-is, without any warranty. +# +# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause +# +# .. [1] the original `rst2mathml.py` in `sandbox/jensj/latex_math` + +"""Convert LaTex maths code into presentational MathML. + +This module is provisional: +the API is not settled and may change with any minor Docutils version. +""" + +# Usage: +# +# >>> from latex2mathml import * + +import re +import unicodedata + +from docutils.utils.math import (MathError, mathalphabet2unichar, + tex2unichar, toplevel_code) +from docutils.utils.math.mathml_elements import ( + math, mtable, mrow, mtr, mtd, menclose, mphantom, msqrt, mi, mn, mo, + mtext, msub, msup, msubsup, munder, mover, munderover, mroot, mfrac, + mspace, MathRow) + + +# Character data +# -------------- + +# LaTeX math macro to Unicode mappings. +# Character categories. + +# identifiers -> <mi> + +letters = {'hbar': 'ℏ'} # Compatibility mapping: \hbar resembles italic ħ +# "unicode-math" unifies \hbar and \hslash to ℏ. +letters.update(tex2unichar.mathalpha) + +ordinary = tex2unichar.mathord # Miscellaneous symbols + +# special case: Capital Greek letters: (upright in TeX style) +greek_capitals = { + 'Phi': '\u03a6', 'Xi': '\u039e', 'Sigma': '\u03a3', + 'Psi': '\u03a8', 'Delta': '\u0394', 'Theta': '\u0398', + 'Upsilon': '\u03d2', 'Pi': '\u03a0', 'Omega': '\u03a9', + 'Gamma': '\u0393', 'Lambda': '\u039b'} + +# functions -> <mi> +functions = { + # functions with a space in the name + 'liminf': 'lim\u202finf', + 'limsup': 'lim\u202fsup', + 'injlim': 'inj\u202flim', + 'projlim': 'proj\u202flim', + # embellished function names (see handle_cmd() below) + 'varlimsup': 'lim', + 'varliminf': 'lim', + 'varprojlim': 'lim', + 'varinjlim': 'lim', + # custom function name + 'operatorname': None, +} +functions.update((name, name) for name in + ('arccos', 'arcsin', 'arctan', 'arg', 'cos', + 'cosh', 'cot', 'coth', 'csc', 'deg', + 'det', 'dim', 'exp', 'gcd', 'hom', + 'ker', 'lg', 'ln', 'log', 'Pr', + 'sec', 'sin', 'sinh', 'tan', 'tanh')) +# Function with limits: 'lim', 'sup', 'inf', 'max', 'min': +# use <mo> to allow "movablelimits" attribute (see below). + +# modulo operator/arithmetic +modulo_functions = { + # cmdname: (binary, named, parentheses, padding) + 'bmod': (True, True, False, '0.278em'), # a mod n + 'pmod': (False, True, True, '0.444em'), # a (mod n) + 'mod': (False, True, False, '0.667em'), # a mod n + 'pod': (False, False, True, '0.444em'), # a (n) + } + + +# "mathematical alphabets": map identifiers to the corresponding +# characters from the "Mathematical Alphanumeric Symbols" block +math_alphabets = { + # 'cmdname': 'mathvariant value' # package + 'mathbb': 'double-struck', # amssymb + 'mathbf': 'bold', + 'mathbfit': 'bold-italic', # isomath + 'mathcal': 'script', + 'mathfrak': 'fraktur', # amssymb + 'mathit': 'italic', + 'mathrm': 'normal', + 'mathscr': 'script', # mathrsfs et al + 'mathsf': 'sans-serif', + 'mathbfsfit': 'sans-serif-bold-italic', # unicode-math + 'mathsfbfit': 'sans-serif-bold-italic', # isomath + 'mathsfit': 'sans-serif-italic', # isomath + 'mathtt': 'monospace', + # unsupported: bold-fraktur + # bold-script + # bold-sans-serif +} + +# operator, fence, or separator -> <mo> + +stretchables = { + # extensible delimiters allowed in left/right cmds + 'backslash': '\\', + 'uparrow': '\u2191', # ↑ UPWARDS ARROW + 'downarrow': '\u2193', # ↓ DOWNWARDS ARROW + 'updownarrow': '\u2195', # ↕ UP DOWN ARROW + 'Uparrow': '\u21d1', # ⇑ UPWARDS DOUBLE ARROW + 'Downarrow': '\u21d3', # ⇓ DOWNWARDS DOUBLE ARROW + 'Updownarrow': '\u21d5', # ⇕ UP DOWN DOUBLE ARROW + 'lmoustache': '\u23b0', # ⎰ … CURLY BRACKET SECTION + 'rmoustache': '\u23b1', # ⎱ … LEFT CURLY BRACKET SECTION + 'arrowvert': '\u23d0', # ⏐ VERTICAL LINE EXTENSION + 'bracevert': '\u23aa', # ⎪ CURLY BRACKET EXTENSION + 'lvert': '|', # left | + 'lVert': '\u2016', # left ‖ + 'rvert': '|', # right | + 'rVert': '\u2016', # right ‖ + 'Arrowvert': '\u2016', # ‖ +} +stretchables.update(tex2unichar.mathfence) +stretchables.update(tex2unichar.mathopen) # Braces +stretchables.update(tex2unichar.mathclose) # Braces + +# >>> print(' '.join(sorted(set(stretchables.values())))) +# [ \ ] { | } ‖ ↑ ↓ ↕ ⇑ ⇓ ⇕ ⌈ ⌉ ⌊ ⌋ ⌜ ⌝ ⌞ ⌟ ⎪ ⎰ ⎱ ⏐ ⟅ ⟆ ⟦ ⟧ ⟨ ⟩ ⟮ ⟯ ⦇ ⦈ + +operators = { + # negated symbols without pre-composed Unicode character + 'nleqq': '\u2266\u0338', # ≦̸ + 'ngeqq': '\u2267\u0338', # ≧̸ + 'nleqslant': '\u2a7d\u0338', # ⩽̸ + 'ngeqslant': '\u2a7e\u0338', # ⩾̸ + 'ngtrless': '\u2277\u0338', # txfonts + 'nlessgtr': '\u2276\u0338', # txfonts + 'nsubseteqq': '\u2AC5\u0338', # ⫅̸ + 'nsupseteqq': '\u2AC6\u0338', # ⫆̸ + # compatibility definitions: + 'centerdot': '\u2B1D', # BLACK VERY SMALL SQUARE | mathbin + 'varnothing': '\u2300', # ⌀ DIAMETER SIGN | empty set + 'varpropto': '\u221d', # ∝ PROPORTIONAL TO | sans serif + 'triangle': '\u25B3', # WHITE UP-POINTING TRIANGLE | mathord + 'triangledown': '\u25BD', # WHITE DOWN-POINTING TRIANGLE | mathord + # alias commands: + 'dotsb': '\u22ef', # ⋯ with binary operators/relations + 'dotsc': '\u2026', # … with commas + 'dotsi': '\u22ef', # ⋯ with integrals + 'dotsm': '\u22ef', # ⋯ multiplication dots + 'dotso': '\u2026', # … other dots + # functions with movable limits (requires <mo>) + 'lim': 'lim', + 'sup': 'sup', + 'inf': 'inf', + 'max': 'max', + 'min': 'min', +} +operators.update(tex2unichar.mathbin) # Binary symbols +operators.update(tex2unichar.mathrel) # Relation symbols, arrow symbols +operators.update(tex2unichar.mathpunct) # Punctuation +operators.update(tex2unichar.mathop) # Variable-sized symbols +operators.update(stretchables) + + +# special cases + +thick_operators = { + # style='font-weight: bold;' + 'thicksim': '\u223C', # ∼ + 'thickapprox': '\u2248', # ≈ +} + +small_operators = { + # mathsize='75%' + 'shortmid': '\u2223', # ∣ + 'shortparallel': '\u2225', # ∥ + 'nshortmid': '\u2224', # ∤ + 'nshortparallel': '\u2226', # ∦ + 'smallfrown': '\u2322', # ⌢ FROWN + 'smallsmile': '\u2323', # ⌣ SMILE + 'smallint': '\u222b', # ∫ INTEGRAL +} + +# Operators and functions with limits above/below in display formulas +# and in index position inline (movablelimits=True) +movablelimits = ('bigcap', 'bigcup', 'bigodot', 'bigoplus', 'bigotimes', + 'bigsqcup', 'biguplus', 'bigvee', 'bigwedge', + 'coprod', 'intop', 'ointop', 'prod', 'sum', + 'lim', 'max', 'min', 'sup', 'inf') +# Depending on settings, integrals may also be in this category. +# (e.g. if "amsmath" is loaded with option "intlimits", see +# http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf) +# movablelimits.extend(('fint', 'iiiint', 'iiint', 'iint', 'int', 'oiint', +# 'oint', 'ointctrclockwise', 'sqint', +# 'varointclockwise',)) + +# horizontal space -> <mspace> + +spaces = {'qquad': '2em', # two \quad + 'quad': '1em', # 18 mu + 'thickspace': '0.2778em', # 5mu = 5/18em + ';': '0.2778em', # 5mu thickspace + ' ': '0.25em', # inter word space + '\n': '0.25em', # inter word space + 'medspace': '0.2222em', # 4mu = 2/9em + ':': '0.2222em', # 4mu medspace + 'thinspace': '0.1667em', # 3mu = 1/6em + ',': '0.1667em', # 3mu thinspace + 'negthinspace': '-0.1667em', # -3mu = -1/6em + '!': '-0.1667em', # negthinspace + 'negmedspace': '-0.2222em', # -4mu = -2/9em + 'negthickspace': '-0.2778em', # -5mu = -5/18em + } + +# accents: -> <mo stretchy="false"> in <mover> +accents = { + # TeX: spacing combining + 'acute': '´', # '\u0301' + 'bar': 'ˉ', # '\u0304' + 'breve': '˘', # '\u0306' + 'check': 'ˇ', # '\u030C' + 'dot': '˙', # '\u0307' + 'ddot': '¨', # '\u0308' + 'dddot': '˙˙˙', # '\u20DB' # or … ? + 'ddddot': '˙˙˙˙', # '\u20DC' # or ¨¨ ? + 'grave': '`', # '\u0300' + 'hat': 'ˆ', # '\u0302' + 'mathring': '˚', # '\u030A' + 'tilde': '~', # '\u0303' # tilde ~ or small tilde ˜? + 'vec': '→', # '\u20d7' # → too heavy, use scriptlevel="+1" +} + +# limits etc. -> <mo> in <mover> or <munder> +over = { + # TeX: (char, offset-correction/em) + 'overbrace': ('\u23DE', -0.2), # DejaVu Math -0.6 + 'overleftarrow': ('\u2190', -0.2), + 'overleftrightarrow': ('\u2194', -0.2), + 'overline': ('_', -0.2), # \u2012 does not stretch + 'overrightarrow': ('\u2192', -0.2), + 'widehat': ('^', -0.5), + 'widetilde': ('~', -0.3), +} +under = {'underbrace': ('\u23DF', 0.1), # DejaVu Math -0.7 + 'underleftarrow': ('\u2190', -0.2), + 'underleftrightarrow': ('\u2194', -0.2), + 'underline': ('_', -0.8), + 'underrightarrow': ('\u2192', -0.2), + } + +# Character translations +# ---------------------- +# characters with preferred alternative in mathematical use +# cf. https://www.w3.org/TR/MathML3/chapter7.html#chars.anomalous +anomalous_chars = {'-': '\u2212', # HYPHEN-MINUS -> MINUS SIGN + ':': '\u2236', # COLON -> RATIO + '~': '\u00a0', # NO-BREAK SPACE + } + +# blackboard bold (Greek characters not working with "mathvariant" (Firefox 78) +mathbb = {'Γ': '\u213E', # ℾ + 'Π': '\u213F', # ℿ + 'Σ': '\u2140', # ⅀ + 'γ': '\u213D', # ℽ + 'π': '\u213C', # ℼ + } + +# Matrix environments +matrices = { + # name: fences + 'matrix': ('', ''), + 'smallmatrix': ('', ''), # smaller, see begin_environment()! + 'pmatrix': ('(', ')'), + 'bmatrix': ('[', ']'), + 'Bmatrix': ('{', '}'), + 'vmatrix': ('|', '|'), + 'Vmatrix': ('\u2016', '\u2016'), # ‖ + 'aligned': ('', ''), + 'cases': ('{', ''), +} + +layout_styles = { + 'displaystyle': {'displaystyle': True, 'scriptlevel': 0}, + 'textstyle': {'displaystyle': False, 'scriptlevel': 0}, + 'scriptstyle': {'displaystyle': False, 'scriptlevel': 1}, + 'scriptscriptstyle': {'displaystyle': False, 'scriptlevel': 2}, + } +# See also https://www.w3.org/TR/MathML3/chapter3.html#presm.scriptlevel + +fractions = { + # name: attributes + 'frac': {}, + 'cfrac': {'displaystyle': True, 'scriptlevel': 0, + 'class': 'cfrac'}, # in LaTeX with padding + 'dfrac': layout_styles['displaystyle'], + 'tfrac': layout_styles['textstyle'], + 'binom': {'linethickness': 0}, + 'dbinom': layout_styles['displaystyle'] | {'linethickness': 0}, + 'tbinom': layout_styles['textstyle'] | {'linethickness': 0}, +} + +delimiter_sizes = ['', '1.2em', '1.623em', '2.047em', '2.470em'] +bigdelimiters = {'left': 0, + 'right': 0, + 'bigl': 1, + 'bigr': 1, + 'Bigl': 2, + 'Bigr': 2, + 'biggl': 3, + 'biggr': 3, + 'Biggl': 4, + 'Biggr': 4, + } + + +# LaTeX to MathML translation +# --------------------------- + +# auxiliary functions +# ~~~~~~~~~~~~~~~~~~~ + +def tex_cmdname(string): + """Return leading TeX command name and remainder of `string`. + + >>> tex_cmdname('mymacro2') # up to first non-letter + ('mymacro', '2') + >>> tex_cmdname('name 2') # strip trailing whitespace + ('name', '2') + >>> tex_cmdname('_2') # single non-letter character + ('_', '2') + + """ + m = re.match(r'([a-zA-Z]+)[ \n]*(.*)', string, re.DOTALL) + if m is None: + m = re.match(r'(.?)(.*)', string, re.DOTALL) + return m.group(1), m.group(2) + + +# Test: +# +# >>> tex_cmdname('name\nnext') # strip trailing whitespace, also newlines +# ('name', 'next') +# >>> tex_cmdname('name_2') # first non-letter terminates +# ('name', '_2') +# >>> tex_cmdname('name_2\nnext line') # line-break allowed +# ('name', '_2\nnext line') +# >>> tex_cmdname(' next') # leading whitespace is returned +# (' ', 'next') +# >>> tex_cmdname('1 2') # whitespace after non-letter is kept +# ('1', ' 2') +# >>> tex_cmdname('1\n2\t3') # whitespace after non-letter is kept +# ('1', '\n2\t3') +# >>> tex_cmdname('') # empty string +# ('', '') + + +def tex_number(string): + """Return leading number literal and remainder of `string`. + + >>> tex_number('123.4') + ('123.4', '') + + """ + m = re.match(r'([0-9.,]*[0-9]+)(.*)', string, re.DOTALL) + if m is None: + return '', string + return m.group(1), m.group(2) + + +# Test: +# +# >>> tex_number(' 23.4b') # leading whitespace -> no number +# ('', ' 23.4b') +# >>> tex_number('23,400/2') # comma separator included +# ('23,400', '/2') +# >>> tex_number('23. 4/2') # trailing separator not included +# ('23', '. 4/2') +# >>> tex_number('4, 2') # trailing separator not included +# ('4', ', 2') +# >>> tex_number('1 000.4') +# ('1', ' 000.4') + + +def tex_token(string): + """Return first simple TeX token and remainder of `string`. + + >>> tex_token('\\command{without argument}') + ('\\command', '{without argument}') + >>> tex_token('or first character') + ('o', 'r first character') + + """ + m = re.match(r"""((?P<cmd>\\[a-zA-Z]+)\s* # TeX command, skip whitespace + |(?P<chcmd>\\.) # one-character TeX command + |(?P<ch>.?)) # first character (or empty) + (?P<remainder>.*$) # remaining part of string + """, string, re.VERBOSE | re.DOTALL) + cmd, chcmd, ch, remainder = m.group('cmd', 'chcmd', 'ch', 'remainder') + return cmd or chcmd or ch, remainder + +# Test: +# +# >>> tex_token('{opening bracket of group}') +# ('{', 'opening bracket of group}') +# >>> tex_token('\\skip whitespace after macro name') +# ('\\skip', 'whitespace after macro name') +# >>> tex_token('. but not after single char') +# ('.', ' but not after single char') +# >>> tex_token('') # empty string. +# ('', '') +# >>> tex_token('\{escaped bracket') +# ('\\{', 'escaped bracket') + + +def tex_group(string): + """Return first TeX group or token and remainder of `string`. + + >>> tex_group('{first group} returned without brackets') + ('first group', ' returned without brackets') + + """ + split_index = 0 + nest_level = 0 # level of {{nested} groups} + escape = False # the next character is escaped (\) + + if not string.startswith('{'): + # special case: there is no group, return first token and remainder + return string[:1], string[1:] + for c in string: + split_index += 1 + if escape: + escape = False + elif c == '\\': + escape = True + elif c == '{': + nest_level += 1 + elif c == '}': + nest_level -= 1 + if nest_level == 0: + break + else: + raise MathError('Group without closing bracket!') + return string[1:split_index-1], string[split_index:] + + +# >>> tex_group('{} empty group') +# ('', ' empty group') +# >>> tex_group('{group with {nested} group} ') +# ('group with {nested} group', ' ') +# >>> tex_group('{group with {nested group}} at the end') +# ('group with {nested group}', ' at the end') +# >>> tex_group('{{group} {with {{complex }nesting}} constructs}') +# ('{group} {with {{complex }nesting}} constructs', '') +# >>> tex_group('{group with \\{escaped\\} brackets}') +# ('group with \\{escaped\\} brackets', '') +# >>> tex_group('{group followed by closing bracket}} from outer group') +# ('group followed by closing bracket', '} from outer group') +# >>> tex_group('No group? Return first character.') +# ('N', 'o group? Return first character.') +# >>> tex_group(' {also whitespace}') +# (' ', '{also whitespace}') + + +def tex_token_or_group(string): + """Return first TeX group or token and remainder of `string`. + + >>> tex_token_or_group('\\command{without argument}') + ('\\command', '{without argument}') + >>> tex_token_or_group('first character') + ('f', 'irst character') + >>> tex_token_or_group(' also whitespace') + (' ', 'also whitespace') + >>> tex_token_or_group('{first group} keep rest') + ('first group', ' keep rest') + + """ + arg, remainder = tex_token(string) + if arg == '{': + arg, remainder = tex_group(string.lstrip()) + return arg, remainder + +# >>> tex_token_or_group('\{no group but left bracket') +# ('\\{', 'no group but left bracket') + + +def tex_optarg(string): + """Return optional argument and remainder. + + >>> tex_optarg('[optional argument] returned without brackets') + ('optional argument', ' returned without brackets') + >>> tex_optarg('{empty string, if there is no optional arg}') + ('', '{empty string, if there is no optional arg}') + + """ + m = re.match(r"""\s* # leading whitespace + \[(?P<optarg>(\\]|[^\[\]]|\\])*)\] # [group] without nested groups + (?P<remainder>.*$) + """, string, re.VERBOSE | re.DOTALL) + if m is None and not string.startswith('['): + return '', string + try: + return m.group('optarg'), m.group('remainder') + except AttributeError: + raise MathError(f'Could not extract optional argument from "{string}"!') + +# Test: +# >>> tex_optarg(' [optional argument] after whitespace') +# ('optional argument', ' after whitespace') +# >>> tex_optarg('[missing right bracket') +# Traceback (most recent call last): +# ... +# docutils.utils.math.MathError: Could not extract optional argument from "[missing right bracket"! +# >>> tex_optarg('[group with [nested group]]') +# Traceback (most recent call last): +# ... +# docutils.utils.math.MathError: Could not extract optional argument from "[group with [nested group]]"! + + +def parse_latex_math(root, source): + """Append MathML conversion of `string` to `node` and return it. + + >>> parse_latex_math(math(), r'\alpha') + math(mi('α')) + >>> parse_latex_math(mrow(), r'x_{n}') + mrow(msub(mi('x'), mi('n'))) + + """ + # Normalize white-space: + string = source # not-yet handled part of source + node = root # the current "insertion point" + + # Loop over `string` while changing it. + while len(string) > 0: + # Take off first character: + c, string = string[0], string[1:] + + if c in ' \n': + continue # whitespace is ignored in LaTeX math mode + if c == '\\': # start of a LaTeX macro + cmdname, string = tex_cmdname(string) + node, string = handle_cmd(cmdname, node, string) + elif c in "_^": + node = handle_script_or_limit(node, c) + elif c == '{': + if isinstance(node, MathRow) and node.nchildren == 1: + # LaTeX takes one arg, MathML node accepts a group + node.nchildren = None # allow appending until closed by '}' + else: # wrap group in an <mrow> + new_node = mrow() + node.append(new_node) + node = new_node + elif c == '}': + node = node.close() + elif c == '&': + new_node = mtd() + node.close().append(new_node) + node = new_node + elif c.isalpha(): + node = node.append(mi(c)) + elif c.isdigit(): + number, string = tex_number(string) + node = node.append(mn(c+number)) + elif c in anomalous_chars: + # characters with a special meaning in LaTeX math mode + # fix spacing before "unary" minus. + attributes = {} + if c == '-' and len(node): + previous_node = node[-1] + if (previous_node.text and previous_node.text in '([=' + or previous_node.get('class') == 'mathopen'): + attributes['form'] = 'prefix' + node = node.append(mo(anomalous_chars[c], **attributes)) + elif c in "/()[]|": + node = node.append(mo(c, stretchy=False)) + elif c in "+*=<>,.!?`';@": + node = node.append(mo(c)) + else: + raise MathError(f'Unsupported character: "{c}"!') + # TODO: append as <mi>? + if node is None: + if not string: + return root # ignore unbalanced braces + raise MathError(f'No insertion point for "{string}". ' + f'Unbalanced braces in "{source[:-len(string)]}"?') + if node.nchildren and len(node) < node.nchildren: + raise MathError('Last node missing children. Source incomplete?') + return root + +# Test: + +# >>> parse_latex_math(math(), '') +# math() +# >>> parse_latex_math(math(), ' \\sqrt{ \\alpha}') +# math(msqrt(mi('α'))) +# >>> parse_latex_math(math(), '23.4x') +# math(mn('23.4'), mi('x')) +# >>> parse_latex_math(math(), '\\sqrt 2 \\ne 3') +# math(msqrt(mn('2')), mo('≠'), mn('3')) +# >>> parse_latex_math(math(), '\\sqrt{2 + 3} < 10') +# math(msqrt(mn('2'), mo('+'), mn('3'), nchildren=3), mo('<'), mn('10')) +# >>> parse_latex_math(math(), '\\sqrt[3]{2 + 3}') +# math(mroot(mrow(mn('2'), mo('+'), mn('3'), nchildren=3), mn('3'))) +# >>> parse_latex_math(math(), '\max_x') # function takes limits +# math(munder(mo('max', movablelimits='true'), mi('x'))) +# >>> parse_latex_math(math(), 'x^j_i') # ensure correct order: base, sub, sup +# math(msubsup(mi('x'), mi('i'), mi('j'))) +# >>> parse_latex_math(math(), '\int^j_i') # ensure correct order +# math(msubsup(mo('∫'), mi('i'), mi('j'))) +# >>> parse_latex_math(math(), 'x_{\\alpha}') +# math(msub(mi('x'), mi('α'))) +# >>> parse_latex_math(math(), 'x_\\text{in}') +# math(msub(mi('x'), mtext('in'))) +# >>> parse_latex_math(math(), '2⌘') +# Traceback (most recent call last): +# docutils.utils.math.MathError: Unsupported character: "⌘"! +# >>> parse_latex_math(math(), '23}x') # doctest: +ELLIPSIS +# Traceback (most recent call last): +# ... +# docutils.utils.math.MathError: ... Unbalanced braces in "23}"? +# >>> parse_latex_math(math(), '\\frac{2}') +# Traceback (most recent call last): +# ... +# docutils.utils.math.MathError: Last node missing children. Source incomplete? + + +def handle_cmd(name, node, string): # noqa: C901 TODO make this less complex + """Process LaTeX command `name` followed by `string`. + + Append result to `node`. + If needed, parse `string` for command argument. + Return new current node and remainder of `string`: + + >>> handle_cmd('hbar', math(), r' \frac') + (math(mi('ℏ')), ' \\frac') + >>> handle_cmd('hspace', math(), r'{1ex} (x)') + (math(mspace(width='1ex')), ' (x)') + + """ + + # Token elements + # ============== + + # identifier -> <mi> + + if name in letters: + new_node = mi(letters[name]) + if name in greek_capitals: + # upright in "TeX style" but MathML sets them italic ("ISO style"). + # CSS styling does not change the font style in Firefox 78. + # Use 'mathvariant="normal"'? + new_node.set('class', 'capital-greek') + node = node.append(new_node) + return node, string + + if name in ordinary: + # <mi mathvariant="normal"> well supported by Chromium but + # Firefox 115.5.0 puts additional space around the symbol, e.g. + # <mi mathvariant="normal">∂</mi><mi>t</mi> looks like ∂ t, not ∂t + # return node.append(mi(ordinary[name], mathvariant='normal')), string + return node.append(mi(ordinary[name])), string + + if name in functions: + # use <mi> followed by invisible function applicator character + # (see https://www.w3.org/TR/MathML3/chapter3.html#presm.mi) + if name == 'operatorname': + # custom function name, e.g. ``\operatorname{abs}(x)`` + # TODO: \operatorname* -> with limits + arg, string = tex_token_or_group(string) + new_node = mi(arg, mathvariant='normal') + else: + new_node = mi(functions[name]) + # embellished function names: + if name == 'varliminf': # \underline\lim + new_node = munder(new_node, mo('_')) + elif name == 'varlimsup': # \overline\lim + new_node = mover(new_node, mo('¯'), accent=False) + elif name == 'varprojlim': # \underleftarrow\lim + new_node = munder(new_node, mo('\u2190')) + elif name == 'varinjlim': # \underrightarrow\lim + new_node = munder(new_node, mo('\u2192')) + + node = node.append(new_node) + # add ApplyFunction when appropriate (not \sin^2(x), say) + # cf. https://www.w3.org/TR/MathML3/chapter3.html#presm.mi + if string and string[0] not in ('^', '_'): + node = node.append(mo('\u2061')) # ⁡ + return node, string + + if name in modulo_functions: + (binary, named, parentheses, padding) = modulo_functions[name] + if binary: + node = node.append(mo('mod', lspace=padding, rspace=padding)) + return node, string + # left padding + if node.in_block(): + padding = '1em' + node = node.append(mspace(width=padding)) + if parentheses: + node = node.append(mo('(', stretchy=False)) + if named: + node = node.append(mi('mod')) + node = node.append(mspace(width='0.333em')) + arg, string = tex_token_or_group(string) + node = parse_latex_math(node, arg) + if parentheses: + node = node.append(mo(')', stretchy=False)) + return node, string + + # font changes or mathematical alphanumeric characters + + if name in ('boldsymbol', 'pmb'): # \pmb is "poor mans bold" + new_node = mrow(CLASS='boldsymbol') + node.append(new_node) + return new_node, string + + if name in math_alphabets: + return handle_math_alphabet(name, node, string) + + # operator, fence, or separator -> <mo> + + if name == 'colon': # trailing punctuation, not binary relation + node = node.append(mo(':', form='postfix', lspace='0', rspace='0.28em')) + return node, string + + if name == 'idotsint': # AMS shortcut for ∫︀···∫︀ + node = parse_latex_math(node, r'\int\dotsi\int') + return node, string + + if name in thick_operators: + node = node.append(mo(thick_operators[name], style='font-weight: bold')) + return node, string + + if name in small_operators: + node = node.append(mo(small_operators[name], mathsize='75%')) + return node, string + + if name in operators: + attributes = {} + if name in movablelimits and string and string[0] in ' _^': + attributes['movablelimits'] = True + elif name in ('lvert', 'lVert'): + attributes['class'] = 'mathopen' + node = node.append(mo(operators[name], **attributes)) + return node, string + + if name in bigdelimiters: + delimiter_attributes = {} + size = delimiter_sizes[bigdelimiters[name]] + delimiter, string = tex_token_or_group(string) + if delimiter not in '()[]/|.': + try: + delimiter = stretchables[delimiter.lstrip('\\')] + except KeyError: + raise MathError(f'Unsupported "\\{name}" delimiter ' + f'"{delimiter}"!') + if size: + delimiter_attributes['maxsize'] = size + delimiter_attributes['minsize'] = size + delimiter_attributes['symmetric'] = True + if name == 'left' or name.endswith('l'): + row = mrow() + node.append(row) + node = row + if delimiter != '.': # '.' stands for "empty delimiter" + node.append(mo(delimiter, **delimiter_attributes)) + if name == 'right' or name.endswith('r'): + node = node.close() + return node, string + + if name == 'not': + # negation: LaTeX just overlays next symbol with "/". + arg, string = tex_token(string) + if arg == '{': + return node, '{\\not ' + string + if arg.startswith('\\'): # LaTeX macro + try: + arg = operators[arg[1:]] + except KeyError: + raise MathError(rf'"\not" cannot negate: "{arg}"!') + arg = unicodedata.normalize('NFC', arg+'\u0338') + node = node.append(mo(arg)) + return node, string + + # arbitrary text (usually comments) -> <mtext> + if name in ('text', 'mbox', 'textrm'): + arg, string = tex_token_or_group(string) + parts = arg.split('$') # extract inline math + for i, part in enumerate(parts): + if i % 2 == 0: # i is even + # LaTeX keeps whitespace in, e.g., ``\text{ foo }``, + # <mtext> displays only internal whitespace. + # → replace marginal whitespace with NBSP + part = re.sub('(^[ \n]|[ \n]$)', '\u00a0', part) + node = node.append(mtext(part)) + else: + parse_latex_math(node, part) + return node, string + + # horizontal space -> <mspace> + if name in spaces: + node = node.append(mspace(width='%s'%spaces[name])) + return node, string + + if name in ('hspace', 'mspace'): + arg, string = tex_group(string) + if arg.endswith('mu'): + # unit "mu" (1mu=1/18em) not supported by MathML + arg = '%sem' % (float(arg[:-2])/18) + node = node.append(mspace(width='%s'%arg)) + return node, string + + if name == 'phantom': + new_node = mphantom() + node.append(new_node) + return new_node, string + + if name == 'boxed': + # CSS padding is broken in Firefox 115.6.0esr + # therefore we still need the deprecated <menclose> element + new_node = menclose(notation='box', CLASS='boxed') + node.append(new_node) + return new_node, string + + # Complex elements (Layout schemata) + # ================================== + + if name == 'sqrt': + radix, string = tex_optarg(string) + if radix: + indexnode = mrow() + new_node = mroot(indexnode, switch=True) + parse_latex_math(indexnode, radix) + indexnode.close() + else: + new_node = msqrt() + node.append(new_node) + return new_node, string + + if name in fractions: + attributes = fractions[name] + if name == 'cfrac': + optarg, string = tex_optarg(string) + optargs = {'l': 'left', 'r': 'right'} + if optarg in optargs: + attributes = attributes.copy() + attributes['numalign'] = optargs[optarg] # "numalign" is deprecated + attributes['class'] += ' numalign-' + optargs[optarg] + new_node = frac = mfrac(**attributes) + if name.endswith('binom'): + new_node = mrow(mo('('), new_node, mo(')'), CLASS='binom') + new_node.nchildren = 3 + node.append(new_node) + return frac, string + + if name == '\\': # end of a row + entry = mtd() + new_node = mtr(entry) + node.close().close().append(new_node) + return entry, string + + if name in accents: + accent_node = mo(accents[name], stretchy=False) + # mi() would be simpler, but semantically wrong + # --- https://w3c.github.io/mathml-core/#operator-fence-separator-or-accent-mo + if name == 'vec': + accent_node.set('scriptlevel', '+1') # scale down arrow + new_node = mover(accent_node, accent=True, switch=True) + node.append(new_node) + return new_node, string + + if name in over: + # set "accent" to False (otherwise dots on i and j are dropped) + # but to True on accent node get "textstyle" (full size) symbols on top + new_node = mover(mo(over[name][0], accent=True), + switch=True, accent=False) + node.append(new_node) + return new_node, string + + if name == 'overset': + new_node = mover(switch=True) + node.append(new_node) + return new_node, string + + if name in under: + new_node = munder(mo(under[name][0]), switch=True) + node.append(new_node) + return new_node, string + + if name == 'underset': + new_node = munder(switch=True) + node.append(new_node) + return new_node, string + + if name in ('xleftarrow', 'xrightarrow'): + subscript, string = tex_optarg(string) + base = mo(operators['long'+name[1:]]) + if subscript: + new_node = munderover(base) + sub_node = parse_latex_math(mrow(), subscript) + if len(sub_node) == 1: + sub_node = sub_node[0] + new_node.append(sub_node) + else: + new_node = mover(base) + node.append(new_node) + return new_node, string + + if name in layout_styles: # 'displaystyle', 'textstyle', ... + if len(node) > 0: + raise MathError(rf'Declaration "\{name}" must be first command ' + 'in a group!') + for k, v in layout_styles[name].items(): + node.set(k, v) + return node, string + + if name.endswith('limits'): + arg, remainder = tex_token(string) + if arg in '_^': # else ignore + string = remainder + node = handle_script_or_limit(node, arg, limits=name) + return node, string + + # Environments + + if name == 'begin': + return begin_environment(node, string) + + if name == 'end': + return end_environment(node, string) + + raise MathError(rf'Unknown LaTeX command "\{name}".') + +# >>> handle_cmd('left', math(), '[a\\right]') +# (mrow(mo('[')), 'a\\right]') +# >>> handle_cmd('left', math(), '. a)') # empty \left +# (mrow(), ' a)') +# >>> handle_cmd('left', math(), '\\uparrow a)') # cmd +# (mrow(mo('↑')), 'a)') +# >>> handle_cmd('not', math(), '\\equiv \\alpha)') # cmd +# (math(mo('≢')), '\\alpha)') +# >>> handle_cmd('text', math(), '{ for } i>0') # group +# (math(mtext('\xa0for\xa0')), ' i>0') +# >>> handle_cmd('text', math(), '{B}T') # group +# (math(mtext('B')), 'T') +# >>> handle_cmd('text', math(), '{number of apples}}') # group +# (math(mtext('number of apples')), '}') +# >>> handle_cmd('text', math(), 'i \\sin(x)') # single char +# (math(mtext('i')), ' \\sin(x)') +# >>> handle_cmd(' ', math(), ' next') # inter word space +# (math(mspace(width='0.25em')), ' next') +# >>> handle_cmd('\n', math(), '\nnext') # inter word space +# (math(mspace(width='0.25em')), '\nnext') +# >>> handle_cmd('sin', math(), '(\\alpha)') +# (math(mi('sin'), mo('\u2061')), '(\\alpha)') +# >>> handle_cmd('sin', math(), ' \\alpha') +# (math(mi('sin'), mo('\u2061')), ' \\alpha') +# >>> handle_cmd('operatorname', math(), '{abs}(x)') +# (math(mi('abs', mathvariant='normal'), mo('\u2061')), '(x)') +# >>> handle_cmd('overline', math(), '{981}') +# (mover(mo('_', accent='true'), switch=True, accent='false'), '{981}') +# >>> handle_cmd('bar', math(), '{x}') +# (mover(mo('ˉ', stretchy='false'), switch=True, accent='true'), '{x}') +# >>> handle_cmd('xleftarrow', math(), r'[\alpha]{10}') +# (munderover(mo('⟵'), mi('α')), '{10}') +# >>> handle_cmd('xleftarrow', math(), r'[\alpha=5]{10}') +# (munderover(mo('⟵'), mrow(mi('α'), mo('='), mn('5'))), '{10}') +# >>> handle_cmd('left', math(), '< a)') +# Traceback (most recent call last): +# docutils.utils.math.MathError: Unsupported "\left" delimiter "<"! +# >>> handle_cmd('not', math(), '{< b} c') # LaTeX ignores the braces, too. +# (math(), '{\\not < b} c') + + +def handle_math_alphabet(name, node, string): + attributes = {} + if name == 'mathscr': + attributes['class'] = 'mathscr' + arg, string = tex_token_or_group(string) + # Shortcut for text arg like \mathrm{out} with more than one letter: + if name == 'mathrm' and arg.isalpha() and len(arg) > 1: + node = node.append(mi(arg)) # <mi> defaults to "normal" font + return node, string + # Parse into an <mrow> + container = mrow(**attributes) + node.append(container) + parse_latex_math(container, arg) + key = name.replace('mathscr', 'mathcal').replace('mathbfsfit', 'mathsfbfit') + a2ch = getattr(mathalphabet2unichar, key, {}) + for subnode in container.iter(): + if isinstance(subnode, mn): + # a number may consist of more than one digit + subnode.text = ''.join(a2ch.get(ch, ch) for ch in subnode.text) + elif isinstance(subnode, mi): + # don't convert multi-letter identifiers (functions) + subnode.text = a2ch.get(subnode.text, subnode.text) + if name == 'mathrm' and subnode.text.isalpha(): + subnode.set('mathvariant', 'normal') + return container.close(), string + +# >>> handle_math_alphabet('mathrm', math(), '\\alpha') +# (math(mi('α', mathvariant='normal')), '') +# >>> handle_math_alphabet('mathbb', math(), '{R} = 3') +# (math(mi('ℝ')), ' = 3') +# >>> handle_math_alphabet('mathcal', math(), '{F = 3}') +# (math(mrow(mi('ℱ'), mo('='), mn('3'), nchildren=3)), '') +# >>> handle_math_alphabet('mathrm', math(), '{out} = 3') # drop <mrow> +# (math(mi('out')), ' = 3') +# +# Single letters in \mathrm require "mathvariant='normal'": +# >>> handle_math_alphabet('mathrm', math(), '{V = 3}') # doctest: +ELLIPSIS +# (math(mrow(mi('V', mathvariant='normal'), mo('='), mn('3'), ...)), '') + + +def handle_script_or_limit(node, c, limits=''): + """Append script or limit element to `node`.""" + child = node.pop() + if limits == 'limits': + child.set('movablelimits', 'false') + elif (limits == 'movablelimits' + or getattr(child, 'text', '') in movablelimits): + child.set('movablelimits', 'true') + + if c == '_': + if isinstance(child, mover): + new_node = munderover(*child, switch=True) + elif isinstance(child, msup): + new_node = msubsup(*child, switch=True) + elif (limits in ('limits', 'movablelimits') + or limits == '' and child.get('movablelimits', None)): + new_node = munder(child) + else: + new_node = msub(child) + elif c == '^': + if isinstance(child, munder): + new_node = munderover(*child) + elif isinstance(child, msub): + new_node = msubsup(*child) + elif (limits in ('limits', 'movablelimits') + or limits == '' and child.get('movablelimits', None)): + new_node = mover(child) + else: + new_node = msup(child) + node.append(new_node) + return new_node + + +def begin_environment(node, string): + name, string = tex_group(string) + if name in matrices: + left_delimiter = matrices[name][0] + attributes = {} + if left_delimiter: + wrapper = mrow(mo(left_delimiter)) + if name == 'cases': + wrapper = mrow(mo(left_delimiter, rspace='0.17em')) + attributes['columnalign'] = 'left' + attributes['class'] = 'cases' + node.append(wrapper) + node = wrapper + elif name == 'smallmatrix': + attributes['rowspacing'] = '0.02em' + attributes['columnspacing'] = '0.333em' + attributes['scriptlevel'] = '1' + elif name == 'aligned': + attributes['class'] = 'ams-align' + # TODO: array, aligned & alignedat take an optional [t], [b], or [c]. + entry = mtd() + node.append(mtable(mtr(entry), **attributes)) + node = entry + else: + raise MathError(f'Environment "{name}" not supported!') + return node, string + + +def end_environment(node, string): + name, string = tex_group(string) + if name in matrices: + node = node.close().close().close() # close: mtd, mdr, mtable + right_delimiter = matrices[name][1] + if right_delimiter: + node = node.append(mo(right_delimiter)) + node = node.close() + elif name == 'cases': + node = node.close() + else: + raise MathError(f'Environment "{name}" not supported!') + return node, string + + +# Return the number of "equation_columns" in `code_lines`. cf. "alignat" +# in http://mirror.ctan.org/macros/latex/required/amsmath/amsldoc.pdf +def tex_equation_columns(rows): + tabs = max(row.count('&') - row.count(r'\&') for row in rows) + if tabs == 0: + return 0 + return int(tabs/2 + 1) + +# >>> tex_equation_columns(['a = b']) +# 0 +# >>> tex_equation_columns(['a &= b']) +# 1 +# >>> tex_equation_columns(['a &= b & a \in S']) +# 2 +# >>> tex_equation_columns(['a &= b & c &= d']) +# 2 + + +# Return dictionary with attributes to style an <mtable> as align environment: +# Not used with HTML. Replaced by CSS rule for "mtable.ams-align" in +# "minimal.css" as "columnalign" is disregarded by Chromium and webkit. +def align_attributes(rows): + atts = {'class': 'ams-align', + 'displaystyle': True} + # get maximal number of non-escaped "next column" markup characters: + tabs = max(row.count('&') - row.count(r'\&') for row in rows) + if tabs: + aligns = ['right', 'left'] * tabs + spacing = ['0', '2em'] * tabs + atts['columnalign'] = ' '.join(aligns[:tabs+1]) + atts['columnspacing'] = ' '.join(spacing[:tabs]) + return atts + +# >>> align_attributes(['a = b']) +# {'class': 'ams-align', 'displaystyle': True} +# >>> align_attributes(['a &= b']) +# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left', 'columnspacing': '0'} +# >>> align_attributes(['a &= b & a \in S']) +# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right', 'columnspacing': '0 2em'} +# >>> align_attributes(['a &= b & c &= d']) +# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'} +# >>> align_attributes([r'a &= b & c &= d \& e']) +# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left', 'columnspacing': '0 2em 0'} +# >>> align_attributes([r'a &= b & c &= d & e']) +# {'class': 'ams-align', 'displaystyle': True, 'columnalign': 'right left right left right', 'columnspacing': '0 2em 0 2em'} + + +def tex2mathml(tex_math, as_block=False): + """Return string with MathML code corresponding to `tex_math`. + + Set `as_block` to ``True`` for displayed formulas. + """ + # Set up tree + math_tree = math(xmlns='http://www.w3.org/1998/Math/MathML') + node = math_tree + if as_block: + math_tree.set('display', 'block') + rows = toplevel_code(tex_math).split(r'\\') + if len(rows) > 1: + # emulate "align*" environment with a math table + node = mtd() + math_tree.append(mtable(mtr(node), CLASS='ams-align', + displaystyle=True)) + parse_latex_math(node, tex_math) + math_tree.indent_xml() + return math_tree.toxml() + +# >>> print(tex2mathml('3')) +# <math xmlns="http://www.w3.org/1998/Math/MathML"> +# <mn>3</mn> +# </math> +# >>> print(tex2mathml('3', as_block=True)) +# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block"> +# <mn>3</mn> +# </math> +# >>> print(tex2mathml(r'a & b \\ c & d', as_block=True)) +# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block"> +# <mtable class="ams-align" displaystyle="true"> +# <mtr> +# <mtd> +# <mi>a</mi> +# </mtd> +# <mtd> +# <mi>b</mi> +# </mtd> +# </mtr> +# <mtr> +# <mtd> +# <mi>c</mi> +# </mtd> +# <mtd> +# <mi>d</mi> +# </mtd> +# </mtr> +# </mtable> +# </math> +# >>> print(tex2mathml(r'a \\ b', as_block=True)) +# <math xmlns="http://www.w3.org/1998/Math/MathML" display="block"> +# <mtable class="ams-align" displaystyle="true"> +# <mtr> +# <mtd> +# <mi>a</mi> +# </mtd> +# </mtr> +# <mtr> +# <mtd> +# <mi>b</mi> +# </mtd> +# </mtr> +# </mtable> +# </math> + + +# TODO: look up more symbols from tr25, e.g. +# +# +# Table 2.8 Using Vertical Line or Solidus Overlay +# some of the negated forms of mathematical relations that can only be +# encoded by using either U+0338 COMBINING LONG SOLIDUS OVERLAY or U+20D2 +# COMBINING LONG VERTICAL LINE OVERLAY . (For issues with using 0338 in +# MathML, see Section 3.2.7, Combining Marks. +# +# Table 2.9 Variants of Mathematical Symbols using VS1? +# +# Sequence Description +# 0030 + VS1 DIGIT ZERO - short diagonal stroke form +# 2205 + VS1 EMPTY SET - zero with long diagonal stroke overlay form +# 2229 + VS1 INTERSECTION - with serifs +# 222A + VS1 UNION - with serifs +# 2268 + VS1 LESS-THAN BUT NOT EQUAL TO - with vertical stroke +# 2269 + VS1 GREATER-THAN BUT NOT EQUAL TO - with vertical stroke +# 2272 + VS1 LESS-THAN OR EQUIVALENT TO - following the slant of the lower leg +# 2273 + VS1 GREATER-THAN OR EQUIVALENT TO - following the slant of the lower leg +# 228A + VS1 SUBSET OF WITH NOT EQUAL TO - variant with stroke through bottom members +# 228B + VS1 SUPERSET OF WITH NOT EQUAL TO - variant with stroke through bottom members +# 2293 + VS1 SQUARE CAP - with serifs +# 2294 + VS1 SQUARE CUP - with serifs +# 2295 + VS1 CIRCLED PLUS - with white rim +# 2297 + VS1 CIRCLED TIMES - with white rim +# 229C + VS1 CIRCLED EQUALS - equal sign inside and touching the circle +# 22DA + VS1 LESS-THAN slanted EQUAL TO OR GREATER-THAN +# 22DB + VS1 GREATER-THAN slanted EQUAL TO OR LESS-THAN +# 2A3C + VS1 INTERIOR PRODUCT - tall variant with narrow foot +# 2A3D + VS1 RIGHTHAND INTERIOR PRODUCT - tall variant with narrow foot +# 2A9D + VS1 SIMILAR OR LESS-THAN - following the slant of the upper leg +# 2A9E + VS1 SIMILAR OR GREATER-THAN - following the slant of the upper leg +# 2AAC + VS1 SMALLER THAN OR slanted EQUAL +# 2AAD + VS1 LARGER THAN OR slanted EQUAL +# 2ACB + VS1 SUBSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members +# 2ACC + VS1 SUPERSET OF ABOVE NOT EQUAL TO - variant with stroke through bottom members |