aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/openpyxl/formula/tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/openpyxl/formula/tokenizer.py')
-rw-r--r--.venv/lib/python3.12/site-packages/openpyxl/formula/tokenizer.py446
1 files changed, 446 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/openpyxl/formula/tokenizer.py b/.venv/lib/python3.12/site-packages/openpyxl/formula/tokenizer.py
new file mode 100644
index 00000000..9bf26240
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/openpyxl/formula/tokenizer.py
@@ -0,0 +1,446 @@
+"""
+This module contains a tokenizer for Excel formulae.
+
+The tokenizer is based on the Javascript tokenizer found at
+http://ewbi.blogs.com/develops/2004/12/excel_formula_p.html written by Eric
+Bachtal
+"""
+
+import re
+
+
+class TokenizerError(Exception):
+ """Base class for all Tokenizer errors."""
+
+
+class Tokenizer:
+
+ """
+ A tokenizer for Excel worksheet formulae.
+
+ Converts a str string representing an Excel formula (in A1 notation)
+ into a sequence of `Token` objects.
+
+ `formula`: The str string to tokenize
+
+ Tokenizer defines a method `._parse()` to parse the formula into tokens,
+ which can then be accessed through the `.items` attribute.
+
+ """
+
+ SN_RE = re.compile("^[1-9](\\.[0-9]+)?[Ee]$") # Scientific notation
+ WSPACE_RE = re.compile(r"[ \n]+")
+ STRING_REGEXES = {
+ # Inside a string, all characters are treated as literals, except for
+ # the quote character used to start the string. That character, when
+ # doubled is treated as a single character in the string. If an
+ # unmatched quote appears, the string is terminated.
+ '"': re.compile('"(?:[^"]*"")*[^"]*"(?!")'),
+ "'": re.compile("'(?:[^']*'')*[^']*'(?!')"),
+ }
+ ERROR_CODES = ("#NULL!", "#DIV/0!", "#VALUE!", "#REF!", "#NAME?",
+ "#NUM!", "#N/A", "#GETTING_DATA")
+ TOKEN_ENDERS = ',;}) +-*/^&=><%' # Each of these characters, marks the
+ # end of an operand token
+
+ def __init__(self, formula):
+ self.formula = formula
+ self.items = []
+ self.token_stack = [] # Used to keep track of arrays, functions, and
+ # parentheses
+ self.offset = 0 # How many chars have we read
+ self.token = [] # Used to build up token values char by char
+ self._parse()
+
+ def _parse(self):
+ """Populate self.items with the tokens from the formula."""
+ if self.offset:
+ return # Already parsed!
+ if not self.formula:
+ return
+ elif self.formula[0] == '=':
+ self.offset += 1
+ else:
+ self.items.append(Token(self.formula, Token.LITERAL))
+ return
+ consumers = (
+ ('"\'', self._parse_string),
+ ('[', self._parse_brackets),
+ ('#', self._parse_error),
+ (' ', self._parse_whitespace),
+ ('\n', self._parse_whitespace),
+ ('+-*/^&=><%', self._parse_operator),
+ ('{(', self._parse_opener),
+ (')}', self._parse_closer),
+ (';,', self._parse_separator),
+ )
+ dispatcher = {} # maps chars to the specific parsing function
+ for chars, consumer in consumers:
+ dispatcher.update(dict.fromkeys(chars, consumer))
+ while self.offset < len(self.formula):
+ if self.check_scientific_notation(): # May consume one character
+ continue
+ curr_char = self.formula[self.offset]
+ if curr_char in self.TOKEN_ENDERS:
+ self.save_token()
+ if curr_char in dispatcher:
+ self.offset += dispatcher[curr_char]()
+ else:
+ # TODO: this can probably be sped up using a regex to get to
+ # the next interesting character
+ self.token.append(curr_char)
+ self.offset += 1
+ self.save_token()
+
+ def _parse_string(self):
+ """
+ Parse a "-delimited string or '-delimited link.
+
+ The offset must be pointing to either a single quote ("'") or double
+ quote ('"') character. The strings are parsed according to Excel
+ rules where to escape the delimiter you just double it up. E.g.,
+ "abc""def" in Excel is parsed as 'abc"def' in Python.
+
+ Returns the number of characters matched. (Does not update
+ self.offset)
+
+ """
+ self.assert_empty_token(can_follow=':')
+ delim = self.formula[self.offset]
+ assert delim in ('"', "'")
+ regex = self.STRING_REGEXES[delim]
+ match = regex.match(self.formula[self.offset:])
+ if match is None:
+ subtype = "string" if delim == '"' else 'link'
+ raise TokenizerError(f"Reached end of formula while parsing {subtype} in {self.formula}")
+ match = match.group(0)
+ if delim == '"':
+ self.items.append(Token.make_operand(match))
+ else:
+ self.token.append(match)
+ return len(match)
+
+ def _parse_brackets(self):
+ """
+ Consume all the text between square brackets [].
+
+ Returns the number of characters matched. (Does not update
+ self.offset)
+
+ """
+ assert self.formula[self.offset] == '['
+ lefts = [(t.start(), 1) for t in
+ re.finditer(r"\[", self.formula[self.offset:])]
+ rights = [(t.start(), -1) for t in
+ re.finditer(r"\]", self.formula[self.offset:])]
+
+ open_count = 0
+ for idx, open_close in sorted(lefts + rights):
+ open_count += open_close
+ if open_count == 0:
+ outer_right = idx + 1
+ self.token.append(
+ self.formula[self.offset:self.offset + outer_right])
+ return outer_right
+
+ raise TokenizerError(f"Encountered unmatched '[' in {self.formula}")
+
+ def _parse_error(self):
+ """
+ Consume the text following a '#' as an error.
+
+ Looks for a match in self.ERROR_CODES and returns the number of
+ characters matched. (Does not update self.offset)
+
+ """
+ self.assert_empty_token(can_follow='!')
+ assert self.formula[self.offset] == '#'
+ subformula = self.formula[self.offset:]
+ for err in self.ERROR_CODES:
+ if subformula.startswith(err):
+ self.items.append(Token.make_operand(''.join(self.token) + err))
+ del self.token[:]
+ return len(err)
+ raise TokenizerError(f"Invalid error code at position {self.offset} in '{self.formula}'")
+
+ def _parse_whitespace(self):
+ """
+ Consume a string of consecutive spaces.
+
+ Returns the number of spaces found. (Does not update self.offset).
+
+ """
+ assert self.formula[self.offset] in (' ', '\n')
+ self.items.append(Token(self.formula[self.offset], Token.WSPACE))
+ return self.WSPACE_RE.match(self.formula[self.offset:]).end()
+
+ def _parse_operator(self):
+ """
+ Consume the characters constituting an operator.
+
+ Returns the number of characters consumed. (Does not update
+ self.offset)
+
+ """
+ if self.formula[self.offset:self.offset + 2] in ('>=', '<=', '<>'):
+ self.items.append(Token(
+ self.formula[self.offset:self.offset + 2],
+ Token.OP_IN
+ ))
+ return 2
+ curr_char = self.formula[self.offset] # guaranteed to be 1 char
+ assert curr_char in '%*/^&=><+-'
+ if curr_char == '%':
+ token = Token('%', Token.OP_POST)
+ elif curr_char in "*/^&=><":
+ token = Token(curr_char, Token.OP_IN)
+ # From here on, curr_char is guaranteed to be in '+-'
+ elif not self.items:
+ token = Token(curr_char, Token.OP_PRE)
+ else:
+ prev = next((i for i in reversed(self.items)
+ if i.type != Token.WSPACE), None)
+ is_infix = prev and (
+ prev.subtype == Token.CLOSE
+ or prev.type == Token.OP_POST
+ or prev.type == Token.OPERAND
+ )
+ if is_infix:
+ token = Token(curr_char, Token.OP_IN)
+ else:
+ token = Token(curr_char, Token.OP_PRE)
+ self.items.append(token)
+ return 1
+
+ def _parse_opener(self):
+ """
+ Consumes a ( or { character.
+
+ Returns the number of characters consumed. (Does not update
+ self.offset)
+
+ """
+ assert self.formula[self.offset] in ('(', '{')
+ if self.formula[self.offset] == '{':
+ self.assert_empty_token()
+ token = Token.make_subexp("{")
+ elif self.token:
+ token_value = "".join(self.token) + '('
+ del self.token[:]
+ token = Token.make_subexp(token_value)
+ else:
+ token = Token.make_subexp("(")
+ self.items.append(token)
+ self.token_stack.append(token)
+ return 1
+
+ def _parse_closer(self):
+ """
+ Consumes a } or ) character.
+
+ Returns the number of characters consumed. (Does not update
+ self.offset)
+
+ """
+ assert self.formula[self.offset] in (')', '}')
+ token = self.token_stack.pop().get_closer()
+ if token.value != self.formula[self.offset]:
+ raise TokenizerError(
+ "Mismatched ( and { pair in '%s'" % self.formula)
+ self.items.append(token)
+ return 1
+
+ def _parse_separator(self):
+ """
+ Consumes a ; or , character.
+
+ Returns the number of characters consumed. (Does not update
+ self.offset)
+
+ """
+ curr_char = self.formula[self.offset]
+ assert curr_char in (';', ',')
+ if curr_char == ';':
+ token = Token.make_separator(";")
+ else:
+ try:
+ top_type = self.token_stack[-1].type
+ except IndexError:
+ token = Token(",", Token.OP_IN) # Range Union operator
+ else:
+ if top_type == Token.PAREN:
+ token = Token(",", Token.OP_IN) # Range Union operator
+ else:
+ token = Token.make_separator(",")
+ self.items.append(token)
+ return 1
+
+ def check_scientific_notation(self):
+ """
+ Consumes a + or - character if part of a number in sci. notation.
+
+ Returns True if the character was consumed and self.offset was
+ updated, False otherwise.
+
+ """
+ curr_char = self.formula[self.offset]
+ if (curr_char in '+-'
+ and len(self.token) >= 1
+ and self.SN_RE.match("".join(self.token))):
+ self.token.append(curr_char)
+ self.offset += 1
+ return True
+ return False
+
+ def assert_empty_token(self, can_follow=()):
+ """
+ Ensure that there's no token currently being parsed.
+
+ Or if there is a token being parsed, it must end with a character in
+ can_follow.
+
+ If there are unconsumed token contents, it means we hit an unexpected
+ token transition. In this case, we raise a TokenizerError
+
+ """
+ if self.token and self.token[-1] not in can_follow:
+ raise TokenizerError(f"Unexpected character at position {self.offset} in '{self.formula}'")
+
+ def save_token(self):
+ """If there's a token being parsed, add it to the item list."""
+ if self.token:
+ self.items.append(Token.make_operand("".join(self.token)))
+ del self.token[:]
+
+ def render(self):
+ """Convert the parsed tokens back to a string."""
+ if not self.items:
+ return ""
+ elif self.items[0].type == Token.LITERAL:
+ return self.items[0].value
+ return "=" + "".join(token.value for token in self.items)
+
+
+class Token:
+
+ """
+ A token in an Excel formula.
+
+ Tokens have three attributes:
+
+ * `value`: The string value parsed that led to this token
+ * `type`: A string identifying the type of token
+ * `subtype`: A string identifying subtype of the token (optional, and
+ defaults to "")
+
+ """
+
+ __slots__ = ['value', 'type', 'subtype']
+
+ LITERAL = "LITERAL"
+ OPERAND = "OPERAND"
+ FUNC = "FUNC"
+ ARRAY = "ARRAY"
+ PAREN = "PAREN"
+ SEP = "SEP"
+ OP_PRE = "OPERATOR-PREFIX"
+ OP_IN = "OPERATOR-INFIX"
+ OP_POST = "OPERATOR-POSTFIX"
+ WSPACE = "WHITE-SPACE"
+
+ def __init__(self, value, type_, subtype=""):
+ self.value = value
+ self.type = type_
+ self.subtype = subtype
+
+ # Literal operands:
+ #
+ # Literal operands are always of type 'OPERAND' and can be of subtype
+ # 'TEXT' (for text strings), 'NUMBER' (for all numeric types), 'LOGICAL'
+ # (for TRUE and FALSE), 'ERROR' (for literal error values), or 'RANGE'
+ # (for all range references).
+
+ TEXT = 'TEXT'
+ NUMBER = 'NUMBER'
+ LOGICAL = 'LOGICAL'
+ ERROR = 'ERROR'
+ RANGE = 'RANGE'
+
+ def __repr__(self):
+ return u"{0} {1} {2}:".format(self.type, self.subtype, self.value)
+
+ @classmethod
+ def make_operand(cls, value):
+ """Create an operand token."""
+ if value.startswith('"'):
+ subtype = cls.TEXT
+ elif value.startswith('#'):
+ subtype = cls.ERROR
+ elif value in ('TRUE', 'FALSE'):
+ subtype = cls.LOGICAL
+ else:
+ try:
+ float(value)
+ subtype = cls.NUMBER
+ except ValueError:
+ subtype = cls.RANGE
+ return cls(value, cls.OPERAND, subtype)
+
+
+ # Subexpresssions
+ #
+ # There are 3 types of `Subexpressions`: functions, array literals, and
+ # parentheticals. Subexpressions have 'OPEN' and 'CLOSE' tokens. 'OPEN'
+ # is used when parsing the initial expression token (i.e., '(' or '{')
+ # and 'CLOSE' is used when parsing the closing expression token ('}' or
+ # ')').
+
+ OPEN = "OPEN"
+ CLOSE = "CLOSE"
+
+ @classmethod
+ def make_subexp(cls, value, func=False):
+ """
+ Create a subexpression token.
+
+ `value`: The value of the token
+ `func`: If True, force the token to be of type FUNC
+
+ """
+ assert value[-1] in ('{', '}', '(', ')')
+ if func:
+ assert re.match('.+\\(|\\)', value)
+ type_ = Token.FUNC
+ elif value in '{}':
+ type_ = Token.ARRAY
+ elif value in '()':
+ type_ = Token.PAREN
+ else:
+ type_ = Token.FUNC
+ subtype = cls.CLOSE if value in ')}' else cls.OPEN
+ return cls(value, type_, subtype)
+
+ def get_closer(self):
+ """Return a closing token that matches this token's type."""
+ assert self.type in (self.FUNC, self.ARRAY, self.PAREN)
+ assert self.subtype == self.OPEN
+ value = "}" if self.type == self.ARRAY else ")"
+ return self.make_subexp(value, func=self.type == self.FUNC)
+
+ # Separator tokens
+ #
+ # Argument separators always have type 'SEP' and can have one of two
+ # subtypes: 'ARG', 'ROW'. 'ARG' is used for the ',' token, when used to
+ # delimit either function arguments or array elements. 'ROW' is used for
+ # the ';' token, which is always used to delimit rows in an array
+ # literal.
+
+ ARG = "ARG"
+ ROW = "ROW"
+
+ @classmethod
+ def make_separator(cls, value):
+ """Create a separator token"""
+ assert value in (',', ';')
+ subtype = cls.ARG if value == ',' else cls.ROW
+ return cls(value, cls.SEP, subtype)