diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/lark/parsers')
11 files changed, 2517 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/__init__.py b/.venv/lib/python3.12/site-packages/lark/parsers/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/__init__.py diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/cyk.py b/.venv/lib/python3.12/site-packages/lark/parsers/cyk.py new file mode 100644 index 00000000..ff0924f2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/cyk.py @@ -0,0 +1,345 @@ +"""This module implements a CYK parser.""" + +# Author: https://github.com/ehudt (2018) +# +# Adapted by Erez + + +from collections import defaultdict +import itertools + +from ..exceptions import ParseError +from ..lexer import Token +from ..tree import Tree +from ..grammar import Terminal as T, NonTerminal as NT, Symbol + +try: + xrange +except NameError: + xrange = range + +def match(t, s): + assert isinstance(t, T) + return t.name == s.type + + +class Rule(object): + """Context-free grammar rule.""" + + def __init__(self, lhs, rhs, weight, alias): + super(Rule, self).__init__() + assert isinstance(lhs, NT), lhs + assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs + self.lhs = lhs + self.rhs = rhs + self.weight = weight + self.alias = alias + + def __str__(self): + return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs)) + + def __repr__(self): + return str(self) + + def __hash__(self): + return hash((self.lhs, tuple(self.rhs))) + + def __eq__(self, other): + return self.lhs == other.lhs and self.rhs == other.rhs + + def __ne__(self, other): + return not (self == other) + + +class Grammar(object): + """Context-free grammar.""" + + def __init__(self, rules): + self.rules = frozenset(rules) + + def __eq__(self, other): + return self.rules == other.rules + + def __str__(self): + return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n' + + def __repr__(self): + return str(self) + + +# Parse tree data structures +class RuleNode(object): + """A node in the parse tree, which also contains the full rhs rule.""" + + def __init__(self, rule, children, weight=0): + self.rule = rule + self.children = children + self.weight = weight + + def __repr__(self): + return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children)) + + + +class Parser(object): + """Parser wrapper.""" + + def __init__(self, rules): + super(Parser, self).__init__() + self.orig_rules = {rule: rule for rule in rules} + rules = [self._to_rule(rule) for rule in rules] + self.grammar = to_cnf(Grammar(rules)) + + def _to_rule(self, lark_rule): + """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" + assert isinstance(lark_rule.origin, NT) + assert all(isinstance(x, Symbol) for x in lark_rule.expansion) + return Rule( + lark_rule.origin, lark_rule.expansion, + weight=lark_rule.options.priority if lark_rule.options.priority else 0, + alias=lark_rule) + + def parse(self, tokenized, start): # pylint: disable=invalid-name + """Parses input, which is a list of tokens.""" + assert start + start = NT(start) + + table, trees = _parse(tokenized, self.grammar) + # Check if the parse succeeded. + if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]): + raise ParseError('Parsing failed.') + parse = trees[(0, len(tokenized) - 1)][start] + return self._to_tree(revert_cnf(parse)) + + def _to_tree(self, rule_node): + """Converts a RuleNode parse tree to a lark Tree.""" + orig_rule = self.orig_rules[rule_node.rule.alias] + children = [] + for child in rule_node.children: + if isinstance(child, RuleNode): + children.append(self._to_tree(child)) + else: + assert isinstance(child.name, Token) + children.append(child.name) + t = Tree(orig_rule.origin, children) + t.rule=orig_rule + return t + + +def print_parse(node, indent=0): + if isinstance(node, RuleNode): + print(' ' * (indent * 2) + str(node.rule.lhs)) + for child in node.children: + print_parse(child, indent + 1) + else: + print(' ' * (indent * 2) + str(node.s)) + + +def _parse(s, g): + """Parses sentence 's' using CNF grammar 'g'.""" + # The CYK table. Indexed with a 2-tuple: (start pos, end pos) + table = defaultdict(set) + # Top-level structure is similar to the CYK table. Each cell is a dict from + # rule name to the best (lightest) tree for that rule. + trees = defaultdict(dict) + # Populate base case with existing terminal production rules + for i, w in enumerate(s): + for terminal, rules in g.terminal_rules.items(): + if match(terminal, w): + for rule in rules: + table[(i, i)].add(rule) + if (rule.lhs not in trees[(i, i)] or + rule.weight < trees[(i, i)][rule.lhs].weight): + trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight) + + # Iterate over lengths of sub-sentences + for l in xrange(2, len(s) + 1): + # Iterate over sub-sentences with the given length + for i in xrange(len(s) - l + 1): + # Choose partition of the sub-sentence in [1, l) + for p in xrange(i + 1, i + l): + span1 = (i, p - 1) + span2 = (p, i + l - 1) + for r1, r2 in itertools.product(table[span1], table[span2]): + for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []): + table[(i, i + l - 1)].add(rule) + r1_tree = trees[span1][r1.lhs] + r2_tree = trees[span2][r2.lhs] + rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight + if (rule.lhs not in trees[(i, i + l - 1)] + or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight): + trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight) + return table, trees + + +# This section implements context-free grammar converter to Chomsky normal form. +# It also implements a conversion of parse trees from its CNF to the original +# grammar. +# Overview: +# Applies the following operations in this order: +# * TERM: Eliminates non-solitary terminals from all rules +# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side. +# * UNIT: Eliminates non-terminal unit rules +# +# The following grammar characteristics aren't featured: +# * Start symbol appears on RHS +# * Empty rules (epsilon rules) + + +class CnfWrapper(object): + """CNF wrapper for grammar. + + Validates that the input grammar is CNF and provides helper data structures. + """ + + def __init__(self, grammar): + super(CnfWrapper, self).__init__() + self.grammar = grammar + self.rules = grammar.rules + self.terminal_rules = defaultdict(list) + self.nonterminal_rules = defaultdict(list) + for r in self.rules: + # Validate that the grammar is CNF and populate auxiliary data structures. + assert isinstance(r.lhs, NT), r + if len(r.rhs) not in [1, 2]: + raise ParseError("CYK doesn't support empty rules") + if len(r.rhs) == 1 and isinstance(r.rhs[0], T): + self.terminal_rules[r.rhs[0]].append(r) + elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs): + self.nonterminal_rules[tuple(r.rhs)].append(r) + else: + assert False, r + + def __eq__(self, other): + return self.grammar == other.grammar + + def __repr__(self): + return repr(self.grammar) + + +class UnitSkipRule(Rule): + """A rule that records NTs that were skipped during transformation.""" + + def __init__(self, lhs, rhs, skipped_rules, weight, alias): + super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias) + self.skipped_rules = skipped_rules + + def __eq__(self, other): + return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules + + __hash__ = Rule.__hash__ + + +def build_unit_skiprule(unit_rule, target_rule): + skipped_rules = [] + if isinstance(unit_rule, UnitSkipRule): + skipped_rules += unit_rule.skipped_rules + skipped_rules.append(target_rule) + if isinstance(target_rule, UnitSkipRule): + skipped_rules += target_rule.skipped_rules + return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules, + weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias) + + +def get_any_nt_unit_rule(g): + """Returns a non-terminal unit rule from 'g', or None if there is none.""" + for rule in g.rules: + if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT): + return rule + return None + + +def _remove_unit_rule(g, rule): + """Removes 'rule' from 'g' without changing the langugage produced by 'g'.""" + new_rules = [x for x in g.rules if x != rule] + refs = [x for x in g.rules if x.lhs == rule.rhs[0]] + new_rules += [build_unit_skiprule(rule, ref) for ref in refs] + return Grammar(new_rules) + + +def _split(rule): + """Splits a rule whose len(rhs) > 2 into shorter rules.""" + rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs) + rule_name = '__SP_%s' % (rule_str) + '_%d' + yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias) + for i in xrange(1, len(rule.rhs) - 2): + yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split') + yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split') + + +def _term(g): + """Applies the TERM rule on 'g' (see top comment).""" + all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)} + t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t} + new_rules = [] + for rule in g.rules: + if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs): + new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs] + new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias)) + new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs) + else: + new_rules.append(rule) + return Grammar(new_rules) + + +def _bin(g): + """Applies the BIN rule to 'g' (see top comment).""" + new_rules = [] + for rule in g.rules: + if len(rule.rhs) > 2: + new_rules += _split(rule) + else: + new_rules.append(rule) + return Grammar(new_rules) + + +def _unit(g): + """Applies the UNIT rule to 'g' (see top comment).""" + nt_unit_rule = get_any_nt_unit_rule(g) + while nt_unit_rule: + g = _remove_unit_rule(g, nt_unit_rule) + nt_unit_rule = get_any_nt_unit_rule(g) + return g + + +def to_cnf(g): + """Creates a CNF grammar from a general context-free grammar 'g'.""" + g = _unit(_bin(_term(g))) + return CnfWrapper(g) + + +def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias): + if not skipped_rules: + return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight) + else: + weight = weight - skipped_rules[0].weight + return RuleNode( + Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [ + unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs, + skipped_rules[1:], children, + skipped_rules[0].weight, skipped_rules[0].alias) + ], weight=weight) + + +def revert_cnf(node): + """Reverts a parse tree (RuleNode) to its original non-CNF form (Node).""" + if isinstance(node, T): + return node + # Reverts TERM rule. + if node.rule.lhs.name.startswith('__T_'): + return node.children[0] + else: + children = [] + for child in map(revert_cnf, node.children): + # Reverts BIN rule. + if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'): + children += child.children + else: + children.append(child) + # Reverts UNIT rule. + if isinstance(node.rule, UnitSkipRule): + return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs, + node.rule.skipped_rules, children, + node.rule.weight, node.rule.alias) + else: + return RuleNode(node.rule, children) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/earley.py b/.venv/lib/python3.12/site-packages/lark/parsers/earley.py new file mode 100644 index 00000000..5961d5d5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/earley.py @@ -0,0 +1,326 @@ +"""This module implements an Earley parser. + +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html +""" + +from collections import deque + +from ..tree import Tree +from ..exceptions import UnexpectedEOF, UnexpectedToken +from ..utils import logger +from .grammar_analysis import GrammarAnalyzer +from ..grammar import NonTerminal +from .earley_common import Item, TransitiveItem +from .earley_forest import ForestSumVisitor, SymbolNode, ForestToParseTree + +class Parser: + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False, tree_class=Tree): + analysis = GrammarAnalyzer(parser_conf) + self.parser_conf = parser_conf + self.resolve_ambiguity = resolve_ambiguity + self.debug = debug + self.tree_class = tree_class + + self.FIRST = analysis.FIRST + self.NULLABLE = analysis.NULLABLE + self.callbacks = parser_conf.callbacks + self.predictions = {} + + ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than + # the slow 'isupper' in is_terminal. + self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } + self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + + self.forest_sum_visitor = None + for rule in parser_conf.rules: + if rule.origin not in self.predictions: + self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] + + ## Detect if any rules have priorities set. If the user specified priority = "none" then + # the priorities will be stripped from all rules before they reach us, allowing us to + # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities + # on any rules. + if self.forest_sum_visitor is None and rule.options.priority is not None: + self.forest_sum_visitor = ForestSumVisitor + + self.term_matcher = term_matcher + + + def predict_and_complete(self, i, to_scan, columns, transitives): + """The core Earley Predictor and Completer. + + At each stage of the input, we handling any completed items (things + that matched on the last cycle) and use those to predict what should + come next in the input stream. The completions and any predicted + non-terminals are recursively processed until we reach a set of, + which can be added to the scan list for the next scanner cycle.""" + # Held Completions (H in E.Scotts paper). + node_cache = {} + held_completions = {} + + column = columns[i] + # R (items) = Ei (column.items) + items = deque(column) + while items: + item = items.pop() # remove an element, A say, from R + + ### The Earley completer + if item.is_complete: ### (item.s == string) + if item.node is None: + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + item.node.add_family(item.s, item.rule, item.start, None, None) + + # create_leo_transitives(item.rule.origin, item.start) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + label = (root_transitive.s, root_transitive.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_path(root_transitive, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + + ### The Earley predictor + elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) + new_items = [] + for rule in self.predictions[item.expect]: + new_item = Item(rule, 0, i) + new_items.append(new_item) + + # Process any held completions (H). + if item.expect in held_completions: + new_item = item.advance() + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) + new_items.append(new_item) + + for new_item in new_items: + if new_item.expect in self.TERMINALS: + to_scan.add(new_item) + elif new_item not in column: + column.add(new_item) + items.append(new_item) + + def _parse(self, lexer, columns, to_scan, start_symbol=None): + def is_quasi_complete(item): + if item.is_complete: + return True + + quasi = item.advance() + while not quasi.is_complete: + if quasi.expect not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(origin, start): + visited = set() + to_create = [] + trule = None + previous = None + + ### Recursively walk backwards through the Earley sets until we find the + # first transitive candidate. If this is done continuously, we shouldn't + # have to walk more than 1 hop. + while True: + if origin in transitives[start]: + previous = trule = transitives[start][origin] + break + + is_empty_rule = not self.FIRST[origin] + if is_empty_rule: + break + + candidates = [ candidate for candidate in columns[start] if candidate.expect is not None and origin == candidate.expect ] + if len(candidates) != 1: + break + originator = next(iter(candidates)) + + if originator is None or originator in visited: + break + + visited.add(originator) + if not is_quasi_complete(originator): + break + + trule = originator.advance() + if originator.start != start: + visited.clear() + + to_create.append((origin, start, originator)) + origin = originator.rule.origin + start = originator.start + + # If a suitable Transitive candidate is not found, bail. + if trule is None: + return + + #### Now walk forwards and create Transitive Items in each set we walked through; and link + # each transitive item to the next set forwards. + while to_create: + origin, start, originator = to_create.pop() + titem = None + if previous is not None: + titem = previous.next_titem = TransitiveItem(origin, trule, originator, previous.column) + else: + titem = TransitiveItem(origin, trule, originator, start) + previous = transitives[start][origin] = titem + + + + def scan(i, token, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + next_to_scan = set() + next_set = set() + columns.append(next_set) + transitives.append({}) + node_cache = {} + + for item in set(to_scan): + if match(item.expect, token): + new_item = item.advance() + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) + + if not next_set and not next_to_scan: + expect = {i.expect.name for i in to_scan} + raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) + + return next_to_scan + + + # Define parser functions + match = self.term_matcher + + # Cache for nodes & tokens created in a particular parse step. + transitives = [{}] + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. + expects = {i.expect for i in to_scan} + i = 0 + for token in lexer.lex(expects): + self.predict_and_complete(i, to_scan, columns, transitives) + + to_scan = scan(i, token, to_scan) + i += 1 + + expects.clear() + expects |= {i.expect for i in to_scan} + + self.predict_and_complete(i, to_scan, columns, transitives) + + ## Column is now the final column in the parse. + assert i == len(columns)-1 + return to_scan + + def parse(self, lexer, start): + assert start, start + start_symbol = NonTerminal(start) + + columns = [set()] + to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. + + ## Predict for the start_symbol. + # Add predicted items to the first Earley set (for the predictor) if they + # result in a non-terminal, or the scanner if they result in a terminal. + for rule in self.predictions[start_symbol]: + item = Item(rule, 0, 0) + if item.expect in self.TERMINALS: + to_scan.add(item) + else: + columns[0].add(item) + + to_scan = self._parse(lexer, columns, to_scan, start_symbol) + + # If the parse was successful, the start + # symbol should have been completed in the last step of the Earley cycle, and will be in + # this column. Find the item for the start_symbol, which is the root of the SPPF tree. + solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] + if not solutions: + expected_terminals = [t.expect.name for t in to_scan] + raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) + + if self.debug: + from .earley_forest import ForestToPyDotVisitor + try: + debug_walker = ForestToPyDotVisitor() + except ImportError: + logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + else: + debug_walker.visit(solutions[0], "sppf.png") + + + if len(solutions) > 1: + assert False, 'Earley should not generate multiple start symbol items!' + + if self.tree_class is not None: + # Perform our SPPF -> AST conversion + transformer = ForestToParseTree(self.tree_class, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity) + return transformer.transform(solutions[0]) + + # return the root of the SPPF + return solutions[0] diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/earley_common.py b/.venv/lib/python3.12/site-packages/lark/parsers/earley_common.py new file mode 100644 index 00000000..6bd614ba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/earley_common.py @@ -0,0 +1,75 @@ +"This module implements an Earley Parser" + +# The parser uses a parse-forest to keep track of derivations and ambiguations. +# When the parse ends successfully, a disambiguation stage resolves all ambiguity +# (right now ambiguity resolution is not developed beyond the needs of lark) +# Afterwards the parse tree is reduced (transformed) according to user callbacks. +# I use the no-recursion version of Transformer, because the tree might be +# deeper than Python's recursion limit (a bit absurd, but that's life) +# +# The algorithm keeps track of each state set, using a corresponding Column instance. +# Column keeps track of new items using NewsList instances. +# +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +from ..grammar import NonTerminal, Terminal + +class Item(object): + "An Earley Item, the atom of the algorithm." + + __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') + def __init__(self, rule, ptr, start): + self.is_complete = len(rule.expansion) == ptr + self.rule = rule # rule + self.ptr = ptr # ptr + self.start = start # j + self.node = None # w + if self.is_complete: + self.s = rule.origin + self.expect = None + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + else: + self.s = (rule, ptr) + self.expect = rule.expansion[ptr] + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + self._hash = hash((self.s, self.start)) + + def advance(self): + return Item(self.rule, self.ptr + 1, self.start) + + def __eq__(self, other): + return self is other or (self.s == other.s and self.start == other.start) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) + return '%s (%d)' % (symbol, self.start) + + +class TransitiveItem(Item): + __slots__ = ('recognized', 'reduction', 'column', 'next_titem') + def __init__(self, recognized, trule, originator, start): + super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start) + self.recognized = recognized + self.reduction = originator + self.column = start + self.next_titem = None + self._hash = hash((self.s, self.start, self.recognized)) + + def __eq__(self, other): + if not isinstance(other, TransitiveItem): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/earley_forest.py b/.venv/lib/python3.12/site-packages/lark/parsers/earley_forest.py new file mode 100644 index 00000000..f39f4eb1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/earley_forest.py @@ -0,0 +1,790 @@ +""""This module implements an SPPF implementation + +This is used as the primary output mechanism for the Earley parser +in order to store complex ambiguities. + +Full reference and more details is here: +http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ +""" + +from random import randint +from math import isinf +from collections import deque +from operator import attrgetter +from importlib import import_module +from functools import partial + +from ..parse_tree_builder import AmbiguousIntermediateExpander +from ..visitors import Discard +from ..lexer import Token +from ..utils import logger +from ..tree import Tree + +class ForestNode(object): + pass + +class SymbolNode(ForestNode): + """ + A Symbol Node represents a symbol (or Intermediate LR0). + + Symbol nodes are keyed by the symbol (s). For intermediate nodes + s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol + nodes, s will be a string representing the non-terminal origin (i.e. + the left hand side of the rule). + + The children of a Symbol or Intermediate Node will always be Packed Nodes; + with each Packed Node child representing a single derivation of a production. + + Hence a Symbol Node with a single child is unambiguous. + + :ivar s: A Symbol, or a tuple of (rule, ptr) for an intermediate node. + :ivar start: The index of the start of the substring matched by this + symbol (inclusive). + :ivar end: The index of the end of the substring matched by this + symbol (exclusive). + :ivar is_intermediate: True if this node is an intermediate node. + :ivar priority: The priority of the node's symbol. + """ + __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') + def __init__(self, s, start, end): + self.s = s + self.start = start + self.end = end + self._children = set() + self.paths = set() + self.paths_loaded = False + + ### We use inf here as it can be safely negated without resorting to conditionals, + # unlike None or float('NaN'), and sorts appropriately. + self.priority = float('-inf') + self.is_intermediate = isinstance(s, tuple) + self._hash = hash((self.s, self.start, self.end)) + + def add_family(self, lr0, rule, start, left, right): + self._children.add(PackedNode(self, lr0, rule, start, left, right)) + + def add_path(self, transitive, node): + self.paths.add((transitive, node)) + + def load_paths(self): + for transitive, node in self.paths: + if transitive.next_titem is not None: + vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end) + vn.add_path(transitive.next_titem, node) + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) + else: + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) + self.paths_loaded = True + + @property + def is_ambiguous(self): + """Returns True if this node is ambiguous.""" + return len(self.children) > 1 + + @property + def children(self): + """Returns a list of this node's children sorted from greatest to + least priority.""" + if not self.paths_loaded: self.load_paths() + return sorted(self._children, key=attrgetter('sort_key')) + + def __iter__(self): + return iter(self._children) + + def __eq__(self, other): + if not isinstance(other, SymbolNode): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end) + + def __hash__(self): + return self._hash + + def __repr__(self): + if self.is_intermediate: + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority) + +class PackedNode(ForestNode): + """ + A Packed Node represents a single derivation in a symbol node. + + :ivar rule: The rule associated with this node. + :ivar parent: The parent of this node. + :ivar left: The left child of this node. ``None`` if one does not exist. + :ivar right: The right child of this node. ``None`` if one does not exist. + :ivar priority: The priority of this node. + """ + __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') + def __init__(self, parent, s, rule, start, left, right): + self.parent = parent + self.s = s + self.start = start + self.rule = rule + self.left = left + self.right = right + self.priority = float('-inf') + self._hash = hash((self.left, self.right)) + + @property + def is_empty(self): + return self.left is None and self.right is None + + @property + def sort_key(self): + """ + Used to sort PackedNode children of SymbolNodes. + A SymbolNode has multiple PackedNodes if it matched + ambiguously. Hence, we use the sort order to identify + the order in which ambiguous children should be considered. + """ + return self.is_empty, -self.priority, self.rule.order + + @property + def children(self): + """Returns a list of this node's children.""" + return [x for x in [self.left, self.right] if x is not None] + + def __iter__(self): + yield self.left + yield self.right + + def __eq__(self, other): + if not isinstance(other, PackedNode): + return False + return self is other or (self.left == other.left and self.right == other.right) + + def __hash__(self): + return self._hash + + def __repr__(self): + if isinstance(self.s, tuple): + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) + +class ForestVisitor(object): + """ + An abstract base class for building forest visitors. + + This class performs a controllable depth-first walk of an SPPF. + The visitor will not enter cycles and will backtrack if one is encountered. + Subclasses are notified of cycles through the ``on_cycle`` method. + + Behavior for visit events is defined by overriding the + ``visit*node*`` functions. + + The walk is controlled by the return values of the ``visit*node_in`` + methods. Returning a node(s) will schedule them to be visited. The visitor + will begin to backtrack if no nodes are returned. + + :ivar single_visit: If ``True``, non-Token nodes will only be visited once. + """ + + def __init__(self, single_visit=False): + self.single_visit = single_visit + + def visit_token_node(self, node): + """Called when a ``Token`` is visited. ``Token`` nodes are always leaves.""" + pass + + def visit_symbol_node_in(self, node): + """Called when a symbol node is visited. Nodes that are returned + will be scheduled to be visited. If ``visit_intermediate_node_in`` + is not implemented, this function will be called for intermediate + nodes as well.""" + pass + + def visit_symbol_node_out(self, node): + """Called after all nodes returned from a corresponding ``visit_symbol_node_in`` + call have been visited. If ``visit_intermediate_node_out`` + is not implemented, this function will be called for intermediate + nodes as well.""" + pass + + def visit_packed_node_in(self, node): + """Called when a packed node is visited. Nodes that are returned + will be scheduled to be visited. """ + pass + + def visit_packed_node_out(self, node): + """Called after all nodes returned from a corresponding ``visit_packed_node_in`` + call have been visited.""" + pass + + def on_cycle(self, node, path): + """Called when a cycle is encountered. + + :param node: The node that causes a cycle. + :param path: The list of nodes being visited: nodes that have been + entered but not exited. The first element is the root in a forest + visit, and the last element is the node visited most recently. + ``path`` should be treated as read-only. + """ + pass + + def get_cycle_in_path(self, node, path): + """A utility function for use in ``on_cycle`` to obtain a slice of + ``path`` that only contains the nodes that make up the cycle.""" + index = len(path) - 1 + while id(path[index]) != id(node): + index -= 1 + return path[index:] + + def visit(self, root): + # Visiting is a list of IDs of all symbol/intermediate nodes currently in + # the stack. It serves two purposes: to detect when we 'recurse' in and out + # of a symbol/intermediate so that we can process both up and down. Also, + # since the SPPF can have cycles it allows us to detect if we're trying + # to recurse into a node that's already on the stack (infinite recursion). + visiting = set() + + # set of all nodes that have been visited + visited = set() + + # a list of nodes that are currently being visited + # used for the `on_cycle` callback + path = [] + + # We do not use recursion here to walk the Forest due to the limited + # stack size in python. Therefore input_stack is essentially our stack. + input_stack = deque([root]) + + # It is much faster to cache these as locals since they are called + # many times in large parses. + vpno = getattr(self, 'visit_packed_node_out') + vpni = getattr(self, 'visit_packed_node_in') + vsno = getattr(self, 'visit_symbol_node_out') + vsni = getattr(self, 'visit_symbol_node_in') + vino = getattr(self, 'visit_intermediate_node_out', vsno) + vini = getattr(self, 'visit_intermediate_node_in', vsni) + vtn = getattr(self, 'visit_token_node') + oc = getattr(self, 'on_cycle') + + while input_stack: + current = next(reversed(input_stack)) + try: + next_node = next(current) + except StopIteration: + input_stack.pop() + continue + except TypeError: + ### If the current object is not an iterator, pass through to Token/SymbolNode + pass + else: + if next_node is None: + continue + + if id(next_node) in visiting: + oc(next_node, path) + continue + + input_stack.append(next_node) + continue + + if not isinstance(current, ForestNode): + vtn(current) + input_stack.pop() + continue + + current_id = id(current) + if current_id in visiting: + if isinstance(current, PackedNode): + vpno(current) + elif current.is_intermediate: + vino(current) + else: + vsno(current) + input_stack.pop() + path.pop() + visiting.remove(current_id) + visited.add(current_id) + elif self.single_visit and current_id in visited: + input_stack.pop() + else: + visiting.add(current_id) + path.append(current) + if isinstance(current, PackedNode): + next_node = vpni(current) + elif current.is_intermediate: + next_node = vini(current) + else: + next_node = vsni(current) + if next_node is None: + continue + + if not isinstance(next_node, ForestNode) and \ + not isinstance(next_node, Token): + next_node = iter(next_node) + elif id(next_node) in visiting: + oc(next_node, path) + continue + + input_stack.append(next_node) + +class ForestTransformer(ForestVisitor): + """The base class for a bottom-up forest transformation. Most users will + want to use ``TreeForestTransformer`` instead as it has a friendlier + interface and covers most use cases. + + Transformations are applied via inheritance and overriding of the + ``transform*node`` methods. + + ``transform_token_node`` receives a ``Token`` as an argument. + All other methods receive the node that is being transformed and + a list of the results of the transformations of that node's children. + The return value of these methods are the resulting transformations. + + If ``Discard`` is raised in a node's transformation, no data from that node + will be passed to its parent's transformation. + """ + + def __init__(self): + super(ForestTransformer, self).__init__() + # results of transformations + self.data = dict() + # used to track parent nodes + self.node_stack = deque() + + def transform(self, root): + """Perform a transformation on an SPPF.""" + self.node_stack.append('result') + self.data['result'] = [] + self.visit(root) + assert len(self.data['result']) <= 1 + if self.data['result']: + return self.data['result'][0] + + def transform_symbol_node(self, node, data): + """Transform a symbol node.""" + return node + + def transform_intermediate_node(self, node, data): + """Transform an intermediate node.""" + return node + + def transform_packed_node(self, node, data): + """Transform a packed node.""" + return node + + def transform_token_node(self, node): + """Transform a ``Token``.""" + return node + + def visit_symbol_node_in(self, node): + self.node_stack.append(id(node)) + self.data[id(node)] = [] + return node.children + + def visit_packed_node_in(self, node): + self.node_stack.append(id(node)) + self.data[id(node)] = [] + return node.children + + def visit_token_node(self, node): + try: + transformed = self.transform_token_node(node) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + + def visit_symbol_node_out(self, node): + self.node_stack.pop() + try: + transformed = self.transform_symbol_node(node, self.data[id(node)]) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + finally: + del self.data[id(node)] + + def visit_intermediate_node_out(self, node): + self.node_stack.pop() + try: + transformed = self.transform_intermediate_node(node, self.data[id(node)]) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + finally: + del self.data[id(node)] + + def visit_packed_node_out(self, node): + self.node_stack.pop() + try: + transformed = self.transform_packed_node(node, self.data[id(node)]) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + finally: + del self.data[id(node)] + +class ForestSumVisitor(ForestVisitor): + """ + A visitor for prioritizing ambiguous parts of the Forest. + + This visitor is used when support for explicit priorities on + rules is requested (whether normal, or invert). It walks the + forest (or subsets thereof) and cascades properties upwards + from the leaves. + + It would be ideal to do this during parsing, however this would + require processing each Earley item multiple times. That's + a big performance drawback; so running a forest walk is the + lesser of two evils: there can be significantly more Earley + items created during parsing than there are SPPF nodes in the + final tree. + """ + def __init__(self): + super(ForestSumVisitor, self).__init__(single_visit=True) + + def visit_packed_node_in(self, node): + yield node.left + yield node.right + + def visit_symbol_node_in(self, node): + return iter(node.children) + + def visit_packed_node_out(self, node): + priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0 + priority += getattr(node.right, 'priority', 0) + priority += getattr(node.left, 'priority', 0) + node.priority = priority + + def visit_symbol_node_out(self, node): + node.priority = max(child.priority for child in node.children) + +class PackedData(): + """Used in transformationss of packed nodes to distinguish the data + that comes from the left child and the right child. + """ + + class _NoData(): + pass + + NO_DATA = _NoData() + + def __init__(self, node, data): + self.left = self.NO_DATA + self.right = self.NO_DATA + if data: + if node.left is not None: + self.left = data[0] + if len(data) > 1: + self.right = data[1] + else: + self.right = data[0] + +class ForestToParseTree(ForestTransformer): + """Used by the earley parser when ambiguity equals 'resolve' or + 'explicit'. Transforms an SPPF into an (ambiguous) parse tree. + + tree_class: The tree class to use for construction + callbacks: A dictionary of rules to functions that output a tree + prioritizer: A ``ForestVisitor`` that manipulates the priorities of + ForestNodes + resolve_ambiguity: If True, ambiguities will be resolved based on + priorities. Otherwise, `_ambig` nodes will be in the resulting + tree. + use_cache: If True, the results of packed node transformations will be + cached. + """ + + def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True): + super(ForestToParseTree, self).__init__() + self.tree_class = tree_class + self.callbacks = callbacks + self.prioritizer = prioritizer + self.resolve_ambiguity = resolve_ambiguity + self._use_cache = use_cache + self._cache = {} + self._on_cycle_retreat = False + self._cycle_node = None + self._successful_visits = set() + + def visit(self, root): + if self.prioritizer: + self.prioritizer.visit(root) + super(ForestToParseTree, self).visit(root) + self._cache = {} + + def on_cycle(self, node, path): + logger.debug("Cycle encountered in the SPPF at node: %s. " + "As infinite ambiguities cannot be represented in a tree, " + "this family of derivations will be discarded.", node) + self._cycle_node = node + self._on_cycle_retreat = True + + def _check_cycle(self, node): + if self._on_cycle_retreat: + if id(node) == id(self._cycle_node) or id(node) in self._successful_visits: + self._cycle_node = None + self._on_cycle_retreat = False + return + raise Discard() + + def _collapse_ambig(self, children): + new_children = [] + for child in children: + if hasattr(child, 'data') and child.data == '_ambig': + new_children += child.children + else: + new_children.append(child) + return new_children + + def _call_rule_func(self, node, data): + # called when transforming children of symbol nodes + # data is a list of trees or tokens that correspond to the + # symbol's rule expansion + return self.callbacks[node.rule](data) + + def _call_ambig_func(self, node, data): + # called when transforming a symbol node + # data is a list of trees where each tree's data is + # equal to the name of the symbol or one of its aliases. + if len(data) > 1: + return self.tree_class('_ambig', data) + elif data: + return data[0] + raise Discard() + + def transform_symbol_node(self, node, data): + if id(node) not in self._successful_visits: + raise Discard() + self._check_cycle(node) + self._successful_visits.remove(id(node)) + data = self._collapse_ambig(data) + return self._call_ambig_func(node, data) + + def transform_intermediate_node(self, node, data): + if id(node) not in self._successful_visits: + raise Discard() + self._check_cycle(node) + self._successful_visits.remove(id(node)) + if len(data) > 1: + children = [self.tree_class('_inter', c) for c in data] + return self.tree_class('_iambig', children) + return data[0] + + def transform_packed_node(self, node, data): + self._check_cycle(node) + if self.resolve_ambiguity and id(node.parent) in self._successful_visits: + raise Discard() + if self._use_cache and id(node) in self._cache: + return self._cache[id(node)] + children = [] + assert len(data) <= 2 + data = PackedData(node, data) + if data.left is not PackedData.NO_DATA: + if node.left.is_intermediate and isinstance(data.left, list): + children += data.left + else: + children.append(data.left) + if data.right is not PackedData.NO_DATA: + children.append(data.right) + if node.parent.is_intermediate: + return self._cache.setdefault(id(node), children) + return self._cache.setdefault(id(node), self._call_rule_func(node, children)) + + def visit_symbol_node_in(self, node): + super(ForestToParseTree, self).visit_symbol_node_in(node) + if self._on_cycle_retreat: + return + return node.children + + def visit_packed_node_in(self, node): + self._on_cycle_retreat = False + to_visit = super(ForestToParseTree, self).visit_packed_node_in(node) + if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits: + if not self._use_cache or id(node) not in self._cache: + return to_visit + + def visit_packed_node_out(self, node): + super(ForestToParseTree, self).visit_packed_node_out(node) + if not self._on_cycle_retreat: + self._successful_visits.add(id(node.parent)) + +def handles_ambiguity(func): + """Decorator for methods of subclasses of ``TreeForestTransformer``. + Denotes that the method should receive a list of transformed derivations.""" + func.handles_ambiguity = True + return func + +class TreeForestTransformer(ForestToParseTree): + """A ``ForestTransformer`` with a tree ``Transformer``-like interface. + By default, it will construct a tree. + + Methods provided via inheritance are called based on the rule/symbol + names of nodes in the forest. + + Methods that act on rules will receive a list of the results of the + transformations of the rule's children. By default, trees and tokens. + + Methods that act on tokens will receive a token. + + Alternatively, methods that act on rules may be annotated with + ``handles_ambiguity``. In this case, the function will receive a list + of all the transformations of all the derivations of the rule. + By default, a list of trees where each tree.data is equal to the + rule name or one of its aliases. + + Non-tree transformations are made possible by override of + ``__default__``, ``__default_token__``, and ``__default_ambig__``. + + .. note:: + + Tree shaping features such as inlined rules and token filtering are + not built into the transformation. Positions are also not + propagated. + + :param tree_class: The tree class to use for construction + :param prioritizer: A ``ForestVisitor`` that manipulates the priorities of + nodes in the SPPF. + :param resolve_ambiguity: If True, ambiguities will be resolved based on + priorities. + :param use_cache: If True, caches the results of some transformations, + potentially improving performance when ``resolve_ambiguity==False``. + Only use if you know what you are doing: i.e. All transformation + functions are pure and referentially transparent. + """ + + def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False): + super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache) + + def __default__(self, name, data): + """Default operation on tree (for override). + + Returns a tree with name with data as children. + """ + return self.tree_class(name, data) + + def __default_ambig__(self, name, data): + """Default operation on ambiguous rule (for override). + + Wraps data in an '_ambig_' node if it contains more than + one element. + """ + if len(data) > 1: + return self.tree_class('_ambig', data) + elif data: + return data[0] + raise Discard() + + def __default_token__(self, node): + """Default operation on ``Token`` (for override). + + Returns ``node``. + """ + return node + + def transform_token_node(self, node): + return getattr(self, node.type, self.__default_token__)(node) + + def _call_rule_func(self, node, data): + name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name + user_func = getattr(self, name, self.__default__) + if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'): + user_func = partial(self.__default__, name) + if not self.resolve_ambiguity: + wrapper = partial(AmbiguousIntermediateExpander, self.tree_class) + user_func = wrapper(user_func) + return user_func(data) + + def _call_ambig_func(self, node, data): + name = node.s.name + user_func = getattr(self, name, self.__default_ambig__) + if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'): + user_func = partial(self.__default_ambig__, name) + return user_func(data) + +class ForestToPyDotVisitor(ForestVisitor): + """ + A Forest visitor which writes the SPPF to a PNG. + + The SPPF can get really large, really quickly because + of the amount of meta-data it stores, so this is probably + only useful for trivial trees and learning how the SPPF + is structured. + """ + def __init__(self, rankdir="TB"): + super(ForestToPyDotVisitor, self).__init__(single_visit=True) + self.pydot = import_module('pydot') + self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) + + def visit(self, root, filename): + super(ForestToPyDotVisitor, self).visit(root) + try: + self.graph.write_png(filename) + except FileNotFoundError as e: + logger.error("Could not write png: ", e) + + def visit_token_node(self, node): + graph_node_id = str(id(node)) + graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) + graph_node_color = 0x808080 + graph_node_style = "\"filled,rounded\"" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + + def visit_packed_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "filled" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + yield node.left + yield node.right + + def visit_packed_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in [node.left, node.right]: + if child is not None: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) + else: + #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. + child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) + child_graph_node_style = "invis" + child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") + child_edge_style = "invis" + self.graph.add_node(child_graph_node) + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) + + def visit_symbol_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "\"filled\"" + if node.is_intermediate: + graph_node_shape = "ellipse" + else: + graph_node_shape = "rectangle" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter(node.children) + + def visit_symbol_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in node.children: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/grammar_analysis.py b/.venv/lib/python3.12/site-packages/lark/parsers/grammar_analysis.py new file mode 100644 index 00000000..737cb02a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/grammar_analysis.py @@ -0,0 +1,185 @@ +from collections import Counter, defaultdict + +from ..utils import bfs, fzset, classify +from ..exceptions import GrammarError +from ..grammar import Rule, Terminal, NonTerminal + + +class RulePtr(object): + __slots__ = ('rule', 'index') + + def __init__(self, rule, index): + assert isinstance(rule, Rule) + assert index <= len(rule.expansion) + self.rule = rule + self.index = index + + def __repr__(self): + before = [x.name for x in self.rule.expansion[:self.index]] + after = [x.name for x in self.rule.expansion[self.index:]] + return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after)) + + @property + def next(self): + return self.rule.expansion[self.index] + + def advance(self, sym): + assert self.next == sym + return RulePtr(self.rule, self.index+1) + + @property + def is_satisfied(self): + return self.index == len(self.rule.expansion) + + def __eq__(self, other): + return self.rule == other.rule and self.index == other.index + def __hash__(self): + return hash((self.rule, self.index)) + + +# state generation ensures no duplicate LR0ItemSets +class LR0ItemSet(object): + __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') + + def __init__(self, kernel, closure): + self.kernel = fzset(kernel) + self.closure = fzset(closure) + self.transitions = {} + self.lookaheads = defaultdict(set) + + def __repr__(self): + return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) + + +def update_set(set1, set2): + if not set2 or set1 > set2: + return False + + copy = set(set1) + set1 |= set2 + return set1 != copy + +def calculate_sets(rules): + """Calculate FOLLOW sets. + + Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" + symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules} + + # foreach grammar rule X ::= Y(1) ... Y(k) + # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then + # NULLABLE = NULLABLE union {X} + # for i = 1 to k + # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then + # FIRST(X) = FIRST(X) union FIRST(Y(i)) + # for j = i+1 to k + # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) + # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) + # until none of NULLABLE,FIRST,FOLLOW changed in last iteration + + NULLABLE = set() + FIRST = {} + FOLLOW = {} + for sym in symbols: + FIRST[sym]={sym} if sym.is_term else set() + FOLLOW[sym]=set() + + # Calculate NULLABLE and FIRST + changed = True + while changed: + changed = False + + for rule in rules: + if set(rule.expansion) <= NULLABLE: + if update_set(NULLABLE, {rule.origin}): + changed = True + + for i, sym in enumerate(rule.expansion): + if set(rule.expansion[:i]) <= NULLABLE: + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True + else: + break + + # Calculate FOLLOW + changed = True + while changed: + changed = False + + for rule in rules: + for i, sym in enumerate(rule.expansion): + if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE: + if update_set(FOLLOW[sym], FOLLOW[rule.origin]): + changed = True + + for j in range(i+1, len(rule.expansion)): + if set(rule.expansion[i+1:j]) <= NULLABLE: + if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): + changed = True + + return FIRST, FOLLOW, NULLABLE + + +class GrammarAnalyzer(object): + def __init__(self, parser_conf, debug=False): + self.debug = debug + + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) + for start in parser_conf.start} + + rules = parser_conf.rules + list(root_rules.values()) + self.rules_by_origin = classify(rules, lambda r: r.origin) + + if len(rules) != len(set(rules)): + duplicates = [item for item, count in Counter(rules).items() if count > 1] + raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) + + for r in rules: + for sym in r.expansion: + if not (sym.is_term or sym in self.rules_by_origin): + raise GrammarError("Using an undefined rule: %s" % sym) + + self.start_states = {start: self.expand_rule(root_rule.origin) + for start, root_rule in root_rules.items()} + + self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))}) + for start, root_rule in root_rules.items()} + + lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)]) + for start in parser_conf.start} + + lr0_rules = parser_conf.rules + list(lr0_root_rules.values()) + assert(len(lr0_rules) == len(set(lr0_rules))) + + self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) + + # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) + self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) + for start, root_rule in lr0_root_rules.items()} + + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) + + def expand_rule(self, source_rule, rules_by_origin=None): + "Returns all init_ptrs accessible by rule (recursive)" + + if rules_by_origin is None: + rules_by_origin = self.rules_by_origin + + init_ptrs = set() + def _expand_rule(rule): + assert not rule.is_term, rule + + for r in rules_by_origin[rule]: + init_ptr = RulePtr(r, 0) + init_ptrs.add(init_ptr) + + if r.expansion: # if not empty rule + new_r = init_ptr.next + if not new_r.is_term: + yield new_r + + for _ in bfs([source_rule], _expand_rule): + pass + + return fzset(init_ptrs) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_analysis.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_analysis.py new file mode 100644 index 00000000..f6a993b9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_analysis.py @@ -0,0 +1,304 @@ +"""This module builds a LALR(1) transition-table for lalr_parser.py + +For now, shift/reduce conflicts are automatically resolved as shifts. +""" + +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +from collections import defaultdict + +from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger +from ..exceptions import GrammarError + +from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet +from ..grammar import Rule + +###{standalone + +class Action: + def __init__(self, name): + self.name = name + def __str__(self): + return self.name + def __repr__(self): + return str(self) + +Shift = Action('Shift') +Reduce = Action('Reduce') + + +class ParseTable: + def __init__(self, states, start_states, end_states): + self.states = states + self.start_states = start_states + self.end_states = end_states + + def serialize(self, memo): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'states': states, + 'start_states': self.start_states, + 'end_states': self.end_states, + } + + @classmethod + def deserialize(cls, data, memo): + tokens = data['tokens'] + states = { + state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + return cls(states, data['start_states'], data['end_states']) + + +class IntParseTable(ParseTable): + + @classmethod + def from_ParseTable(cls, parse_table): + enum = list(parse_table.states) + state_to_idx = {s:i for i,s in enumerate(enum)} + int_states = {} + + for s, la in parse_table.states.items(): + la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v + for k,v in la.items()} + int_states[ state_to_idx[s] ] = la + + + start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()} + end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()} + return cls(int_states, start_states, end_states) + +###} + + +# digraph and traverse, see The Theory and Practice of Compiler Writing + +# computes F(x) = G(x) union (union { G(y) | x R y }) +# X: nodes +# R: relation (function mapping node -> list of nodes that satisfy the relation) +# G: set valued function +def digraph(X, R, G): + F = {} + S = [] + N = {} + for x in X: + N[x] = 0 + for x in X: + # this is always true for the first iteration, but N[x] may be updated in traverse below + if N[x] == 0: + traverse(x, S, N, X, R, G, F) + return F + +# x: single node +# S: stack +# N: weights +# X: nodes +# R: relation (see above) +# G: set valued function +# F: set valued function we are computing (map of input -> output) +def traverse(x, S, N, X, R, G, F): + S.append(x) + d = len(S) + N[x] = d + F[x] = G[x] + for y in R[x]: + if N[y] == 0: + traverse(y, S, N, X, R, G, F) + n_x = N[x] + assert(n_x > 0) + n_y = N[y] + assert(n_y != 0) + if (n_y > 0) and (n_y < n_x): + N[x] = n_y + F[x].update(F[y]) + if N[x] == d: + f_x = F[x] + while True: + z = S.pop() + N[z] = -1 + F[z] = f_x + if z == x: + break + + +class LALR_Analyzer(GrammarAnalyzer): + def __init__(self, parser_conf, debug=False): + GrammarAnalyzer.__init__(self, parser_conf, debug) + self.nonterminal_transitions = [] + self.directly_reads = defaultdict(set) + self.reads = defaultdict(set) + self.includes = defaultdict(set) + self.lookback = defaultdict(set) + + + def compute_lr0_states(self): + self.lr0_states = set() + # map of kernels to LR0ItemSets + cache = {} + + def step(state): + _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied) + + d = classify(unsat, lambda rp: rp.next) + for sym, rps in d.items(): + kernel = fzset({rp.advance(sym) for rp in rps}) + new_state = cache.get(kernel, None) + if new_state is None: + closure = set(kernel) + for rp in kernel: + if not rp.is_satisfied and not rp.next.is_term: + closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) + new_state = LR0ItemSet(kernel, closure) + cache[kernel] = new_state + + state.transitions[sym] = new_state + yield new_state + + self.lr0_states.add(state) + + for _ in bfs(self.lr0_start_states.values(), step): + pass + + def compute_reads_relations(self): + # handle start state + for root in self.lr0_start_states.values(): + assert(len(root.kernel) == 1) + for rp in root.kernel: + assert(rp.index == 0) + self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) + + for state in self.lr0_states: + seen = set() + for rp in state.closure: + if rp.is_satisfied: + continue + s = rp.next + # if s is a not a nonterminal + if s not in self.lr0_rules_by_origin: + continue + if s in seen: + continue + seen.add(s) + nt = (state, s) + self.nonterminal_transitions.append(nt) + dr = self.directly_reads[nt] + r = self.reads[nt] + next_state = state.transitions[s] + for rp2 in next_state.closure: + if rp2.is_satisfied: + continue + s2 = rp2.next + # if s2 is a terminal + if s2 not in self.lr0_rules_by_origin: + dr.add(s2) + if s2 in self.NULLABLE: + r.add((next_state, s2)) + + def compute_includes_lookback(self): + for nt in self.nonterminal_transitions: + state, nonterminal = nt + includes = [] + lookback = self.lookback[nt] + for rp in state.closure: + if rp.rule.origin != nonterminal: + continue + # traverse the states for rp(.rule) + state2 = state + for i in range(rp.index, len(rp.rule.expansion)): + s = rp.rule.expansion[i] + nt2 = (state2, s) + state2 = state2.transitions[s] + if nt2 not in self.reads: + continue + for j in range(i + 1, len(rp.rule.expansion)): + if not rp.rule.expansion[j] in self.NULLABLE: + break + else: + includes.append(nt2) + # state2 is at the final state for rp.rule + if rp.index == 0: + for rp2 in state2.closure: + if (rp2.rule == rp.rule) and rp2.is_satisfied: + lookback.add((state2, rp2.rule)) + for nt2 in includes: + self.includes[nt2].add(nt) + + def compute_lookaheads(self): + read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads) + follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets) + + for nt, lookbacks in self.lookback.items(): + for state, rule in lookbacks: + for s in follow_sets[nt]: + state.lookaheads[s].add(rule) + + def compute_lalr1_states(self): + m = {} + reduce_reduce = [] + for state in self.lr0_states: + actions = {} + for la, next_state in state.transitions.items(): + actions[la] = (Shift, next_state.closure) + for la, rules in state.lookaheads.items(): + if len(rules) > 1: + # Try to resolve conflict based on priority + p = [(r.options.priority or 0, r) for r in rules] + p.sort(key=lambda r: r[0], reverse=True) + best, second_best = p[:2] + if best[0] > second_best[0]: + rules = [best[1]] + else: + reduce_reduce.append((state, la, rules)) + if la in actions: + if self.debug: + logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) + logger.warning(' * %s', list(rules)[0]) + else: + actions[la] = (Reduce, list(rules)[0]) + m[state] = { k.name: v for k, v in actions.items() } + + if reduce_reduce: + msgs = [] + for state, la, rules in reduce_reduce: + msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ])) + if self.debug: + msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in state.closure]) + msgs.append(msg) + raise GrammarError('\n\n'.join(msgs)) + + states = { k.closure: v for k, v in m.items() } + + # compute end states + end_states = {} + for state in states: + for rp in state: + for start in self.lr0_start_states: + if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: + assert(not start in end_states) + end_states[start] = state + + _parse_table = ParseTable(states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states) + + if self.debug: + self.parse_table = _parse_table + else: + self.parse_table = IntParseTable.from_ParseTable(_parse_table) + + def compute_lalr(self): + self.compute_lr0_states() + self.compute_reads_relations() + self.compute_includes_lookback() + self.compute_lookaheads() + self.compute_lalr1_states() diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_interactive_parser.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_interactive_parser.py new file mode 100644 index 00000000..d6780cba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_interactive_parser.py @@ -0,0 +1,132 @@ +# This module provides a LALR interactive parser, which is used for debugging and error handling + +from copy import copy + +from .. import Token +from ..exceptions import UnexpectedToken + + +class InteractiveParser(object): + """InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. + + For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. + """ + def __init__(self, parser, parser_state, lexer_state): + self.parser = parser + self.parser_state = parser_state + self.lexer_state = lexer_state + + def feed_token(self, token): + """Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. + + Note that ``token`` has to be an instance of ``Token``. + """ + return self.parser_state.feed_token(token, token.type == '$END') + + def exhaust_lexer(self): + """Try to feed the rest of the lexer state into the interactive parser. + + Note that this modifies the instance in place and does not feed an '$END' Token""" + for token in self.lexer_state.lex(self.parser_state): + self.parser_state.feed_token(token) + + def feed_eof(self, last_token=None): + """Feed a '$END' Token. Borrows from 'last_token' if given.""" + eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1) + return self.feed_token(eof) + + + def __copy__(self): + """Create a new interactive parser with a separate state. + + Calls to feed_token() won't affect the old instance, and vice-versa. + """ + return type(self)( + self.parser, + copy(self.parser_state), + copy(self.lexer_state), + ) + + def copy(self): + return copy(self) + + def __eq__(self, other): + if not isinstance(other, InteractiveParser): + return False + + return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state + + def as_immutable(self): + """Convert to an ``ImmutableInteractiveParser``.""" + p = copy(self) + return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_state) + + def pretty(self): + """Print the output of ``choices()`` in a way that's easier to read.""" + out = ["Parser choices:"] + for k, v in self.choices().items(): + out.append('\t- %s -> %r' % (k, v)) + out.append('stack size: %s' % len(self.parser_state.state_stack)) + return '\n'.join(out) + + def choices(self): + """Returns a dictionary of token types, matched to their action in the parser. + + Only returns token types that are accepted by the current state. + + Updated by ``feed_token()``. + """ + return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] + + def accepts(self): + """Returns the set of possible tokens that will advance the parser into a new valid state.""" + accepts = set() + for t in self.choices(): + if t.isupper(): # is terminal? + new_cursor = copy(self) + try: + new_cursor.feed_token(Token(t, '')) + except UnexpectedToken: + pass + else: + accepts.add(t) + return accepts + + def resume_parse(self): + """Resume automated parsing from the current state.""" + return self.parser.parse_from_state(self.parser_state) + + + +class ImmutableInteractiveParser(InteractiveParser): + """Same as ``InteractiveParser``, but operations create a new instance instead + of changing it in-place. + """ + + result = None + + def __hash__(self): + return hash((self.parser_state, self.lexer_state)) + + def feed_token(self, token): + c = copy(self) + c.result = InteractiveParser.feed_token(c, token) + return c + + def exhaust_lexer(self): + """Try to feed the rest of the lexer state into the parser. + + Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token""" + cursor = self.as_mutable() + cursor.exhaust_lexer() + return cursor.as_immutable() + + def as_mutable(self): + """Convert to an ``InteractiveParser``.""" + p = copy(self) + return InteractiveParser(p.parser, p.parser_state, p.lexer_state) + + +# Deprecated class names for the interactive parser +ParserPuppet = InteractiveParser +ImmutableParserPuppet = ImmutableInteractiveParser diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py new file mode 100644 index 00000000..d916b463 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py @@ -0,0 +1,200 @@ +"""This module implements a LALR(1) Parser +""" +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com +from copy import deepcopy, copy +from ..exceptions import UnexpectedInput, UnexpectedToken +from ..lexer import Token +from ..utils import Serialize + +from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable +from .lalr_interactive_parser import InteractiveParser +from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken + +###{standalone + +class LALR_Parser(Serialize): + def __init__(self, parser_conf, debug=False): + analysis = LALR_Analyzer(parser_conf, debug=debug) + analysis.compute_lalr() + callbacks = parser_conf.callbacks + + self._parse_table = analysis.parse_table + self.parser_conf = parser_conf + self.parser = _Parser(analysis.parse_table, callbacks, debug) + + @classmethod + def deserialize(cls, data, memo, callbacks, debug=False): + inst = cls.__new__(cls) + inst._parse_table = IntParseTable.deserialize(data, memo) + inst.parser = _Parser(inst._parse_table, callbacks, debug) + return inst + + def serialize(self, memo): + return self._parse_table.serialize(memo) + + def parse_interactive(self, lexer, start): + return self.parser.parse(lexer, start, start_interactive=True) + + def parse(self, lexer, start, on_error=None): + try: + return self.parser.parse(lexer, start) + except UnexpectedInput as e: + if on_error is None: + raise + + while True: + if isinstance(e, UnexpectedCharacters): + s = e.interactive_parser.lexer_state.state + p = s.line_ctr.char_pos + + if not on_error(e): + raise e + + if isinstance(e, UnexpectedCharacters): + # If user didn't change the character position, then we should + if p == s.line_ctr.char_pos: + s.line_ctr.feed(s.text[p:p+1]) + + try: + return e.interactive_parser.resume_parse() + except UnexpectedToken as e2: + if (isinstance(e, UnexpectedToken) + and e.token.type == e2.token.type == '$END' + and e.interactive_parser == e2.interactive_parser): + # Prevent infinite loop + raise e2 + e = e2 + except UnexpectedCharacters as e2: + e = e2 + + +class ParseConf(object): + __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' + + def __init__(self, parse_table, callbacks, start): + self.parse_table = parse_table + + self.start_state = self.parse_table.start_states[start] + self.end_state = self.parse_table.end_states[start] + self.states = self.parse_table.states + + self.callbacks = callbacks + self.start = start + + +class ParserState(object): + __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' + + def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): + self.parse_conf = parse_conf + self.lexer = lexer + self.state_stack = state_stack or [self.parse_conf.start_state] + self.value_stack = value_stack or [] + + @property + def position(self): + return self.state_stack[-1] + + # Necessary for match_examples() to work + def __eq__(self, other): + if not isinstance(other, ParserState): + return NotImplemented + return len(self.state_stack) == len(other.state_stack) and self.position == other.position + + def __copy__(self): + return type(self)( + self.parse_conf, + self.lexer, # XXX copy + copy(self.state_stack), + deepcopy(self.value_stack), + ) + + def copy(self): + return copy(self) + + def feed_token(self, token, is_end=False): + state_stack = self.state_stack + value_stack = self.value_stack + states = self.parse_conf.states + end_state = self.parse_conf.end_state + callbacks = self.parse_conf.callbacks + + while True: + state = state_stack[-1] + try: + action, arg = states[state][token.type] + except KeyError: + expected = {s for s in states[state].keys() if s.isupper()} + raise UnexpectedToken(token, expected, state=self, interactive_parser=None) + + assert arg != end_state + + if action is Shift: + # shift once and return + assert not is_end + state_stack.append(arg) + value_stack.append(token if token.type not in callbacks else callbacks[token.type](token)) + return + else: + # reduce+shift as many times as necessary + rule = arg + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = callbacks[rule](s) + + _action, new_state = states[state_stack[-1]][rule.origin.name] + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + if is_end and state_stack[-1] == end_state: + return value_stack[-1] + +class _Parser(object): + def __init__(self, parse_table, callbacks, debug=False): + self.parse_table = parse_table + self.callbacks = callbacks + self.debug = debug + + def parse(self, lexer, start, value_stack=None, state_stack=None, start_interactive=False): + parse_conf = ParseConf(self.parse_table, self.callbacks, start) + parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) + if start_interactive: + return InteractiveParser(self, parser_state, parser_state.lexer) + return self.parse_from_state(parser_state) + + + def parse_from_state(self, state): + # Main LALR-parser loop + try: + token = None + for token in state.lexer.lex(state): + state.feed_token(token) + + end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + return state.feed_token(end_token, True) + except UnexpectedInput as e: + try: + e.interactive_parser = InteractiveParser(self, state, state.lexer) + except NameError: + pass + raise e + except Exception as e: + if self.debug: + print("") + print("STATE STACK DUMP") + print("----------------") + for i, s in enumerate(state.state_stack): + print('%d)' % i , s) + print("") + + raise +###} + diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_puppet.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_puppet.py new file mode 100644 index 00000000..6ea6d894 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_puppet.py @@ -0,0 +1,3 @@ +# Deprecated
+
+from .lalr_interactive_parser import ParserPuppet, ImmutableParserPuppet
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py b/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py new file mode 100644 index 00000000..335d9ad4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py @@ -0,0 +1,157 @@ +"""This module implements an experimental Earley parser with a dynamic lexer + +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is better documented here: + http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ + +Instead of running a lexer beforehand, or using a costy char-by-char method, this parser +uses regular expressions by necessity, achieving high-performance while maintaining all of +Earley's power in parsing any CFG. +""" + +from collections import defaultdict + +from ..tree import Tree +from ..exceptions import UnexpectedCharacters +from ..lexer import Token +from ..grammar import Terminal +from .earley import Parser as BaseParser +from .earley_forest import SymbolNode + + +class Parser(BaseParser): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False, tree_class=Tree): + BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug, tree_class) + self.ignore = [Terminal(t) for t in ignore] + self.complete_lex = complete_lex + + def _parse(self, stream, columns, to_scan, start_symbol=None): + + def scan(i, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + + node_cache = {} + + # 1) Loop the expectations and ask the lexer to match. + # Since regexp is forward looking on the input stream, and we only + # want to process tokens when we hit the point in the stream at which + # they complete, we push all tokens into a buffer (delayed_matches), to + # be held possibly for a later parse step when we reach the point in the + # input stream at which they complete. + for item in set(to_scan): + m = match(item.expect, stream, i) + if m: + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[m.end()].append( (item, i, t) ) + + if self.complete_lex: + s = m.group(0) + for j in range(1, len(s)): + m = match(item.expect, s[:-j]) + if m: + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[i+m.end()].append( (item, i, t) ) + + # XXX The following 3 lines were commented out for causing a bug. See issue #768 + # # Remove any items that successfully matched in this pass from the to_scan buffer. + # # This ensures we don't carry over tokens that already matched, if we're ignoring below. + # to_scan.remove(item) + + # 3) Process any ignores. This is typically used for e.g. whitespace. + # We carry over any unmatched items from the to_scan buffer to be matched again after + # the ignore. This should allow us to use ignored symbols in non-terminals to implement + # e.g. mandatory spacing. + for x in self.ignore: + m = match(x, stream, i) + if m: + # Carry over any items still in the scan buffer, to past the end of the ignored items. + delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) + + # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. + delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) + + next_to_scan = set() + next_set = set() + columns.append(next_set) + transitives.append({}) + + ## 4) Process Tokens from delayed_matches. + # This is the core of the Earley scanner. Create an SPPF node for each Token, + # and create the symbol node in the SPPF tree. Advance the item that completed, + # and add the resulting new item to either the Earley set (for processing by the + # completer/predictor) or the to_scan buffer for the next parse step. + for item, start, token in delayed_matches[i+1]: + if token is not None: + token.end_line = text_line + token.end_column = text_column + 1 + token.end_pos = i + 1 + + new_item = item.advance() + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + else: + new_item = item + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) + + del delayed_matches[i+1] # No longer needed, so unburden memory + + if not next_set and not delayed_matches and not next_to_scan: + considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name)) + raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, + set(to_scan), state=frozenset(i.s for i in to_scan), + considered_rules=considered_rules + ) + + return next_to_scan + + + delayed_matches = defaultdict(list) + match = self.term_matcher + + # Cache for nodes & tokens created in a particular parse step. + transitives = [{}] + + text_line = 1 + text_column = 1 + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. + i = 0 + for token in stream: + self.predict_and_complete(i, to_scan, columns, transitives) + + to_scan = scan(i, to_scan) + + if token == '\n': + text_line += 1 + text_column = 1 + else: + text_column += 1 + i += 1 + + self.predict_and_complete(i, to_scan, columns, transitives) + + ## Column is now the final column in the parse. + assert i == len(columns)-1 + return to_scan |