From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001
From: S. Solomon Darnell
Date: Fri, 28 Mar 2025 21:52:21 -0500
Subject: two version of R2R are here

---
 .../site-packages/lark/parsers/xearley.py          | 157 +++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 .venv/lib/python3.12/site-packages/lark/parsers/xearley.py

(limited to '.venv/lib/python3.12/site-packages/lark/parsers/xearley.py')

diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py b/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py
new file mode 100644
index 00000000..335d9ad4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py
@@ -0,0 +1,157 @@
+"""This module implements an experimental Earley parser with a dynamic lexer
+
+The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
+    https://www.sciencedirect.com/science/article/pii/S1571066108001497
+
+That is probably the best reference for understanding the algorithm here.
+
+The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
+is better documented here:
+    http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
+
+Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
+uses regular expressions by necessity, achieving high-performance while maintaining all of
+Earley's power in parsing any CFG.
+"""
+
+from collections import defaultdict
+
+from ..tree import Tree
+from ..exceptions import UnexpectedCharacters
+from ..lexer import Token
+from ..grammar import Terminal
+from .earley import Parser as BaseParser
+from .earley_forest import SymbolNode
+
+
+class Parser(BaseParser):
+    def __init__(self,  parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False, tree_class=Tree):
+        BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug, tree_class)
+        self.ignore = [Terminal(t) for t in ignore]
+        self.complete_lex = complete_lex
+
+    def _parse(self, stream, columns, to_scan, start_symbol=None):
+
+        def scan(i, to_scan):
+            """The core Earley Scanner.
+
+            This is a custom implementation of the scanner that uses the
+            Lark lexer to match tokens. The scan list is built by the
+            Earley predictor, based on the previously completed tokens.
+            This ensures that at each phase of the parse we have a custom
+            lexer context, allowing for more complex ambiguities."""
+
+            node_cache = {}
+
+            # 1) Loop the expectations and ask the lexer to match.
+            # Since regexp is forward looking on the input stream, and we only
+            # want to process tokens when we hit the point in the stream at which
+            # they complete, we push all tokens into a buffer (delayed_matches), to
+            # be held possibly for a later parse step when we reach the point in the
+            # input stream at which they complete.
+            for item in set(to_scan):
+                m = match(item.expect, stream, i)
+                if m:
+                    t = Token(item.expect.name, m.group(0), i, text_line, text_column)
+                    delayed_matches[m.end()].append( (item, i, t) )
+
+                    if self.complete_lex:
+                        s = m.group(0)
+                        for j in range(1, len(s)):
+                            m = match(item.expect, s[:-j])
+                            if m:
+                                t = Token(item.expect.name, m.group(0), i, text_line, text_column)
+                                delayed_matches[i+m.end()].append( (item, i, t) )
+
+                    # XXX The following 3 lines were commented out for causing a bug. See issue #768
+                    # # Remove any items that successfully matched in this pass from the to_scan buffer.
+                    # # This ensures we don't carry over tokens that already matched, if we're ignoring below.
+                    # to_scan.remove(item)
+
+            # 3) Process any ignores. This is typically used for e.g. whitespace.
+            # We carry over any unmatched items from the to_scan buffer to be matched again after
+            # the ignore. This should allow us to use ignored symbols in non-terminals to implement
+            # e.g. mandatory spacing.
+            for x in self.ignore:
+                m = match(x, stream, i)
+                if m:
+                    # Carry over any items still in the scan buffer, to past the end of the ignored items.
+                    delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])
+
+                    # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
+                    delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])
+
+            next_to_scan = set()
+            next_set = set()
+            columns.append(next_set)
+            transitives.append({})
+
+            ## 4) Process Tokens from delayed_matches.
+            # This is the core of the Earley scanner. Create an SPPF node for each Token,
+            # and create the symbol node in the SPPF tree. Advance the item that completed,
+            # and add the resulting new item to either the Earley set (for processing by the
+            # completer/predictor) or the to_scan buffer for the next parse step.
+            for item, start, token in delayed_matches[i+1]:
+                if token is not None:
+                    token.end_line = text_line
+                    token.end_column = text_column + 1
+                    token.end_pos = i + 1
+
+                    new_item = item.advance()
+                    label = (new_item.s, new_item.start, i)
+                    new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
+                    new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)
+                else:
+                    new_item = item
+
+                if new_item.expect in self.TERMINALS:
+                    # add (B ::= Aai+1.B, h, y) to Q'
+                    next_to_scan.add(new_item)
+                else:
+                    # add (B ::= Aa+1.B, h, y) to Ei+1
+                    next_set.add(new_item)
+
+            del delayed_matches[i+1]    # No longer needed, so unburden memory
+
+            if not next_set and not delayed_matches and not next_to_scan:
+                considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name))
+                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
+                                           set(to_scan), state=frozenset(i.s for i in to_scan),
+                                           considered_rules=considered_rules
+                                           )
+
+            return next_to_scan
+
+
+        delayed_matches = defaultdict(list)
+        match = self.term_matcher
+
+        # Cache for nodes & tokens created in a particular parse step.
+        transitives = [{}]
+
+        text_line = 1
+        text_column = 1
+
+        ## The main Earley loop.
+        # Run the Prediction/Completion cycle for any Items in the current Earley set.
+        # Completions will be added to the SPPF tree, and predictions will be recursively
+        # processed down to terminals/empty nodes to be added to the scanner for the next
+        # step.
+        i = 0
+        for token in stream:
+            self.predict_and_complete(i, to_scan, columns, transitives)
+
+            to_scan = scan(i, to_scan)
+
+            if token == '\n':
+                text_line += 1
+                text_column = 1
+            else:
+                text_column += 1
+            i += 1
+
+        self.predict_and_complete(i, to_scan, columns, transitives)
+
+        ## Column is now the final column in the parse.
+        assert i == len(columns)-1
+        return to_scan
-- 
cgit v1.2.3