diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/lark | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/lark')
37 files changed, 8262 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/lark/__init__.py b/.venv/lib/python3.12/site-packages/lark/__init__.py new file mode 100644 index 00000000..909d4106 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/__init__.py @@ -0,0 +1,10 @@ +from .utils import logger +from .tree import Tree +from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive +from .visitors import InlineTransformer, inline_args # XXX Deprecated +from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, + UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError) +from .lexer import Token +from .lark import Lark + +__version__ = "0.12.0" diff --git a/.venv/lib/python3.12/site-packages/lark/__pyinstaller/__init__.py b/.venv/lib/python3.12/site-packages/lark/__pyinstaller/__init__.py new file mode 100644 index 00000000..fa02fc92 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/__pyinstaller/__init__.py @@ -0,0 +1,6 @@ +# For usage of lark with PyInstaller. See https://pyinstaller-sample-hook.readthedocs.io/en/latest/index.html + +import os + +def get_hook_dirs(): + return [os.path.dirname(__file__)]
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/lark/__pyinstaller/hook-lark.py b/.venv/lib/python3.12/site-packages/lark/__pyinstaller/hook-lark.py new file mode 100644 index 00000000..cf3d8e3d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/__pyinstaller/hook-lark.py @@ -0,0 +1,14 @@ +#----------------------------------------------------------------------------- +# Copyright (c) 2017-2020, PyInstaller Development Team. +# +# Distributed under the terms of the GNU General Public License (version 2 +# or later) with exception for distributing the bootloader. +# +# The full license is in the file COPYING.txt, distributed with this software. +# +# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception) +#----------------------------------------------------------------------------- + +from PyInstaller.utils.hooks import collect_data_files + +datas = collect_data_files('lark') diff --git a/.venv/lib/python3.12/site-packages/lark/ast_utils.py b/.venv/lib/python3.12/site-packages/lark/ast_utils.py new file mode 100644 index 00000000..0c03d458 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/ast_utils.py @@ -0,0 +1,55 @@ +""" + Module of utilities for transforming a lark.Tree into a custom Abstract Syntax Tree +""" + +import inspect, re + +from lark import Transformer, v_args + +class Ast(object): + """Abstract class + + Subclasses will be collected by `create_transformer()` + """ + pass + +class AsList(object): + """Abstract class + + Subclasses will be instanciated with the parse results as a single list, instead of as arguments. + """ + +class WithMeta(object): + """Abstract class + + Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) + """ + pass + +def camel_to_snake(name): + return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() + +def create_transformer(ast_module, transformer=None, decorator_factory=v_args): + """Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST. + + For each class, we create a corresponding rule in the transformer, with a matching name. + CamelCase names will be converted into snake_case. Example: "CodeBlock" -> "code_block". + + Classes starting with an underscore (`_`) will be skipped. + + Parameters: + ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` + transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. + decorator_factory (Callable): An optional callable accepting two booleans, inline, and meta, + and returning a decorator for the methods of ``transformer``. (default: ``v_args``). + """ + t = transformer or Transformer() + + for name, obj in inspect.getmembers(ast_module): + if not name.startswith('_') and inspect.isclass(obj): + if issubclass(obj, Ast): + wrapper = decorator_factory(inline=not issubclass(obj, AsList), meta=issubclass(obj, WithMeta)) + obj = wrapper(obj).__get__(t) + setattr(t, camel_to_snake(name), obj) + + return t
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/lark/common.py b/.venv/lib/python3.12/site-packages/lark/common.py new file mode 100644 index 00000000..cb408d91 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/common.py @@ -0,0 +1,59 @@ +from warnings import warn +from copy import deepcopy + +from .utils import Serialize +from .lexer import TerminalDef + +###{standalone + + +class LexerConf(Serialize): + __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' + __serialize_namespace__ = TerminalDef, + + def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): + self.terminals = terminals + self.terminals_by_name = {t.name: t for t in self.terminals} + assert len(self.terminals) == len(self.terminals_by_name) + self.ignore = ignore + self.postlex = postlex + self.callbacks = callbacks or {} + self.g_regex_flags = g_regex_flags + self.re_module = re_module + self.skip_validation = skip_validation + self.use_bytes = use_bytes + self.lexer_type = None + + @property + def tokens(self): + warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) + return self.terminals + + def _deserialize(self): + self.terminals_by_name = {t.name: t for t in self.terminals} + + def __deepcopy__(self, memo=None): + return type(self)( + deepcopy(self.terminals, memo), + self.re_module, + deepcopy(self.ignore, memo), + deepcopy(self.postlex, memo), + deepcopy(self.callbacks, memo), + deepcopy(self.g_regex_flags, memo), + deepcopy(self.skip_validation, memo), + deepcopy(self.use_bytes, memo), + ) + + +class ParserConf(Serialize): + __serialize_fields__ = 'rules', 'start', 'parser_type' + + def __init__(self, rules, callbacks, start): + assert isinstance(start, list) + self.rules = rules + self.callbacks = callbacks + self.start = start + + self.parser_type = None + +###} diff --git a/.venv/lib/python3.12/site-packages/lark/exceptions.py b/.venv/lib/python3.12/site-packages/lark/exceptions.py new file mode 100644 index 00000000..9f187531 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/exceptions.py @@ -0,0 +1,267 @@ +from warnings import warn + +from .utils import STRING_TYPE, logger, NO_VALUE + + +###{standalone + + +class LarkError(Exception): + pass + + +class ConfigurationError(LarkError, ValueError): + pass + + +def assert_config(value, options, msg='Got %r, expected one of %s'): + if value not in options: + raise ConfigurationError(msg % (value, options)) + + +class GrammarError(LarkError): + pass + + +class ParseError(LarkError): + pass + + +class LexError(LarkError): + pass + + +class UnexpectedInput(LarkError): + """UnexpectedInput Error. + + Used as a base class for the following exceptions: + + - ``UnexpectedCharacters``: The lexer encountered an unexpected string + - ``UnexpectedToken``: The parser received an unexpected token + - ``UnexpectedEOF``: The parser expected a token, but the input ended + + After catching one of these exceptions, you may call the following helper methods to create a nicer error message. + """ + pos_in_stream = None + _terminals_by_name = None + + def get_context(self, text, span=40): + """Returns a pretty string pinpointing the error in the text, + with span amount of context characters around it. + + Note: + The parser doesn't hold a copy of the text it has to parse, + so you have to provide it again + """ + assert self.pos_in_stream is not None, self + pos = self.pos_in_stream + start = max(pos - span, 0) + end = pos + span + if not isinstance(text, bytes): + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before.expandtabs()) + '^\n' + else: + before = text[start:pos].rsplit(b'\n', 1)[-1] + after = text[pos:end].split(b'\n', 1)[0] + return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace") + + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): + """Allows you to detect what's wrong in the input text by matching + against example errors. + + Given a parser instance and a dictionary mapping some label with + some malformed syntax examples, it'll return the label for the + example that bests matches the current error. The function will + iterate the dictionary until it finds a matching error, and + return the corresponding value. + + For an example usage, see `examples/error_reporting_lalr.py` + + Parameters: + parse_fn: parse function (usually ``lark_instance.parse``) + examples: dictionary of ``{'example_string': value}``. + use_accepts: Recommended to call this with ``use_accepts=True``. + The default is ``False`` for backwards compatibility. + """ + assert self.state is not None, "Not supported for this exception" + + if isinstance(examples, dict): + examples = examples.items() + + candidate = (None, False) + for i, (label, example) in enumerate(examples): + assert not isinstance(example, STRING_TYPE) + + for j, malformed in enumerate(example): + try: + parse_fn(malformed) + except UnexpectedInput as ut: + if ut.state == self.state: + if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: + logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) + continue + try: + if ut.token == self.token: # Try exact match first + logger.debug("Exact Match at example [%s][%s]" % (i, j)) + return label + + if token_type_match_fallback: + # Fallback to token types match + if (ut.token.type == self.token.type) and not candidate[-1]: + logger.debug("Token Type Fallback at example [%s][%s]" % (i, j)) + candidate = label, True + + except AttributeError: + pass + if candidate[0] is None: + logger.debug("Same State match at example [%s][%s]" % (i, j)) + candidate = label, False + + return candidate[0] + + def _format_expected(self, expected): + if self._terminals_by_name: + d = self._terminals_by_name + expected = [d[t_name].user_repr() if t_name in d else t_name for t_name in expected] + return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) + + +class UnexpectedEOF(ParseError, UnexpectedInput): + """An exception that is raised by the parser, when the input ends while it still expects a token. + """ + + def __init__(self, expected, state=None, terminals_by_name=None): + super(UnexpectedEOF, self).__init__() + + self.expected = expected + self.state = state + from .lexer import Token + self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1) + self.pos_in_stream = -1 + self.line = -1 + self.column = -1 + self._terminals_by_name = terminals_by_name + + + def __str__(self): + message = "Unexpected end-of-input. " + message += self._format_expected(self.expected) + return message + + +class UnexpectedCharacters(LexError, UnexpectedInput): + """An exception that is raised by the lexer, when it cannot match the next + string of characters to any of its terminals. + """ + + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, + terminals_by_name=None, considered_rules=None): + super(UnexpectedCharacters, self).__init__() + + # TODO considered_tokens and allowed can be figured out using state + self.line = line + self.column = column + self.pos_in_stream = lex_pos + self.state = state + self._terminals_by_name = terminals_by_name + + self.allowed = allowed + self.considered_tokens = considered_tokens + self.considered_rules = considered_rules + self.token_history = token_history + + if isinstance(seq, bytes): + self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") + else: + self.char = seq[lex_pos] + self._context = self.get_context(seq) + + + def __str__(self): + message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) + message += '\n\n' + self._context + if self.allowed: + message += self._format_expected(self.allowed) + if self.token_history: + message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) + return message + + +class UnexpectedToken(ParseError, UnexpectedInput): + """An exception that is raised by the parser, when the token it received + doesn't match any valid step forward. + + Parameters: + token: The mismatched token + expected: The set of expected tokens + considered_rules: Which rules were considered, to deduce the expected tokens + state: A value representing the parser state. Do not rely on its value or type. + interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, + and can be used for debugging and error handling. + + Note: These parameters are available as attributes of the instance. + """ + + def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): + super(UnexpectedToken, self).__init__() + + # TODO considered_rules and expected can be figured out using state + self.line = getattr(token, 'line', '?') + self.column = getattr(token, 'column', '?') + self.pos_in_stream = getattr(token, 'start_pos', None) + self.state = state + + self.token = token + self.expected = expected # XXX deprecate? `accepts` is better + self._accepts = NO_VALUE + self.considered_rules = considered_rules + self.interactive_parser = interactive_parser + self._terminals_by_name = terminals_by_name + self.token_history = token_history + + + @property + def accepts(self): + if self._accepts is NO_VALUE: + self._accepts = self.interactive_parser and self.interactive_parser.accepts() + return self._accepts + + def __str__(self): + message = ("Unexpected token %r at line %s, column %s.\n%s" + % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) + if self.token_history: + message += "Previous tokens: %r\n" % self.token_history + + return message + + @property + def puppet(self): + warn("UnexpectedToken.puppet attribute has been renamed to interactive_parser", DeprecationWarning) + return self.interactive_parser + + + +class VisitError(LarkError): + """VisitError is raised when visitors are interrupted by an exception + + It provides the following attributes for inspection: + + Parameters: + rule: the name of the visit rule that failed + obj: the tree-node or token that was being processed + orig_exc: the exception that cause it to fail + + Note: These parameters are available as attributes + """ + + def __init__(self, rule, obj, orig_exc): + message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) + super(VisitError, self).__init__(message) + + self.rule = rule + self.obj = obj + self.orig_exc = orig_exc + +###} diff --git a/.venv/lib/python3.12/site-packages/lark/grammar.py b/.venv/lib/python3.12/site-packages/lark/grammar.py new file mode 100644 index 00000000..405086a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/grammar.py @@ -0,0 +1,105 @@ +from .utils import Serialize + +###{standalone + +class Symbol(Serialize): + __slots__ = ('name',) + + is_term = NotImplemented + + def __init__(self, name): + self.name = name + + def __eq__(self, other): + assert isinstance(other, Symbol), other + return self.is_term == other.is_term and self.name == other.name + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash(self.name) + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.name) + + fullrepr = property(__repr__) + + +class Terminal(Symbol): + __serialize_fields__ = 'name', 'filter_out' + + is_term = True + + def __init__(self, name, filter_out=False): + self.name = name + self.filter_out = filter_out + + @property + def fullrepr(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) + + +class NonTerminal(Symbol): + __serialize_fields__ = 'name', + + is_term = False + + +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' + + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.priority = priority + self.template_source = template_source + self.empty_indices = empty_indices + + def __repr__(self): + return 'RuleOptions(%r, %r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.priority, + self.template_source + ) + + +class Rule(Serialize): + """ + origin : a symbol + expansion : a list of symbols + order : index of this expansion amongst all rules of the same name + """ + __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + + __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' + __serialize_namespace__ = Terminal, NonTerminal, RuleOptions + + def __init__(self, origin, expansion, order=0, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.order = order + self.options = options or RuleOptions() + self._hash = hash((self.origin, tuple(self.expansion))) + + def _deserialize(self): + self._hash = hash((self.origin, tuple(self.expansion))) + + def __str__(self): + return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + if not isinstance(other, Rule): + return False + return self.origin == other.origin and self.expansion == other.expansion + + +###} diff --git a/.venv/lib/python3.12/site-packages/lark/grammars/common.lark b/.venv/lib/python3.12/site-packages/lark/grammars/common.lark new file mode 100644 index 00000000..d2e86d17 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/grammars/common.lark @@ -0,0 +1,59 @@ +// Basic terminals for common use + + +// +// Numbers +// + +DIGIT: "0".."9" +HEXDIGIT: "a".."f"|"A".."F"|DIGIT + +INT: DIGIT+ +SIGNED_INT: ["+"|"-"] INT +DECIMAL: INT "." INT? | "." INT + +// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ +_EXP: ("e"|"E") SIGNED_INT +FLOAT: INT _EXP | DECIMAL _EXP? +SIGNED_FLOAT: ["+"|"-"] FLOAT + +NUMBER: FLOAT | INT +SIGNED_NUMBER: ["+"|"-"] NUMBER + +// +// Strings +// +_STRING_INNER: /.*?/ +_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/ + +ESCAPED_STRING : "\"" _STRING_ESC_INNER "\"" + + +// +// Names (Variables) +// +LCASE_LETTER: "a".."z" +UCASE_LETTER: "A".."Z" + +LETTER: UCASE_LETTER | LCASE_LETTER +WORD: LETTER+ + +CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)* + + +// +// Whitespace +// +WS_INLINE: (" "|/\t/)+ +WS: /[ \t\f\r\n]/+ + +CR : /\r/ +LF : /\n/ +NEWLINE: (CR? LF)+ + + +// Comments +SH_COMMENT: /#[^\n]*/ +CPP_COMMENT: /\/\/[^\n]*/ +C_COMMENT: "/*" /(.|\n)*?/ "*/" +SQL_COMMENT: /--[^\n]*/ diff --git a/.venv/lib/python3.12/site-packages/lark/grammars/lark.lark b/.venv/lib/python3.12/site-packages/lark/grammars/lark.lark new file mode 100644 index 00000000..68588469 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/grammars/lark.lark @@ -0,0 +1,59 @@ +start: (_item? _NL)* _item? + +_item: rule + | token + | statement + +rule: RULE rule_params priority? ":" expansions +token: TOKEN token_params priority? ":" expansions + +rule_params: ["{" RULE ("," RULE)* "}"] +token_params: ["{" TOKEN ("," TOKEN)* "}"] + +priority: "." NUMBER + +statement: "%ignore" expansions -> ignore + | "%import" import_path ["->" name] -> import + | "%import" import_path name_list -> multi_import + | "%override" rule -> override_rule + | "%declare" name+ -> declare + +!import_path: "."? name ("." name)* +name_list: "(" name ("," name)* ")" + +?expansions: alias (_VBAR alias)* + +?alias: expansion ["->" RULE] + +?expansion: expr* + +?expr: atom [OP | "~" NUMBER [".." NUMBER]] + +?atom: "(" expansions ")" + | "[" expansions "]" -> maybe + | value + +?value: STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + | name "{" value ("," value)* "}" -> template_usage + +name: RULE + | TOKEN + +_VBAR: _NL? "|" +OP: /[+*]|[?](?![a-z])/ +RULE: /!?[_?]?[a-z][_a-z0-9]*/ +TOKEN: /_?[A-Z][_A-Z0-9]*/ +STRING: _STRING "i"? +REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ +_NL: /(\r?\n)+\s*/ + +%import common.ESCAPED_STRING -> _STRING +%import common.SIGNED_INT -> NUMBER +%import common.WS_INLINE + +COMMENT: /\s*/ "//" /[^\n]/* + +%ignore WS_INLINE +%ignore COMMENT diff --git a/.venv/lib/python3.12/site-packages/lark/grammars/python.lark b/.venv/lib/python3.12/site-packages/lark/grammars/python.lark new file mode 100644 index 00000000..785728c5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/grammars/python.lark @@ -0,0 +1,19 @@ +// Python terminals + +NAME: /[a-zA-Z_]\w*/ +COMMENT: /#[^\n]*/ + +STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i +LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is + +DEC_NUMBER: /0|[1-9][\d_]*/i +HEX_NUMBER.2: /0x[\da-f]*/i +OCT_NUMBER.2: /0o[0-7]*/i +BIN_NUMBER.2 : /0b[0-1]*/i +FLOAT_NUMBER.2: /((\d+\.[\d_]*|\.[\d_]+)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))/ +IMAG_NUMBER.2: /\d+[jJ]/ | FLOAT_NUMBER /[jJ]/ + + +// Comma-separated list (with an optional trailing comma) +cs_list{item}: item ("," item)* ","? +_cs_list{item}: item ("," item)* ","?
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/lark/grammars/unicode.lark b/.venv/lib/python3.12/site-packages/lark/grammars/unicode.lark new file mode 100644 index 00000000..0ab849e3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/grammars/unicode.lark @@ -0,0 +1,7 @@ +// TODO: LETTER, WORD, etc. + +// +// Whitespace +// +WS_INLINE: /[ \t\xa0]/+ +WS: /[ \t\xa0\f\r\n]/+ diff --git a/.venv/lib/python3.12/site-packages/lark/indenter.py b/.venv/lib/python3.12/site-packages/lark/indenter.py new file mode 100644 index 00000000..7e1263dd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/indenter.py @@ -0,0 +1,67 @@ +"Provides Indentation services for languages with indentation similar to Python" + +from .exceptions import LarkError +from .lark import PostLex +from .lexer import Token + +###{standalone +class DedentError(LarkError): + pass + +class Indenter(PostLex): + def __init__(self): + self.paren_level = None + self.indent_level = None + assert self.tab_len > 0 + + def handle_NL(self, token): + if self.paren_level > 0: + return + + yield token + + indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces + indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len + + if indent > self.indent_level[-1]: + self.indent_level.append(indent) + yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) + else: + while indent < self.indent_level[-1]: + self.indent_level.pop() + yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) + + if indent != self.indent_level[-1]: + raise DedentError('Unexpected dedent to column %s. Expected dedent to %s' % (indent, self.indent_level[-1])) + + def _process(self, stream): + for token in stream: + if token.type == self.NL_type: + for t in self.handle_NL(token): + yield t + else: + yield token + + if token.type in self.OPEN_PAREN_types: + self.paren_level += 1 + elif token.type in self.CLOSE_PAREN_types: + self.paren_level -= 1 + assert self.paren_level >= 0 + + while len(self.indent_level) > 1: + self.indent_level.pop() + yield Token(self.DEDENT_type, '') + + assert self.indent_level == [0], self.indent_level + + def process(self, stream): + self.paren_level = 0 + self.indent_level = [0] + return self._process(stream) + + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + @property + def always_accept(self): + return (self.NL_type,) + +###} diff --git a/.venv/lib/python3.12/site-packages/lark/lark.py b/.venv/lib/python3.12/site-packages/lark/lark.py new file mode 100644 index 00000000..744cf4b4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/lark.py @@ -0,0 +1,602 @@ +from __future__ import absolute_import + + +from lark.exceptions import ConfigurationError, assert_config + +import sys, os, pickle, hashlib +from io import open +import tempfile +from warnings import warn + +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod +from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files +from .tree import Tree +from .common import LexerConf, ParserConf + +from .lexer import Lexer, TraditionalLexer, TerminalDef, LexerThread +from .parse_tree_builder import ParseTreeBuilder +from .parser_frontends import get_frontend, _get_lexer_callbacks +from .grammar import Rule + +import re +try: + import regex +except ImportError: + regex = None + + +###{standalone + + +class LarkOptions(Serialize): + """Specifies the options for Lark + + """ + OPTIONS_DOC = """ + **=== General Options ===** + + start + The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") + debug + Display debug information and extra warnings. Use only when debugging (default: False) + When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed. + transformer + Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) + propagate_positions + Propagates (line, column, end_line, end_column) attributes into all tree branches. + Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. + maybe_placeholders + When ``True``, the ``[]`` operator returns ``None`` when not matched. + + When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all. + (default= ``False``. Recommended to set to ``True``) + cache + Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. + + - When ``False``, does nothing (default) + - When ``True``, caches to a temporary file in the local directory + - When given a string, caches to the path pointed by the string + regex + When True, uses the ``regex`` module instead of the stdlib ``re``. + g_regex_flags + Flags that are applied to all terminals (both regex and strings) + keep_all_tokens + Prevent the tree builder from automagically removing "punctuation" tokens (default: False) + tree_class + Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``. + + **=== Algorithm Options ===** + + parser + Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley"). + (there is also a "cyk" option for legacy) + lexer + Decides whether or not to use a lexer stage + + - "auto" (default): Choose for me based on the parser + - "standard": Use a standard lexer + - "contextual": Stronger lexer (only works with parser="lalr") + - "dynamic": Flexible and powerful (only with parser="earley") + - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. + ambiguity + Decides how to handle ambiguity in the parse. Only relevant if parser="earley" + + - "resolve": The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) + - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + - "forest": The parser will return the root of the shared packed parse forest. + + **=== Misc. / Domain Specific Options ===** + + postlex + Lexer post-processing (Default: None) Only works with the standard and contextual lexers. + priority + How priorities should be evaluated - auto, none, normal, invert (Default: auto) + lexer_callbacks + Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes + Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). + edit_terminals + A callback for editing the terminals before parse. + import_paths + A List of either paths or loader functions to specify from where grammars are imported + source_path + Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading + **=== End of Options ===** + """ + if __doc__: + __doc__ += OPTIONS_DOC + + + # Adding a new option needs to be done in multiple places: + # - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts + # - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs + # - In `lark-stubs/lark.pyi`: + # - As attribute to `LarkOptions` + # - As parameter to `Lark.__init__` + # - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded + # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument + _defaults = { + 'debug': False, + 'keep_all_tokens': False, + 'tree_class': None, + 'cache': False, + 'postlex': None, + 'parser': 'earley', + 'lexer': 'auto', + 'transformer': None, + 'start': 'start', + 'priority': 'auto', + 'ambiguity': 'auto', + 'regex': False, + 'propagate_positions': False, + 'lexer_callbacks': {}, + 'maybe_placeholders': False, + 'edit_terminals': None, + 'g_regex_flags': 0, + 'use_bytes': False, + 'import_paths': [], + 'source_path': None, + } + + def __init__(self, options_dict): + o = dict(options_dict) + + options = {} + for name, default in self._defaults.items(): + if name in o: + value = o.pop(name) + if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'): + value = bool(value) + else: + value = default + + options[name] = value + + if isinstance(options['start'], STRING_TYPE): + options['start'] = [options['start']] + + self.__dict__['options'] = options + + + assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) + + if self.parser == 'earley' and self.transformer: + raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' + 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') + + if o: + raise ConfigurationError("Unknown options: %s" % o.keys()) + + def __getattr__(self, name): + try: + return self.__dict__['options'][name] + except KeyError as e: + raise AttributeError(e) + + def __setattr__(self, name, value): + assert_config(name, self.options.keys(), "%r isn't a valid option. Expected one of: %s") + self.options[name] = value + + def serialize(self, memo): + return self.options + + @classmethod + def deserialize(cls, data, memo): + return cls(data) + + +# Options that can be passed to the Lark parser, even when it was loaded from cache/standalone. +# These option are only used outside of `load_grammar`. +_LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'lexer_callbacks', 'use_bytes', 'debug', 'g_regex_flags', 'regex', 'propagate_positions', 'tree_class'} + +_VALID_PRIORITY_OPTIONS = ('auto', 'normal', 'invert', None) +_VALID_AMBIGUITY_OPTIONS = ('auto', 'resolve', 'explicit', 'forest') + + +class PostLex(ABC): + @abstractmethod + def process(self, stream): + return stream + + always_accept = () + + +class Lark(Serialize): + """Main interface for the library. + + It's mostly a thin wrapper for the many different parsers, and for the tree constructor. + + Parameters: + grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax) + options: a dictionary controlling various aspects of Lark. + + Example: + >>> Lark(r'''start: "foo" ''') + Lark(...) + """ + def __init__(self, grammar, **options): + self.options = LarkOptions(options) + + # Set regex or re module + use_regex = self.options.regex + if use_regex: + if regex: + re_module = regex + else: + raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') + else: + re_module = re + + # Some, but not all file-like objects have a 'name' attribute + if self.options.source_path is None: + try: + self.source_path = grammar.name + except AttributeError: + self.source_path = '<string>' + else: + self.source_path = self.options.source_path + + # Drain file-like objects to get their contents + try: + read = grammar.read + except AttributeError: + pass + else: + grammar = read() + + cache_fn = None + cache_md5 = None + if isinstance(grammar, STRING_TYPE): + self.source_grammar = grammar + if self.options.use_bytes: + if not isascii(grammar): + raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise ConfigurationError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") + + if self.options.cache: + if self.options.parser != 'lalr': + raise ConfigurationError("cache only works with parser='lalr' for now") + + unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) + from . import __version__ + s = grammar + options_str + __version__ + str(sys.version_info[:2]) + cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest() + + if isinstance(self.options.cache, STRING_TYPE): + cache_fn = self.options.cache + else: + if self.options.cache is not True: + raise ConfigurationError("cache argument must be bool or str") + # Python2.7 doesn't support * syntax in tuples + cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2]) + + if FS.exists(cache_fn): + logger.debug('Loading grammar from cache: %s', cache_fn) + # Remove options that aren't relevant for loading from cache + for name in (set(options) - _LOAD_ALLOWED_OPTIONS): + del options[name] + with FS.open(cache_fn, 'rb') as f: + old_options = self.options + try: + file_md5 = f.readline().rstrip(b'\n') + cached_used_files = pickle.load(f) + if file_md5 == cache_md5.encode('utf8') and verify_used_files(cached_used_files): + cached_parser_data = pickle.load(f) + self._load(cached_parser_data, **options) + return + except Exception: # We should probably narrow done which errors we catch here. + logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn) + + # In theory, the Lark instance might have been messed up by the call to `_load`. + # In practice the only relevant thing that might have been overriden should be `options` + self.options = old_options + + + # Parse the grammar file and compose the grammars + self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + else: + assert isinstance(grammar, Grammar) + self.grammar = grammar + + + if self.options.lexer == 'auto': + if self.options.parser == 'lalr': + self.options.lexer = 'contextual' + elif self.options.parser == 'earley': + if self.options.postlex is not None: + logger.info("postlex can't be used with the dynamic lexer, so we use standard instead. " + "Consider using lalr with contextual instead of earley") + self.options.lexer = 'standard' + else: + self.options.lexer = 'dynamic' + elif self.options.parser == 'cyk': + self.options.lexer = 'standard' + else: + assert False, self.options.parser + lexer = self.options.lexer + if isinstance(lexer, type): + assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance + else: + assert_config(lexer, ('standard', 'contextual', 'dynamic', 'dynamic_complete')) + if self.options.postlex is not None and 'dynamic' in lexer: + raise ConfigurationError("Can't use postlex with a dynamic lexer. Use standard or contextual instead") + + if self.options.ambiguity == 'auto': + if self.options.parser == 'earley': + self.options.ambiguity = 'resolve' + else: + assert_config(self.options.parser, ('earley', 'cyk'), "%r doesn't support disambiguation. Use one of these parsers instead: %s") + + if self.options.priority == 'auto': + self.options.priority = 'normal' + + if self.options.priority not in _VALID_PRIORITY_OPTIONS: + raise ConfigurationError("invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS)) + assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' + if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: + raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) + + if self.options.parser is None: + terminals_to_keep = '*' + elif self.options.postlex is not None: + terminals_to_keep = set(self.options.postlex.always_accept) + else: + terminals_to_keep = set() + + # Compile the EBNF grammar into BNF + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep) + + if self.options.edit_terminals: + for t in self.terminals: + self.options.edit_terminals(t) + + self._terminals_dict = {t.name: t for t in self.terminals} + + # If the user asked to invert the priorities, negate them all here. + # This replaces the old 'resolve__antiscore_sum' option. + if self.options.priority == 'invert': + for rule in self.rules: + if rule.options.priority is not None: + rule.options.priority = -rule.options.priority + # Else, if the user asked to disable priorities, strip them from the + # rules. This allows the Earley parsers to skip an extra forest walk + # for improved performance, if you don't need them (or didn't specify any). + elif self.options.priority is None: + for rule in self.rules: + if rule.options.priority is not None: + rule.options.priority = None + + # TODO Deprecate lexer_callbacks? + self.lexer_conf = LexerConf( + self.terminals, re_module, self.ignore_tokens, self.options.postlex, + self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes + ) + + if self.options.parser: + self.parser = self._build_parser() + elif lexer: + self.lexer = self._build_lexer() + + if cache_fn: + logger.debug('Saving grammar to cache: %s', cache_fn) + with FS.open(cache_fn, 'wb') as f: + f.write(cache_md5.encode('utf8') + b'\n') + pickle.dump(used_files, f) + self.save(f) + + if __doc__: + __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC + + __serialize_fields__ = 'parser', 'rules', 'options' + + def _build_lexer(self, dont_ignore=False): + lexer_conf = self.lexer_conf + if dont_ignore: + from copy import copy + lexer_conf = copy(lexer_conf) + lexer_conf.ignore = () + return TraditionalLexer(lexer_conf) + + def _prepare_callbacks(self): + self._callbacks = {} + # we don't need these callbacks if we aren't building a tree + if self.options.ambiguity != 'forest': + self._parse_tree_builder = ParseTreeBuilder( + self.rules, + self.options.tree_class or Tree, + self.options.propagate_positions, + self.options.parser != 'lalr' and self.options.ambiguity == 'explicit', + self.options.maybe_placeholders + ) + self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) + self._callbacks.update(_get_lexer_callbacks(self.options.transformer, self.terminals)) + + def _build_parser(self): + self._prepare_callbacks() + parser_class = get_frontend(self.options.parser, self.options.lexer) + parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) + return parser_class(self.lexer_conf, parser_conf, options=self.options) + + def save(self, f): + """Saves the instance into the given file object + + Useful for caching and multiprocessing. + """ + data, m = self.memo_serialize([TerminalDef, Rule]) + pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL) + + @classmethod + def load(cls, f): + """Loads an instance from the given file object + + Useful for caching and multiprocessing. + """ + inst = cls.__new__(cls) + return inst._load(f) + + def _deserialize_lexer_conf(self, data, memo, options): + lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo) + lexer_conf.callbacks = options.lexer_callbacks or {} + lexer_conf.re_module = regex if options.regex else re + lexer_conf.use_bytes = options.use_bytes + lexer_conf.g_regex_flags = options.g_regex_flags + lexer_conf.skip_validation = True + lexer_conf.postlex = options.postlex + return lexer_conf + + def _load(self, f, **kwargs): + if isinstance(f, dict): + d = f + else: + d = pickle.load(f) + memo_json = d['memo'] + data = d['data'] + + assert memo_json + memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) + options = dict(data['options']) + if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): + raise ConfigurationError("Some options are not allowed when loading a Parser: {}" + .format(set(kwargs) - _LOAD_ALLOWED_OPTIONS)) + options.update(kwargs) + self.options = LarkOptions.deserialize(options, memo) + self.rules = [Rule.deserialize(r, memo) for r in data['rules']] + self.source_path = '<deserialized>' + parser_class = get_frontend(self.options.parser, self.options.lexer) + self.lexer_conf = self._deserialize_lexer_conf(data['parser'], memo, self.options) + self.terminals = self.lexer_conf.terminals + self._prepare_callbacks() + self._terminals_dict = {t.name: t for t in self.terminals} + self.parser = parser_class.deserialize( + data['parser'], + memo, + self.lexer_conf, + self._callbacks, + self.options, # Not all, but multiple attributes are used + ) + return self + + @classmethod + def _load_from_dict(cls, data, memo, **kwargs): + inst = cls.__new__(cls) + return inst._load({'data': data, 'memo': memo}, **kwargs) + + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + """Create an instance of Lark with the grammar given by its filename + + If ``rel_to`` is provided, the function will find the grammar filename in relation to it. + + Example: + + >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") + Lark(...) + + """ + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename, encoding='utf8') as f: + return cls(f, **options) + + @classmethod + def open_from_package(cls, package, grammar_path, search_paths=("",), **options): + """Create an instance of Lark with the grammar loaded from within the package `package`. + This allows grammar loading from zipapps. + + Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader` + + Example: + + Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) + """ + package_loader = FromPackageLoader(package, search_paths) + full_path, text = package_loader(None, grammar_path) + options.setdefault('source_path', full_path) + options.setdefault('import_paths', []) + options['import_paths'].append(package_loader) + return cls(text, **options) + + def __repr__(self): + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) + + + def lex(self, text, dont_ignore=False): + """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' + + When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + + :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. + """ + if not hasattr(self, 'lexer') or dont_ignore: + lexer = self._build_lexer(dont_ignore) + else: + lexer = self.lexer + lexer_thread = LexerThread(lexer, text) + stream = lexer_thread.lex(None) + if self.options.postlex: + return self.options.postlex.process(stream) + return stream + + def get_terminal(self, name): + """Get information about a terminal""" + return self._terminals_dict[name] + + def parse_interactive(self, text=None, start=None): + """Start an interactive parsing session. + + Parameters: + text (str, optional): Text to be parsed. Required for ``resume_parse()``. + start (str, optional): Start symbol + + Returns: + A new InteractiveParser instance. + + See Also: ``Lark.parse()`` + """ + return self.parser.parse_interactive(text, start=start) + + def parse(self, text, start=None, on_error=None): + """Parse the given text, according to the options provided. + + Parameters: + text (str): Text to be parsed. + start (str, optional): Required if Lark was given multiple possible start symbols (using the start option). + on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing. + LALR only. See examples/advanced/error_handling.py for an example of how to use on_error. + + Returns: + If a transformer is supplied to ``__init__``, returns whatever is the + result of the transformation. Otherwise, returns a Tree instance. + + :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: + ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. + For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. + + """ + return self.parser.parse(text, start=start, on_error=on_error) + + @property + def source(self): + warn("Attribute Lark.source was renamed to Lark.source_path", DeprecationWarning) + return self.source_path + + @source.setter + def source(self, value): + self.source_path = value + + @property + def grammar_source(self): + warn("Attribute Lark.grammar_source was renamed to Lark.source_grammar", DeprecationWarning) + return self.source_grammar + + @grammar_source.setter + def grammar_source(self, value): + self.source_grammar = value + + +###} diff --git a/.venv/lib/python3.12/site-packages/lark/lexer.py b/.venv/lib/python3.12/site-packages/lark/lexer.py new file mode 100644 index 00000000..a82cc180 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/lexer.py @@ -0,0 +1,506 @@ +# Lexer Implementation + +import re + +from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress +from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken + +###{standalone +from warnings import warn +from copy import copy + + +class Pattern(Serialize): + raw = None + type = None + + def __init__(self, value, flags=(), raw=None): + self.value = value + self.flags = frozenset(flags) + self.raw = raw + + def __repr__(self): + return repr(self.to_regexp()) + + # Pattern Hashing assumes all subclasses have a different priority! + def __hash__(self): + return hash((type(self), self.value, self.flags)) + + def __eq__(self, other): + return type(self) == type(other) and self.value == other.value and self.flags == other.flags + + def to_regexp(self): + raise NotImplementedError() + + def min_width(self): + raise NotImplementedError() + + def max_width(self): + raise NotImplementedError() + + if Py36: + # Python 3.6 changed syntax for flags in regular expression + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s:%s)' % (f, value)) + return value + + else: + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s)' % f) + value + return value + + + +class PatternStr(Pattern): + __serialize_fields__ = 'value', 'flags' + + type = "str" + + def to_regexp(self): + return self._get_flags(re.escape(self.value)) + + @property + def min_width(self): + return len(self.value) + max_width = min_width + + +class PatternRE(Pattern): + __serialize_fields__ = 'value', 'flags', '_width' + + type = "re" + + def to_regexp(self): + return self._get_flags(self.value) + + _width = None + def _get_width(self): + if self._width is None: + self._width = get_regexp_width(self.to_regexp()) + return self._width + + @property + def min_width(self): + return self._get_width()[0] + + @property + def max_width(self): + return self._get_width()[1] + + +class TerminalDef(Serialize): + __serialize_fields__ = 'name', 'pattern', 'priority' + __serialize_namespace__ = PatternStr, PatternRE + + def __init__(self, name, pattern, priority=1): + assert isinstance(pattern, Pattern), pattern + self.name = name + self.pattern = pattern + self.priority = priority + + def __repr__(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + + def user_repr(self): + if self.name.startswith('__'): # We represent a generated terminal + return self.pattern.raw or self.name + else: + return self.name + + +class Token(Str): + """A string with meta-information, that is produced by the lexer. + + When parsing text, the resulting chunks of the input that haven't been discarded, + will end up in the tree as Token instances. The Token class inherits from Python's ``str``, + so normal string comparisons and operations will work as expected. + + Attributes: + type: Name of the token (as specified in grammar) + value: Value of the token (redundant, as ``token.value == token`` will always be true) + start_pos: The index of the token in the text + line: The line of the token in the text (starting with 1) + column: The column of the token in the text (starting with 1) + end_line: The line where the token ends + end_column: The next column after the end of the token. For example, + if the token is a single character with a column value of 4, + end_column will be 5. + end_pos: the index where the token ends (basically ``start_pos + len(token)``) + """ + __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') + + def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): + try: + inst = super(Token, cls).__new__(cls, value) + except UnicodeDecodeError: + value = value.decode('latin1') + inst = super(Token, cls).__new__(cls, value) + + inst.type = type_ + inst.start_pos = start_pos if start_pos is not None else pos_in_stream + inst.value = value + inst.line = line + inst.column = column + inst.end_line = end_line + inst.end_column = end_column + inst.end_pos = end_pos + return inst + + @property + def pos_in_stream(self): + warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning, 2) + return self.start_pos + + def update(self, type_=None, value=None): + return Token.new_borrow_pos( + type_ if type_ is not None else self.type, + value if value is not None else self.value, + self + ) + + @classmethod + def new_borrow_pos(cls, type_, value, borrow_t): + return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) + + def __reduce__(self): + return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column)) + + def __repr__(self): + return 'Token(%r, %r)' % (self.type, self.value) + + def __deepcopy__(self, memo): + return Token(self.type, self.value, self.start_pos, self.line, self.column) + + def __eq__(self, other): + if isinstance(other, Token) and self.type != other.type: + return False + + return Str.__eq__(self, other) + + __hash__ = Str.__hash__ + + +class LineCounter: + __slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char' + + def __init__(self, newline_char): + self.newline_char = newline_char + self.char_pos = 0 + self.line = 1 + self.column = 1 + self.line_start_pos = 0 + + def __eq__(self, other): + if not isinstance(other, LineCounter): + return NotImplemented + + return self.char_pos == other.char_pos and self.newline_char == other.newline_char + + def feed(self, token, test_newline=True): + """Consume a token and calculate the new line & column. + + As an optional optimization, set test_newline=False if token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + 1 + + +class UnlessCallback: + def __init__(self, scanner): + self.scanner = scanner + + def __call__(self, t): + res = self.scanner.match(t.value, 0) + if res: + _value, t.type = res + return t + + +class CallChain: + def __init__(self, callback1, callback2, cond): + self.callback1 = callback1 + self.callback2 = callback2 + self.cond = cond + + def __call__(self, t): + t2 = self.callback1(t) + return self.callback2(t) if self.cond(t2) else t2 + + +def _get_match(re_, regexp, s, flags): + m = re_.match(regexp, s, flags) + if m: + return m.group(0) + +def _create_unless(terminals, g_regex_flags, re_, use_bytes): + tokens_by_type = classify(terminals, lambda t: type(t.pattern)) + assert len(tokens_by_type) <= 2, tokens_by_type.keys() + embedded_strs = set() + callback = {} + for retok in tokens_by_type.get(PatternRE, []): + unless = [] + for strtok in tokens_by_type.get(PatternStr, []): + if strtok.priority > retok.priority: + continue + s = strtok.pattern.value + if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): + unless.append(strtok) + if strtok.pattern.flags <= retok.pattern.flags: + embedded_strs.add(strtok) + if unless: + callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) + + new_terminals = [t for t in terminals if t not in embedded_strs] + return new_terminals, callback + + + +class Scanner: + def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): + self.terminals = terminals + self.g_regex_flags = g_regex_flags + self.re_ = re_ + self.use_bytes = use_bytes + self.match_whole = match_whole + + self.allowed_types = {t.name for t in self.terminals} + + self._mres = self._build_mres(terminals, len(terminals)) + + def _build_mres(self, terminals, max_size): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + postfix = '$' if self.match_whole else '' + mres = [] + while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if self.use_bytes: + pattern = pattern.encode('latin-1') + try: + mre = self.re_.compile(pattern, self.g_regex_flags) + except AssertionError: # Yes, this is what Python provides us.. :/ + return self._build_mres(terminals, max_size//2) + + mres.append((mre, {i: n for n, i in mre.groupindex.items()})) + terminals = terminals[max_size:] + return mres + + def match(self, text, pos): + for mre, type_from_index in self._mres: + m = mre.match(text, pos) + if m: + return m.group(0), type_from_index[m.lastindex] + + +def _regexp_has_newline(r): + r"""Expressions that may indicate newlines in a regexp: + - newlines (\n) + - escaped newline (\\n) + - anything but ([^...]) + - any-char (.) when the flag (?s) exists + - spaces (\s) + """ + return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) + + +class Lexer(object): + """Lexer interface + + Method Signatures: + lex(self, text) -> Iterator[Token] + """ + lex = NotImplemented + + def make_lexer_state(self, text): + line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n') + return LexerState(text, line_ctr) + + +class TraditionalLexer(Lexer): + + def __init__(self, conf): + terminals = list(conf.terminals) + assert all(isinstance(t, TerminalDef) for t in terminals), terminals + + self.re = conf.re_module + + if not conf.skip_validation: + # Sanitization + for t in terminals: + try: + self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags) + except self.re.error: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + + if not (set(conf.ignore) <= {t.name for t in terminals}): + raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals})) + + # Init + self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) + self.ignore_types = frozenset(conf.ignore) + + terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) + self.terminals = terminals + self.user_callbacks = conf.callbacks + self.g_regex_flags = conf.g_regex_flags + self.use_bytes = conf.use_bytes + self.terminals_by_name = conf.terminals_by_name + + self._scanner = None + + def _build_scanner(self): + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) + assert all(self.callback.values()) + + for type_, f in self.user_callbacks.items(): + if type_ in self.callback: + # Already a callback there, probably UnlessCallback + self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) + else: + self.callback[type_] = f + + self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) + + @property + def scanner(self): + if self._scanner is None: + self._build_scanner() + return self._scanner + + def match(self, text, pos): + return self.scanner.match(text, pos) + + def lex(self, state, parser_state): + with suppress(EOFError): + while True: + yield self.next_token(state, parser_state) + + def next_token(self, lex_state, parser_state=None): + line_ctr = lex_state.line_ctr + while line_ctr.char_pos < len(lex_state.text): + res = self.match(lex_state.text, line_ctr.char_pos) + if not res: + allowed = self.scanner.allowed_types - self.ignore_types + if not allowed: + allowed = {"<END-OF-FILE>"} + raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, + allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], + state=parser_state, terminals_by_name=self.terminals_by_name) + + value, type_ = res + + if type_ not in self.ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + line_ctr.feed(value, type_ in self.newline_types) + t.end_line = line_ctr.line + t.end_column = line_ctr.column + t.end_pos = line_ctr.char_pos + if t.type in self.callback: + t = self.callback[t.type](t) + if not isinstance(t, Token): + raise LexError("Callbacks must return a token (returned %r)" % t) + lex_state.last_token = t + return t + else: + if type_ in self.callback: + t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + self.callback[type_](t2) + line_ctr.feed(value, type_ in self.newline_types) + + # EOF + raise EOFError(self) + + +class LexerState(object): + __slots__ = 'text', 'line_ctr', 'last_token' + + def __init__(self, text, line_ctr, last_token=None): + self.text = text + self.line_ctr = line_ctr + self.last_token = last_token + + def __eq__(self, other): + if not isinstance(other, LexerState): + return NotImplemented + + return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token + + def __copy__(self): + return type(self)(self.text, copy(self.line_ctr), self.last_token) + + +class ContextualLexer(Lexer): + + def __init__(self, conf, states, always_accept=()): + terminals = list(conf.terminals) + terminals_by_name = conf.terminals_by_name + + trad_conf = copy(conf) + trad_conf.terminals = terminals + + lexer_by_tokens = {} + self.lexers = {} + for state, accepts in states.items(): + key = frozenset(accepts) + try: + lexer = lexer_by_tokens[key] + except KeyError: + accepts = set(accepts) | set(conf.ignore) | set(always_accept) + lexer_conf = copy(trad_conf) + lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] + lexer = TraditionalLexer(lexer_conf) + lexer_by_tokens[key] = lexer + + self.lexers[state] = lexer + + assert trad_conf.terminals is terminals + self.root_lexer = TraditionalLexer(trad_conf) + + def make_lexer_state(self, text): + return self.root_lexer.make_lexer_state(text) + + def lex(self, lexer_state, parser_state): + try: + while True: + lexer = self.lexers[parser_state.position] + yield lexer.next_token(lexer_state, parser_state) + except EOFError: + pass + except UnexpectedCharacters as e: + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. + # This tests the input against the global context, to provide a nicer error. + try: + last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token + token = self.root_lexer.next_token(lexer_state, parser_state) + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) + except UnexpectedCharacters: + raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. + +class LexerThread(object): + """A thread that ties a lexer instance and a lexer state, to be used by the parser""" + + def __init__(self, lexer, text): + self.lexer = lexer + self.state = lexer.make_lexer_state(text) + + def lex(self, parser_state): + return self.lexer.lex(self.state, parser_state) + + def __copy__(self): + copied = object.__new__(LexerThread) + copied.lexer = self.lexer + copied.state = copy(self.state) + return copied +###} diff --git a/.venv/lib/python3.12/site-packages/lark/load_grammar.py b/.venv/lib/python3.12/site-packages/lark/load_grammar.py new file mode 100644 index 00000000..1ae832f6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/load_grammar.py @@ -0,0 +1,1354 @@ +"""Parses and creates Grammar objects""" +import hashlib +import os.path +import sys +from collections import namedtuple +from copy import copy, deepcopy +from io import open +import pkgutil +from ast import literal_eval +from numbers import Integral + +from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors +from .lexer import Token, TerminalDef, PatternStr, PatternRE + +from .parse_tree_builder import ParseTreeBuilder +from .parser_frontends import ParsingFrontend +from .common import LexerConf, ParserConf +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol +from .utils import classify, suppress, dedup_list, Str +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError + +from .tree import Tree, SlottedTree as ST +from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive +inline_args = v_args(inline=True) + +__path__ = os.path.dirname(__file__) +IMPORT_PATHS = ['grammars'] + +EXT = '.lark' + +_RE_FLAGS = 'imslux' + +_EMPTY = Symbol('__empty__') + +_TERMINAL_NAMES = { + '.' : 'DOT', + ',' : 'COMMA', + ':' : 'COLON', + ';' : 'SEMICOLON', + '+' : 'PLUS', + '-' : 'MINUS', + '*' : 'STAR', + '/' : 'SLASH', + '\\' : 'BACKSLASH', + '|' : 'VBAR', + '?' : 'QMARK', + '!' : 'BANG', + '@' : 'AT', + '#' : 'HASH', + '$' : 'DOLLAR', + '%' : 'PERCENT', + '^' : 'CIRCUMFLEX', + '&' : 'AMPERSAND', + '_' : 'UNDERSCORE', + '<' : 'LESSTHAN', + '>' : 'MORETHAN', + '=' : 'EQUAL', + '"' : 'DBLQUOTE', + '\'' : 'QUOTE', + '`' : 'BACKQUOTE', + '~' : 'TILDE', + '(' : 'LPAR', + ')' : 'RPAR', + '{' : 'LBRACE', + '}' : 'RBRACE', + '[' : 'LSQB', + ']' : 'RSQB', + '\n' : 'NEWLINE', + '\r\n' : 'CRLF', + '\t' : 'TAB', + ' ' : 'SPACE', +} + +# Grammar Parser +TERMINALS = { + '_LPAR': r'\(', + '_RPAR': r'\)', + '_LBRA': r'\[', + '_RBRA': r'\]', + '_LBRACE': r'\{', + '_RBRACE': r'\}', + 'OP': '[+*]|[?](?![a-z])', + '_COLON': ':', + '_COMMA': ',', + '_OR': r'\|', + '_DOT': r'\.(?!\.)', + '_DOTDOT': r'\.\.', + 'TILDE': '~', + 'RULE': '!?[_?]?[a-z][_a-z0-9]*', + 'TERMINAL': '_?[A-Z][_A-Z0-9]*', + 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, + '_NL': r'(\r?\n)+\s*', + '_NL_OR': r'(\r?\n)+\s*\|', + 'WS': r'[ \t]+', + 'COMMENT': r'\s*//[^\n]*', + 'BACKSLASH': r'\\[ ]*\n', + '_TO': '->', + '_IGNORE': r'%ignore', + '_OVERRIDE': r'%override', + '_DECLARE': r'%declare', + '_EXTEND': r'%extend', + '_IMPORT': r'%import', + 'NUMBER': r'[+-]?\d+', +} + +RULES = { + 'start': ['_list'], + '_list': ['_item', '_list _item'], + '_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'], + + 'rule': ['RULE template_params _COLON expansions _NL', + 'RULE template_params _DOT NUMBER _COLON expansions _NL'], + 'template_params': ['_LBRACE _template_params _RBRACE', + ''], + '_template_params': ['RULE', + '_template_params _COMMA RULE'], + 'expansions': ['_expansions'], + '_expansions': ['alias', + '_expansions _OR alias', + '_expansions _NL_OR alias'], + + '?alias': ['expansion _TO RULE', 'expansion'], + 'expansion': ['_expansion'], + + '_expansion': ['', '_expansion expr'], + + '?expr': ['atom', + 'atom OP', + 'atom TILDE NUMBER', + 'atom TILDE NUMBER _DOTDOT NUMBER', + ], + + '?atom': ['_LPAR expansions _RPAR', + 'maybe', + 'value'], + + 'value': ['terminal', + 'nonterminal', + 'literal', + 'range', + 'template_usage'], + + 'terminal': ['TERMINAL'], + 'nonterminal': ['RULE'], + + '?name': ['RULE', 'TERMINAL'], + + 'maybe': ['_LBRA expansions _RBRA'], + 'range': ['STRING _DOTDOT STRING'], + + 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], + '_template_args': ['value', + '_template_args _COMMA value'], + + 'term': ['TERMINAL _COLON expansions _NL', + 'TERMINAL _DOT NUMBER _COLON expansions _NL'], + 'override': ['_OVERRIDE rule', + '_OVERRIDE term'], + 'extend': ['_EXTEND rule', + '_EXTEND term'], + 'ignore': ['_IGNORE expansions _NL'], + 'declare': ['_DECLARE _declare_args _NL'], + 'import': ['_IMPORT _import_path _NL', + '_IMPORT _import_path _LPAR name_list _RPAR _NL', + '_IMPORT _import_path _TO name _NL'], + + '_import_path': ['import_lib', 'import_rel'], + 'import_lib': ['_import_args'], + 'import_rel': ['_DOT _import_args'], + '_import_args': ['name', '_import_args _DOT name'], + + 'name_list': ['_name_list'], + '_name_list': ['name', '_name_list _COMMA name'], + + '_declare_args': ['name', '_declare_args name'], + 'literal': ['REGEXP', 'STRING'], +} + + +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 +# The Threshold whether repeat via ~ are split up into different rules +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# (See PR #949) +REPEAT_BREAK_THRESHOLD = 50 + + +@inline_args +class EBNF_to_BNF(Transformer_InPlace): + def __init__(self): + self.new_rules = [] + self.rules_cache = {} + self.prefix = 'anon' + self.i = 0 + self.rule_options = None + + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) + self.i += 1 + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t + return t + + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + + def _add_repeat_rule(self, a, b, target, atom): + """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. + + When called recursively (into target), it repeats atom for x(n) times, where: + x(0) = 1 + x(n) = a(n) * x(n-1) + b + + Example rule when a=3, b=4: + + new_rule: target target target atom atom atom atom + + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """Creates a rule that matches atom 0 to (a*n+b)-1 times. + + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + + First we generate target * i followed by target_opt, for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i, for i from 0 to b-1 + These match n*a to n*a + b-1 times atom + + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + Example rule when a=3, b=4: + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom + + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target]*i + [target_opt]) for i in range(a) + ] + [ + ST('expansion', [target]*a + [atom]*i) for i in range(b) + ]) + return self._add_rule(key, new_name, tree) + + def _generate_repeats(self, rule, mn, mx): + """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. + """ + # For a small number of repeats, we can take the naive approach + if mx < REPEAT_BREAK_THRESHOLD: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + # For large repeat values, we break the repetition into sub-rules. + # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. + # We then use small_factors to split up mn and diff up into values [(a, b), ...] + # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + # to generate a complete rule/expression that matches the corresponding number of repeats + mn_target = rule + for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less + diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) + for a, b in diff_factors[:-1]: + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = self._add_repeat_rule(a, b, diff_target, rule) + + a, b = diff_factors[-1] + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) + + def expr(self, rule, op, *args): + if op.value == '?': + empty = ST('expansion', []) + return ST('expansions', [rule, empty]) + elif op.value == '+': + # a : b c+ d + # --> + # a : b _c d + # _c : _c c | c; + return self._add_recurse_rule('plus', rule) + elif op.value == '*': + # a : b c* d + # --> + # a : b _c? d + # _c : _c c | c; + new_name = self._add_recurse_rule('star', rule) + return ST('expansions', [new_name, ST('expansion', [])]) + elif op.value == '~': + if len(args) == 1: + mn = mx = int(args[0]) + else: + mn, mx = map(int, args) + if mx < mn or mn < 0: + raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) + + return self._generate_repeats(rule, mn, mx) + + assert False, op + + def maybe(self, rule): + keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens + + def will_not_get_removed(sym): + if isinstance(sym, NonTerminal): + return not sym.name.startswith('_') + if isinstance(sym, Terminal): + return keep_all_tokens or not sym.filter_out + assert False + + if any(rule.scan_values(will_not_get_removed)): + empty = _EMPTY + else: + empty = ST('expansion', []) + + return ST('expansions', [rule, empty]) + + +class SimplifyRule_Visitor(Visitor): + + @staticmethod + def _flatten(tree): + while tree.expand_kids_by_data(tree.data): + pass + + def expansion(self, tree): + # rules_list unpacking + # a : b (c|d) e + # --> + # a : b c e | b d e + # + # In AST terms: + # expansion(b, expansions(c, d), e) + # --> + # expansions( expansion(b, c, e), expansion(b, d, e) ) + + self._flatten(tree) + + for i, child in enumerate(tree.children): + if isinstance(child, Tree) and child.data == 'expansions': + tree.data = 'expansions' + tree.children = [self.visit(ST('expansion', [option if i == j else other + for j, other in enumerate(tree.children)])) + for option in dedup_list(child.children)] + self._flatten(tree) + break + + def alias(self, tree): + rule, alias_name = tree.children + if rule.data == 'expansions': + aliases = [] + for child in tree.children[0].children: + aliases.append(ST('alias', [child, alias_name])) + tree.data = 'expansions' + tree.children = aliases + + def expansions(self, tree): + self._flatten(tree) + # Ensure all children are unique + if len(set(tree.children)) != len(tree.children): + tree.children = dedup_list(tree.children) # dedup is expensive, so try to minimize its use + + +class RuleTreeToText(Transformer): + def expansions(self, x): + return x + + def expansion(self, symbols): + return symbols, None + + def alias(self, x): + (expansion, _alias), alias = x + assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed + return expansion, alias.value + + +class PrepareAnonTerminals(Transformer_InPlace): + """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them""" + + def __init__(self, terminals): + self.terminals = terminals + self.term_set = {td.name for td in self.terminals} + self.term_reverse = {td.pattern: td for td in terminals} + self.i = 0 + self.rule_options = None + + @inline_args + def pattern(self, p): + value = p.value + if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags: + raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) + + term_name = None + + if isinstance(p, PatternStr): + try: + # If already defined, use the user-defined terminal name + term_name = self.term_reverse[p].name + except KeyError: + # Try to assign an indicative anon-terminal name + try: + term_name = _TERMINAL_NAMES[value] + except KeyError: + if value and is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: + term_name = value.upper() + + if term_name in self.term_set: + term_name = None + + elif isinstance(p, PatternRE): + if p in self.term_reverse: # Kind of a weird placement.name + term_name = self.term_reverse[p].name + else: + assert False, p + + if term_name is None: + term_name = '__ANON_%d' % self.i + self.i += 1 + + if term_name not in self.term_set: + assert p not in self.term_reverse + self.term_set.add(term_name) + termdef = TerminalDef(term_name, p) + self.term_reverse[p] = termdef + self.terminals.append(termdef) + + filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr) + + return Terminal(term_name, filter_out=filter_out) + + +class _ReplaceSymbols(Transformer_InPlace): + """Helper for ApplyTemplates""" + + def __init__(self): + self.names = {} + + def value(self, c): + if len(c) == 1 and isinstance(c[0], Token) and c[0].value in self.names: + return self.names[c[0].value] + return self.__default__('value', c, None) + + def template_usage(self, c): + if c[0] in self.names: + return self.__default__('template_usage', [self.names[c[0]].name] + c[1:], None) + return self.__default__('template_usage', c, None) + + +class ApplyTemplates(Transformer_InPlace): + """Apply the templates, creating new rules that represent the used templates""" + + def __init__(self, rule_defs): + self.rule_defs = rule_defs + self.replacer = _ReplaceSymbols() + self.created_templates = set() + + def template_usage(self, c): + name = c[0] + args = c[1:] + result_name = "%s{%s}" % (name, ",".join(a.name for a in args)) + if result_name not in self.created_templates: + self.created_templates.add(result_name) + (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name) + assert len(params) == len(args), args + result_tree = deepcopy(tree) + self.replacer.names = dict(zip(params, args)) + self.replacer.transform(result_tree) + self.rule_defs.append((result_name, [], result_tree, deepcopy(options))) + return NonTerminal(result_name) + + +def _rfind(s, choices): + return max(s.rfind(c) for c in choices) + + +def eval_escaping(s): + w = '' + i = iter(s) + for n in i: + w += n + if n == '\\': + try: + n2 = next(i) + except StopIteration: + raise GrammarError("Literal ended unexpectedly (bad escaping): `%r`" % s) + if n2 == '\\': + w += '\\\\' + elif n2 not in 'Uuxnftr': + w += '\\' + w += n2 + w = w.replace('\\"', '"').replace("'", "\\'") + + to_eval = "u'''%s'''" % w + try: + s = literal_eval(to_eval) + except SyntaxError as e: + raise GrammarError(s, e) + + return s + + +def _literal_to_pattern(literal): + v = literal.value + flag_start = _rfind(v, '/"')+1 + assert flag_start > 0 + flags = v[flag_start:] + assert all(f in _RE_FLAGS for f in flags), flags + + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + + v = v[:flag_start] + assert v[0] == v[-1] and v[0] in '"/' + x = v[1:-1] + + s = eval_escaping(x) + + if s == "": + raise GrammarError("Empty terminals are not allowed (%s)" % literal) + + if literal.type == 'STRING': + s = s.replace('\\\\', '\\') + return PatternStr(s, flags, raw=literal.value) + elif literal.type == 'REGEXP': + return PatternRE(s, flags, raw=literal.value) + else: + assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' + + +@inline_args +class PrepareLiterals(Transformer_InPlace): + def literal(self, literal): + return ST('pattern', [_literal_to_pattern(literal)]) + + def range(self, start, end): + assert start.type == end.type == 'STRING' + start = start.value[1:-1] + end = end.value[1:-1] + assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1 + regexp = '[%s-%s]' % (start, end) + return ST('pattern', [PatternRE(regexp)]) + + +def _make_joined_pattern(regexp, flags_set): + # In Python 3.6, a new syntax for flags was introduced, that allows us to restrict the scope + # of flags to a specific regexp group. We are already using it in `lexer.Pattern._get_flags` + # However, for prior Python versions, we still need to use global flags, so we have to make sure + # that there are no flag collisions when we merge several terminals. + flags = () + if not Py36: + if len(flags_set) > 1: + raise GrammarError("Lark doesn't support joining terminals with conflicting flags in python <3.6!") + elif len(flags_set) == 1: + flags ,= flags_set + + return PatternRE(regexp, flags) + +class TerminalTreeToPattern(Transformer_NonRecursive): + def pattern(self, ps): + p ,= ps + return p + + def expansion(self, items): + assert items + if len(items) == 1: + return items[0] + + pattern = ''.join(i.to_regexp() for i in items) + return _make_joined_pattern(pattern, {i.flags for i in items}) + + def expansions(self, exps): + if len(exps) == 1: + return exps[0] + + # Do a bit of sorting to make sure that the longest option is returned + # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match) + exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value))) + + pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) + return _make_joined_pattern(pattern, {i.flags for i in exps}) + + def expr(self, args): + inner, op = args[:2] + if op == '~': + if len(args) == 3: + op = "{%d}" % int(args[2]) + else: + mn, mx = map(int, args[2:]) + if mx < mn: + raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx)) + op = "{%d,%d}" % (mn, mx) + else: + assert len(args) == 2 + return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) + + def maybe(self, expr): + return self.expr(expr + ['?']) + + def alias(self, t): + raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") + + def value(self, v): + return v[0] + + +class PrepareSymbols(Transformer_InPlace): + def value(self, v): + v ,= v + if isinstance(v, Tree): + return v + elif v.type == 'RULE': + return NonTerminal(Str(v.value)) + elif v.type == 'TERMINAL': + return Terminal(Str(v.value), filter_out=v.startswith('_')) + assert False + + +def nr_deepcopy_tree(t): + """Deepcopy tree `t` without recursion""" + return Transformer_NonRecursive(False).transform(t) + + +class Grammar: + def __init__(self, rule_defs, term_defs, ignore): + self.term_defs = term_defs + self.rule_defs = rule_defs + self.ignore = ignore + + def compile(self, start, terminals_to_keep): + # We change the trees in-place (to support huge grammars) + # So deepcopy allows calling compile more than once. + term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] + rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs] + + # =================== + # Compile Terminals + # =================== + + # Convert terminal-trees to strings/regexps + + for name, (term_tree, priority) in term_defs: + if term_tree is None: # Terminal added through %declare + continue + expansions = list(term_tree.find_data('expansion')) + if len(expansions) == 1 and not expansions[0].children: + raise GrammarError("Terminals cannot be empty (%s)" % name) + + transformer = PrepareLiterals() * TerminalTreeToPattern() + terminals = [TerminalDef(name, transformer.transform(term_tree), priority) + for name, (term_tree, priority) in term_defs if term_tree] + + # ================= + # Compile Rules + # ================= + + # 1. Pre-process terminals + anon_tokens_transf = PrepareAnonTerminals(terminals) + transformer = PrepareLiterals() * PrepareSymbols() * anon_tokens_transf # Adds to terminals + + # 2. Inline Templates + + transformer *= ApplyTemplates(rule_defs) + + # 3. Convert EBNF to BNF (and apply step 1 & 2) + ebnf_to_bnf = EBNF_to_BNF() + rules = [] + i = 0 + while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates + name, params, rule_tree, options = rule_defs[i] + i += 1 + if len(params) != 0: # Dont transform templates + continue + rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None + ebnf_to_bnf.rule_options = rule_options + ebnf_to_bnf.prefix = name + anon_tokens_transf.rule_options = rule_options + tree = transformer.transform(rule_tree) + res = ebnf_to_bnf.transform(tree) + rules.append((name, res, options)) + rules += ebnf_to_bnf.new_rules + + assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" + + # 4. Compile tree to Rule objects + rule_tree_to_text = RuleTreeToText() + + simplify_rule = SimplifyRule_Visitor() + compiled_rules = [] + for rule_content in rules: + name, tree, options = rule_content + simplify_rule.visit(tree) + expansions = rule_tree_to_text.transform(tree) + + for i, (expansion, alias) in enumerate(expansions): + if alias and name.startswith('_'): + raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)"% (name, alias)) + + empty_indices = [x==_EMPTY for x in expansion] + if any(empty_indices): + exp_options = copy(options) or RuleOptions() + exp_options.empty_indices = empty_indices + expansion = [x for x in expansion if x!=_EMPTY] + else: + exp_options = options + + for sym in expansion: + assert isinstance(sym, Symbol) + if sym.is_term and exp_options and exp_options.keep_all_tokens: + sym.filter_out = False + rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) + compiled_rules.append(rule) + + # Remove duplicates of empty rules, throw error for non-empty duplicates + if len(set(compiled_rules)) != len(compiled_rules): + duplicates = classify(compiled_rules, lambda x: x) + for dups in duplicates.values(): + if len(dups) > 1: + if dups[0].expansion: + raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" + % ''.join('\n * %s' % i for i in dups)) + + # Empty rule; assert all other attributes are equal + assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) + + # Remove duplicates + compiled_rules = list(set(compiled_rules)) + + # Filter out unused rules + while True: + c = len(compiled_rules) + used_rules = {s for r in compiled_rules + for s in r.expansion + if isinstance(s, NonTerminal) + and s != r.origin} + used_rules |= {NonTerminal(s) for s in start} + compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules) + for r in unused: + logger.debug("Unused rule: %s", r) + if len(compiled_rules) == c: + break + + # Filter out unused terminals + if terminals_to_keep != '*': + used_terms = {t.name for r in compiled_rules + for t in r.expansion + if isinstance(t, Terminal)} + terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) + if unused: + logger.debug("Unused terminals: %s", [t.name for t in unused]) + + return terminals, compiled_rules, self.ignore + + +PackageResource = namedtuple('PackageResource', 'pkg_name path') + + +class FromPackageLoader(object): + """ + Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. + This allows them to be compatible even from within zip files. + + Relative imports are handled, so you can just freely use them. + + pkg_name: The name of the package. You can probably provide `__name__` most of the time + search_paths: All the path that will be search on absolute imports. + """ + def __init__(self, pkg_name, search_paths=("", )): + self.pkg_name = pkg_name + self.search_paths = search_paths + + def __repr__(self): + return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) + + def __call__(self, base_path, grammar_path): + if base_path is None: + to_try = self.search_paths + else: + # Check whether or not the importing grammar was loaded by this module. + if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name: + # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway + raise IOError() + to_try = [base_path.path] + for path in to_try: + full_path = os.path.join(path, grammar_path) + try: + text = pkgutil.get_data(self.pkg_name, full_path) + except IOError: + continue + else: + return PackageResource(self.pkg_name, full_path), text.decode() + raise IOError() + + +stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) + + + +def resolve_term_references(term_dict): + # TODO Solve with transitive closure (maybe) + + while True: + changed = False + for name, token_tree in term_dict.items(): + if token_tree is None: # Terminal added through %declare + continue + for exp in token_tree.find_data('value'): + item ,= exp.children + if isinstance(item, Token): + if item.type == 'RULE': + raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) + if item.type == 'TERMINAL': + try: + term_value = term_dict[item] + except KeyError: + raise GrammarError("Terminal used but not defined: %s" % item) + assert term_value is not None + exp.children[0] = term_value + changed = True + if not changed: + break + + for name, term in term_dict.items(): + if term: # Not just declared + for child in term.children: + ids = [id(x) for x in child.iter_subtrees()] + if id(term) in ids: + raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) + + +def options_from_rule(name, params, *x): + if len(x) > 1: + priority, expansions = x + priority = int(priority) + else: + expansions ,= x + priority = None + params = [t.value for t in params.children] if params is not None else [] # For the grammar parser + + keep_all_tokens = name.startswith('!') + name = name.lstrip('!') + expand1 = name.startswith('?') + name = name.lstrip('?') + + return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority, + template_source=(name if params else None)) + + +def symbols_from_strcase(expansion): + return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion] + + +@inline_args +class PrepareGrammar(Transformer_InPlace): + def terminal(self, name): + return name + + def nonterminal(self, name): + return name + + +def _find_used_symbols(tree): + assert tree.data == 'expansions' + return {t for x in tree.find_data('expansion') + for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} + + +def _get_parser(): + try: + return _get_parser.cache + except AttributeError: + terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] + + rules = [options_from_rule(name, None, x) for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) + for r, _p, xs, o in rules for i, x in enumerate(xs)] + callback = ParseTreeBuilder(rules, ST).create_callback() + import re + lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) + parser_conf = ParserConf(rules, callback, ['start']) + lexer_conf.lexer_type = 'standard' + parser_conf.parser_type = 'lalr' + _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) + return _get_parser.cache + +GRAMMAR_ERRORS = [ + ('Incorrect type of value', ['a: 1\n']), + ('Unclosed parenthesis', ['a: (\n']), + ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), + ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), + ('Illegal name for rules or terminals', ['Aa:\n']), + ('Alias expects lowercase name', ['a: -> "a"\n']), + ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), + ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), + ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), + ('Terminal names cannot contain dots', ['A.B\n']), + ('Expecting rule or terminal definition', ['"a"\n']), + ('%import expects a name', ['%import "a"\n']), + ('%ignore expects a value', ['%ignore %import\n']), + ] + +def _translate_parser_exception(parse, e): + error = e.match_examples(parse, GRAMMAR_ERRORS, use_accepts=True) + if error: + return error + elif 'STRING' in e.expected: + return "Expecting a value" + +def _parse_grammar(text, name, start='start'): + try: + tree = _get_parser().parse(text + '\n', start) + except UnexpectedCharacters as e: + context = e.get_context(text) + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, name, context)) + except UnexpectedToken as e: + context = e.get_context(text) + error = _translate_parser_exception(_get_parser().parse, e) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + raise + + return PrepareGrammar().transform(tree) + + +def _error_repr(error): + if isinstance(error, UnexpectedToken): + error2 = _translate_parser_exception(_get_parser().parse, error) + if error2: + return error2 + expected = ', '.join(error.accepts or error.expected) + return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) + else: + return str(error) + +def _search_interactive_parser(interactive_parser, predicate): + def expand(node): + path, p = node + for choice in p.choices(): + t = Token(choice, '') + try: + new_p = p.feed_token(t) + except ParseError: # Illegal + pass + else: + yield path + (choice,), new_p + + for path, p in bfs_all_unique([((), interactive_parser)], expand): + if predicate(p): + return path, p + +def find_grammar_errors(text, start='start'): + errors = [] + def on_error(e): + errors.append((e, _error_repr(e))) + + # recover to a new line + token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices()) + for token_type in token_path: + e.interactive_parser.feed_token(Token(token_type, '')) + e.interactive_parser.feed_token(Token('_NL', '\n')) + return True + + _tree = _get_parser().parse(text + '\n', start, on_error=on_error) + + errors_by_line = classify(errors, lambda e: e[0].line) + errors = [el[0] for el in errors_by_line.values()] # already sorted + + for e in errors: + e[0].interactive_parser = None + return errors + + +def _get_mangle(prefix, aliases, base_mangle=None): + def mangle(s): + if s in aliases: + s = aliases[s] + else: + if s[0] == '_': + s = '_%s__%s' % (prefix, s[1:]) + else: + s = '%s__%s' % (prefix, s) + if base_mangle is not None: + s = base_mangle(s) + return s + return mangle + +def _mangle_exp(exp, mangle): + if mangle is None: + return exp + exp = deepcopy(exp) # TODO: is this needed + for t in exp.iter_subtrees(): + for i, c in enumerate(t.children): + if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): + t.children[i] = Token(c.type, mangle(c.value)) + return exp + + + +class GrammarBuilder: + def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None): + self.global_keep_all_tokens = global_keep_all_tokens + self.import_paths = import_paths or [] + self.used_files = used_files or {} + + self._definitions = {} + self._ignore_names = [] + + def _is_term(self, name): + # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME` + # Only the last part is the actual name, and the rest might contain mixed case + return name.rpartition('__')[-1].isupper() + + def _grammar_error(self, msg, *names): + args = {} + for i, name in enumerate(names, start=1): + postfix = '' if i == 1 else str(i) + args['name' + postfix] = name + args['type' + postfix] = lowercase_type = ("rule", "terminal")[self._is_term(name)] + args['Type' + postfix] = lowercase_type.title() + raise GrammarError(msg.format(**args)) + + def _check_options(self, name, options): + if self._is_term(name): + if options is None: + options = 1 + # if we don't use Integral here, we run into python2.7/python3 problems with long vs int + elif not isinstance(options, Integral): + raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),)) + else: + if options is None: + options = RuleOptions() + elif not isinstance(options, RuleOptions): + raise GrammarError("Rules require a RuleOptions instance as 'options'") + if self.global_keep_all_tokens: + options.keep_all_tokens = True + return options + + + def _define(self, name, exp, params=(), options=None, override=False): + if name in self._definitions: + if not override: + self._grammar_error("{Type} '{name}' defined more than once", name) + elif override: + self._grammar_error("Cannot override a nonexisting {type} {name}", name) + + if name.startswith('__'): + self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', name) + + self._definitions[name] = (params, exp, self._check_options(name, options)) + + def _extend(self, name, exp, params=(), options=None): + if name not in self._definitions: + self._grammar_error("Can't extend {type} {name} as it wasn't defined before", name) + if tuple(params) != tuple(self._definitions[name][0]): + self._grammar_error("Cannot extend {type} with different parameters: {name}", name) + # TODO: think about what to do with 'options' + base = self._definitions[name][1] + + assert isinstance(base, Tree) and base.data == 'expansions' + base.children.insert(0, exp) + + def _ignore(self, exp_or_name): + if isinstance(exp_or_name, str): + self._ignore_names.append(exp_or_name) + else: + assert isinstance(exp_or_name, Tree) + t = exp_or_name + if t.data == 'expansions' and len(t.children) == 1: + t2 ,= t.children + if t2.data=='expansion' and len(t2.children) == 1: + item ,= t2.children + if item.data == 'value': + item ,= item.children + if isinstance(item, Token) and item.type == 'TERMINAL': + self._ignore_names.append(item.value) + return + + name = '__IGNORE_%d'% len(self._ignore_names) + self._ignore_names.append(name) + self._definitions[name] = ((), t, 1) + + def _declare(self, *names): + for name in names: + self._define(name, None) + + def _unpack_import(self, stmt, grammar_name): + if len(stmt.children) > 1: + path_node, arg1 = stmt.children + else: + path_node, = stmt.children + arg1 = None + + if isinstance(arg1, Tree): # Multi import + dotted_path = tuple(path_node.children) + names = arg1.children + aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names + else: # Single import + dotted_path = tuple(path_node.children[:-1]) + if not dotted_path: + name ,= path_node.children + raise GrammarError("Nothing was imported from grammar `%s`" % name) + name = path_node.children[-1] # Get name from dotted path + aliases = {name.value: (arg1 or name).value} # Aliases if exist + + if path_node.data == 'import_lib': # Import from library + base_path = None + else: # Relative import + if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script + try: + base_file = os.path.abspath(sys.modules['__main__'].__file__) + except AttributeError: + base_file = None + else: + base_file = grammar_name # Import relative to grammar file path if external grammar file + if base_file: + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] + else: + base_path = os.path.abspath(os.path.curdir) + + return dotted_path, base_path, aliases + + def _unpack_definition(self, tree, mangle): + if tree.data == 'rule': + name, params, exp, opts = options_from_rule(*tree.children) + else: + name = tree.children[0].value + params = () # TODO terminal templates + opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority + exp = tree.children[-1] + + if mangle is not None: + params = tuple(mangle(p) for p in params) + name = mangle(name) + + exp = _mangle_exp(exp, mangle) + return name, exp, params, opts + + + def load_grammar(self, grammar_text, grammar_name="<?>", mangle=None): + tree = _parse_grammar(grammar_text, grammar_name) + + imports = {} + for stmt in tree.children: + if stmt.data == 'import': + dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name) + try: + import_base_path, import_aliases = imports[dotted_path] + assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) + import_aliases.update(aliases) + except KeyError: + imports[dotted_path] = base_path, aliases + + for dotted_path, (base_path, aliases) in imports.items(): + self.do_import(dotted_path, base_path, aliases, mangle) + + for stmt in tree.children: + if stmt.data in ('term', 'rule'): + self._define(*self._unpack_definition(stmt, mangle)) + elif stmt.data == 'override': + r ,= stmt.children + self._define(*self._unpack_definition(r, mangle), override=True) + elif stmt.data == 'extend': + r ,= stmt.children + self._extend(*self._unpack_definition(r, mangle)) + elif stmt.data == 'ignore': + # if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar + if mangle is None: + self._ignore(*stmt.children) + elif stmt.data == 'declare': + names = [t.value for t in stmt.children] + if mangle is None: + self._declare(*names) + else: + self._declare(*map(mangle, names)) + elif stmt.data == 'import': + pass + else: + assert False, stmt + + + term_defs = { name: exp + for name, (_params, exp, _options) in self._definitions.items() + if self._is_term(name) + } + resolve_term_references(term_defs) + + + def _remove_unused(self, used): + def rule_dependencies(symbol): + if self._is_term(symbol): + return [] + try: + params, tree,_ = self._definitions[symbol] + except KeyError: + return [] + return _find_used_symbols(tree) - set(params) + + _used = set(bfs(used, rule_dependencies)) + self._definitions = {k: v for k, v in self._definitions.items() if k in _used} + + + def do_import(self, dotted_path, base_path, aliases, base_mangle=None): + assert dotted_path + mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle) + grammar_path = os.path.join(*dotted_path) + EXT + to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: + if callable(source): + joined_path, text = source(base_path, grammar_path) + else: + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + except IOError: + continue + else: + h = hashlib.md5(text.encode('utf8')).hexdigest() + if self.used_files.get(joined_path, h) != h: + raise RuntimeError("Grammar file was changed during importing") + self.used_files[joined_path] = h + + gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) + gb.load_grammar(text, joined_path, mangle) + gb._remove_unused(map(mangle, aliases)) + for name in gb._definitions: + if name in self._definitions: + raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path)) + + self._definitions.update(**gb._definitions) + break + else: + # Search failed. Make Python throw a nice error. + open(grammar_path, encoding='utf8') + assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) + + + def validate(self): + for name, (params, exp, _options) in self._definitions.items(): + for i, p in enumerate(params): + if p in self._definitions: + raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) + if p in params[:i]: + raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) + + if exp is None: # Remaining checks don't apply to abstract rules/terminals + continue + + for temp in exp.find_data('template_usage'): + sym = temp.children[0] + args = temp.children[1:] + if sym not in params: + if sym not in self._definitions: + self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, name) + if len(args) != len(self._definitions[sym][0]): + expected, actual = len(self._definitions[sym][0]), len(args) + self._grammar_error("Wrong number of template arguments used for {name} " + "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name) + + for sym in _find_used_symbols(exp): + if sym not in self._definitions and sym not in params: + self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", sym, name) + + if not set(self._definitions).issuperset(self._ignore_names): + raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) + + def build(self): + self.validate() + rule_defs = [] + term_defs = [] + for name, (params, exp, options) in self._definitions.items(): + if self._is_term(name): + assert len(params) == 0 + term_defs.append((name, (exp, options))) + else: + rule_defs.append((name, params, exp, options)) + # resolve_term_references(term_defs) + return Grammar(rule_defs, term_defs, self._ignore_names) + + +def verify_used_files(file_hashes): + for path, old in file_hashes.items(): + text = None + if isinstance(path, str) and os.path.exists(path): + with open(path, encoding='utf8') as f: + text = f.read() + elif isinstance(path, PackageResource): + with suppress(IOError): + text = pkgutil.get_data(*path).decode('utf-8') + if text is None: # We don't know how to load the path. ignore it. + continue + + current = hashlib.md5(text.encode()).hexdigest() + if old != current: + logger.info("File %r changed, rebuilding Parser" % path) + return False + return True + +def list_grammar_imports(grammar, import_paths=[]): + "Returns a list of paths to the lark grammars imported by the given grammar (recursively)" + builder = GrammarBuilder(False, import_paths) + builder.load_grammar(grammar, '<string>') + return list(builder.used_files.keys()) + +def load_grammar(grammar, source, import_paths, global_keep_all_tokens): + builder = GrammarBuilder(global_keep_all_tokens, import_paths) + builder.load_grammar(grammar, source) + return builder.build(), builder.used_files diff --git a/.venv/lib/python3.12/site-packages/lark/parse_tree_builder.py b/.venv/lib/python3.12/site-packages/lark/parse_tree_builder.py new file mode 100644 index 00000000..fa526b0c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parse_tree_builder.py @@ -0,0 +1,387 @@ +from .exceptions import GrammarError, ConfigurationError +from .lexer import Token +from .tree import Tree +from .visitors import InlineTransformer # XXX Deprecated +from .visitors import Transformer_InPlace +from .visitors import _vargs_meta, _vargs_meta_inline + +###{standalone +from functools import partial, wraps +from itertools import repeat, product + + +class ExpandSingleChild: + def __init__(self, node_builder): + self.node_builder = node_builder + + def __call__(self, children): + if len(children) == 1: + return children[0] + else: + return self.node_builder(children) + + + +class PropagatePositions: + def __init__(self, node_builder, node_filter=None): + self.node_builder = node_builder + self.node_filter = node_filter + + def __call__(self, children): + res = self.node_builder(children) + + if isinstance(res, Tree): + # Calculate positions while the tree is streaming, according to the rule: + # - nodes start at the start of their first child's container, + # and end at the end of their last child's container. + # Containers are nodes that take up space in text, but have been inlined in the tree. + + res_meta = res.meta + + first_meta = self._pp_get_meta(children) + if first_meta is not None: + if not hasattr(res_meta, 'line'): + # meta was already set, probably because the rule has been inlined (e.g. `?rule`) + res_meta.line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.column = getattr(first_meta, 'container_column', first_meta.column) + res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) + res_meta.empty = False + + res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column) + + last_meta = self._pp_get_meta(reversed(children)) + if last_meta is not None: + if not hasattr(res_meta, 'end_line'): + res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) + res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) + res_meta.empty = False + + res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) + + return res + + def _pp_get_meta(self, children): + for c in children: + if self.node_filter is not None and not self.node_filter(c): + continue + if isinstance(c, Tree): + if not c.meta.empty: + return c.meta + elif isinstance(c, Token): + return c + +def make_propagate_positions(option): + if callable(option): + return partial(PropagatePositions, node_filter=option) + elif option is True: + return PropagatePositions + elif option is False: + return None + + raise ConfigurationError('Invalid option for propagate_positions: %r' % option) + + +class ChildFilter: + def __init__(self, to_include, append_none, node_builder): + self.node_builder = node_builder + self.to_include = to_include + self.append_none = append_none + + def __call__(self, children): + filtered = [] + + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + filtered += children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + + +class ChildFilterLALR(ChildFilter): + """Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)""" + + def __call__(self, children): + filtered = [] + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + + +class ChildFilterLALR_NoPlaceholders(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + def __init__(self, to_include, node_builder): + self.node_builder = node_builder + self.to_include = to_include + + def __call__(self, children): + filtered = [] + for i, to_expand in self.to_include: + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) + return self.node_builder(filtered) + + +def _should_expand(sym): + return not sym.is_term and sym.name.startswith('_') + + +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): + # Prepare empty_indices as: How many Nones to insert at each index? + if _empty_indices: + assert _empty_indices.count(False) == len(expansion) + s = ''.join(str(int(b)) for b in _empty_indices) + empty_indices = [len(ones) for ones in s.split('0')] + assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion)) + else: + empty_indices = [0] * (len(expansion)+1) + + to_include = [] + nones_to_add = 0 + for i, sym in enumerate(expansion): + nones_to_add += empty_indices[i] + if keep_all_tokens or not (sym.is_term and sym.filter_out): + to_include.append((i, _should_expand(sym), nones_to_add)) + nones_to_add = 0 + + nones_to_add += empty_indices[len(expansion)] + + if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include): + if _empty_indices or ambiguous: + return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) + else: + # LALR without placeholders + return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) + + +class AmbiguousExpander: + """Deal with the case where we're expanding children ('_rule') into a parent but the children + are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself + ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children + into the right parents in the right places, essentially shifting the ambiguity up the tree.""" + def __init__(self, to_expand, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + self.to_expand = to_expand + + def __call__(self, children): + def _is_ambig_tree(t): + return hasattr(t, 'data') and t.data == '_ambig' + + # -- When we're repeatedly expanding ambiguities we can end up with nested ambiguities. + # All children of an _ambig node should be a derivation of that ambig node, hence + # it is safe to assume that if we see an _ambig node nested within an ambig node + # it is safe to simply expand it into the parent _ambig node as an alternative derivation. + ambiguous = [] + for i, child in enumerate(children): + if _is_ambig_tree(child): + if i in self.to_expand: + ambiguous.append(i) + + child.expand_kids_by_data('_ambig') + + if not ambiguous: + return self.node_builder(children) + + expand = [iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children)] + return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) + + +def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): + to_expand = [i for i, sym in enumerate(expansion) + if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] + if to_expand: + return partial(AmbiguousExpander, to_expand, tree_class) + + +class AmbiguousIntermediateExpander: + """ + Propagate ambiguous intermediate nodes and their derivations up to the + current rule. + + In general, converts + + rule + _iambig + _inter + someChildren1 + ... + _inter + someChildren2 + ... + someChildren3 + ... + + to + + _ambig + rule + someChildren1 + ... + someChildren3 + ... + rule + someChildren2 + ... + someChildren3 + ... + rule + childrenFromNestedIambigs + ... + someChildren3 + ... + ... + + propagating up any nested '_iambig' nodes along the way. + """ + + def __init__(self, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + + def __call__(self, children): + def _is_iambig_tree(child): + return hasattr(child, 'data') and child.data == '_iambig' + + def _collapse_iambig(children): + """ + Recursively flatten the derivations of the parent of an '_iambig' + node. Returns a list of '_inter' nodes guaranteed not + to contain any nested '_iambig' nodes, or None if children does + not contain an '_iambig' node. + """ + + # Due to the structure of the SPPF, + # an '_iambig' node can only appear as the first child + if children and _is_iambig_tree(children[0]): + iambig_node = children[0] + result = [] + for grandchild in iambig_node.children: + collapsed = _collapse_iambig(grandchild.children) + if collapsed: + for child in collapsed: + child.children += children[1:] + result += collapsed + else: + new_tree = self.tree_class('_inter', grandchild.children + children[1:]) + result.append(new_tree) + return result + + collapsed = _collapse_iambig(children) + if collapsed: + processed_nodes = [self.node_builder(c.children) for c in collapsed] + return self.tree_class('_ambig', processed_nodes) + + return self.node_builder(children) + + +def ptb_inline_args(func): + @wraps(func) + def f(children): + return func(*children) + return f + + +def inplace_transformer(func): + @wraps(func) + def f(children): + # function name in a Transformer is a rule name. + tree = Tree(func.__name__, children) + return func(tree) + return f + + +def apply_visit_wrapper(func, name, wrapper): + if wrapper is _vargs_meta or wrapper is _vargs_meta_inline: + raise NotImplementedError("Meta args not supported for internal transformer") + + @wraps(func) + def f(children): + return wrapper(func, name, children, None) + return f + + +class ParseTreeBuilder: + def __init__(self, rules, tree_class, propagate_positions=False, ambiguous=False, maybe_placeholders=False): + self.tree_class = tree_class + self.propagate_positions = propagate_positions + self.ambiguous = ambiguous + self.maybe_placeholders = maybe_placeholders + + self.rule_builders = list(self._init_builders(rules)) + + def _init_builders(self, rules): + propagate_positions = make_propagate_positions(self.propagate_positions) + + for rule in rules: + options = rule.options + keep_all_tokens = options.keep_all_tokens + expand_single_child = options.expand1 + + wrapper_chain = list(filter(None, [ + (expand_single_child and not rule.alias) and ExpandSingleChild, + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), + propagate_positions, + self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), + self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class) + ])) + + yield rule, wrapper_chain + + def create_callback(self, transformer=None): + callbacks = {} + + for rule, wrapper_chain in self.rule_builders: + + user_callback_name = rule.alias or rule.options.template_source or rule.origin.name + try: + f = getattr(transformer, user_callback_name) + # XXX InlineTransformer is deprecated! + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + f = apply_visit_wrapper(f, user_callback_name, wrapper) + else: + if isinstance(transformer, InlineTransformer): + f = ptb_inline_args(f) + elif isinstance(transformer, Transformer_InPlace): + f = inplace_transformer(f) + except AttributeError: + f = partial(self.tree_class, user_callback_name) + + for w in wrapper_chain: + f = w(f) + + if rule in callbacks: + raise GrammarError("Rule '%s' already exists" % (rule,)) + + callbacks[rule] = f + + return callbacks + +###} diff --git a/.venv/lib/python3.12/site-packages/lark/parser_frontends.py b/.venv/lib/python3.12/site-packages/lark/parser_frontends.py new file mode 100644 index 00000000..0e53dd58 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parser_frontends.py @@ -0,0 +1,239 @@ +from .exceptions import ConfigurationError, GrammarError, assert_config +from .utils import get_regexp_width, Serialize +from .parsers.grammar_analysis import GrammarAnalyzer +from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef +from .parsers import earley, xearley, cyk +from .parsers.lalr_parser import LALR_Parser +from .tree import Tree +from .common import LexerConf, ParserConf +try: + import regex +except ImportError: + regex = None +import re + +###{standalone + +def _wrap_lexer(lexer_class): + future_interface = getattr(lexer_class, '__future_interface__', False) + if future_interface: + return lexer_class + else: + class CustomLexerWrapper(Lexer): + def __init__(self, lexer_conf): + self.lexer = lexer_class(lexer_conf) + def lex(self, lexer_state, parser_state): + return self.lexer.lex(lexer_state.text) + return CustomLexerWrapper + + +class MakeParsingFrontend: + def __init__(self, parser_type, lexer_type): + self.parser_type = parser_type + self.lexer_type = lexer_type + + def __call__(self, lexer_conf, parser_conf, options): + assert isinstance(lexer_conf, LexerConf) + assert isinstance(parser_conf, ParserConf) + parser_conf.parser_type = self.parser_type + lexer_conf.lexer_type = self.lexer_type + return ParsingFrontend(lexer_conf, parser_conf, options) + + def deserialize(self, data, memo, lexer_conf, callbacks, options): + parser_conf = ParserConf.deserialize(data['parser_conf'], memo) + parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) + parser_conf.callbacks = callbacks + return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser) + + + + +class ParsingFrontend(Serialize): + __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options' + + def __init__(self, lexer_conf, parser_conf, options, parser=None): + self.parser_conf = parser_conf + self.lexer_conf = lexer_conf + self.options = options + + # Set-up parser + if parser: # From cache + self.parser = parser + else: + create_parser = { + 'lalr': create_lalr_parser, + 'earley': create_earley_parser, + 'cyk': CYK_FrontEnd, + }[parser_conf.parser_type] + self.parser = create_parser(lexer_conf, parser_conf, options) + + # Set-up lexer + lexer_type = lexer_conf.lexer_type + self.skip_lexer = False + if lexer_type in ('dynamic', 'dynamic_complete'): + assert lexer_conf.postlex is None + self.skip_lexer = True + return + + try: + create_lexer = { + 'standard': create_traditional_lexer, + 'contextual': create_contextual_lexer, + }[lexer_type] + except KeyError: + assert issubclass(lexer_type, Lexer), lexer_type + self.lexer = _wrap_lexer(lexer_type)(lexer_conf) + else: + self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex) + + if lexer_conf.postlex: + self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) + + def _verify_start(self, start=None): + if start is None: + start_decls = self.parser_conf.start + if len(start_decls) > 1: + raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) + start ,= start_decls + elif start not in self.parser_conf.start: + raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) + return start + + def parse(self, text, start=None, on_error=None): + chosen_start = self._verify_start(start) + stream = text if self.skip_lexer else LexerThread(self.lexer, text) + kw = {} if on_error is None else {'on_error': on_error} + return self.parser.parse(stream, chosen_start, **kw) + + def parse_interactive(self, text=None, start=None): + chosen_start = self._verify_start(start) + if self.parser_conf.parser_type != 'lalr': + raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") + stream = text if self.skip_lexer else LexerThread(self.lexer, text) + return self.parser.parse_interactive(stream, chosen_start) + + +def get_frontend(parser, lexer): + assert_config(parser, ('lalr', 'earley', 'cyk')) + if not isinstance(lexer, type): # not custom lexer? + expected = { + 'lalr': ('standard', 'contextual'), + 'earley': ('standard', 'dynamic', 'dynamic_complete'), + 'cyk': ('standard', ), + }[parser] + assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) + + return MakeParsingFrontend(parser, lexer) + + +def _get_lexer_callbacks(transformer, terminals): + result = {} + for terminal in terminals: + callback = getattr(transformer, terminal.name, None) + if callback is not None: + result[terminal.name] = callback + return result + +class PostLexConnector: + def __init__(self, lexer, postlexer): + self.lexer = lexer + self.postlexer = postlexer + + def make_lexer_state(self, text): + return self.lexer.make_lexer_state(text) + + def lex(self, lexer_state, parser_state): + i = self.lexer.lex(lexer_state, parser_state) + return self.postlexer.process(i) + + + +def create_traditional_lexer(lexer_conf, parser, postlex): + return TraditionalLexer(lexer_conf) + +def create_contextual_lexer(lexer_conf, parser, postlex): + states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} + always_accept = postlex.always_accept if postlex else () + return ContextualLexer(lexer_conf, states, always_accept=always_accept) + +def create_lalr_parser(lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + return LALR_Parser(parser_conf, debug=debug) + + +create_earley_parser = NotImplemented +CYK_FrontEnd = NotImplemented +###} + +class EarleyRegexpMatcher: + def __init__(self, lexer_conf): + self.regexps = {} + for t in lexer_conf.terminals: + if t.priority != 1: + raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) + regexp = t.pattern.to_regexp() + try: + width = get_regexp_width(regexp)[0] + except ValueError: + raise GrammarError("Bad regexp in token %s: %s" % (t.name, regexp)) + else: + if width == 0: + raise GrammarError("Dynamic Earley doesn't allow zero-width regexps", t) + if lexer_conf.use_bytes: + regexp = regexp.encode('utf-8') + + self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) + + def match(self, term, text, index=0): + return self.regexps[term.name].match(text, index) + + +def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw): + earley_matcher = EarleyRegexpMatcher(lexer_conf) + return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw) + +def _match_earley_basic(term, token): + return term.name == token.type + +def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw): + return earley.Parser(parser_conf, _match_earley_basic, **kw) + +def create_earley_parser(lexer_conf, parser_conf, options): + resolve_ambiguity = options.ambiguity == 'resolve' + debug = options.debug if options else False + tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None + + extra = {} + if lexer_conf.lexer_type == 'dynamic': + f = create_earley_parser__dynamic + elif lexer_conf.lexer_type == 'dynamic_complete': + extra['complete_lex'] =True + f = create_earley_parser__dynamic + else: + f = create_earley_parser__basic + + return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra) + + + +class CYK_FrontEnd: + def __init__(self, lexer_conf, parser_conf, options=None): + self._analysis = GrammarAnalyzer(parser_conf) + self.parser = cyk.Parser(parser_conf.rules) + + self.callbacks = parser_conf.callbacks + + def parse(self, lexer_thread, start): + tokens = list(lexer_thread.lex(None)) + tree = self.parser.parse(tokens, start) + return self._transform(tree) + + def _transform(self, tree): + subtrees = list(tree.iter_subtrees()) + for subtree in subtrees: + subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children] + + return self._apply_callback(tree) + + def _apply_callback(self, tree): + return self.callbacks[tree.rule](tree.children) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/__init__.py b/.venv/lib/python3.12/site-packages/lark/parsers/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/__init__.py diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/cyk.py b/.venv/lib/python3.12/site-packages/lark/parsers/cyk.py new file mode 100644 index 00000000..ff0924f2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/cyk.py @@ -0,0 +1,345 @@ +"""This module implements a CYK parser.""" + +# Author: https://github.com/ehudt (2018) +# +# Adapted by Erez + + +from collections import defaultdict +import itertools + +from ..exceptions import ParseError +from ..lexer import Token +from ..tree import Tree +from ..grammar import Terminal as T, NonTerminal as NT, Symbol + +try: + xrange +except NameError: + xrange = range + +def match(t, s): + assert isinstance(t, T) + return t.name == s.type + + +class Rule(object): + """Context-free grammar rule.""" + + def __init__(self, lhs, rhs, weight, alias): + super(Rule, self).__init__() + assert isinstance(lhs, NT), lhs + assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs + self.lhs = lhs + self.rhs = rhs + self.weight = weight + self.alias = alias + + def __str__(self): + return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs)) + + def __repr__(self): + return str(self) + + def __hash__(self): + return hash((self.lhs, tuple(self.rhs))) + + def __eq__(self, other): + return self.lhs == other.lhs and self.rhs == other.rhs + + def __ne__(self, other): + return not (self == other) + + +class Grammar(object): + """Context-free grammar.""" + + def __init__(self, rules): + self.rules = frozenset(rules) + + def __eq__(self, other): + return self.rules == other.rules + + def __str__(self): + return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n' + + def __repr__(self): + return str(self) + + +# Parse tree data structures +class RuleNode(object): + """A node in the parse tree, which also contains the full rhs rule.""" + + def __init__(self, rule, children, weight=0): + self.rule = rule + self.children = children + self.weight = weight + + def __repr__(self): + return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children)) + + + +class Parser(object): + """Parser wrapper.""" + + def __init__(self, rules): + super(Parser, self).__init__() + self.orig_rules = {rule: rule for rule in rules} + rules = [self._to_rule(rule) for rule in rules] + self.grammar = to_cnf(Grammar(rules)) + + def _to_rule(self, lark_rule): + """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" + assert isinstance(lark_rule.origin, NT) + assert all(isinstance(x, Symbol) for x in lark_rule.expansion) + return Rule( + lark_rule.origin, lark_rule.expansion, + weight=lark_rule.options.priority if lark_rule.options.priority else 0, + alias=lark_rule) + + def parse(self, tokenized, start): # pylint: disable=invalid-name + """Parses input, which is a list of tokens.""" + assert start + start = NT(start) + + table, trees = _parse(tokenized, self.grammar) + # Check if the parse succeeded. + if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]): + raise ParseError('Parsing failed.') + parse = trees[(0, len(tokenized) - 1)][start] + return self._to_tree(revert_cnf(parse)) + + def _to_tree(self, rule_node): + """Converts a RuleNode parse tree to a lark Tree.""" + orig_rule = self.orig_rules[rule_node.rule.alias] + children = [] + for child in rule_node.children: + if isinstance(child, RuleNode): + children.append(self._to_tree(child)) + else: + assert isinstance(child.name, Token) + children.append(child.name) + t = Tree(orig_rule.origin, children) + t.rule=orig_rule + return t + + +def print_parse(node, indent=0): + if isinstance(node, RuleNode): + print(' ' * (indent * 2) + str(node.rule.lhs)) + for child in node.children: + print_parse(child, indent + 1) + else: + print(' ' * (indent * 2) + str(node.s)) + + +def _parse(s, g): + """Parses sentence 's' using CNF grammar 'g'.""" + # The CYK table. Indexed with a 2-tuple: (start pos, end pos) + table = defaultdict(set) + # Top-level structure is similar to the CYK table. Each cell is a dict from + # rule name to the best (lightest) tree for that rule. + trees = defaultdict(dict) + # Populate base case with existing terminal production rules + for i, w in enumerate(s): + for terminal, rules in g.terminal_rules.items(): + if match(terminal, w): + for rule in rules: + table[(i, i)].add(rule) + if (rule.lhs not in trees[(i, i)] or + rule.weight < trees[(i, i)][rule.lhs].weight): + trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight) + + # Iterate over lengths of sub-sentences + for l in xrange(2, len(s) + 1): + # Iterate over sub-sentences with the given length + for i in xrange(len(s) - l + 1): + # Choose partition of the sub-sentence in [1, l) + for p in xrange(i + 1, i + l): + span1 = (i, p - 1) + span2 = (p, i + l - 1) + for r1, r2 in itertools.product(table[span1], table[span2]): + for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []): + table[(i, i + l - 1)].add(rule) + r1_tree = trees[span1][r1.lhs] + r2_tree = trees[span2][r2.lhs] + rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight + if (rule.lhs not in trees[(i, i + l - 1)] + or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight): + trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight) + return table, trees + + +# This section implements context-free grammar converter to Chomsky normal form. +# It also implements a conversion of parse trees from its CNF to the original +# grammar. +# Overview: +# Applies the following operations in this order: +# * TERM: Eliminates non-solitary terminals from all rules +# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side. +# * UNIT: Eliminates non-terminal unit rules +# +# The following grammar characteristics aren't featured: +# * Start symbol appears on RHS +# * Empty rules (epsilon rules) + + +class CnfWrapper(object): + """CNF wrapper for grammar. + + Validates that the input grammar is CNF and provides helper data structures. + """ + + def __init__(self, grammar): + super(CnfWrapper, self).__init__() + self.grammar = grammar + self.rules = grammar.rules + self.terminal_rules = defaultdict(list) + self.nonterminal_rules = defaultdict(list) + for r in self.rules: + # Validate that the grammar is CNF and populate auxiliary data structures. + assert isinstance(r.lhs, NT), r + if len(r.rhs) not in [1, 2]: + raise ParseError("CYK doesn't support empty rules") + if len(r.rhs) == 1 and isinstance(r.rhs[0], T): + self.terminal_rules[r.rhs[0]].append(r) + elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs): + self.nonterminal_rules[tuple(r.rhs)].append(r) + else: + assert False, r + + def __eq__(self, other): + return self.grammar == other.grammar + + def __repr__(self): + return repr(self.grammar) + + +class UnitSkipRule(Rule): + """A rule that records NTs that were skipped during transformation.""" + + def __init__(self, lhs, rhs, skipped_rules, weight, alias): + super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias) + self.skipped_rules = skipped_rules + + def __eq__(self, other): + return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules + + __hash__ = Rule.__hash__ + + +def build_unit_skiprule(unit_rule, target_rule): + skipped_rules = [] + if isinstance(unit_rule, UnitSkipRule): + skipped_rules += unit_rule.skipped_rules + skipped_rules.append(target_rule) + if isinstance(target_rule, UnitSkipRule): + skipped_rules += target_rule.skipped_rules + return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules, + weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias) + + +def get_any_nt_unit_rule(g): + """Returns a non-terminal unit rule from 'g', or None if there is none.""" + for rule in g.rules: + if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT): + return rule + return None + + +def _remove_unit_rule(g, rule): + """Removes 'rule' from 'g' without changing the langugage produced by 'g'.""" + new_rules = [x for x in g.rules if x != rule] + refs = [x for x in g.rules if x.lhs == rule.rhs[0]] + new_rules += [build_unit_skiprule(rule, ref) for ref in refs] + return Grammar(new_rules) + + +def _split(rule): + """Splits a rule whose len(rhs) > 2 into shorter rules.""" + rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs) + rule_name = '__SP_%s' % (rule_str) + '_%d' + yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias) + for i in xrange(1, len(rule.rhs) - 2): + yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split') + yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split') + + +def _term(g): + """Applies the TERM rule on 'g' (see top comment).""" + all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)} + t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t} + new_rules = [] + for rule in g.rules: + if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs): + new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs] + new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias)) + new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs) + else: + new_rules.append(rule) + return Grammar(new_rules) + + +def _bin(g): + """Applies the BIN rule to 'g' (see top comment).""" + new_rules = [] + for rule in g.rules: + if len(rule.rhs) > 2: + new_rules += _split(rule) + else: + new_rules.append(rule) + return Grammar(new_rules) + + +def _unit(g): + """Applies the UNIT rule to 'g' (see top comment).""" + nt_unit_rule = get_any_nt_unit_rule(g) + while nt_unit_rule: + g = _remove_unit_rule(g, nt_unit_rule) + nt_unit_rule = get_any_nt_unit_rule(g) + return g + + +def to_cnf(g): + """Creates a CNF grammar from a general context-free grammar 'g'.""" + g = _unit(_bin(_term(g))) + return CnfWrapper(g) + + +def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias): + if not skipped_rules: + return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight) + else: + weight = weight - skipped_rules[0].weight + return RuleNode( + Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [ + unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs, + skipped_rules[1:], children, + skipped_rules[0].weight, skipped_rules[0].alias) + ], weight=weight) + + +def revert_cnf(node): + """Reverts a parse tree (RuleNode) to its original non-CNF form (Node).""" + if isinstance(node, T): + return node + # Reverts TERM rule. + if node.rule.lhs.name.startswith('__T_'): + return node.children[0] + else: + children = [] + for child in map(revert_cnf, node.children): + # Reverts BIN rule. + if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'): + children += child.children + else: + children.append(child) + # Reverts UNIT rule. + if isinstance(node.rule, UnitSkipRule): + return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs, + node.rule.skipped_rules, children, + node.rule.weight, node.rule.alias) + else: + return RuleNode(node.rule, children) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/earley.py b/.venv/lib/python3.12/site-packages/lark/parsers/earley.py new file mode 100644 index 00000000..5961d5d5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/earley.py @@ -0,0 +1,326 @@ +"""This module implements an Earley parser. + +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html +""" + +from collections import deque + +from ..tree import Tree +from ..exceptions import UnexpectedEOF, UnexpectedToken +from ..utils import logger +from .grammar_analysis import GrammarAnalyzer +from ..grammar import NonTerminal +from .earley_common import Item, TransitiveItem +from .earley_forest import ForestSumVisitor, SymbolNode, ForestToParseTree + +class Parser: + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False, tree_class=Tree): + analysis = GrammarAnalyzer(parser_conf) + self.parser_conf = parser_conf + self.resolve_ambiguity = resolve_ambiguity + self.debug = debug + self.tree_class = tree_class + + self.FIRST = analysis.FIRST + self.NULLABLE = analysis.NULLABLE + self.callbacks = parser_conf.callbacks + self.predictions = {} + + ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than + # the slow 'isupper' in is_terminal. + self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } + self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + + self.forest_sum_visitor = None + for rule in parser_conf.rules: + if rule.origin not in self.predictions: + self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] + + ## Detect if any rules have priorities set. If the user specified priority = "none" then + # the priorities will be stripped from all rules before they reach us, allowing us to + # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities + # on any rules. + if self.forest_sum_visitor is None and rule.options.priority is not None: + self.forest_sum_visitor = ForestSumVisitor + + self.term_matcher = term_matcher + + + def predict_and_complete(self, i, to_scan, columns, transitives): + """The core Earley Predictor and Completer. + + At each stage of the input, we handling any completed items (things + that matched on the last cycle) and use those to predict what should + come next in the input stream. The completions and any predicted + non-terminals are recursively processed until we reach a set of, + which can be added to the scan list for the next scanner cycle.""" + # Held Completions (H in E.Scotts paper). + node_cache = {} + held_completions = {} + + column = columns[i] + # R (items) = Ei (column.items) + items = deque(column) + while items: + item = items.pop() # remove an element, A say, from R + + ### The Earley completer + if item.is_complete: ### (item.s == string) + if item.node is None: + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + item.node.add_family(item.s, item.rule, item.start, None, None) + + # create_leo_transitives(item.rule.origin, item.start) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + label = (root_transitive.s, root_transitive.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_path(root_transitive, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + + ### The Earley predictor + elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) + new_items = [] + for rule in self.predictions[item.expect]: + new_item = Item(rule, 0, i) + new_items.append(new_item) + + # Process any held completions (H). + if item.expect in held_completions: + new_item = item.advance() + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) + new_items.append(new_item) + + for new_item in new_items: + if new_item.expect in self.TERMINALS: + to_scan.add(new_item) + elif new_item not in column: + column.add(new_item) + items.append(new_item) + + def _parse(self, lexer, columns, to_scan, start_symbol=None): + def is_quasi_complete(item): + if item.is_complete: + return True + + quasi = item.advance() + while not quasi.is_complete: + if quasi.expect not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(origin, start): + visited = set() + to_create = [] + trule = None + previous = None + + ### Recursively walk backwards through the Earley sets until we find the + # first transitive candidate. If this is done continuously, we shouldn't + # have to walk more than 1 hop. + while True: + if origin in transitives[start]: + previous = trule = transitives[start][origin] + break + + is_empty_rule = not self.FIRST[origin] + if is_empty_rule: + break + + candidates = [ candidate for candidate in columns[start] if candidate.expect is not None and origin == candidate.expect ] + if len(candidates) != 1: + break + originator = next(iter(candidates)) + + if originator is None or originator in visited: + break + + visited.add(originator) + if not is_quasi_complete(originator): + break + + trule = originator.advance() + if originator.start != start: + visited.clear() + + to_create.append((origin, start, originator)) + origin = originator.rule.origin + start = originator.start + + # If a suitable Transitive candidate is not found, bail. + if trule is None: + return + + #### Now walk forwards and create Transitive Items in each set we walked through; and link + # each transitive item to the next set forwards. + while to_create: + origin, start, originator = to_create.pop() + titem = None + if previous is not None: + titem = previous.next_titem = TransitiveItem(origin, trule, originator, previous.column) + else: + titem = TransitiveItem(origin, trule, originator, start) + previous = transitives[start][origin] = titem + + + + def scan(i, token, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + next_to_scan = set() + next_set = set() + columns.append(next_set) + transitives.append({}) + node_cache = {} + + for item in set(to_scan): + if match(item.expect, token): + new_item = item.advance() + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) + + if not next_set and not next_to_scan: + expect = {i.expect.name for i in to_scan} + raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) + + return next_to_scan + + + # Define parser functions + match = self.term_matcher + + # Cache for nodes & tokens created in a particular parse step. + transitives = [{}] + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. + expects = {i.expect for i in to_scan} + i = 0 + for token in lexer.lex(expects): + self.predict_and_complete(i, to_scan, columns, transitives) + + to_scan = scan(i, token, to_scan) + i += 1 + + expects.clear() + expects |= {i.expect for i in to_scan} + + self.predict_and_complete(i, to_scan, columns, transitives) + + ## Column is now the final column in the parse. + assert i == len(columns)-1 + return to_scan + + def parse(self, lexer, start): + assert start, start + start_symbol = NonTerminal(start) + + columns = [set()] + to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. + + ## Predict for the start_symbol. + # Add predicted items to the first Earley set (for the predictor) if they + # result in a non-terminal, or the scanner if they result in a terminal. + for rule in self.predictions[start_symbol]: + item = Item(rule, 0, 0) + if item.expect in self.TERMINALS: + to_scan.add(item) + else: + columns[0].add(item) + + to_scan = self._parse(lexer, columns, to_scan, start_symbol) + + # If the parse was successful, the start + # symbol should have been completed in the last step of the Earley cycle, and will be in + # this column. Find the item for the start_symbol, which is the root of the SPPF tree. + solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] + if not solutions: + expected_terminals = [t.expect.name for t in to_scan] + raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) + + if self.debug: + from .earley_forest import ForestToPyDotVisitor + try: + debug_walker = ForestToPyDotVisitor() + except ImportError: + logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + else: + debug_walker.visit(solutions[0], "sppf.png") + + + if len(solutions) > 1: + assert False, 'Earley should not generate multiple start symbol items!' + + if self.tree_class is not None: + # Perform our SPPF -> AST conversion + transformer = ForestToParseTree(self.tree_class, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity) + return transformer.transform(solutions[0]) + + # return the root of the SPPF + return solutions[0] diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/earley_common.py b/.venv/lib/python3.12/site-packages/lark/parsers/earley_common.py new file mode 100644 index 00000000..6bd614ba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/earley_common.py @@ -0,0 +1,75 @@ +"This module implements an Earley Parser" + +# The parser uses a parse-forest to keep track of derivations and ambiguations. +# When the parse ends successfully, a disambiguation stage resolves all ambiguity +# (right now ambiguity resolution is not developed beyond the needs of lark) +# Afterwards the parse tree is reduced (transformed) according to user callbacks. +# I use the no-recursion version of Transformer, because the tree might be +# deeper than Python's recursion limit (a bit absurd, but that's life) +# +# The algorithm keeps track of each state set, using a corresponding Column instance. +# Column keeps track of new items using NewsList instances. +# +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +from ..grammar import NonTerminal, Terminal + +class Item(object): + "An Earley Item, the atom of the algorithm." + + __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') + def __init__(self, rule, ptr, start): + self.is_complete = len(rule.expansion) == ptr + self.rule = rule # rule + self.ptr = ptr # ptr + self.start = start # j + self.node = None # w + if self.is_complete: + self.s = rule.origin + self.expect = None + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + else: + self.s = (rule, ptr) + self.expect = rule.expansion[ptr] + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + self._hash = hash((self.s, self.start)) + + def advance(self): + return Item(self.rule, self.ptr + 1, self.start) + + def __eq__(self, other): + return self is other or (self.s == other.s and self.start == other.start) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) + return '%s (%d)' % (symbol, self.start) + + +class TransitiveItem(Item): + __slots__ = ('recognized', 'reduction', 'column', 'next_titem') + def __init__(self, recognized, trule, originator, start): + super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start) + self.recognized = recognized + self.reduction = originator + self.column = start + self.next_titem = None + self._hash = hash((self.s, self.start, self.recognized)) + + def __eq__(self, other): + if not isinstance(other, TransitiveItem): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/earley_forest.py b/.venv/lib/python3.12/site-packages/lark/parsers/earley_forest.py new file mode 100644 index 00000000..f39f4eb1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/earley_forest.py @@ -0,0 +1,790 @@ +""""This module implements an SPPF implementation + +This is used as the primary output mechanism for the Earley parser +in order to store complex ambiguities. + +Full reference and more details is here: +http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ +""" + +from random import randint +from math import isinf +from collections import deque +from operator import attrgetter +from importlib import import_module +from functools import partial + +from ..parse_tree_builder import AmbiguousIntermediateExpander +from ..visitors import Discard +from ..lexer import Token +from ..utils import logger +from ..tree import Tree + +class ForestNode(object): + pass + +class SymbolNode(ForestNode): + """ + A Symbol Node represents a symbol (or Intermediate LR0). + + Symbol nodes are keyed by the symbol (s). For intermediate nodes + s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol + nodes, s will be a string representing the non-terminal origin (i.e. + the left hand side of the rule). + + The children of a Symbol or Intermediate Node will always be Packed Nodes; + with each Packed Node child representing a single derivation of a production. + + Hence a Symbol Node with a single child is unambiguous. + + :ivar s: A Symbol, or a tuple of (rule, ptr) for an intermediate node. + :ivar start: The index of the start of the substring matched by this + symbol (inclusive). + :ivar end: The index of the end of the substring matched by this + symbol (exclusive). + :ivar is_intermediate: True if this node is an intermediate node. + :ivar priority: The priority of the node's symbol. + """ + __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') + def __init__(self, s, start, end): + self.s = s + self.start = start + self.end = end + self._children = set() + self.paths = set() + self.paths_loaded = False + + ### We use inf here as it can be safely negated without resorting to conditionals, + # unlike None or float('NaN'), and sorts appropriately. + self.priority = float('-inf') + self.is_intermediate = isinstance(s, tuple) + self._hash = hash((self.s, self.start, self.end)) + + def add_family(self, lr0, rule, start, left, right): + self._children.add(PackedNode(self, lr0, rule, start, left, right)) + + def add_path(self, transitive, node): + self.paths.add((transitive, node)) + + def load_paths(self): + for transitive, node in self.paths: + if transitive.next_titem is not None: + vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end) + vn.add_path(transitive.next_titem, node) + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) + else: + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) + self.paths_loaded = True + + @property + def is_ambiguous(self): + """Returns True if this node is ambiguous.""" + return len(self.children) > 1 + + @property + def children(self): + """Returns a list of this node's children sorted from greatest to + least priority.""" + if not self.paths_loaded: self.load_paths() + return sorted(self._children, key=attrgetter('sort_key')) + + def __iter__(self): + return iter(self._children) + + def __eq__(self, other): + if not isinstance(other, SymbolNode): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end) + + def __hash__(self): + return self._hash + + def __repr__(self): + if self.is_intermediate: + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority) + +class PackedNode(ForestNode): + """ + A Packed Node represents a single derivation in a symbol node. + + :ivar rule: The rule associated with this node. + :ivar parent: The parent of this node. + :ivar left: The left child of this node. ``None`` if one does not exist. + :ivar right: The right child of this node. ``None`` if one does not exist. + :ivar priority: The priority of this node. + """ + __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') + def __init__(self, parent, s, rule, start, left, right): + self.parent = parent + self.s = s + self.start = start + self.rule = rule + self.left = left + self.right = right + self.priority = float('-inf') + self._hash = hash((self.left, self.right)) + + @property + def is_empty(self): + return self.left is None and self.right is None + + @property + def sort_key(self): + """ + Used to sort PackedNode children of SymbolNodes. + A SymbolNode has multiple PackedNodes if it matched + ambiguously. Hence, we use the sort order to identify + the order in which ambiguous children should be considered. + """ + return self.is_empty, -self.priority, self.rule.order + + @property + def children(self): + """Returns a list of this node's children.""" + return [x for x in [self.left, self.right] if x is not None] + + def __iter__(self): + yield self.left + yield self.right + + def __eq__(self, other): + if not isinstance(other, PackedNode): + return False + return self is other or (self.left == other.left and self.right == other.right) + + def __hash__(self): + return self._hash + + def __repr__(self): + if isinstance(self.s, tuple): + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order) + +class ForestVisitor(object): + """ + An abstract base class for building forest visitors. + + This class performs a controllable depth-first walk of an SPPF. + The visitor will not enter cycles and will backtrack if one is encountered. + Subclasses are notified of cycles through the ``on_cycle`` method. + + Behavior for visit events is defined by overriding the + ``visit*node*`` functions. + + The walk is controlled by the return values of the ``visit*node_in`` + methods. Returning a node(s) will schedule them to be visited. The visitor + will begin to backtrack if no nodes are returned. + + :ivar single_visit: If ``True``, non-Token nodes will only be visited once. + """ + + def __init__(self, single_visit=False): + self.single_visit = single_visit + + def visit_token_node(self, node): + """Called when a ``Token`` is visited. ``Token`` nodes are always leaves.""" + pass + + def visit_symbol_node_in(self, node): + """Called when a symbol node is visited. Nodes that are returned + will be scheduled to be visited. If ``visit_intermediate_node_in`` + is not implemented, this function will be called for intermediate + nodes as well.""" + pass + + def visit_symbol_node_out(self, node): + """Called after all nodes returned from a corresponding ``visit_symbol_node_in`` + call have been visited. If ``visit_intermediate_node_out`` + is not implemented, this function will be called for intermediate + nodes as well.""" + pass + + def visit_packed_node_in(self, node): + """Called when a packed node is visited. Nodes that are returned + will be scheduled to be visited. """ + pass + + def visit_packed_node_out(self, node): + """Called after all nodes returned from a corresponding ``visit_packed_node_in`` + call have been visited.""" + pass + + def on_cycle(self, node, path): + """Called when a cycle is encountered. + + :param node: The node that causes a cycle. + :param path: The list of nodes being visited: nodes that have been + entered but not exited. The first element is the root in a forest + visit, and the last element is the node visited most recently. + ``path`` should be treated as read-only. + """ + pass + + def get_cycle_in_path(self, node, path): + """A utility function for use in ``on_cycle`` to obtain a slice of + ``path`` that only contains the nodes that make up the cycle.""" + index = len(path) - 1 + while id(path[index]) != id(node): + index -= 1 + return path[index:] + + def visit(self, root): + # Visiting is a list of IDs of all symbol/intermediate nodes currently in + # the stack. It serves two purposes: to detect when we 'recurse' in and out + # of a symbol/intermediate so that we can process both up and down. Also, + # since the SPPF can have cycles it allows us to detect if we're trying + # to recurse into a node that's already on the stack (infinite recursion). + visiting = set() + + # set of all nodes that have been visited + visited = set() + + # a list of nodes that are currently being visited + # used for the `on_cycle` callback + path = [] + + # We do not use recursion here to walk the Forest due to the limited + # stack size in python. Therefore input_stack is essentially our stack. + input_stack = deque([root]) + + # It is much faster to cache these as locals since they are called + # many times in large parses. + vpno = getattr(self, 'visit_packed_node_out') + vpni = getattr(self, 'visit_packed_node_in') + vsno = getattr(self, 'visit_symbol_node_out') + vsni = getattr(self, 'visit_symbol_node_in') + vino = getattr(self, 'visit_intermediate_node_out', vsno) + vini = getattr(self, 'visit_intermediate_node_in', vsni) + vtn = getattr(self, 'visit_token_node') + oc = getattr(self, 'on_cycle') + + while input_stack: + current = next(reversed(input_stack)) + try: + next_node = next(current) + except StopIteration: + input_stack.pop() + continue + except TypeError: + ### If the current object is not an iterator, pass through to Token/SymbolNode + pass + else: + if next_node is None: + continue + + if id(next_node) in visiting: + oc(next_node, path) + continue + + input_stack.append(next_node) + continue + + if not isinstance(current, ForestNode): + vtn(current) + input_stack.pop() + continue + + current_id = id(current) + if current_id in visiting: + if isinstance(current, PackedNode): + vpno(current) + elif current.is_intermediate: + vino(current) + else: + vsno(current) + input_stack.pop() + path.pop() + visiting.remove(current_id) + visited.add(current_id) + elif self.single_visit and current_id in visited: + input_stack.pop() + else: + visiting.add(current_id) + path.append(current) + if isinstance(current, PackedNode): + next_node = vpni(current) + elif current.is_intermediate: + next_node = vini(current) + else: + next_node = vsni(current) + if next_node is None: + continue + + if not isinstance(next_node, ForestNode) and \ + not isinstance(next_node, Token): + next_node = iter(next_node) + elif id(next_node) in visiting: + oc(next_node, path) + continue + + input_stack.append(next_node) + +class ForestTransformer(ForestVisitor): + """The base class for a bottom-up forest transformation. Most users will + want to use ``TreeForestTransformer`` instead as it has a friendlier + interface and covers most use cases. + + Transformations are applied via inheritance and overriding of the + ``transform*node`` methods. + + ``transform_token_node`` receives a ``Token`` as an argument. + All other methods receive the node that is being transformed and + a list of the results of the transformations of that node's children. + The return value of these methods are the resulting transformations. + + If ``Discard`` is raised in a node's transformation, no data from that node + will be passed to its parent's transformation. + """ + + def __init__(self): + super(ForestTransformer, self).__init__() + # results of transformations + self.data = dict() + # used to track parent nodes + self.node_stack = deque() + + def transform(self, root): + """Perform a transformation on an SPPF.""" + self.node_stack.append('result') + self.data['result'] = [] + self.visit(root) + assert len(self.data['result']) <= 1 + if self.data['result']: + return self.data['result'][0] + + def transform_symbol_node(self, node, data): + """Transform a symbol node.""" + return node + + def transform_intermediate_node(self, node, data): + """Transform an intermediate node.""" + return node + + def transform_packed_node(self, node, data): + """Transform a packed node.""" + return node + + def transform_token_node(self, node): + """Transform a ``Token``.""" + return node + + def visit_symbol_node_in(self, node): + self.node_stack.append(id(node)) + self.data[id(node)] = [] + return node.children + + def visit_packed_node_in(self, node): + self.node_stack.append(id(node)) + self.data[id(node)] = [] + return node.children + + def visit_token_node(self, node): + try: + transformed = self.transform_token_node(node) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + + def visit_symbol_node_out(self, node): + self.node_stack.pop() + try: + transformed = self.transform_symbol_node(node, self.data[id(node)]) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + finally: + del self.data[id(node)] + + def visit_intermediate_node_out(self, node): + self.node_stack.pop() + try: + transformed = self.transform_intermediate_node(node, self.data[id(node)]) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + finally: + del self.data[id(node)] + + def visit_packed_node_out(self, node): + self.node_stack.pop() + try: + transformed = self.transform_packed_node(node, self.data[id(node)]) + except Discard: + pass + else: + self.data[self.node_stack[-1]].append(transformed) + finally: + del self.data[id(node)] + +class ForestSumVisitor(ForestVisitor): + """ + A visitor for prioritizing ambiguous parts of the Forest. + + This visitor is used when support for explicit priorities on + rules is requested (whether normal, or invert). It walks the + forest (or subsets thereof) and cascades properties upwards + from the leaves. + + It would be ideal to do this during parsing, however this would + require processing each Earley item multiple times. That's + a big performance drawback; so running a forest walk is the + lesser of two evils: there can be significantly more Earley + items created during parsing than there are SPPF nodes in the + final tree. + """ + def __init__(self): + super(ForestSumVisitor, self).__init__(single_visit=True) + + def visit_packed_node_in(self, node): + yield node.left + yield node.right + + def visit_symbol_node_in(self, node): + return iter(node.children) + + def visit_packed_node_out(self, node): + priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0 + priority += getattr(node.right, 'priority', 0) + priority += getattr(node.left, 'priority', 0) + node.priority = priority + + def visit_symbol_node_out(self, node): + node.priority = max(child.priority for child in node.children) + +class PackedData(): + """Used in transformationss of packed nodes to distinguish the data + that comes from the left child and the right child. + """ + + class _NoData(): + pass + + NO_DATA = _NoData() + + def __init__(self, node, data): + self.left = self.NO_DATA + self.right = self.NO_DATA + if data: + if node.left is not None: + self.left = data[0] + if len(data) > 1: + self.right = data[1] + else: + self.right = data[0] + +class ForestToParseTree(ForestTransformer): + """Used by the earley parser when ambiguity equals 'resolve' or + 'explicit'. Transforms an SPPF into an (ambiguous) parse tree. + + tree_class: The tree class to use for construction + callbacks: A dictionary of rules to functions that output a tree + prioritizer: A ``ForestVisitor`` that manipulates the priorities of + ForestNodes + resolve_ambiguity: If True, ambiguities will be resolved based on + priorities. Otherwise, `_ambig` nodes will be in the resulting + tree. + use_cache: If True, the results of packed node transformations will be + cached. + """ + + def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True): + super(ForestToParseTree, self).__init__() + self.tree_class = tree_class + self.callbacks = callbacks + self.prioritizer = prioritizer + self.resolve_ambiguity = resolve_ambiguity + self._use_cache = use_cache + self._cache = {} + self._on_cycle_retreat = False + self._cycle_node = None + self._successful_visits = set() + + def visit(self, root): + if self.prioritizer: + self.prioritizer.visit(root) + super(ForestToParseTree, self).visit(root) + self._cache = {} + + def on_cycle(self, node, path): + logger.debug("Cycle encountered in the SPPF at node: %s. " + "As infinite ambiguities cannot be represented in a tree, " + "this family of derivations will be discarded.", node) + self._cycle_node = node + self._on_cycle_retreat = True + + def _check_cycle(self, node): + if self._on_cycle_retreat: + if id(node) == id(self._cycle_node) or id(node) in self._successful_visits: + self._cycle_node = None + self._on_cycle_retreat = False + return + raise Discard() + + def _collapse_ambig(self, children): + new_children = [] + for child in children: + if hasattr(child, 'data') and child.data == '_ambig': + new_children += child.children + else: + new_children.append(child) + return new_children + + def _call_rule_func(self, node, data): + # called when transforming children of symbol nodes + # data is a list of trees or tokens that correspond to the + # symbol's rule expansion + return self.callbacks[node.rule](data) + + def _call_ambig_func(self, node, data): + # called when transforming a symbol node + # data is a list of trees where each tree's data is + # equal to the name of the symbol or one of its aliases. + if len(data) > 1: + return self.tree_class('_ambig', data) + elif data: + return data[0] + raise Discard() + + def transform_symbol_node(self, node, data): + if id(node) not in self._successful_visits: + raise Discard() + self._check_cycle(node) + self._successful_visits.remove(id(node)) + data = self._collapse_ambig(data) + return self._call_ambig_func(node, data) + + def transform_intermediate_node(self, node, data): + if id(node) not in self._successful_visits: + raise Discard() + self._check_cycle(node) + self._successful_visits.remove(id(node)) + if len(data) > 1: + children = [self.tree_class('_inter', c) for c in data] + return self.tree_class('_iambig', children) + return data[0] + + def transform_packed_node(self, node, data): + self._check_cycle(node) + if self.resolve_ambiguity and id(node.parent) in self._successful_visits: + raise Discard() + if self._use_cache and id(node) in self._cache: + return self._cache[id(node)] + children = [] + assert len(data) <= 2 + data = PackedData(node, data) + if data.left is not PackedData.NO_DATA: + if node.left.is_intermediate and isinstance(data.left, list): + children += data.left + else: + children.append(data.left) + if data.right is not PackedData.NO_DATA: + children.append(data.right) + if node.parent.is_intermediate: + return self._cache.setdefault(id(node), children) + return self._cache.setdefault(id(node), self._call_rule_func(node, children)) + + def visit_symbol_node_in(self, node): + super(ForestToParseTree, self).visit_symbol_node_in(node) + if self._on_cycle_retreat: + return + return node.children + + def visit_packed_node_in(self, node): + self._on_cycle_retreat = False + to_visit = super(ForestToParseTree, self).visit_packed_node_in(node) + if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits: + if not self._use_cache or id(node) not in self._cache: + return to_visit + + def visit_packed_node_out(self, node): + super(ForestToParseTree, self).visit_packed_node_out(node) + if not self._on_cycle_retreat: + self._successful_visits.add(id(node.parent)) + +def handles_ambiguity(func): + """Decorator for methods of subclasses of ``TreeForestTransformer``. + Denotes that the method should receive a list of transformed derivations.""" + func.handles_ambiguity = True + return func + +class TreeForestTransformer(ForestToParseTree): + """A ``ForestTransformer`` with a tree ``Transformer``-like interface. + By default, it will construct a tree. + + Methods provided via inheritance are called based on the rule/symbol + names of nodes in the forest. + + Methods that act on rules will receive a list of the results of the + transformations of the rule's children. By default, trees and tokens. + + Methods that act on tokens will receive a token. + + Alternatively, methods that act on rules may be annotated with + ``handles_ambiguity``. In this case, the function will receive a list + of all the transformations of all the derivations of the rule. + By default, a list of trees where each tree.data is equal to the + rule name or one of its aliases. + + Non-tree transformations are made possible by override of + ``__default__``, ``__default_token__``, and ``__default_ambig__``. + + .. note:: + + Tree shaping features such as inlined rules and token filtering are + not built into the transformation. Positions are also not + propagated. + + :param tree_class: The tree class to use for construction + :param prioritizer: A ``ForestVisitor`` that manipulates the priorities of + nodes in the SPPF. + :param resolve_ambiguity: If True, ambiguities will be resolved based on + priorities. + :param use_cache: If True, caches the results of some transformations, + potentially improving performance when ``resolve_ambiguity==False``. + Only use if you know what you are doing: i.e. All transformation + functions are pure and referentially transparent. + """ + + def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False): + super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache) + + def __default__(self, name, data): + """Default operation on tree (for override). + + Returns a tree with name with data as children. + """ + return self.tree_class(name, data) + + def __default_ambig__(self, name, data): + """Default operation on ambiguous rule (for override). + + Wraps data in an '_ambig_' node if it contains more than + one element. + """ + if len(data) > 1: + return self.tree_class('_ambig', data) + elif data: + return data[0] + raise Discard() + + def __default_token__(self, node): + """Default operation on ``Token`` (for override). + + Returns ``node``. + """ + return node + + def transform_token_node(self, node): + return getattr(self, node.type, self.__default_token__)(node) + + def _call_rule_func(self, node, data): + name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name + user_func = getattr(self, name, self.__default__) + if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'): + user_func = partial(self.__default__, name) + if not self.resolve_ambiguity: + wrapper = partial(AmbiguousIntermediateExpander, self.tree_class) + user_func = wrapper(user_func) + return user_func(data) + + def _call_ambig_func(self, node, data): + name = node.s.name + user_func = getattr(self, name, self.__default_ambig__) + if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'): + user_func = partial(self.__default_ambig__, name) + return user_func(data) + +class ForestToPyDotVisitor(ForestVisitor): + """ + A Forest visitor which writes the SPPF to a PNG. + + The SPPF can get really large, really quickly because + of the amount of meta-data it stores, so this is probably + only useful for trivial trees and learning how the SPPF + is structured. + """ + def __init__(self, rankdir="TB"): + super(ForestToPyDotVisitor, self).__init__(single_visit=True) + self.pydot = import_module('pydot') + self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) + + def visit(self, root, filename): + super(ForestToPyDotVisitor, self).visit(root) + try: + self.graph.write_png(filename) + except FileNotFoundError as e: + logger.error("Could not write png: ", e) + + def visit_token_node(self, node): + graph_node_id = str(id(node)) + graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) + graph_node_color = 0x808080 + graph_node_style = "\"filled,rounded\"" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + + def visit_packed_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "filled" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + yield node.left + yield node.right + + def visit_packed_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in [node.left, node.right]: + if child is not None: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) + else: + #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. + child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) + child_graph_node_style = "invis" + child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") + child_edge_style = "invis" + self.graph.add_node(child_graph_node) + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) + + def visit_symbol_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "\"filled\"" + if node.is_intermediate: + graph_node_shape = "ellipse" + else: + graph_node_shape = "rectangle" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter(node.children) + + def visit_symbol_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in node.children: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/grammar_analysis.py b/.venv/lib/python3.12/site-packages/lark/parsers/grammar_analysis.py new file mode 100644 index 00000000..737cb02a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/grammar_analysis.py @@ -0,0 +1,185 @@ +from collections import Counter, defaultdict + +from ..utils import bfs, fzset, classify +from ..exceptions import GrammarError +from ..grammar import Rule, Terminal, NonTerminal + + +class RulePtr(object): + __slots__ = ('rule', 'index') + + def __init__(self, rule, index): + assert isinstance(rule, Rule) + assert index <= len(rule.expansion) + self.rule = rule + self.index = index + + def __repr__(self): + before = [x.name for x in self.rule.expansion[:self.index]] + after = [x.name for x in self.rule.expansion[self.index:]] + return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after)) + + @property + def next(self): + return self.rule.expansion[self.index] + + def advance(self, sym): + assert self.next == sym + return RulePtr(self.rule, self.index+1) + + @property + def is_satisfied(self): + return self.index == len(self.rule.expansion) + + def __eq__(self, other): + return self.rule == other.rule and self.index == other.index + def __hash__(self): + return hash((self.rule, self.index)) + + +# state generation ensures no duplicate LR0ItemSets +class LR0ItemSet(object): + __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') + + def __init__(self, kernel, closure): + self.kernel = fzset(kernel) + self.closure = fzset(closure) + self.transitions = {} + self.lookaheads = defaultdict(set) + + def __repr__(self): + return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) + + +def update_set(set1, set2): + if not set2 or set1 > set2: + return False + + copy = set(set1) + set1 |= set2 + return set1 != copy + +def calculate_sets(rules): + """Calculate FOLLOW sets. + + Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" + symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules} + + # foreach grammar rule X ::= Y(1) ... Y(k) + # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then + # NULLABLE = NULLABLE union {X} + # for i = 1 to k + # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then + # FIRST(X) = FIRST(X) union FIRST(Y(i)) + # for j = i+1 to k + # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) + # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) + # until none of NULLABLE,FIRST,FOLLOW changed in last iteration + + NULLABLE = set() + FIRST = {} + FOLLOW = {} + for sym in symbols: + FIRST[sym]={sym} if sym.is_term else set() + FOLLOW[sym]=set() + + # Calculate NULLABLE and FIRST + changed = True + while changed: + changed = False + + for rule in rules: + if set(rule.expansion) <= NULLABLE: + if update_set(NULLABLE, {rule.origin}): + changed = True + + for i, sym in enumerate(rule.expansion): + if set(rule.expansion[:i]) <= NULLABLE: + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True + else: + break + + # Calculate FOLLOW + changed = True + while changed: + changed = False + + for rule in rules: + for i, sym in enumerate(rule.expansion): + if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE: + if update_set(FOLLOW[sym], FOLLOW[rule.origin]): + changed = True + + for j in range(i+1, len(rule.expansion)): + if set(rule.expansion[i+1:j]) <= NULLABLE: + if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): + changed = True + + return FIRST, FOLLOW, NULLABLE + + +class GrammarAnalyzer(object): + def __init__(self, parser_conf, debug=False): + self.debug = debug + + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) + for start in parser_conf.start} + + rules = parser_conf.rules + list(root_rules.values()) + self.rules_by_origin = classify(rules, lambda r: r.origin) + + if len(rules) != len(set(rules)): + duplicates = [item for item, count in Counter(rules).items() if count > 1] + raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) + + for r in rules: + for sym in r.expansion: + if not (sym.is_term or sym in self.rules_by_origin): + raise GrammarError("Using an undefined rule: %s" % sym) + + self.start_states = {start: self.expand_rule(root_rule.origin) + for start, root_rule in root_rules.items()} + + self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))}) + for start, root_rule in root_rules.items()} + + lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)]) + for start in parser_conf.start} + + lr0_rules = parser_conf.rules + list(lr0_root_rules.values()) + assert(len(lr0_rules) == len(set(lr0_rules))) + + self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) + + # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) + self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) + for start, root_rule in lr0_root_rules.items()} + + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) + + def expand_rule(self, source_rule, rules_by_origin=None): + "Returns all init_ptrs accessible by rule (recursive)" + + if rules_by_origin is None: + rules_by_origin = self.rules_by_origin + + init_ptrs = set() + def _expand_rule(rule): + assert not rule.is_term, rule + + for r in rules_by_origin[rule]: + init_ptr = RulePtr(r, 0) + init_ptrs.add(init_ptr) + + if r.expansion: # if not empty rule + new_r = init_ptr.next + if not new_r.is_term: + yield new_r + + for _ in bfs([source_rule], _expand_rule): + pass + + return fzset(init_ptrs) diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_analysis.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_analysis.py new file mode 100644 index 00000000..f6a993b9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_analysis.py @@ -0,0 +1,304 @@ +"""This module builds a LALR(1) transition-table for lalr_parser.py + +For now, shift/reduce conflicts are automatically resolved as shifts. +""" + +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +from collections import defaultdict + +from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger +from ..exceptions import GrammarError + +from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet +from ..grammar import Rule + +###{standalone + +class Action: + def __init__(self, name): + self.name = name + def __str__(self): + return self.name + def __repr__(self): + return str(self) + +Shift = Action('Shift') +Reduce = Action('Reduce') + + +class ParseTable: + def __init__(self, states, start_states, end_states): + self.states = states + self.start_states = start_states + self.end_states = end_states + + def serialize(self, memo): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'states': states, + 'start_states': self.start_states, + 'end_states': self.end_states, + } + + @classmethod + def deserialize(cls, data, memo): + tokens = data['tokens'] + states = { + state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + return cls(states, data['start_states'], data['end_states']) + + +class IntParseTable(ParseTable): + + @classmethod + def from_ParseTable(cls, parse_table): + enum = list(parse_table.states) + state_to_idx = {s:i for i,s in enumerate(enum)} + int_states = {} + + for s, la in parse_table.states.items(): + la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v + for k,v in la.items()} + int_states[ state_to_idx[s] ] = la + + + start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()} + end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()} + return cls(int_states, start_states, end_states) + +###} + + +# digraph and traverse, see The Theory and Practice of Compiler Writing + +# computes F(x) = G(x) union (union { G(y) | x R y }) +# X: nodes +# R: relation (function mapping node -> list of nodes that satisfy the relation) +# G: set valued function +def digraph(X, R, G): + F = {} + S = [] + N = {} + for x in X: + N[x] = 0 + for x in X: + # this is always true for the first iteration, but N[x] may be updated in traverse below + if N[x] == 0: + traverse(x, S, N, X, R, G, F) + return F + +# x: single node +# S: stack +# N: weights +# X: nodes +# R: relation (see above) +# G: set valued function +# F: set valued function we are computing (map of input -> output) +def traverse(x, S, N, X, R, G, F): + S.append(x) + d = len(S) + N[x] = d + F[x] = G[x] + for y in R[x]: + if N[y] == 0: + traverse(y, S, N, X, R, G, F) + n_x = N[x] + assert(n_x > 0) + n_y = N[y] + assert(n_y != 0) + if (n_y > 0) and (n_y < n_x): + N[x] = n_y + F[x].update(F[y]) + if N[x] == d: + f_x = F[x] + while True: + z = S.pop() + N[z] = -1 + F[z] = f_x + if z == x: + break + + +class LALR_Analyzer(GrammarAnalyzer): + def __init__(self, parser_conf, debug=False): + GrammarAnalyzer.__init__(self, parser_conf, debug) + self.nonterminal_transitions = [] + self.directly_reads = defaultdict(set) + self.reads = defaultdict(set) + self.includes = defaultdict(set) + self.lookback = defaultdict(set) + + + def compute_lr0_states(self): + self.lr0_states = set() + # map of kernels to LR0ItemSets + cache = {} + + def step(state): + _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied) + + d = classify(unsat, lambda rp: rp.next) + for sym, rps in d.items(): + kernel = fzset({rp.advance(sym) for rp in rps}) + new_state = cache.get(kernel, None) + if new_state is None: + closure = set(kernel) + for rp in kernel: + if not rp.is_satisfied and not rp.next.is_term: + closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) + new_state = LR0ItemSet(kernel, closure) + cache[kernel] = new_state + + state.transitions[sym] = new_state + yield new_state + + self.lr0_states.add(state) + + for _ in bfs(self.lr0_start_states.values(), step): + pass + + def compute_reads_relations(self): + # handle start state + for root in self.lr0_start_states.values(): + assert(len(root.kernel) == 1) + for rp in root.kernel: + assert(rp.index == 0) + self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) + + for state in self.lr0_states: + seen = set() + for rp in state.closure: + if rp.is_satisfied: + continue + s = rp.next + # if s is a not a nonterminal + if s not in self.lr0_rules_by_origin: + continue + if s in seen: + continue + seen.add(s) + nt = (state, s) + self.nonterminal_transitions.append(nt) + dr = self.directly_reads[nt] + r = self.reads[nt] + next_state = state.transitions[s] + for rp2 in next_state.closure: + if rp2.is_satisfied: + continue + s2 = rp2.next + # if s2 is a terminal + if s2 not in self.lr0_rules_by_origin: + dr.add(s2) + if s2 in self.NULLABLE: + r.add((next_state, s2)) + + def compute_includes_lookback(self): + for nt in self.nonterminal_transitions: + state, nonterminal = nt + includes = [] + lookback = self.lookback[nt] + for rp in state.closure: + if rp.rule.origin != nonterminal: + continue + # traverse the states for rp(.rule) + state2 = state + for i in range(rp.index, len(rp.rule.expansion)): + s = rp.rule.expansion[i] + nt2 = (state2, s) + state2 = state2.transitions[s] + if nt2 not in self.reads: + continue + for j in range(i + 1, len(rp.rule.expansion)): + if not rp.rule.expansion[j] in self.NULLABLE: + break + else: + includes.append(nt2) + # state2 is at the final state for rp.rule + if rp.index == 0: + for rp2 in state2.closure: + if (rp2.rule == rp.rule) and rp2.is_satisfied: + lookback.add((state2, rp2.rule)) + for nt2 in includes: + self.includes[nt2].add(nt) + + def compute_lookaheads(self): + read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads) + follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets) + + for nt, lookbacks in self.lookback.items(): + for state, rule in lookbacks: + for s in follow_sets[nt]: + state.lookaheads[s].add(rule) + + def compute_lalr1_states(self): + m = {} + reduce_reduce = [] + for state in self.lr0_states: + actions = {} + for la, next_state in state.transitions.items(): + actions[la] = (Shift, next_state.closure) + for la, rules in state.lookaheads.items(): + if len(rules) > 1: + # Try to resolve conflict based on priority + p = [(r.options.priority or 0, r) for r in rules] + p.sort(key=lambda r: r[0], reverse=True) + best, second_best = p[:2] + if best[0] > second_best[0]: + rules = [best[1]] + else: + reduce_reduce.append((state, la, rules)) + if la in actions: + if self.debug: + logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) + logger.warning(' * %s', list(rules)[0]) + else: + actions[la] = (Reduce, list(rules)[0]) + m[state] = { k.name: v for k, v in actions.items() } + + if reduce_reduce: + msgs = [] + for state, la, rules in reduce_reduce: + msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ])) + if self.debug: + msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in state.closure]) + msgs.append(msg) + raise GrammarError('\n\n'.join(msgs)) + + states = { k.closure: v for k, v in m.items() } + + # compute end states + end_states = {} + for state in states: + for rp in state: + for start in self.lr0_start_states: + if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: + assert(not start in end_states) + end_states[start] = state + + _parse_table = ParseTable(states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states) + + if self.debug: + self.parse_table = _parse_table + else: + self.parse_table = IntParseTable.from_ParseTable(_parse_table) + + def compute_lalr(self): + self.compute_lr0_states() + self.compute_reads_relations() + self.compute_includes_lookback() + self.compute_lookaheads() + self.compute_lalr1_states() diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_interactive_parser.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_interactive_parser.py new file mode 100644 index 00000000..d6780cba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_interactive_parser.py @@ -0,0 +1,132 @@ +# This module provides a LALR interactive parser, which is used for debugging and error handling + +from copy import copy + +from .. import Token +from ..exceptions import UnexpectedToken + + +class InteractiveParser(object): + """InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR. + + For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``. + """ + def __init__(self, parser, parser_state, lexer_state): + self.parser = parser + self.parser_state = parser_state + self.lexer_state = lexer_state + + def feed_token(self, token): + """Feed the parser with a token, and advance it to the next state, as if it received it from the lexer. + + Note that ``token`` has to be an instance of ``Token``. + """ + return self.parser_state.feed_token(token, token.type == '$END') + + def exhaust_lexer(self): + """Try to feed the rest of the lexer state into the interactive parser. + + Note that this modifies the instance in place and does not feed an '$END' Token""" + for token in self.lexer_state.lex(self.parser_state): + self.parser_state.feed_token(token) + + def feed_eof(self, last_token=None): + """Feed a '$END' Token. Borrows from 'last_token' if given.""" + eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1) + return self.feed_token(eof) + + + def __copy__(self): + """Create a new interactive parser with a separate state. + + Calls to feed_token() won't affect the old instance, and vice-versa. + """ + return type(self)( + self.parser, + copy(self.parser_state), + copy(self.lexer_state), + ) + + def copy(self): + return copy(self) + + def __eq__(self, other): + if not isinstance(other, InteractiveParser): + return False + + return self.parser_state == other.parser_state and self.lexer_state == other.lexer_state + + def as_immutable(self): + """Convert to an ``ImmutableInteractiveParser``.""" + p = copy(self) + return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_state) + + def pretty(self): + """Print the output of ``choices()`` in a way that's easier to read.""" + out = ["Parser choices:"] + for k, v in self.choices().items(): + out.append('\t- %s -> %r' % (k, v)) + out.append('stack size: %s' % len(self.parser_state.state_stack)) + return '\n'.join(out) + + def choices(self): + """Returns a dictionary of token types, matched to their action in the parser. + + Only returns token types that are accepted by the current state. + + Updated by ``feed_token()``. + """ + return self.parser_state.parse_conf.parse_table.states[self.parser_state.position] + + def accepts(self): + """Returns the set of possible tokens that will advance the parser into a new valid state.""" + accepts = set() + for t in self.choices(): + if t.isupper(): # is terminal? + new_cursor = copy(self) + try: + new_cursor.feed_token(Token(t, '')) + except UnexpectedToken: + pass + else: + accepts.add(t) + return accepts + + def resume_parse(self): + """Resume automated parsing from the current state.""" + return self.parser.parse_from_state(self.parser_state) + + + +class ImmutableInteractiveParser(InteractiveParser): + """Same as ``InteractiveParser``, but operations create a new instance instead + of changing it in-place. + """ + + result = None + + def __hash__(self): + return hash((self.parser_state, self.lexer_state)) + + def feed_token(self, token): + c = copy(self) + c.result = InteractiveParser.feed_token(c, token) + return c + + def exhaust_lexer(self): + """Try to feed the rest of the lexer state into the parser. + + Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token""" + cursor = self.as_mutable() + cursor.exhaust_lexer() + return cursor.as_immutable() + + def as_mutable(self): + """Convert to an ``InteractiveParser``.""" + p = copy(self) + return InteractiveParser(p.parser, p.parser_state, p.lexer_state) + + +# Deprecated class names for the interactive parser +ParserPuppet = InteractiveParser +ImmutableParserPuppet = ImmutableInteractiveParser diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py new file mode 100644 index 00000000..d916b463 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py @@ -0,0 +1,200 @@ +"""This module implements a LALR(1) Parser +""" +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com +from copy import deepcopy, copy +from ..exceptions import UnexpectedInput, UnexpectedToken +from ..lexer import Token +from ..utils import Serialize + +from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable +from .lalr_interactive_parser import InteractiveParser +from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken + +###{standalone + +class LALR_Parser(Serialize): + def __init__(self, parser_conf, debug=False): + analysis = LALR_Analyzer(parser_conf, debug=debug) + analysis.compute_lalr() + callbacks = parser_conf.callbacks + + self._parse_table = analysis.parse_table + self.parser_conf = parser_conf + self.parser = _Parser(analysis.parse_table, callbacks, debug) + + @classmethod + def deserialize(cls, data, memo, callbacks, debug=False): + inst = cls.__new__(cls) + inst._parse_table = IntParseTable.deserialize(data, memo) + inst.parser = _Parser(inst._parse_table, callbacks, debug) + return inst + + def serialize(self, memo): + return self._parse_table.serialize(memo) + + def parse_interactive(self, lexer, start): + return self.parser.parse(lexer, start, start_interactive=True) + + def parse(self, lexer, start, on_error=None): + try: + return self.parser.parse(lexer, start) + except UnexpectedInput as e: + if on_error is None: + raise + + while True: + if isinstance(e, UnexpectedCharacters): + s = e.interactive_parser.lexer_state.state + p = s.line_ctr.char_pos + + if not on_error(e): + raise e + + if isinstance(e, UnexpectedCharacters): + # If user didn't change the character position, then we should + if p == s.line_ctr.char_pos: + s.line_ctr.feed(s.text[p:p+1]) + + try: + return e.interactive_parser.resume_parse() + except UnexpectedToken as e2: + if (isinstance(e, UnexpectedToken) + and e.token.type == e2.token.type == '$END' + and e.interactive_parser == e2.interactive_parser): + # Prevent infinite loop + raise e2 + e = e2 + except UnexpectedCharacters as e2: + e = e2 + + +class ParseConf(object): + __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' + + def __init__(self, parse_table, callbacks, start): + self.parse_table = parse_table + + self.start_state = self.parse_table.start_states[start] + self.end_state = self.parse_table.end_states[start] + self.states = self.parse_table.states + + self.callbacks = callbacks + self.start = start + + +class ParserState(object): + __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' + + def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): + self.parse_conf = parse_conf + self.lexer = lexer + self.state_stack = state_stack or [self.parse_conf.start_state] + self.value_stack = value_stack or [] + + @property + def position(self): + return self.state_stack[-1] + + # Necessary for match_examples() to work + def __eq__(self, other): + if not isinstance(other, ParserState): + return NotImplemented + return len(self.state_stack) == len(other.state_stack) and self.position == other.position + + def __copy__(self): + return type(self)( + self.parse_conf, + self.lexer, # XXX copy + copy(self.state_stack), + deepcopy(self.value_stack), + ) + + def copy(self): + return copy(self) + + def feed_token(self, token, is_end=False): + state_stack = self.state_stack + value_stack = self.value_stack + states = self.parse_conf.states + end_state = self.parse_conf.end_state + callbacks = self.parse_conf.callbacks + + while True: + state = state_stack[-1] + try: + action, arg = states[state][token.type] + except KeyError: + expected = {s for s in states[state].keys() if s.isupper()} + raise UnexpectedToken(token, expected, state=self, interactive_parser=None) + + assert arg != end_state + + if action is Shift: + # shift once and return + assert not is_end + state_stack.append(arg) + value_stack.append(token if token.type not in callbacks else callbacks[token.type](token)) + return + else: + # reduce+shift as many times as necessary + rule = arg + size = len(rule.expansion) + if size: + s = value_stack[-size:] + del state_stack[-size:] + del value_stack[-size:] + else: + s = [] + + value = callbacks[rule](s) + + _action, new_state = states[state_stack[-1]][rule.origin.name] + assert _action is Shift + state_stack.append(new_state) + value_stack.append(value) + + if is_end and state_stack[-1] == end_state: + return value_stack[-1] + +class _Parser(object): + def __init__(self, parse_table, callbacks, debug=False): + self.parse_table = parse_table + self.callbacks = callbacks + self.debug = debug + + def parse(self, lexer, start, value_stack=None, state_stack=None, start_interactive=False): + parse_conf = ParseConf(self.parse_table, self.callbacks, start) + parser_state = ParserState(parse_conf, lexer, state_stack, value_stack) + if start_interactive: + return InteractiveParser(self, parser_state, parser_state.lexer) + return self.parse_from_state(parser_state) + + + def parse_from_state(self, state): + # Main LALR-parser loop + try: + token = None + for token in state.lexer.lex(state): + state.feed_token(token) + + end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + return state.feed_token(end_token, True) + except UnexpectedInput as e: + try: + e.interactive_parser = InteractiveParser(self, state, state.lexer) + except NameError: + pass + raise e + except Exception as e: + if self.debug: + print("") + print("STATE STACK DUMP") + print("----------------") + for i, s in enumerate(state.state_stack): + print('%d)' % i , s) + print("") + + raise +###} + diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/lalr_puppet.py b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_puppet.py new file mode 100644 index 00000000..6ea6d894 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/lalr_puppet.py @@ -0,0 +1,3 @@ +# Deprecated
+
+from .lalr_interactive_parser import ParserPuppet, ImmutableParserPuppet
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py b/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py new file mode 100644 index 00000000..335d9ad4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/parsers/xearley.py @@ -0,0 +1,157 @@ +"""This module implements an experimental Earley parser with a dynamic lexer + +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is better documented here: + http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ + +Instead of running a lexer beforehand, or using a costy char-by-char method, this parser +uses regular expressions by necessity, achieving high-performance while maintaining all of +Earley's power in parsing any CFG. +""" + +from collections import defaultdict + +from ..tree import Tree +from ..exceptions import UnexpectedCharacters +from ..lexer import Token +from ..grammar import Terminal +from .earley import Parser as BaseParser +from .earley_forest import SymbolNode + + +class Parser(BaseParser): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False, tree_class=Tree): + BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug, tree_class) + self.ignore = [Terminal(t) for t in ignore] + self.complete_lex = complete_lex + + def _parse(self, stream, columns, to_scan, start_symbol=None): + + def scan(i, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + + node_cache = {} + + # 1) Loop the expectations and ask the lexer to match. + # Since regexp is forward looking on the input stream, and we only + # want to process tokens when we hit the point in the stream at which + # they complete, we push all tokens into a buffer (delayed_matches), to + # be held possibly for a later parse step when we reach the point in the + # input stream at which they complete. + for item in set(to_scan): + m = match(item.expect, stream, i) + if m: + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[m.end()].append( (item, i, t) ) + + if self.complete_lex: + s = m.group(0) + for j in range(1, len(s)): + m = match(item.expect, s[:-j]) + if m: + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[i+m.end()].append( (item, i, t) ) + + # XXX The following 3 lines were commented out for causing a bug. See issue #768 + # # Remove any items that successfully matched in this pass from the to_scan buffer. + # # This ensures we don't carry over tokens that already matched, if we're ignoring below. + # to_scan.remove(item) + + # 3) Process any ignores. This is typically used for e.g. whitespace. + # We carry over any unmatched items from the to_scan buffer to be matched again after + # the ignore. This should allow us to use ignored symbols in non-terminals to implement + # e.g. mandatory spacing. + for x in self.ignore: + m = match(x, stream, i) + if m: + # Carry over any items still in the scan buffer, to past the end of the ignored items. + delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) + + # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. + delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) + + next_to_scan = set() + next_set = set() + columns.append(next_set) + transitives.append({}) + + ## 4) Process Tokens from delayed_matches. + # This is the core of the Earley scanner. Create an SPPF node for each Token, + # and create the symbol node in the SPPF tree. Advance the item that completed, + # and add the resulting new item to either the Earley set (for processing by the + # completer/predictor) or the to_scan buffer for the next parse step. + for item, start, token in delayed_matches[i+1]: + if token is not None: + token.end_line = text_line + token.end_column = text_column + 1 + token.end_pos = i + 1 + + new_item = item.advance() + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + else: + new_item = item + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) + + del delayed_matches[i+1] # No longer needed, so unburden memory + + if not next_set and not delayed_matches and not next_to_scan: + considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name)) + raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, + set(to_scan), state=frozenset(i.s for i in to_scan), + considered_rules=considered_rules + ) + + return next_to_scan + + + delayed_matches = defaultdict(list) + match = self.term_matcher + + # Cache for nodes & tokens created in a particular parse step. + transitives = [{}] + + text_line = 1 + text_column = 1 + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. + i = 0 + for token in stream: + self.predict_and_complete(i, to_scan, columns, transitives) + + to_scan = scan(i, to_scan) + + if token == '\n': + text_line += 1 + text_column = 1 + else: + text_column += 1 + i += 1 + + self.predict_and_complete(i, to_scan, columns, transitives) + + ## Column is now the final column in the parse. + assert i == len(columns)-1 + return to_scan diff --git a/.venv/lib/python3.12/site-packages/lark/reconstruct.py b/.venv/lib/python3.12/site-packages/lark/reconstruct.py new file mode 100644 index 00000000..ab2fb38b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/reconstruct.py @@ -0,0 +1,101 @@ +"""Reconstruct text from a tree, based on Lark grammar""" + +import unicodedata + +from .tree import Tree +from .visitors import Transformer_InPlace +from .lexer import Token, PatternStr +from .grammar import Terminal, NonTerminal + +from .tree_matcher import TreeMatcher, is_discarded_terminal +from .utils import is_id_continue + +def is_iter_empty(i): + try: + _ = next(i) + return False + except StopIteration: + return True + + +class WriteTokensTransformer(Transformer_InPlace): + "Inserts discarded tokens into their correct place, according to the rules of grammar" + + def __init__(self, tokens, term_subs): + self.tokens = tokens + self.term_subs = term_subs + + def __default__(self, data, children, meta): + if not getattr(meta, 'match_tree', False): + return Tree(data, children) + + iter_args = iter(children) + to_write = [] + for sym in meta.orig_expansion: + if is_discarded_terminal(sym): + try: + v = self.term_subs[sym.name](sym) + except KeyError: + t = self.tokens[sym.name] + if not isinstance(t.pattern, PatternStr): + raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t) + + v = t.pattern.value + to_write.append(v) + else: + x = next(iter_args) + if isinstance(x, list): + to_write += x + else: + if isinstance(x, Token): + assert Terminal(x.type) == sym, x + else: + assert NonTerminal(x.data) == sym, (sym, x) + to_write.append(x) + + assert is_iter_empty(iter_args) + return to_write + + +class Reconstructor(TreeMatcher): + """ + A Reconstructor that will, given a full parse Tree, generate source code. + + Note: + The reconstructor cannot generate values from regexps. If you need to produce discarded + regexes, such as newlines, use `term_subs` and provide default values for them. + + Paramters: + parser: a Lark instance + term_subs: a dictionary of [Terminal name as str] to [output text as str] + """ + + def __init__(self, parser, term_subs=None): + TreeMatcher.__init__(self, parser) + + self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {}) + + def _reconstruct(self, tree): + unreduced_tree = self.match_tree(tree, tree.data) + + res = self.write_tokens.transform(unreduced_tree) + for item in res: + if isinstance(item, Tree): + # TODO use orig_expansion.rulename to support templates + for x in self._reconstruct(item): + yield x + else: + yield item + + def reconstruct(self, tree, postproc=None, insert_spaces=True): + x = self._reconstruct(tree) + if postproc: + x = postproc(x) + y = [] + prev_item = '' + for item in x: + if insert_spaces and prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): + y.append(' ') + y.append(item) + prev_item = item + return ''.join(y) diff --git a/.venv/lib/python3.12/site-packages/lark/tools/__init__.py b/.venv/lib/python3.12/site-packages/lark/tools/__init__.py new file mode 100644 index 00000000..4ecf13d4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/tools/__init__.py @@ -0,0 +1,65 @@ +import sys +from argparse import ArgumentParser, FileType +try: + from textwrap import indent +except ImportError: + def indent(text, prefix): + return ''.join(prefix + line for line in text.splitlines(True)) +from logging import DEBUG, INFO, WARN, ERROR +import warnings + +from lark import Lark, logger + +lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options') + +flags = [ + ('d', 'debug'), + 'keep_all_tokens', + 'regex', + 'propagate_positions', + 'maybe_placeholders', + 'use_bytes' +] + +options = ['start', 'lexer'] + +lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times") +lalr_argparser.add_argument('-s', '--start', action='append', default=[]) +lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('standard', 'contextual')) +k = {'encoding': 'utf-8'} if sys.version_info > (3, 4) else {} +lalr_argparser.add_argument('-o', '--out', type=FileType('w', **k), default=sys.stdout, help='the output file (default=stdout)') +lalr_argparser.add_argument('grammar_file', type=FileType('r', **k), help='A valid .lark file') + +for f in flags: + if isinstance(f, tuple): + options.append(f[1]) + lalr_argparser.add_argument('-' + f[0], '--' + f[1], action='store_true') + else: + options.append(f) + lalr_argparser.add_argument('--' + f, action='store_true') + + +def build_lalr(namespace): + logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)]) + if len(namespace.start) == 0: + namespace.start.append('start') + kwargs = {n: getattr(namespace, n) for n in options} + return Lark(namespace.grammar_file, parser='lalr', **kwargs), namespace.out + + +def showwarning_as_comment(message, category, filename, lineno, file=None, line=None): + # Based on warnings._showwarnmsg_impl + text = warnings.formatwarning(message, category, filename, lineno, line) + text = indent(text, '# ') + if file is None: + file = sys.stderr + if file is None: + return + try: + file.write(text) + except OSError: + pass + + +def make_warnings_comments(): + warnings.showwarning = showwarning_as_comment diff --git a/.venv/lib/python3.12/site-packages/lark/tools/nearley.py b/.venv/lib/python3.12/site-packages/lark/tools/nearley.py new file mode 100644 index 00000000..f0779dc5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/tools/nearley.py @@ -0,0 +1,201 @@ +"Converts Nearley grammars to Lark" + +import os.path +import sys +import codecs +import argparse + + +from lark import Lark, InlineTransformer + +nearley_grammar = r""" + start: (ruledef|directive)+ + + directive: "@" NAME (STRING|NAME) + | "@" JS -> js_code + ruledef: NAME "->" expansions + | NAME REGEXP "->" expansions -> macro + expansions: expansion ("|" expansion)* + + expansion: expr+ js + + ?expr: item (":" /[+*?]/)? + + ?item: rule|string|regexp|null + | "(" expansions ")" + + rule: NAME + string: STRING + regexp: REGEXP + null: "null" + JS: /{%.*?%}/s + js: JS? + + NAME: /[a-zA-Z_$]\w*/ + COMMENT: /#[^\n]*/ + REGEXP: /\[.*?\]/ + + STRING: _STRING "i"? + + %import common.ESCAPED_STRING -> _STRING + %import common.WS + %ignore WS + %ignore COMMENT + + """ + +nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard') + +def _get_rulename(name): + name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name) + return 'n_' + name.replace('$', '__DOLLAR__').lower() + +class NearleyToLark(InlineTransformer): + def __init__(self): + self._count = 0 + self.extra_rules = {} + self.extra_rules_rev = {} + self.alias_js_code = {} + + def _new_function(self, code): + name = 'alias_%d' % self._count + self._count += 1 + + self.alias_js_code[name] = code + return name + + def _extra_rule(self, rule): + if rule in self.extra_rules_rev: + return self.extra_rules_rev[rule] + + name = 'xrule_%d' % len(self.extra_rules) + assert name not in self.extra_rules + self.extra_rules[name] = rule + self.extra_rules_rev[rule] = name + return name + + def rule(self, name): + return _get_rulename(name) + + def ruledef(self, name, exps): + return '!%s: %s' % (_get_rulename(name), exps) + + def expr(self, item, op): + rule = '(%s)%s' % (item, op) + return self._extra_rule(rule) + + def regexp(self, r): + return '/%s/' % r + + def null(self): + return '' + + def string(self, s): + return self._extra_rule(s) + + def expansion(self, *x): + x, js = x[:-1], x[-1] + if js.children: + js_code ,= js.children + js_code = js_code[2:-2] + alias = '-> ' + self._new_function(js_code) + else: + alias = '' + return ' '.join(x) + alias + + def expansions(self, *x): + return '%s' % ('\n |'.join(x)) + + def start(self, *rules): + return '\n'.join(filter(None, rules)) + +def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes): + rule_defs = [] + + tree = nearley_grammar_parser.parse(g) + for statement in tree.children: + if statement.data == 'directive': + directive, arg = statement.children + if directive in ('builtin', 'include'): + folder = builtin_path if directive == 'builtin' else folder_path + path = os.path.join(folder, arg[1:-1]) + if path not in includes: + includes.add(path) + with codecs.open(path, encoding='utf8') as f: + text = f.read() + rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes) + else: + assert False, directive + elif statement.data == 'js_code': + code ,= statement.children + code = code[2:-2] + js_code.append(code) + elif statement.data == 'macro': + pass # TODO Add support for macros! + elif statement.data == 'ruledef': + rule_defs.append( n2l.transform(statement) ) + else: + raise Exception("Unknown statement: %s" % statement) + + return rule_defs + + +def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False): + import js2py + + emit_code = [] + def emit(x=None): + if x: + emit_code.append(x) + emit_code.append('\n') + + js_code = ['function id(x) {return x[0];}'] + n2l = NearleyToLark() + rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set()) + lark_g = '\n'.join(rule_defs) + lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items()) + + emit('from lark import Lark, Transformer') + emit() + emit('grammar = ' + repr(lark_g)) + emit() + + for alias, code in n2l.alias_js_code.items(): + js_code.append('%s = (%s);' % (alias, code)) + + if es6: + emit(js2py.translate_js6('\n'.join(js_code))) + else: + emit(js2py.translate_js('\n'.join(js_code))) + emit('class TransformNearley(Transformer):') + for alias in n2l.alias_js_code: + emit(" %s = var.get('%s').to_python()" % (alias, alias)) + emit(" __default__ = lambda self, n, c, m: c if c else None") + + emit() + emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start) + emit('def parse(text):') + emit(' return TransformNearley().transform(parser.parse(text))') + + return ''.join(emit_code) + +def main(fn, start, nearley_lib, es6=False): + with codecs.open(fn, encoding='utf8') as f: + grammar = f.read() + return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6) + +def get_arg_parser(): + parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.') + parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar') + parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule') + parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)') + parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true') + return parser + +if __name__ == '__main__': + parser = get_arg_parser() + if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(1) + args = parser.parse_args() + print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6)) diff --git a/.venv/lib/python3.12/site-packages/lark/tools/serialize.py b/.venv/lib/python3.12/site-packages/lark/tools/serialize.py new file mode 100644 index 00000000..61540242 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/tools/serialize.py @@ -0,0 +1,34 @@ +import codecs +import sys +import json + +from lark import Lark +from lark.grammar import RuleOptions, Rule +from lark.lexer import TerminalDef +from lark.tools import lalr_argparser, build_lalr + +import argparse + +argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize', parents=[lalr_argparser], + description="Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file", + epilog='Look at the Lark documentation for more info on the options') + + +def serialize(lark_inst, outfile): + data, memo = lark_inst.memo_serialize([TerminalDef, Rule]) + outfile.write('{\n') + outfile.write(' "data": %s,\n' % json.dumps(data)) + outfile.write(' "memo": %s\n' % json.dumps(memo)) + outfile.write('}\n') + + +def main(): + if len(sys.argv)==1: + argparser.print_help(sys.stderr) + sys.exit(1) + ns = argparser.parse_args() + serialize(*build_lalr(ns)) + + +if __name__ == '__main__': + main() diff --git a/.venv/lib/python3.12/site-packages/lark/tools/standalone.py b/.venv/lib/python3.12/site-packages/lark/tools/standalone.py new file mode 100644 index 00000000..c86d7d7a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/tools/standalone.py @@ -0,0 +1,196 @@ +from __future__ import print_function + +###{standalone +# +# +# Lark Stand-alone Generator Tool +# ---------------------------------- +# Generates a stand-alone LALR(1) parser with a standard lexer +# +# Git: https://github.com/erezsh/lark +# Author: Erez Shinan (erezshin@gmail.com) +# +# +# >>> LICENSE +# +# This tool and its generated code use a separate license from Lark, +# and are subject to the terms of the Mozilla Public License, v. 2.0. +# If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# +# If you wish to purchase a commercial license for this tool and its +# generated code, you may contact me via email or otherwise. +# +# If MPL2 is incompatible with your free or open-source project, +# contact me and we'll work it out. +# +# + +from io import open +###} + +import sys +import token, tokenize +import os +from os import path +from collections import defaultdict +from functools import partial +from argparse import ArgumentParser, SUPPRESS +from warnings import warn + +import lark +from lark import Lark +from lark.tools import lalr_argparser, build_lalr, make_warnings_comments + + +from lark.grammar import RuleOptions, Rule +from lark.lexer import TerminalDef + +_dir = path.dirname(__file__) +_larkdir = path.join(_dir, path.pardir) + + +EXTRACT_STANDALONE_FILES = [ + 'tools/standalone.py', + 'exceptions.py', + 'utils.py', + 'tree.py', + 'visitors.py', + 'grammar.py', + 'lexer.py', + 'common.py', + 'parse_tree_builder.py', + 'parsers/lalr_parser.py', + 'parsers/lalr_analysis.py', + 'parser_frontends.py', + 'lark.py', + 'indenter.py', +] + +def extract_sections(lines): + section = None + text = [] + sections = defaultdict(list) + for l in lines: + if l.startswith('###'): + if l[3] == '{': + section = l[4:].strip() + elif l[3] == '}': + sections[section] += text + section = None + text = [] + else: + raise ValueError(l) + elif section: + text.append(l) + + return {name:''.join(text) for name, text in sections.items()} + + +def strip_docstrings(line_gen): + """ Strip comments and docstrings from a file. + Based on code from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings + """ + res = [] + + prev_toktype = token.INDENT + last_lineno = -1 + last_col = 0 + + tokgen = tokenize.generate_tokens(line_gen) + for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen: + if slineno > last_lineno: + last_col = 0 + if scol > last_col: + res.append(" " * (scol - last_col)) + if toktype == token.STRING and prev_toktype == token.INDENT: + # Docstring + res.append("#--") + elif toktype == tokenize.COMMENT: + # Comment + res.append("##\n") + else: + res.append(ttext) + prev_toktype = toktype + last_col = ecol + last_lineno = elineno + + return ''.join(res) + + +def main(fobj, start, print=print): + warn('`lark.tools.standalone.main` is being redesigned. Use `gen_standalone`', DeprecationWarning) + lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) + gen_standalone(lark_inst, print) + +def gen_standalone(lark_inst, output=None, out=sys.stdout, compress=False): + if output is None: + output = partial(print, file=out) + + import pickle, zlib, base64 + def compressed_output(obj): + s = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL) + c = zlib.compress(s) + output(repr(base64.b64encode(c))) + + def output_decompress(name): + output('%(name)s = pickle.loads(zlib.decompress(base64.b64decode(%(name)s)))' % locals()) + + output('# The file was automatically generated by Lark v%s' % lark.__version__) + output('__version__ = "%s"' % lark.__version__) + output() + + for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES): + with open(os.path.join(_larkdir, pyfile)) as f: + code = extract_sections(f)['standalone'] + if i: # if not this file + code = strip_docstrings(partial(next, iter(code.splitlines(True)))) + output(code) + + data, m = lark_inst.memo_serialize([TerminalDef, Rule]) + output('import pickle, zlib, base64') + if compress: + output('DATA = (') + compressed_output(data) + output(')') + output_decompress('DATA') + output('MEMO = (') + compressed_output(m) + output(')') + output_decompress('MEMO') + else: + output('DATA = (') + output(data) + output(')') + output('MEMO = (') + output(m) + output(')') + + + output('Shift = 0') + output('Reduce = 1') + output("def Lark_StandAlone(**kwargs):") + output(" return Lark._load_from_dict(DATA, MEMO, **kwargs)") + + + + +def main(): + make_warnings_comments() + parser = ArgumentParser(prog="prog='python -m lark.tools.standalone'", description="Lark Stand-alone Generator Tool", + parents=[lalr_argparser], epilog='Look at the Lark documentation for more info on the options') + parser.add_argument("old_start", nargs='?', help=SUPPRESS) + parser.add_argument('-c', '--compress', action='store_true', default=0, help="Enable compression") + if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(1) + ns = parser.parse_args() + if ns.old_start is not None: + warn('The syntax `python -m lark.tools.standalone <grammar-file> <start>` is deprecated. Use the -s option') + ns.start.append(ns.old_start) + + lark_inst, out = build_lalr(ns) + gen_standalone(lark_inst, out=out, compress=ns.compress) + +if __name__ == '__main__': + main()
\ No newline at end of file diff --git a/.venv/lib/python3.12/site-packages/lark/tree.py b/.venv/lib/python3.12/site-packages/lark/tree.py new file mode 100644 index 00000000..0937b859 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/tree.py @@ -0,0 +1,231 @@ +try: + from future_builtins import filter +except ImportError: + pass + +from copy import deepcopy + + +###{standalone +from collections import OrderedDict + + +class Meta: + def __init__(self): + self.empty = True + + +class Tree(object): + """The main tree class. + + Creates a new tree, and stores "data" and "children" in attributes of the same name. + Trees can be hashed and compared. + + Parameters: + data: The name of the rule or alias + children: List of matched sub-rules and terminals + meta: Line & Column numbers (if ``propagate_positions`` is enabled). + meta attributes: line, column, start_pos, end_line, end_column, end_pos + """ + def __init__(self, data, children, meta=None): + self.data = data + self.children = children + self._meta = meta + + @property + def meta(self): + if self._meta is None: + self._meta = Meta() + return self._meta + + def __repr__(self): + return 'Tree(%r, %r)' % (self.data, self.children) + + def _pretty_label(self): + return self.data + + def _pretty(self, level, indent_str): + if len(self.children) == 1 and not isinstance(self.children[0], Tree): + return [indent_str*level, self._pretty_label(), '\t', '%s' % (self.children[0],), '\n'] + + l = [indent_str*level, self._pretty_label(), '\n'] + for n in self.children: + if isinstance(n, Tree): + l += n._pretty(level+1, indent_str) + else: + l += [indent_str*(level+1), '%s' % (n,), '\n'] + + return l + + def pretty(self, indent_str=' '): + """Returns an indented string representation of the tree. + + Great for debugging. + """ + return ''.join(self._pretty(0, indent_str)) + + def __eq__(self, other): + try: + return self.data == other.data and self.children == other.children + except AttributeError: + return False + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash((self.data, tuple(self.children))) + + def iter_subtrees(self): + """Depth-first iteration. + + Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG). + """ + queue = [self] + subtrees = OrderedDict() + for subtree in queue: + subtrees[id(subtree)] = subtree + queue += [c for c in reversed(subtree.children) + if isinstance(c, Tree) and id(c) not in subtrees] + + del queue + return reversed(list(subtrees.values())) + + def find_pred(self, pred): + """Returns all nodes of the tree that evaluate pred(node) as true.""" + return filter(pred, self.iter_subtrees()) + + def find_data(self, data): + """Returns all nodes of the tree whose data equals the given data.""" + return self.find_pred(lambda t: t.data == data) + +###} + + def expand_kids_by_index(self, *indices): + """Expand (inline) children at the given indices""" + for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices + kid = self.children[i] + self.children[i:i+1] = kid.children + + def expand_kids_by_data(self, *data_values): + """Expand (inline) children with any of the given data values. Returns True if anything changed""" + changed = False + for i in range(len(self.children)-1, -1, -1): + child = self.children[i] + if isinstance(child, Tree) and child.data in data_values: + self.children[i:i+1] = child.children + changed = True + return changed + + + def scan_values(self, pred): + """Return all values in the tree that evaluate pred(value) as true. + + This can be used to find all the tokens in the tree. + + Example: + >>> all_tokens = tree.scan_values(lambda v: isinstance(v, Token)) + """ + for c in self.children: + if isinstance(c, Tree): + for t in c.scan_values(pred): + yield t + else: + if pred(c): + yield c + + def iter_subtrees_topdown(self): + """Breadth-first iteration. + + Iterates over all the subtrees, return nodes in order like pretty() does. + """ + stack = [self] + while stack: + node = stack.pop() + if not isinstance(node, Tree): + continue + yield node + for n in reversed(node.children): + stack.append(n) + + def __deepcopy__(self, memo): + return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta) + + def copy(self): + return type(self)(self.data, self.children) + + def set(self, data, children): + self.data = data + self.children = children + + # XXX Deprecated! Here for backwards compatibility <0.6.0 + @property + def line(self): + return self.meta.line + + @property + def column(self): + return self.meta.column + + @property + def end_line(self): + return self.meta.end_line + + @property + def end_column(self): + return self.meta.end_column + + +class SlottedTree(Tree): + __slots__ = 'data', 'children', 'rule', '_meta' + + +def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs): + graph = pydot__tree_to_graph(tree, rankdir, **kwargs) + graph.write_png(filename) + + +def pydot__tree_to_dot(tree, filename, rankdir="LR", **kwargs): + graph = pydot__tree_to_graph(tree, rankdir, **kwargs) + graph.write(filename) + + +def pydot__tree_to_graph(tree, rankdir="LR", **kwargs): + """Creates a colorful image that represents the tree (data+children, without meta) + + Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to + directed graphs drawn from top to bottom, from left to right, from bottom to + top, and from right to left, respectively. + + `kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of + possible attributes, see https://www.graphviz.org/doc/info/attrs.html. + """ + + import pydot + graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs) + + i = [0] + + def new_leaf(leaf): + node = pydot.Node(i[0], label=repr(leaf)) + i[0] += 1 + graph.add_node(node) + return node + + def _to_pydot(subtree): + color = hash(subtree.data) & 0xffffff + color |= 0x808080 + + subnodes = [_to_pydot(child) if isinstance(child, Tree) else new_leaf(child) + for child in subtree.children] + node = pydot.Node(i[0], style="filled", fillcolor="#%x" % color, label=subtree.data) + i[0] += 1 + graph.add_node(node) + + for subnode in subnodes: + graph.add_edge(pydot.Edge(node, subnode)) + + return node + + _to_pydot(tree) + return graph diff --git a/.venv/lib/python3.12/site-packages/lark/tree_matcher.py b/.venv/lib/python3.12/site-packages/lark/tree_matcher.py new file mode 100644 index 00000000..02f95885 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/tree_matcher.py @@ -0,0 +1,186 @@ +"""Tree matcher based on Lark grammar""" + +import re +from collections import defaultdict + +from . import Tree, Token +from .common import ParserConf +from .parsers import earley +from .grammar import Rule, Terminal, NonTerminal + + +def is_discarded_terminal(t): + return t.is_term and t.filter_out + + +class _MakeTreeMatch: + def __init__(self, name, expansion): + self.name = name + self.expansion = expansion + + def __call__(self, args): + t = Tree(self.name, args) + t.meta.match_tree = True + t.meta.orig_expansion = self.expansion + return t + + +def _best_from_group(seq, group_key, cmp_key): + d = {} + for item in seq: + key = group_key(item) + if key in d: + v1 = cmp_key(item) + v2 = cmp_key(d[key]) + if v2 > v1: + d[key] = item + else: + d[key] = item + return list(d.values()) + + +def _best_rules_from_group(rules): + rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion)) + rules.sort(key=lambda r: len(r.expansion)) + return rules + + +def _match(term, token): + if isinstance(token, Tree): + name, _args = parse_rulename(term.name) + return token.data == name + elif isinstance(token, Token): + return term == Terminal(token.type) + assert False, (term, token) + + +def make_recons_rule(origin, expansion, old_expansion): + return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion)) + + +def make_recons_rule_to_term(origin, term): + return make_recons_rule(origin, [Terminal(term.name)], [term]) + + +def parse_rulename(s): + "Parse rule names that may contain a template syntax (like rule{a, b, ...})" + name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups() + args = args_str and [a.strip() for a in args_str.split(',')] + return name, args + + + +class ChildrenLexer: + def __init__(self, children): + self.children = children + + def lex(self, parser_state): + return self.children + +class TreeMatcher: + """Match the elements of a tree node, based on an ontology + provided by a Lark grammar. + + Supports templates and inlined rules (`rule{a, b,..}` and `_rule`) + + Initiialize with an instance of Lark. + """ + + def __init__(self, parser): + # XXX TODO calling compile twice returns different results! + assert parser.options.maybe_placeholders == False + # XXX TODO: we just ignore the potential existence of a postlexer + self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) + + self.rules_for_root = defaultdict(list) + + self.rules = list(self._build_recons_rules(rules)) + self.rules.reverse() + + # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. + self.rules = _best_rules_from_group(self.rules) + + self.parser = parser + self._parser_cache = {} + + def _build_recons_rules(self, rules): + "Convert tree-parsing/construction rules to tree-matching rules" + expand1s = {r.origin for r in rules if r.options.expand1} + + aliases = defaultdict(list) + for r in rules: + if r.alias: + aliases[r.origin].append(r.alias) + + rule_names = {r.origin for r in rules} + nonterminals = {sym for sym in rule_names + if sym.name.startswith('_') or sym in expand1s or sym in aliases} + + seen = set() + for r in rules: + recons_exp = [sym if sym in nonterminals else Terminal(sym.name) + for sym in r.expansion if not is_discarded_terminal(sym)] + + # Skip self-recursive constructs + if recons_exp == [r.origin] and r.alias is None: + continue + + sym = NonTerminal(r.alias) if r.alias else r.origin + rule = make_recons_rule(sym, recons_exp, r.expansion) + + if sym in expand1s and len(recons_exp) != 1: + self.rules_for_root[sym.name].append(rule) + + if sym.name not in seen: + yield make_recons_rule_to_term(sym, sym) + seen.add(sym.name) + else: + if sym.name.startswith('_') or sym in expand1s: + yield rule + else: + self.rules_for_root[sym.name].append(rule) + + for origin, rule_aliases in aliases.items(): + for alias in rule_aliases: + yield make_recons_rule_to_term(origin, NonTerminal(alias)) + yield make_recons_rule_to_term(origin, origin) + + def match_tree(self, tree, rulename): + """Match the elements of `tree` to the symbols of rule `rulename`. + + Parameters: + tree (Tree): the tree node to match + rulename (str): The expected full rule name (including template args) + + Returns: + Tree: an unreduced tree that matches `rulename` + + Raises: + UnexpectedToken: If no match was found. + + Note: + It's the callers' responsibility match the tree recursively. + """ + if rulename: + # validate + name, _args = parse_rulename(rulename) + assert tree.data == name + else: + rulename = tree.data + + # TODO: ambiguity? + try: + parser = self._parser_cache[rulename] + except KeyError: + rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename]) + + # TODO pass callbacks through dict, instead of alias? + callbacks = {rule: rule.alias for rule in rules} + conf = ParserConf(rules, callbacks, [rulename]) + parser = earley.Parser(conf, _match, resolve_ambiguity=True) + self._parser_cache[rulename] = parser + + # find a full derivation + unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) + assert unreduced_tree.data == rulename + return unreduced_tree diff --git a/.venv/lib/python3.12/site-packages/lark/utils.py b/.venv/lib/python3.12/site-packages/lark/utils.py new file mode 100644 index 00000000..051adfa1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/utils.py @@ -0,0 +1,387 @@ +import hashlib +import unicodedata +import os +from functools import reduce +from collections import deque + +###{standalone +import sys, re +import logging +from io import open +logger = logging.getLogger("lark") +logger.addHandler(logging.StreamHandler()) +# Set to highest level, since we have some warnings amongst the code +# By default, we should not output any log messages +logger.setLevel(logging.CRITICAL) + +if sys.version_info[0]>2: + from abc import ABC, abstractmethod +else: + from abc import ABCMeta, abstractmethod + class ABC(object): # Provide Python27 compatibility + __slots__ = () + __metclass__ = ABCMeta + + +Py36 = (sys.version_info[:2] >= (3, 6)) + +NO_VALUE = object() + + +def classify(seq, key=None, value=None): + d = {} + for item in seq: + k = key(item) if (key is not None) else item + v = value(item) if (value is not None) else item + if k in d: + d[k].append(v) + else: + d[k] = [v] + return d + + +def _deserialize(data, namespace, memo): + if isinstance(data, dict): + if '__type__' in data: # Object + class_ = namespace[data['__type__']] + return class_.deserialize(data, memo) + elif '@' in data: + return memo[data['@']] + return {key:_deserialize(value, namespace, memo) for key, value in data.items()} + elif isinstance(data, list): + return [_deserialize(value, namespace, memo) for value in data] + return data + + +class Serialize(object): + """Safe-ish serialization interface that doesn't rely on Pickle + + Attributes: + __serialize_fields__ (List[str]): Fields (aka attributes) to serialize. + __serialize_namespace__ (list): List of classes that deserialization is allowed to instantiate. + Should include all field types that aren't builtin types. + """ + + def memo_serialize(self, types_to_memoize): + memo = SerializeMemoizer(types_to_memoize) + return self.serialize(memo), memo.serialize() + + def serialize(self, memo=None): + if memo and memo.in_types(self): + return {'@': memo.memoized.get(self)} + + fields = getattr(self, '__serialize_fields__') + res = {f: _serialize(getattr(self, f), memo) for f in fields} + res['__type__'] = type(self).__name__ + if hasattr(self, '_serialize'): + self._serialize(res, memo) + return res + + @classmethod + def deserialize(cls, data, memo): + namespace = getattr(cls, '__serialize_namespace__', []) + namespace = {c.__name__:c for c in namespace} + + fields = getattr(cls, '__serialize_fields__') + + if '@' in data: + return memo[data['@']] + + inst = cls.__new__(cls) + for f in fields: + try: + setattr(inst, f, _deserialize(data[f], namespace, memo)) + except KeyError as e: + raise KeyError("Cannot find key for class", cls, e) + + if hasattr(inst, '_deserialize'): + inst._deserialize() + + return inst + + +class SerializeMemoizer(Serialize): + "A version of serialize that memoizes objects to reduce space" + + __serialize_fields__ = 'memoized', + + def __init__(self, types_to_memoize): + self.types_to_memoize = tuple(types_to_memoize) + self.memoized = Enumerator() + + def in_types(self, value): + return isinstance(value, self.types_to_memoize) + + def serialize(self): + return _serialize(self.memoized.reversed(), None) + + @classmethod + def deserialize(cls, data, namespace, memo): + return _deserialize(data, namespace, memo) + + +try: + STRING_TYPE = basestring +except NameError: # Python 3 + STRING_TYPE = str + + +import types +from functools import wraps, partial +from contextlib import contextmanager + +Str = type(u'') +try: + classtype = types.ClassType # Python2 +except AttributeError: + classtype = type # Python3 + + +def smart_decorator(f, create_decorator): + if isinstance(f, types.FunctionType): + return wraps(f)(create_decorator(f, True)) + + elif isinstance(f, (classtype, type, types.BuiltinFunctionType)): + return wraps(f)(create_decorator(f, False)) + + elif isinstance(f, types.MethodType): + return wraps(f)(create_decorator(f.__func__, True)) + + elif isinstance(f, partial): + # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 + return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) + + else: + return create_decorator(f.__func__.__call__, True) + + +try: + import regex +except ImportError: + regex = None + +import sre_parse +import sre_constants +categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') + +def get_regexp_width(expr): + if regex: + # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with + # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex + # match here below. + regexp_final = re.sub(categ_pattern, 'A', expr) + else: + if re.search(categ_pattern, expr): + raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) + regexp_final = expr + try: + return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] + except sre_constants.error: + if not regex: + raise ValueError(expr) + else: + # sre_parse does not support the new features in regex. To not completely fail in that case, + # we manually test for the most important info (whether the empty string is matched) + c = regex.compile(regexp_final) + if c.match('') is None: + return 1, sre_constants.MAXREPEAT + else: + return 0, sre_constants.MAXREPEAT + +###} + + +_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc' +_ID_CONTINUE = _ID_START + ('Nd', 'Nl',) + +def _test_unicode_category(s, categories): + if len(s) != 1: + return all(_test_unicode_category(char, categories) for char in s) + return s == '_' or unicodedata.category(s) in categories + +def is_id_continue(s): + """ + Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin + numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details. + """ + return _test_unicode_category(s, _ID_CONTINUE) + +def is_id_start(s): + """ + Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin + numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details. + """ + return _test_unicode_category(s, _ID_START) + + +def dedup_list(l): + """Given a list (l) will removing duplicates from the list, + preserving the original order of the list. Assumes that + the list entries are hashable.""" + dedup = set() + return [x for x in l if not (x in dedup or dedup.add(x))] + + +try: + from contextlib import suppress # Python 3 +except ImportError: + @contextmanager + def suppress(*excs): + '''Catch and dismiss the provided exception + + >>> x = 'hello' + >>> with suppress(IndexError): + ... x = x[10] + >>> x + 'hello' + ''' + try: + yield + except excs: + pass + + +class Enumerator(Serialize): + def __init__(self): + self.enums = {} + + def get(self, item): + if item not in self.enums: + self.enums[item] = len(self.enums) + return self.enums[item] + + def __len__(self): + return len(self.enums) + + def reversed(self): + r = {v: k for k, v in self.enums.items()} + assert len(r) == len(self.enums) + return r + + + +def combine_alternatives(lists): + """ + Accepts a list of alternatives, and enumerates all their possible concatinations. + + Examples: + >>> combine_alternatives([range(2), [4,5]]) + [[0, 4], [0, 5], [1, 4], [1, 5]] + + >>> combine_alternatives(["abc", "xy", '$']) + [['a', 'x', '$'], ['a', 'y', '$'], ['b', 'x', '$'], ['b', 'y', '$'], ['c', 'x', '$'], ['c', 'y', '$']] + + >>> combine_alternatives([]) + [[]] + """ + if not lists: + return [[]] + assert all(l for l in lists), lists + init = [[x] for x in lists[0]] + return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) + + +try: + import atomicwrites +except ImportError: + atomicwrites = None + +class FS: + exists = staticmethod(os.path.exists) + + @staticmethod + def open(name, mode="r", **kwargs): + if atomicwrites and "w" in mode: + return atomicwrites.atomic_write(name, mode=mode, overwrite=True, **kwargs) + else: + return open(name, mode, **kwargs) + + + +def isascii(s): + """ str.isascii only exists in python3.7+ """ + try: + return s.isascii() + except AttributeError: + try: + s.encode('ascii') + return True + except (UnicodeDecodeError, UnicodeEncodeError): + return False + + +class fzset(frozenset): + def __repr__(self): + return '{%s}' % ', '.join(map(repr, self)) + + +def classify_bool(seq, pred): + true_elems = [] + false_elems = [] + + for elem in seq: + if pred(elem): + true_elems.append(elem) + else: + false_elems.append(elem) + + return true_elems, false_elems + + +def bfs(initial, expand): + open_q = deque(list(initial)) + visited = set(open_q) + while open_q: + node = open_q.popleft() + yield node + for next_node in expand(node): + if next_node not in visited: + visited.add(next_node) + open_q.append(next_node) + +def bfs_all_unique(initial, expand): + "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions" + open_q = deque(list(initial)) + while open_q: + node = open_q.popleft() + yield node + open_q += expand(node) + + +def _serialize(value, memo): + if isinstance(value, Serialize): + return value.serialize(memo) + elif isinstance(value, list): + return [_serialize(elem, memo) for elem in value] + elif isinstance(value, frozenset): + return list(value) # TODO reversible? + elif isinstance(value, dict): + return {key:_serialize(elem, memo) for key, elem in value.items()} + # assert value is None or isinstance(value, (int, float, str, tuple)), value + return value + + + + +def small_factors(n, max_factor): + """ + Splits n up into smaller factors and summands <= max_factor. + Returns a list of [(a, b), ...] + so that the following code returns n: + + n = 1 + for a, b in values: + n = n * a + b + + Currently, we also keep a + b <= max_factor, but that might change + """ + assert n >= 0 + assert max_factor > 2 + if n <= max_factor: + return [(n, 0)] + + for a in range(max_factor, 1, -1): + r, b = divmod(n, a) + if a + b <= max_factor: + return small_factors(r, max_factor) + [(a, b)] + assert False, "Failed to factorize %s" % n diff --git a/.venv/lib/python3.12/site-packages/lark/visitors.py b/.venv/lib/python3.12/site-packages/lark/visitors.py new file mode 100644 index 00000000..d45bb193 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/lark/visitors.py @@ -0,0 +1,529 @@ +from functools import wraps + +from .utils import smart_decorator, combine_alternatives +from .tree import Tree +from .exceptions import VisitError, GrammarError +from .lexer import Token + +###{standalone +from inspect import getmembers, getmro + + +class Discard(Exception): + """When raising the Discard exception in a transformer callback, + that node is discarded and won't appear in the parent. + """ + pass + +# Transformers + + +class _Decoratable: + "Provides support for decorating methods with @v_args" + + @classmethod + def _apply_decorator(cls, decorator, **kwargs): + mro = getmro(cls) + assert mro[0] is cls + libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} + for name, value in getmembers(cls): + + # Make sure the function isn't inherited (unless it's overwritten) + if name.startswith('_') or (name in libmembers and name not in cls.__dict__): + continue + if not callable(value): + continue + + # Skip if v_args already applied (at the function level) + if hasattr(cls.__dict__[name], 'vargs_applied') or hasattr(value, 'vargs_applied'): + continue + + static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) + setattr(cls, name, decorator(value, static=static, **kwargs)) + return cls + + def __class_getitem__(cls, _): + return cls + + +class Transformer(_Decoratable): + """Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. + + Methods are provided by the user via inheritance, and called according to ``tree.data``. + The returned value from each method replaces the node in the tree structure. + + Transformers work bottom-up (or depth-first), starting with the leaves and ending at the root of the tree. + Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root, + at any point the callbacks may assume the children have already been transformed (if applicable). + + ``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree, + it is slightly less efficient. + + All these classes implement the transformer interface: + + - ``Transformer`` - Recursively transforms the tree. This is the one you probably want. + - ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances + - ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances + + Parameters: + visit_tokens (bool, optional): Should the transformer visit tokens in addition to rules. + Setting this to ``False`` is slightly faster. Defaults to ``True``. + (For processing ignored tokens, use the ``lexer_callbacks`` options) + + NOTE: A transformer without methods essentially performs a non-memoized partial deepcopy. + """ + __visit_tokens__ = True # For backwards compatibility + + def __init__(self, visit_tokens=True): + self.__visit_tokens__ = visit_tokens + + def _call_userfunc(self, tree, new_children=None): + # Assumes tree is already transformed + children = new_children if new_children is not None else tree.children + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree.data, children, tree.meta) + else: + try: + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + return f.visit_wrapper(f, tree.data, children, tree.meta) + else: + return f(children) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(tree.data, tree, e) + + def _call_userfunc_token(self, token): + try: + f = getattr(self, token.type) + except AttributeError: + return self.__default_token__(token) + else: + try: + return f(token) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(token.type, token, e) + + def _transform_children(self, children): + for c in children: + try: + if isinstance(c, Tree): + yield self._transform_tree(c) + elif self.__visit_tokens__ and isinstance(c, Token): + yield self._call_userfunc_token(c) + else: + yield c + except Discard: + pass + + def _transform_tree(self, tree): + children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree, children) + + def transform(self, tree): + "Transform the given tree, and return the final result" + return self._transform_tree(tree) + + def __mul__(self, other): + """Chain two transformers together, returning a new transformer. + """ + return TransformerChain(self, other) + + def __default__(self, data, children, meta): + """Default function that is called if there is no attribute matching ``data`` + + Can be overridden. Defaults to creating a new copy of the tree node (i.e. ``return Tree(data, children, meta)``) + """ + return Tree(data, children, meta) + + def __default_token__(self, token): + """Default function that is called if there is no attribute matching ``token.type`` + + Can be overridden. Defaults to returning the token as-is. + """ + return token + + +def merge_transformers(base_transformer=None, **transformers_to_merge): + """Merge a collection of transformers into the base_transformer, each into its own 'namespace'. + + When called, it will collect the methods from each transformer, and assign them to base_transformer, + with their name prefixed with the given keyword, as ``prefix__methodname``. + + This function is especially useful for processing grammars that import other grammars, + thereby creating some of their rules in a 'namespace'. (i.e with a consistent name prefix). + In this case, the key for the transformer should match the name of the imported grammar. + + Parameters: + base_transformer (Transformer, optional): The transformer that all other transformers will be added to. + **transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``. + + Raises: + AttributeError: In case of a name collision in the merged methods + + Example: + :: + + class TBase(Transformer): + def start(self, children): + return children[0] + 'bar' + + class TImportedGrammar(Transformer): + def foo(self, children): + return "foo" + + composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar()) + + t = Tree('start', [ Tree('imported__foo', []) ]) + + assert composed_transformer.transform(t) == 'foobar' + + """ + if base_transformer is None: + base_transformer = Transformer() + for prefix, transformer in transformers_to_merge.items(): + for method_name in dir(transformer): + method = getattr(transformer, method_name) + if not callable(method): + continue + if method_name.startswith("_") or method_name == "transform": + continue + prefixed_method = prefix + "__" + method_name + if hasattr(base_transformer, prefixed_method): + raise AttributeError("Cannot merge: method '%s' appears more than once" % prefixed_method) + + setattr(base_transformer, prefixed_method, method) + + return base_transformer + + +class InlineTransformer(Transformer): # XXX Deprecated + def _call_userfunc(self, tree, new_children=None): + # Assumes tree is already transformed + children = new_children if new_children is not None else tree.children + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree.data, children, tree.meta) + else: + return f(*children) + + +class TransformerChain(object): + def __init__(self, *transformers): + self.transformers = transformers + + def transform(self, tree): + for t in self.transformers: + tree = t.transform(tree) + return tree + + def __mul__(self, other): + return TransformerChain(*self.transformers + (other,)) + + +class Transformer_InPlace(Transformer): + """Same as Transformer, but non-recursive, and changes the tree in-place instead of returning new instances + + Useful for huge trees. Conservative in memory. + """ + def _transform_tree(self, tree): # Cancel recursion + return self._call_userfunc(tree) + + def transform(self, tree): + for subtree in tree.iter_subtrees(): + subtree.children = list(self._transform_children(subtree.children)) + + return self._transform_tree(tree) + + +class Transformer_NonRecursive(Transformer): + """Same as Transformer but non-recursive. + + Like Transformer, it doesn't change the original tree. + + Useful for huge trees. + """ + + def transform(self, tree): + # Tree to postfix + rev_postfix = [] + q = [tree] + while q: + t = q.pop() + rev_postfix.append(t) + if isinstance(t, Tree): + q += t.children + + # Postfix to tree + stack = [] + for x in reversed(rev_postfix): + if isinstance(x, Tree): + size = len(x.children) + if size: + args = stack[-size:] + del stack[-size:] + else: + args = [] + stack.append(self._call_userfunc(x, args)) + elif self.__visit_tokens__ and isinstance(x, Token): + stack.append(self._call_userfunc_token(x)) + else: + stack.append(x) + + t ,= stack # We should have only one tree remaining + return t + + +class Transformer_InPlaceRecursive(Transformer): + "Same as Transformer, recursive, but changes the tree in-place instead of returning new instances" + def _transform_tree(self, tree): + tree.children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree) + + +# Visitors + +class VisitorBase: + def _call_userfunc(self, tree): + return getattr(self, tree.data, self.__default__)(tree) + + def __default__(self, tree): + """Default function that is called if there is no attribute matching ``tree.data`` + + Can be overridden. Defaults to doing nothing. + """ + return tree + + def __class_getitem__(cls, _): + return cls + + +class Visitor(VisitorBase): + """Tree visitor, non-recursive (can handle huge trees). + + Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` + """ + + def visit(self, tree): + "Visits the tree, starting with the leaves and finally the root (bottom-up)" + for subtree in tree.iter_subtrees(): + self._call_userfunc(subtree) + return tree + + def visit_topdown(self,tree): + "Visit the tree, starting at the root, and ending at the leaves (top-down)" + for subtree in tree.iter_subtrees_topdown(): + self._call_userfunc(subtree) + return tree + + +class Visitor_Recursive(VisitorBase): + """Bottom-up visitor, recursive. + + Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` + + Slightly faster than the non-recursive version. + """ + + def visit(self, tree): + "Visits the tree, starting with the leaves and finally the root (bottom-up)" + for child in tree.children: + if isinstance(child, Tree): + self.visit(child) + + self._call_userfunc(tree) + return tree + + def visit_topdown(self,tree): + "Visit the tree, starting at the root, and ending at the leaves (top-down)" + self._call_userfunc(tree) + + for child in tree.children: + if isinstance(child, Tree): + self.visit_topdown(child) + + return tree + + +def visit_children_decor(func): + "See Interpreter" + @wraps(func) + def inner(cls, tree): + values = cls.visit_children(tree) + return func(cls, values) + return inner + + +class Interpreter(_Decoratable): + """Interpreter walks the tree starting at the root. + + Visits the tree, starting with the root and finally the leaves (top-down) + + For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``. + + Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches. + The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``. + This allows the user to implement branching and loops. + """ + + def visit(self, tree): + f = getattr(self, tree.data) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + return f.visit_wrapper(f, tree.data, tree.children, tree.meta) + else: + return f(tree) + + def visit_children(self, tree): + return [self.visit(child) if isinstance(child, Tree) else child + for child in tree.children] + + def __getattr__(self, name): + return self.__default__ + + def __default__(self, tree): + return self.visit_children(tree) + + +# Decorators + +def _apply_decorator(obj, decorator, **kwargs): + try: + _apply = obj._apply_decorator + except AttributeError: + return decorator(obj, **kwargs) + else: + return _apply(decorator, **kwargs) + + +def _inline_args__func(func): + @wraps(func) + def create_decorator(_f, with_self): + if with_self: + def f(self, children): + return _f(self, *children) + else: + def f(self, children): + return _f(*children) + return f + + return smart_decorator(func, create_decorator) + + +def inline_args(obj): # XXX Deprecated + return _apply_decorator(obj, _inline_args__func) + + +def _visitor_args_func_dec(func, visit_wrapper=None, static=False): + def create_decorator(_f, with_self): + if with_self: + def f(self, *args, **kwargs): + return _f(self, *args, **kwargs) + else: + def f(self, *args, **kwargs): + return _f(*args, **kwargs) + return f + + if static: + f = wraps(func)(create_decorator(func, False)) + else: + f = smart_decorator(func, create_decorator) + f.vargs_applied = True + f.visit_wrapper = visit_wrapper + return f + + +def _vargs_inline(f, _data, children, _meta): + return f(*children) +def _vargs_meta_inline(f, _data, children, meta): + return f(meta, *children) +def _vargs_meta(f, _data, children, meta): + return f(children, meta) # TODO swap these for consistency? Backwards incompatible! +def _vargs_tree(f, data, children, meta): + return f(Tree(data, children, meta)) + + +def v_args(inline=False, meta=False, tree=False, wrapper=None): + """A convenience decorator factory for modifying the behavior of user-supplied visitor methods. + + By default, callback methods of transformers/visitors accept one argument - a list of the node's children. + + ``v_args`` can modify this behavior. When used on a transformer/visitor class definition, + it applies to all the callback methods inside it. + + ``v_args`` can be applied to a single method, or to an entire class. When applied to both, + the options given to the method take precedence. + + Parameters: + inline (bool, optional): Children are provided as ``*args`` instead of a list argument (not recommended for very long lists). + meta (bool, optional): Provides two arguments: ``children`` and ``meta`` (instead of just the first) + tree (bool, optional): Provides the entire tree as the argument, instead of the children. + wrapper (function, optional): Provide a function to decorate all methods. + + Example: + :: + + @v_args(inline=True) + class SolveArith(Transformer): + def add(self, left, right): + return left + right + + + class ReverseNotation(Transformer_InPlace): + @v_args(tree=True) + def tree_node(self, tree): + tree.children = tree.children[::-1] + """ + if tree and (meta or inline): + raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") + + func = None + if meta: + if inline: + func = _vargs_meta_inline + else: + func = _vargs_meta + elif inline: + func = _vargs_inline + elif tree: + func = _vargs_tree + + if wrapper is not None: + if func is not None: + raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.") + func = wrapper + + def _visitor_args_dec(obj): + return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func) + return _visitor_args_dec + + +###} + + +# --- Visitor Utilities --- + +class CollapseAmbiguities(Transformer): + """ + Transforms a tree that contains any number of _ambig nodes into a list of trees, + each one containing an unambiguous tree. + + The length of the resulting list is the product of the length of all _ambig nodes. + + Warning: This may quickly explode for highly ambiguous trees. + + """ + def _ambig(self, options): + return sum(options, []) + + def __default__(self, data, children_lists, meta): + return [Tree(data, children, meta) for children in combine_alternatives(children_lists)] + + def __default_token__(self, t): + return [t] |