aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/lark/tools/nearley.py
blob: f0779dc58e89e87658a503e60168a91cf54e123e (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"Converts Nearley grammars to Lark"

import os.path
import sys
import codecs
import argparse


from lark import Lark, InlineTransformer

nearley_grammar = r"""
    start: (ruledef|directive)+

    directive: "@" NAME (STRING|NAME)
             | "@" JS  -> js_code
    ruledef: NAME "->" expansions
           | NAME REGEXP "->" expansions -> macro
    expansions: expansion ("|" expansion)*

    expansion: expr+ js

    ?expr: item (":" /[+*?]/)?

    ?item: rule|string|regexp|null
         | "(" expansions ")"

    rule: NAME
    string: STRING
    regexp: REGEXP
    null: "null"
    JS: /{%.*?%}/s
    js: JS?

    NAME: /[a-zA-Z_$]\w*/
    COMMENT: /#[^\n]*/
    REGEXP: /\[.*?\]/

    STRING: _STRING "i"?

    %import common.ESCAPED_STRING -> _STRING
    %import common.WS
    %ignore WS
    %ignore COMMENT

    """

nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='standard')

def _get_rulename(name):
    name = {'_': '_ws_maybe', '__':'_ws'}.get(name, name)
    return 'n_' + name.replace('$', '__DOLLAR__').lower()

class NearleyToLark(InlineTransformer):
    def __init__(self):
        self._count = 0
        self.extra_rules = {}
        self.extra_rules_rev = {}
        self.alias_js_code = {}

    def _new_function(self, code):
        name = 'alias_%d' % self._count
        self._count += 1

        self.alias_js_code[name] = code
        return name

    def _extra_rule(self, rule):
        if rule in self.extra_rules_rev:
            return self.extra_rules_rev[rule]

        name = 'xrule_%d' % len(self.extra_rules)
        assert name not in self.extra_rules
        self.extra_rules[name] = rule
        self.extra_rules_rev[rule] = name
        return name

    def rule(self, name):
        return _get_rulename(name)

    def ruledef(self, name, exps):
        return '!%s: %s' % (_get_rulename(name), exps)

    def expr(self, item, op):
        rule = '(%s)%s' % (item, op)
        return self._extra_rule(rule)

    def regexp(self, r):
        return '/%s/' % r

    def null(self):
        return ''

    def string(self, s):
        return self._extra_rule(s)

    def expansion(self, *x):
        x, js = x[:-1], x[-1]
        if js.children:
            js_code ,= js.children
            js_code = js_code[2:-2]
            alias = '-> ' + self._new_function(js_code)
        else:
            alias = ''
        return ' '.join(x) + alias

    def expansions(self, *x):
        return '%s' % ('\n    |'.join(x))

    def start(self, *rules):
        return '\n'.join(filter(None, rules))

def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
    rule_defs = []

    tree = nearley_grammar_parser.parse(g)
    for statement in tree.children:
        if statement.data == 'directive':
            directive, arg = statement.children
            if directive in ('builtin', 'include'):
                folder = builtin_path if directive == 'builtin' else folder_path
                path = os.path.join(folder, arg[1:-1])
                if path not in includes:
                    includes.add(path)
                    with codecs.open(path, encoding='utf8') as f:
                        text = f.read()
                    rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
            else:
                assert False, directive
        elif statement.data == 'js_code':
            code ,= statement.children
            code = code[2:-2]
            js_code.append(code)
        elif statement.data == 'macro':
            pass    # TODO Add support for macros!
        elif statement.data == 'ruledef':
            rule_defs.append( n2l.transform(statement) )
        else:
            raise Exception("Unknown statement: %s" % statement)

    return rule_defs


def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False):
    import js2py

    emit_code = []
    def emit(x=None):
        if x:
            emit_code.append(x)
        emit_code.append('\n')

    js_code = ['function id(x) {return x[0];}']
    n2l = NearleyToLark()
    rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
    lark_g = '\n'.join(rule_defs)
    lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())

    emit('from lark import Lark, Transformer')
    emit()
    emit('grammar = ' + repr(lark_g))
    emit()

    for alias, code in n2l.alias_js_code.items():
        js_code.append('%s = (%s);' % (alias, code))

    if es6:
        emit(js2py.translate_js6('\n'.join(js_code)))
    else:
        emit(js2py.translate_js('\n'.join(js_code)))
    emit('class TransformNearley(Transformer):')
    for alias in n2l.alias_js_code:
        emit("    %s = var.get('%s').to_python()" % (alias, alias))
    emit("    __default__ = lambda self, n, c, m: c if c else None")

    emit()
    emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
    emit('def parse(text):')
    emit('    return TransformNearley().transform(parser.parse(text))')

    return ''.join(emit_code)

def main(fn, start, nearley_lib, es6=False):
    with codecs.open(fn, encoding='utf8') as f:
        grammar = f.read()
    return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6)

def get_arg_parser():
    parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.')
    parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar')
    parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule')
    parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)')
    parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true')
    return parser

if __name__ == '__main__':
    parser = get_arg_parser()
    if len(sys.argv)==1:
        parser.print_help(sys.stderr)
        sys.exit(1)
    args = parser.parse_args()
    print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6))