blob: 9ef5eb15272631ee7304a5097b0710c86fcea406 [file] [log] [blame]
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Routines to generate a shift-reduce parser from the module_ir module."""
import pkgutil
from front_end import lr1
from front_end import module_ir
from front_end import tokenizer
from util import simple_memoizer
class ParserGenerationError(Exception):
"""An error occurred during parser generation."""
pass
def parse_error_examples(error_example_text):
"""Parses error examples from error_example_text.
Arguments:
error_example_text: The text of an error example file.
Returns:
A list of tuples, suitable for passing into generate_parser.
Raises:
ParserGenerationError: There is a problem parsing the error examples.
"""
error_examples = error_example_text.split("\n" + "=" * 80 + "\n")
result = []
# Everything before the first "======" line is explanatory text: ignore it.
for error_example in error_examples[1:]:
message_and_examples = error_example.split("\n" + "-" * 80 + "\n")
if len(message_and_examples) != 2:
raise ParserGenerationError(
"Expected one error message and one example section in:\n" +
error_example)
message, example_text = message_and_examples
examples = example_text.split("\n---\n")
for example in examples:
# TODO(bolms): feed a line number into tokenize, so that tokenization
# failures refer to the correct line within error_example_text.
tokens, errors = tokenizer.tokenize(example, "")
if errors:
raise ParserGenerationError(str(errors))
for i in range(len(tokens)):
if tokens[i].symbol == "BadWord" and tokens[i].text == "$ANY":
tokens[i] = lr1.ANY_TOKEN
error_token = None
for i in range(len(tokens)):
if tokens[i].symbol == "BadWord" and tokens[i].text == "$ERR":
error_token = tokens[i + 1]
del tokens[i]
break
else:
raise ParserGenerationError(
"No error token marker '$ERR' in:\n" + error_example)
result.append((tokens, error_token, message.strip(), example))
return result
def generate_parser(start_symbol, productions, error_examples):
"""Generates a parser from grammar, and applies error_examples.
Arguments:
start_symbol: the start symbol of the grammar (a string)
productions: a list of parser_types.Production in the grammar
error_examples: A list of (source tokens, error message, source text)
tuples.
Returns:
A parser.
Raises:
ParserGenerationError: There is a problem generating the parser.
"""
parser = lr1.Grammar(start_symbol, productions).parser()
if parser.conflicts:
raise ParserGenerationError("\n".join([str(c) for c in parser.conflicts]))
for example in error_examples:
mark_result = parser.mark_error(example[0], example[1], example[2])
if mark_result:
raise ParserGenerationError(
"error marking example: {}\nExample:\n{}".format(
mark_result, example[3]))
return parser
@simple_memoizer.memoize
def _load_module_parser():
path = "front_end"
error_examples = parse_error_examples(
pkgutil.get_data(path, "error_examples").decode("utf-8"))
return generate_parser(module_ir.START_SYMBOL, module_ir.PRODUCTIONS,
error_examples)
@simple_memoizer.memoize
def _load_expression_parser():
return generate_parser(module_ir.EXPRESSION_START_SYMBOL,
module_ir.PRODUCTIONS, [])
def parse_module(tokens):
"""Parses the provided Emboss token list into an Emboss module parse tree."""
return _load_module_parser().parse(tokens)
def parse_expression(tokens):
"""Parses the provided Emboss token list into an expression parse tree."""
return _load_expression_parser().parse(tokens)