blob: 52db8434ae4f54d3b6e9705125ed21b7ccd770b9 [file] [log] [blame]
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""module_ir contains code for generating module-level IRs from parse trees.
The primary export is build_ir(), which takes a parse tree (as returned by a
parser from lr1.py), and returns a module-level intermediate representation
("module IR").
This module also notably exports PRODUCTIONS and START_SYMBOL, which should be
fed to lr1.Grammar in order to create a parser for the Emboss language.
"""
import re
import sys
from compiler.util import ir_pb2
from compiler.util import name_conversion
from compiler.util import parser_types
# Intermediate types; should not be found in the final IR.
class _List(object):
"""A list with source location information."""
__slots__ = ('list', 'source_location')
def __init__(self, l):
assert isinstance(l, list), "_List object must wrap list, not '%r'" % l
self.list = l
self.source_location = ir_pb2.Location()
class _ExpressionTail(object):
"""A fragment of an expression with an operator and right-hand side.
_ExpressionTail is the tail of an expression, consisting of an operator and
the right-hand argument to the operator; for example, in the expression (6+8),
the _ExpressionTail would be "+8".
This is used as a temporary object while converting the right-recursive
"expression" and "times-expression" productions into left-associative
Expressions.
Attributes:
operator: An ir_pb2.Word of the operator's name.
expression: The expression on the right side of the operator.
source_location: The source location of the operation fragment.
"""
__slots__ = ('operator', 'expression', 'source_location')
def __init__(self, operator, expression):
self.operator = operator
self.expression = expression
self.source_location = ir_pb2.Location()
class _FieldWithType(object):
"""A field with zero or more types defined inline with that field."""
__slots__ = ('field', 'subtypes', 'source_location')
def __init__(self, field, subtypes=None):
self.field = field
self.subtypes = subtypes or []
self.source_location = ir_pb2.Location()
def build_ir(parse_tree, used_productions=None):
r"""Builds a module-level intermediate representation from a valid parse tree.
The parse tree is precisely dictated by the exact productions in the grammar
used by the parser, with no semantic information. _really_build_ir transforms
this "raw" form into a stable, cooked representation, thereby isolating
subsequent steps from the exact details of the grammar.
(Probably incomplete) list of transformations:
* ParseResult and Token nodes are replaced with Module, Attribute, Struct,
Type, etc. objects.
* Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
* Repeated elements are transformed from tree form to list form:
a*
/ \
b a*
/ \
c a*
/ \
d a*
(where b, c, and d are nodes of type "a") becomes [b, c, d].
* The values of numeric constants (Number, etc. tokens) are parsed.
* Different classes of names (snake_names, CamelNames, ShoutyNames) are
folded into a single "Name" type, since they are guaranteed to appear in
the correct places in the parse tree.
Arguments:
parse_tree: A parse tree. Each leaf node should be a parser_types.Token
object, and each non-leaf node should have a 'symbol' attribute specifying
which grammar symbol it represents, and a 'children' attribute containing
a list of child nodes. This is the format returned by the parsers
produced by the lr1 module, when run against tokens from the tokenizer
module.
used_productions: If specified, used_productions.add() will be called with
each production actually used in parsing. This can be useful when
developing the grammar and writing tests; in particular, it can be used to
figure out which productions are *not* used when parsing a particular
file.
Returns:
A module-level intermediate representation (module IR) for an Emboss module
(source file). This IR will not have symbols resolved; that must be done on
a forest of module IRs so that names from other modules can be resolved.
"""
# TODO(b/140259131): Refactor _really_build_ir to be less recursive/use an
# explicit stack.
old_recursion_limit = sys.getrecursionlimit()
sys.setrecursionlimit(16 * 1024) # ~8000 top-level entities in one module.
try:
result = _really_build_ir(parse_tree, used_productions)
finally:
sys.setrecursionlimit(old_recursion_limit)
return result
def _really_build_ir(parse_tree, used_productions):
"""Real implementation of build_ir()."""
if used_productions is None:
used_productions = set()
if hasattr(parse_tree, 'children'):
parsed_children = [_really_build_ir(child, used_productions)
for child in parse_tree.children]
used_productions.add(parse_tree.production)
result = _handlers[parse_tree.production](*parsed_children)
if parse_tree.source_location is not None:
result.source_location.CopyFrom(parse_tree.source_location)
return result
else:
# For leaf nodes, the temporary "IR" is just the token. Higher-level rules
# will translate it to a real IR.
assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
return parse_tree
# Map of productions to their handlers.
_handlers = {}
_anonymous_name_counter = 0
def _get_anonymous_field_name():
global _anonymous_name_counter
_anonymous_name_counter += 1
return 'emboss_reserved_anonymous_field_{}'.format(_anonymous_name_counter)
def _handles(production_text):
"""_handles marks a function as the handler for a particular production."""
production = parser_types.Production.parse(production_text)
def handles(f):
_handlers[production] = f
return f
return handles
def _make_prelude_import(position):
"""Helper function to construct a synthetic ir_pb2.Import for the prelude."""
location = parser_types.make_location(position, position)
return ir_pb2.Import(
file_name=ir_pb2.String(text='', source_location=location),
local_name=ir_pb2.Word(text='', source_location=location),
source_location=location)
def _text_to_operator(text):
"""Converts an operator's textual name to its corresponding enum."""
operations = {
'+': ir_pb2.FunctionMapping.ADDITION,
'-': ir_pb2.FunctionMapping.SUBTRACTION,
'*': ir_pb2.FunctionMapping.MULTIPLICATION,
'==': ir_pb2.FunctionMapping.EQUALITY,
'!=': ir_pb2.FunctionMapping.INEQUALITY,
'&&': ir_pb2.FunctionMapping.AND,
'||': ir_pb2.FunctionMapping.OR,
'>': ir_pb2.FunctionMapping.GREATER,
'>=': ir_pb2.FunctionMapping.GREATER_OR_EQUAL,
'<': ir_pb2.FunctionMapping.LESS,
'<=': ir_pb2.FunctionMapping.LESS_OR_EQUAL,
}
return operations[text]
def _text_to_function(text):
"""Converts a function's textual name to its corresponding enum."""
functions = {
'$max': ir_pb2.FunctionMapping.MAXIMUM,
'$present': ir_pb2.FunctionMapping.PRESENCE,
'$upper_bound': ir_pb2.FunctionMapping.UPPER_BOUND,
'$lower_bound': ir_pb2.FunctionMapping.LOWER_BOUND,
}
return functions[text]
################################################################################
# Grammar & parse tree to IR translation.
#
# From here to (almost) the end of the file are functions which recursively
# build an IR. The @_handles annotations indicate the exact grammar
# production(s) handled by each function. The handler function should take
# exactly one argument for each symbol in the production's RHS.
#
# The actual Emboss grammar is extracted directly from the @_handles
# annotations, so this is also the grammar definition. For convenience, the
# grammar can be viewed separately in g3doc/grammar.md.
#
# At the end, symbols whose names end in "*", "+", or "?" are extracted from the
# grammar, and appropriate productions are added for zero-or-more, one-or-more,
# or zero-or-one lists, respectively. (This is analogous to the *, +, and ?
# operators in regex.) It is necessary for this to happen here (and not in
# lr1.py) because the generated productions must be associated with
# IR-generation functions.
# A module file is a list of documentation, then imports, then top-level
# attributes, then type definitions. Any section may be missing.
# TODO(bolms): Should Emboss disallow completely empty files?
@_handles('module -> comment-line* doc-line* import-line* attribute-line*'
' type-definition*')
def _file(leading_newlines, docs, imports, attributes, type_definitions):
"""Assembles the top-level IR for a module."""
del leading_newlines # Unused.
# Figure out the best synthetic source_location for the synthesized prelude
# import.
if imports.list:
position = imports.list[0].source_location.start
elif docs.list:
position = docs.list[0].source_location.end
elif attributes.list:
position = attributes.list[0].source_location.start
elif type_definitions.list:
position = type_definitions.list[0].source_location.start
else:
position = 1, 1
# If the source file is completely empty, build_ir won't automatically
# populate the source_location attribute for the module.
if (not docs.list and not imports.list and not attributes.list and
not type_definitions.list):
module_source_location = parser_types.make_location((1, 1), (1, 1))
else:
module_source_location = None
return ir_pb2.Module(
documentation=docs.list,
foreign_import=[_make_prelude_import(position)] + imports.list,
attribute=attributes.list,
type=type_definitions.list,
source_location=module_source_location)
@_handles('import-line ->'
' "import" string-constant "as" snake-word Comment? eol')
def _import(import_, file_name, as_, local_name, comment, eol):
del import_, as_, comment, eol # Unused
return ir_pb2.Import(file_name=file_name, local_name=local_name)
@_handles('doc-line -> doc Comment? eol')
def _doc_line(doc, comment, eol):
del comment, eol # Unused.
return doc
@_handles('doc -> Documentation')
def _doc(documentation):
# As a special case, an empty documentation string may omit the trailing
# space.
if documentation.text == '--':
doc_text = '-- '
else:
doc_text = documentation.text
assert doc_text[0:3] == '-- ', (
"Documentation token '{}' in unknown format.".format(
documentation.text))
return ir_pb2.Documentation(text=doc_text[3:])
# A attribute-line is just a attribute on its own line.
@_handles('attribute-line -> attribute Comment? eol')
def _attribute_line(attr, comment, eol):
del comment, eol # Unused.
return attr
# A attribute is [name = value].
@_handles('attribute -> "[" attribute-context? "$default"?'
' snake-word ":" attribute-value "]"')
def _attribute(open_bracket, context_specifier, default_specifier, name, colon,
attribute_value, close_bracket):
del open_bracket, colon, close_bracket # Unused.
if context_specifier.list:
return ir_pb2.Attribute(name=name,
value=attribute_value,
is_default=bool(default_specifier.list),
back_end=context_specifier.list[0])
else:
return ir_pb2.Attribute(name=name,
value=attribute_value,
is_default=bool(default_specifier.list))
@_handles('attribute-context -> "(" snake-word ")"')
def _attribute_context(open_paren, context_name, close_paren):
del open_paren, close_paren # Unused.
return context_name
@_handles('attribute-value -> expression')
def _attribute_value_expression(expression):
return ir_pb2.AttributeValue(expression=expression)
@_handles('attribute-value -> string-constant')
def _attribute_value_string(string):
return ir_pb2.AttributeValue(string_constant=string)
@_handles('boolean-constant -> BooleanConstant')
def _boolean_constant(boolean):
return ir_pb2.BooleanConstant(value=(boolean.text == 'true'))
@_handles('string-constant -> String')
def _string_constant(string):
"""Turns a String token into an ir_pb2.String, with proper unescaping.
Arguments:
string: A String token.
Returns:
An ir_pb2.String with the "text" field set to the unescaped value of
string.text.
"""
# TODO(bolms): If/when this logic becomes more complex (e.g., to handle \NNN
# or \xNN escapes), extract this into a separate module with separate tests.
assert string.text[0] == '"'
assert string.text[-1] == '"'
assert len(string.text) >= 2
result = []
for substring in re.split(r'(\\.)', string.text[1:-1]):
if substring and substring[0] == '\\':
assert len(substring) == 2
result.append({'\\': '\\', '"': '"', 'n': '\n'}[substring[1]])
else:
result.append(substring)
return ir_pb2.String(text=''.join(result))
# In Emboss, '&&' and '||' may not be mixed without parentheses. These are all
# fine:
#
# x && y && z
# x || y || z
# (x || y) && z
# x || (y && z)
#
# These are syntax errors:
#
# x || y && z
# x && y || z
#
# This is accomplished by making && and || separate-but-equal in the precedence
# hierarchy. Instead of the more traditional:
#
# logical-expression -> or-expression
# or-expression -> and-expression or-expression-right*
# or-expression-right -> '||' and-expression
# and-expression -> equality-expression and-expression-right*
# and-expression-right -> '&&' equality-expression
#
# Or, using yacc-style precedence specifiers:
#
# %left "||"
# %left "&&"
# expression -> expression
# | expression '||' expression
# | expression '&&' expression
#
# Emboss uses a slightly more complex grammar, in which '&&' and '||' are
# parallel, but unmixable:
#
# logical-expression -> and-expression
# | or-expression
# | equality-expression
# or-expression -> equality-expression or-expression-right+
# or-expression-right -> '||' equality-expression
# and-expression -> equality-expression and-expression-right+
# and-expression-right -> '&&' equality-expression
#
# In either case, explicit parenthesization is handled elsewhere in the grammar.
@_handles('logical-expression -> and-expression')
@_handles('logical-expression -> or-expression')
@_handles('logical-expression -> comparison-expression')
@_handles('choice-expression -> logical-expression')
@_handles('expression -> choice-expression')
def _expression(expression):
return expression
# The `logical-expression`s here means that ?: can't be chained without
# parentheses. `x < 0 ? -1 : (x == 0 ? 0 : 1)` is OK, but `x < 0 ? -1 : x == 0
# ? 0 : 1` is not. Parentheses are also needed in the middle: `x <= 0 ? x < 0 ?
# -1 : 0 : 1` is not syntactically valid.
@_handles('choice-expression -> logical-expression "?" logical-expression'
' ":" logical-expression')
def _choice_expression(condition, question, if_true, colon, if_false):
location = parser_types.make_location(
condition.source_location.start, if_false.source_location.end)
operator_location = parser_types.make_location(
question.source_location.start, colon.source_location.end)
# The function_name is a bit weird, but should suffice for any error messages
# that might need it.
return ir_pb2.Expression(
function=ir_pb2.Function(function=ir_pb2.FunctionMapping.CHOICE,
args=[condition, if_true, if_false],
function_name=ir_pb2.Word(
text='?:',
source_location=operator_location),
source_location=location))
@_handles('comparison-expression -> additive-expression')
def _no_op_comparative_expression(expression):
return expression
@_handles('comparison-expression ->'
' additive-expression inequality-operator additive-expression')
def _comparative_expression(left, operator, right):
location = parser_types.make_location(
left.source_location.start, right.source_location.end)
return ir_pb2.Expression(
function=ir_pb2.Function(function=_text_to_operator(operator.text),
args=[left, right],
function_name=operator,
source_location=location))
@_handles('additive-expression -> times-expression additive-expression-right*')
@_handles('times-expression -> negation-expression times-expression-right*')
@_handles('and-expression -> comparison-expression and-expression-right+')
@_handles('or-expression -> comparison-expression or-expression-right+')
def _binary_operator_expression(expression, expression_right):
"""Builds the IR for a chain of equal-precedence left-associative operations.
_binary_operator_expression transforms a right-recursive list of expression
tails into a left-associative Expression tree. For example, given the
arguments:
6, (Tail("+", 7), Tail("-", 8), Tail("+", 10))
_expression produces a structure like:
Expression(Expression(Expression(6, "+", 7), "-", 8), "+", 10)
This transformation is necessary because strict LR(1) grammars do not allow
left recursion.
Note that this method is used for several productions; each of those
productions handles a different precedence level, but are identical in form.
Arguments:
expression: An ir_pb2.Expression which is the head of the (expr, operator,
expr, operator, expr, ...) list.
expression_right: A list of _ExpressionTails corresponding to the (operator,
expr, operator, expr, ...) list that comes after expression.
Returns:
An ir_pb2.Expression with the correct recursive structure to represent a
list of left-associative operations.
"""
e = expression
for right in expression_right.list:
location = parser_types.make_location(
e.source_location.start, right.source_location.end)
e = ir_pb2.Expression(
function=ir_pb2.Function(
function=_text_to_operator(right.operator.text),
args=[e, right.expression],
function_name=right.operator,
source_location=location),
source_location=location)
return e
@_handles('comparison-expression ->'
' additive-expression equality-expression-right+')
@_handles('comparison-expression ->'
' additive-expression less-expression-right-list')
@_handles('comparison-expression ->'
' additive-expression greater-expression-right-list')
def _chained_comparison_expression(expression, expression_right):
"""Builds the IR for a chain of comparisons, like a == b == c.
Like _binary_operator_expression, _chained_comparison_expression transforms a
right-recursive list of expression tails into a left-associative Expression
tree. Unlike _binary_operator_expression, extra AND nodes are added. For
example, the following expression:
0 <= b <= 64
must be translated to the conceptually-equivalent expression:
0 <= b && b <= 64
(The middle subexpression is duplicated -- this would be a problem in a
programming language like C where expressions like `x++` have side effects,
but side effects do not make sense in a data definition language like Emboss.)
_chained_comparison_expression receives a left-hand head expression and a list
of tails, like:
6, (Tail("<=", b), Tail("<=", 64))
which it translates to a structure like:
Expression(Expression(6, "<=", b), "&&", Expression(b, "<=", 64))
The Emboss grammar is constructed such that sequences of "<", "<=", and "=="
comparisons may be chained, and sequences of ">", ">=", and "==" can be
chained, but greater and less-than comparisons may not; e.g., "b < 64 > a" is
not allowed.
Arguments:
expression: An ir_pb2.Expression which is the head of the (expr, operator,
expr, operator, expr, ...) list.
expression_right: A list of _ExpressionTails corresponding to the (operator,
expr, operator, expr, ...) list that comes after expression.
Returns:
An ir_pb2.Expression with the correct recursive structure to represent a
chain of left-associative comparison operations.
"""
sequence = [expression]
for right in expression_right.list:
sequence.append(right.operator)
sequence.append(right.expression)
comparisons = []
for i in range(0, len(sequence) - 1, 2):
left, operator, right = sequence[i:i+3]
location = parser_types.make_location(
left.source_location.start, right.source_location.end)
comparisons.append(ir_pb2.Expression(
function=ir_pb2.Function(
function=_text_to_operator(operator.text),
args=[left, right],
function_name=operator,
source_location=location),
source_location=location))
e = comparisons[0]
for comparison in comparisons[1:]:
location = parser_types.make_location(
e.source_location.start, comparison.source_location.end)
e = ir_pb2.Expression(
function=ir_pb2.Function(
function=ir_pb2.FunctionMapping.AND,
args=[e, comparison],
function_name=ir_pb2.Word(
text='&&',
source_location=comparison.function.args[0].source_location),
source_location=location),
source_location=location)
return e
# _chained_comparison_expression, above, handles three types of chains: `a == b
# == c`, `a < b <= c`, and `a > b >= c`.
#
# This requires a bit of subtlety in the productions for
# `x-expression-right-list`, because the `==` operator may be freely mixed into
# greater-than or less-than chains, like `a < b == c <= d` or `a > b == c >= d`,
# but greater-than and less-than may not be mixed; i.e., `a < b >= c` is
# disallowed.
#
# In order to keep the grammar unambiguous -- that is, in order to ensure that
# every valid input can only be parsed in exactly one way -- the languages
# defined by `equality-expression-right*`, `greater-expression-right-list`, and
# `less-expression-right-list` cannot overlap.
#
# `equality-expression-right*`, by definition, only contains `== n` elements.
# By forcing `greater-expression-right-list` to contain at least one
# `greater-expression-right`, we can ensure that a chain like `== n == m` cannot
# be parsed as a `greater-expression-right-list`. Similar logic applies in the
# less-than case.
#
# There is another potential source of ambiguity here: if
# `greater-expression-right-list` were
#
# greater-expression-right-list ->
# equality-or-greater-expression-right* greater-expression-right
# equality-or-greater-expression-right*
#
# then a sequence like '> b > c > d' could be parsed as any of:
#
# () (> b) ((> c) (> d))
# ((> b)) (> c) ((> d))
# ((> b) (> c)) (> d) ()
#
# By using `equality-expression-right*` for the first symbol, only the first
# parse is possible.
@_handles('greater-expression-right-list ->'
' equality-expression-right* greater-expression-right'
' equality-or-greater-expression-right*')
@_handles('less-expression-right-list ->'
' equality-expression-right* less-expression-right'
' equality-or-less-expression-right*')
def _chained_comparison_tails(start, middle, end):
return _List(start.list + [middle] + end.list)
@_handles('equality-or-greater-expression-right -> equality-expression-right')
@_handles('equality-or-greater-expression-right -> greater-expression-right')
@_handles('equality-or-less-expression-right -> equality-expression-right')
@_handles('equality-or-less-expression-right -> less-expression-right')
def _equality_or_less_or_greater(right):
return right
@_handles('and-expression-right -> and-operator comparison-expression')
@_handles('or-expression-right -> or-operator comparison-expression')
@_handles('additive-expression-right -> additive-operator times-expression')
@_handles('equality-expression-right -> equality-operator additive-expression')
@_handles('greater-expression-right -> greater-operator additive-expression')
@_handles('less-expression-right -> less-operator additive-expression')
@_handles('times-expression-right ->'
' multiplicative-operator negation-expression')
def _expression_right_production(operator, expression):
return _ExpressionTail(operator, expression)
# This supports a single layer of unary plus/minus, so "+5" and "-value" are
# allowed, but "+-5" or "-+-something" are not.
@_handles('negation-expression -> additive-operator bottom-expression')
def _negation_expression_with_operator(operator, expression):
phantom_zero_location = ir_pb2.Location(start=operator.source_location.start,
end=operator.source_location.start)
return ir_pb2.Expression(
function=ir_pb2.Function(
function=_text_to_operator(operator.text),
args=[ir_pb2.Expression(
constant=ir_pb2.NumericConstant(
value='0',
source_location=phantom_zero_location),
source_location=phantom_zero_location), expression],
function_name=operator,
source_location=ir_pb2.Location(
start=operator.source_location.start,
end=expression.source_location.end)))
@_handles('negation-expression -> bottom-expression')
def _negation_expression(expression):
return expression
@_handles('bottom-expression -> "(" expression ")"')
def _bottom_expression_parentheses(open_paren, expression, close_paren):
del open_paren, close_paren # Unused.
return expression
@_handles('bottom-expression -> function-name "(" argument-list ")"')
def _bottom_expression_function(function, open_paren, arguments, close_paren):
del open_paren # Unused.
return ir_pb2.Expression(
function=ir_pb2.Function(
function=_text_to_function(function.text),
args=arguments.list,
function_name=function,
source_location=ir_pb2.Location(
start=function.source_location.start,
end=close_paren.source_location.end)))
@_handles('comma-then-expression -> "," expression')
def _comma_then_expression(comma, expression):
del comma # Unused.
return expression
@_handles('argument-list -> expression comma-then-expression*')
def _argument_list(head, tail):
tail.list.insert(0, head)
return tail
@_handles('argument-list ->')
def _empty_argument_list():
return _List([])
@_handles('bottom-expression -> numeric-constant')
def _bottom_expression_from_numeric_constant(constant):
return ir_pb2.Expression(constant=constant)
@_handles('bottom-expression -> constant-reference')
def _bottom_expression_from_constant_reference(reference):
return ir_pb2.Expression(constant_reference=reference)
@_handles('bottom-expression -> builtin-reference')
def _bottom_expression_from_builtin(reference):
return ir_pb2.Expression(builtin_reference=reference)
@_handles('bottom-expression -> boolean-constant')
def _bottom_expression_from_boolean_constant(boolean):
return ir_pb2.Expression(boolean_constant=boolean)
@_handles('bottom-expression -> field-reference')
def _bottom_expression_from_reference(reference):
return reference
@_handles('field-reference -> snake-reference field-reference-tail*')
def _indirect_field_reference(field_reference, field_references):
if field_references.source_location.HasField('end'):
end_location = field_references.source_location.end
else:
end_location = field_reference.source_location.end
return ir_pb2.Expression(field_reference=ir_pb2.FieldReference(
path=[field_reference] + field_references.list,
source_location=parser_types.make_location(
field_reference.source_location.start, end_location)))
# If "Type.field" ever becomes syntactically valid, it will be necessary to
# check that enum values are compile-time constants.
@_handles('field-reference-tail -> "." snake-reference')
def _field_reference_tail(dot, reference):
del dot # Unused.
return reference
@_handles('numeric-constant -> Number')
def _numeric_constant(number):
# All types of numeric constant tokenize to the same symbol, because they are
# interchangeable in source code.
if number.text[0:2] == '0b':
n = int(number.text.replace('_', '')[2:], 2)
elif number.text[0:2] == '0x':
n = int(number.text.replace('_', '')[2:], 16)
else:
n = int(number.text.replace('_', ''), 10)
return ir_pb2.NumericConstant(value=str(n))
@_handles('type-definition -> struct')
@_handles('type-definition -> bits')
@_handles('type-definition -> enum')
@_handles('type-definition -> external')
def _type_definition(type_definition):
return type_definition
# struct StructureName:
# ... fields ...
# bits BitName:
# ... fields ...
@_handles('struct -> "struct" type-name delimited-parameter-definition-list?'
' ":" Comment? eol struct-body')
@_handles('bits -> "bits" type-name delimited-parameter-definition-list? ":"'
' Comment? eol bits-body')
def _structure(struct, name, parameters, colon, comment, newline, struct_body):
"""Composes the top-level IR for an Emboss structure."""
del colon, comment, newline # Unused.
struct_body.structure.source_location.start.CopyFrom(
struct.source_location.start)
struct_body.structure.source_location.end.CopyFrom(
struct_body.source_location.end)
struct_body.name.CopyFrom(name)
if parameters.list:
struct_body.runtime_parameter.extend(parameters.list[0].list)
return struct_body
@_handles('delimited-parameter-definition-list ->'
' "(" parameter-definition-list ")"')
def _delimited_parameter_definition_list(open_paren, parameters, close_paren):
del open_paren, close_paren # Unused
return parameters
@_handles('parameter-definition -> snake-name ":" type')
def _parameter_definition(name, double_colon, parameter_type):
del double_colon # Unused
return ir_pb2.RuntimeParameter(name=name, physical_type_alias=parameter_type)
@_handles('parameter-definition-list-tail -> "," parameter-definition')
def _parameter_definition_list_tail(comma, parameter):
del comma # Unused.
return parameter
@_handles('parameter-definition-list -> parameter-definition'
' parameter-definition-list-tail*')
def _parameter_definition_list(head, tail):
tail.list.insert(0, head)
return tail
@_handles('parameter-definition-list ->')
def _empty_parameter_definition_list():
return _List([])
# The body of a struct: basically, the part after the first line.
@_handles('struct-body -> Indent doc-line* attribute-line*'
' type-definition* struct-field-block Dedent')
def _struct_body(indent, docs, attributes, types, fields, dedent):
del indent, dedent # Unused.
return _structure_body(docs, attributes, types, fields,
ir_pb2.AddressableUnit.BYTE)
def _structure_body(docs, attributes, types, fields, addressable_unit):
"""Constructs the body of a structure (bits or struct) definition."""
return ir_pb2.TypeDefinition(
structure=ir_pb2.Structure(field=[field.field for field in fields.list]),
documentation=docs.list,
attribute=attributes.list,
subtype=types.list + [subtype for field in fields.list for subtype in
field.subtypes],
addressable_unit=addressable_unit)
@_handles('struct-field-block ->')
@_handles('bits-field-block ->')
@_handles('anonymous-bits-field-block ->')
def _empty_field_block():
return _List([])
@_handles('struct-field-block ->'
' conditional-struct-field-block struct-field-block')
@_handles('bits-field-block ->'
' conditional-bits-field-block bits-field-block')
@_handles('anonymous-bits-field-block -> conditional-anonymous-bits-field-block'
' anonymous-bits-field-block')
def _conditional_block_plus_field_block(conditional_block, block):
return _List(conditional_block.list + block.list)
@_handles('struct-field-block ->'
' unconditional-struct-field struct-field-block')
@_handles('bits-field-block ->'
' unconditional-bits-field bits-field-block')
@_handles('anonymous-bits-field-block ->'
' unconditional-anonymous-bits-field anonymous-bits-field-block')
def _unconditional_block_plus_field_block(field, block):
"""Prepends an unconditional field to block."""
field.field.existence_condition.source_location.CopyFrom(
field.source_location)
field.field.existence_condition.boolean_constant.source_location.CopyFrom(
field.source_location)
field.field.existence_condition.boolean_constant.value = True
return _List([field] + block.list)
# Struct "fields" are regular fields, inline enums, bits, or structs, anonymous
# inline bits, or virtual fields.
@_handles('unconditional-struct-field -> field')
@_handles('unconditional-struct-field -> inline-enum-field-definition')
@_handles('unconditional-struct-field -> inline-bits-field-definition')
@_handles('unconditional-struct-field -> inline-struct-field-definition')
@_handles('unconditional-struct-field -> anonymous-bits-field-definition')
@_handles('unconditional-struct-field -> virtual-field')
# Bits fields are "regular" fields, inline enums or bits, or virtual fields.
#
# Inline structs and anonymous inline bits are not allowed inside of bits:
# anonymous inline bits are pointless, and inline structs do not make sense,
# since a struct cannot be a part of a bits.
#
# Anonymous inline bits may not include virtual fields; instead, the virtual
# field should be a direct part of the enclosing structure.
@_handles('unconditional-anonymous-bits-field -> field')
@_handles('unconditional-anonymous-bits-field -> inline-enum-field-definition')
@_handles('unconditional-anonymous-bits-field -> inline-bits-field-definition')
@_handles('unconditional-bits-field -> unconditional-anonymous-bits-field')
@_handles('unconditional-bits-field -> virtual-field')
def _unconditional_field(field):
"""Handles the unifying grammar production for a struct or bits field."""
return field
# TODO(bolms): Add 'elif' and 'else' support.
# TODO(bolms): Should nested 'if' blocks be allowed?
@_handles('conditional-struct-field-block ->'
' "if" expression ":" Comment? eol'
' Indent unconditional-struct-field+ Dedent')
@_handles('conditional-bits-field-block ->'
' "if" expression ":" Comment? eol'
' Indent unconditional-bits-field+ Dedent')
@_handles('conditional-anonymous-bits-field-block ->'
' "if" expression ":" Comment? eol'
' Indent unconditional-anonymous-bits-field+ Dedent')
def _conditional_field_block(if_keyword, expression, colon, comment, newline,
indent, fields, dedent):
"""Applies an existence_condition to each element of fields."""
del if_keyword, newline, colon, comment, indent, dedent # Unused.
for field in fields.list:
condition = field.field.existence_condition
condition.CopyFrom(expression)
condition.source_location.is_disjoint_from_parent = True
return fields
# The body of a bit field definition: basically, the part after the first line.
@_handles('bits-body -> Indent doc-line* attribute-line*'
' type-definition* bits-field-block Dedent')
def _bits_body(indent, docs, attributes, types, fields, dedent):
del indent, dedent # Unused.
return _structure_body(docs, attributes, types, fields,
ir_pb2.AddressableUnit.BIT)
# Inline bits (defined as part of a field) are more restricted than standalone
# bits.
@_handles('anonymous-bits-body ->'
' Indent attribute-line* anonymous-bits-field-block Dedent')
def _anonymous_bits_body(indent, attributes, fields, dedent):
del indent, dedent # Unused.
return _structure_body(_List([]), attributes, _List([]), fields,
ir_pb2.AddressableUnit.BIT)
# A field is:
# range type name (abbr) [attr: value] [attr2: value] -- doc
# -- doc
# -- doc
# [attr3: value]
# [attr4: value]
@_handles('field ->'
' field-location type snake-name abbreviation? attribute* doc?'
' Comment? eol field-body?')
def _field(location, field_type, name, abbreviation, attributes, doc, comment,
newline, field_body):
"""Constructs an ir_pb2.Field from the given components."""
del comment # Unused
field = ir_pb2.Field(location=location,
type=field_type,
name=name,
attribute=attributes.list,
documentation=doc.list)
if field_body.list:
field.attribute.extend(field_body.list[0].attribute)
field.documentation.extend(field_body.list[0].documentation)
if abbreviation.list:
field.abbreviation.CopyFrom(abbreviation.list[0])
field.source_location.start.CopyFrom(location.source_location.start)
if field_body.source_location.HasField('end'):
field.source_location.end.CopyFrom(field_body.source_location.end)
else:
field.source_location.end.CopyFrom(newline.source_location.end)
return _FieldWithType(field=field)
# A "virtual field" is:
# let name = value
# -- doc
# -- doc
# [attr1: value]
# [attr2: value]
@_handles('virtual-field ->'
' "let" snake-name "=" expression Comment? eol field-body?')
def _virtual_field(let, name, equals, value, comment, newline, field_body):
"""Constructs an ir_pb2.Field from the given components."""
del equals, comment # Unused
field = ir_pb2.Field(read_transform=value, name=name)
if field_body.list:
field.attribute.extend(field_body.list[0].attribute)
field.documentation.extend(field_body.list[0].documentation)
field.source_location.start.CopyFrom(let.source_location.start)
if field_body.source_location.HasField('end'):
field.source_location.end.CopyFrom(field_body.source_location.end)
else:
field.source_location.end.CopyFrom(newline.source_location.end)
return _FieldWithType(field=field)
# An inline enum is:
# range "enum" name (abbr):
# -- doc
# -- doc
# [attr3: value]
# [attr4: value]
# NAME = 10
# NAME2 = 20
@_handles('inline-enum-field-definition ->'
' field-location "enum" snake-name abbreviation? ":" Comment? eol'
' enum-body')
def _inline_enum_field(location, enum, name, abbreviation, colon, comment,
newline, enum_body):
"""Constructs an ir_pb2.Field for an inline enum field."""
del enum, colon, comment, newline # Unused.
return _inline_type_field(location, name, abbreviation, enum_body)
@_handles(
'inline-struct-field-definition ->'
' field-location "struct" snake-name abbreviation? ":" Comment? eol'
' struct-body')
def _inline_struct_field(location, struct, name, abbreviation, colon, comment,
newline, struct_body):
del struct, colon, comment, newline # Unused.
return _inline_type_field(location, name, abbreviation, struct_body)
@_handles('inline-bits-field-definition ->'
' field-location "bits" snake-name abbreviation? ":" Comment? eol'
' bits-body')
def _inline_bits_field(location, bits, name, abbreviation, colon, comment,
newline, bits_body):
del bits, colon, comment, newline # Unused.
return _inline_type_field(location, name, abbreviation, bits_body)
def _inline_type_field(location, name, abbreviation, body):
"""Shared implementation of _inline_enum_field and _anonymous_bit_field."""
field = ir_pb2.Field(location=location,
name=name,
attribute=body.attribute,
documentation=body.documentation)
# All attributes should be attached to the field, not the type definition: if
# the user wants to use type attributes, they should create a separate type
# definition and reference it.
del body.attribute[:]
type_name = ir_pb2.NameDefinition()
type_name.CopyFrom(name)
type_name.name.text = name_conversion.snake_to_camel(type_name.name.text)
field.type.atomic_type.reference.source_name.extend([type_name.name])
field.type.atomic_type.reference.source_location.CopyFrom(
type_name.source_location)
field.type.atomic_type.reference.is_local_name = True
field.type.atomic_type.source_location.CopyFrom(type_name.source_location)
field.type.source_location.CopyFrom(type_name.source_location)
if abbreviation.list:
field.abbreviation.CopyFrom(abbreviation.list[0])
field.source_location.start.CopyFrom(location.source_location.start)
body.source_location.start.CopyFrom(location.source_location.start)
if body.HasField('enumeration'):
body.enumeration.source_location.CopyFrom(body.source_location)
else:
assert body.HasField('structure')
body.structure.source_location.CopyFrom(body.source_location)
body.name.CopyFrom(type_name)
field.source_location.end.CopyFrom(body.source_location.end)
subtypes = [body] + list(body.subtype)
del body.subtype[:]
return _FieldWithType(field=field, subtypes=subtypes)
@_handles('anonymous-bits-field-definition ->'
' field-location "bits" ":" Comment? eol anonymous-bits-body')
def _anonymous_bit_field(location, bits_keyword, colon, comment, newline,
bits_body):
"""Constructs an ir_pb2.Field for an anonymous bit field."""
del colon, comment, newline # Unused.
name = ir_pb2.NameDefinition(
name=ir_pb2.Word(
text=_get_anonymous_field_name(),
source_location=bits_keyword.source_location),
source_location=bits_keyword.source_location,
is_anonymous=True)
return _inline_type_field(location, name, _List([]), bits_body)
@_handles('field-body -> Indent doc-line* attribute-line* Dedent')
def _field_body(indent, docs, attributes, dedent):
del indent, dedent # Unused.
return ir_pb2.Field(documentation=docs.list, attribute=attributes.list)
# A parenthetically-denoted abbreviation.
@_handles('abbreviation -> "(" snake-word ")"')
def _abbreviation(open_paren, word, close_paren):
del open_paren, close_paren # Unused.
return word
# enum EnumName:
# ... values ...
@_handles('enum -> "enum" type-name ":" Comment? eol enum-body')
def _enum(enum, name, colon, comment, newline, enum_body):
del colon, comment, newline # Unused.
enum_body.enumeration.source_location.start.CopyFrom(
enum.source_location.start)
enum_body.enumeration.source_location.end.CopyFrom(
enum_body.source_location.end)
enum_body.name.CopyFrom(name)
return enum_body
# [enum Foo:]
# name = value
# name = value
@_handles('enum-body -> Indent doc-line* attribute-line* enum-value+ Dedent')
def _enum_body(indent, docs, attributes, values, dedent):
del indent, dedent # Unused.
return ir_pb2.TypeDefinition(
enumeration=ir_pb2.Enum(value=values.list),
documentation=docs.list,
attribute=attributes.list,
addressable_unit=ir_pb2.AddressableUnit.BIT)
# name = value
@_handles('enum-value -> '
' constant-name "=" expression attribute* doc? Comment? eol enum-value-body?')
def _enum_value(name, equals, expression, attribute, documentation, comment, newline,
body):
del equals, comment, newline # Unused.
result = ir_pb2.EnumValue(name=name,
value=expression,
documentation=documentation.list,
attribute=attribute.list)
if body.list:
result.documentation.extend(body.list[0].documentation)
result.attribute.extend(body.list[0].attribute)
return result
@_handles('enum-value-body -> Indent doc-line* attribute-line* Dedent')
def _enum_value_body(indent, docs, attributes, dedent):
del indent, dedent # Unused.
return ir_pb2.EnumValue(documentation=docs.list, attribute=attributes.list)
# An external is just a declaration that a type exists and has certain
# attributes.
@_handles('external -> "external" type-name ":" Comment? eol external-body')
def _external(external, name, colon, comment, newline, external_body):
del colon, comment, newline # Unused.
external_body.source_location.start.CopyFrom(external.source_location.start)
external_body.name.CopyFrom(name)
return external_body
# This syntax implicitly requires either a documentation line or a attribute
# line, or it won't parse (because no Indent/Dedent tokens will be emitted).
@_handles('external-body -> Indent doc-line* attribute-line* Dedent')
def _external_body(indent, docs, attributes, dedent):
return ir_pb2.TypeDefinition(
external=ir_pb2.External(
# Set source_location here, since it won't be set automatically.
source_location=ir_pb2.Location(start=indent.source_location.start,
end=dedent.source_location.end)),
documentation=docs.list,
attribute=attributes.list)
@_handles('field-location -> expression "[" "+" expression "]"')
def _field_location(start, open_bracket, plus, size, close_bracket):
del open_bracket, plus, close_bracket # Unused.
return ir_pb2.FieldLocation(start=start, size=size)
@_handles('delimited-argument-list -> "(" argument-list ")"')
def _type_argument_list(open_paren, arguments, close_paren):
del open_paren, close_paren # Unused
return arguments
# A type is "TypeName" or "TypeName[length]" or "TypeName[length][length]", etc.
# An array type may have an empty length ("Type[]"). This is only valid for the
# outermost length (the last set of brackets), but that must be checked
# elsewhere.
@_handles('type -> type-reference delimited-argument-list? type-size-specifier?'
' array-length-specifier*')
def _type(reference, parameters, size, array_spec):
"""Builds the IR for a type specifier."""
base_type_source_location_end = reference.source_location.end
atomic_type_source_location_end = reference.source_location.end
if parameters.list:
base_type_source_location_end = parameters.source_location.end
atomic_type_source_location_end = parameters.source_location.end
if size.list:
base_type_source_location_end = size.source_location.end
base_type_location = parser_types.make_location(
reference.source_location.start,
base_type_source_location_end)
atomic_type_location = parser_types.make_location(
reference.source_location.start,
atomic_type_source_location_end)
t = ir_pb2.Type(
atomic_type=ir_pb2.AtomicType(
reference=reference,
source_location=atomic_type_location,
runtime_parameter=parameters.list[0].list if parameters.list else []),
size_in_bits=size.list[0] if size.list else None,
source_location=base_type_location)
for length in array_spec.list:
location = parser_types.make_location(
t.source_location.start, length.source_location.end)
if isinstance(length, ir_pb2.Expression):
t = ir_pb2.Type(
array_type=ir_pb2.ArrayType(base_type=t,
element_count=length,
source_location=location),
source_location=location)
elif isinstance(length, ir_pb2.Empty):
t = ir_pb2.Type(
array_type=ir_pb2.ArrayType(base_type=t,
automatic=length,
source_location=location),
source_location=location)
else:
assert False, "Shouldn't be here."
return t
# TODO(bolms): Should symbolic names or expressions be allowed? E.g.,
# UInt:FIELD_SIZE or UInt:(16 + 16)?
@_handles('type-size-specifier -> ":" numeric-constant')
def _type_size_specifier(colon, numeric_constant):
"""handles the ":32" part of a type specifier like "UInt:32"."""
del colon
return ir_pb2.Expression(constant=numeric_constant)
# The distinctions between different formats of NameDefinitions, Words, and
# References are enforced during parsing, but not propagated to the IR.
@_handles('type-name -> type-word')
@_handles('snake-name -> snake-word')
@_handles('constant-name -> constant-word')
def _name(word):
return ir_pb2.NameDefinition(name=word)
@_handles('type-word -> CamelWord')
@_handles('snake-word -> SnakeWord')
@_handles('builtin-field-word -> "$size_in_bits"')
@_handles('builtin-field-word -> "$size_in_bytes"')
@_handles('builtin-field-word -> "$max_size_in_bits"')
@_handles('builtin-field-word -> "$max_size_in_bytes"')
@_handles('builtin-field-word -> "$min_size_in_bits"')
@_handles('builtin-field-word -> "$min_size_in_bytes"')
@_handles('builtin-word -> "$is_statically_sized"')
@_handles('builtin-word -> "$static_size_in_bits"')
@_handles('builtin-word -> "$next"')
@_handles('constant-word -> ShoutyWord')
@_handles('and-operator -> "&&"')
@_handles('or-operator -> "||"')
@_handles('less-operator -> "<="')
@_handles('less-operator -> "<"')
@_handles('greater-operator -> ">="')
@_handles('greater-operator -> ">"')
@_handles('equality-operator -> "=="')
@_handles('inequality-operator -> "!="')
@_handles('additive-operator -> "+"')
@_handles('additive-operator -> "-"')
@_handles('multiplicative-operator -> "*"')
@_handles('function-name -> "$max"')
@_handles('function-name -> "$present"')
@_handles('function-name -> "$upper_bound"')
@_handles('function-name -> "$lower_bound"')
def _word(word):
return ir_pb2.Word(text=word.text)
@_handles('type-reference -> type-reference-tail')
@_handles('constant-reference -> constant-reference-tail')
def _un_module_qualified_type_reference(reference):
return reference
@_handles('constant-reference-tail -> constant-word')
@_handles('type-reference-tail -> type-word')
@_handles('snake-reference -> snake-word')
@_handles('snake-reference -> builtin-field-word')
def _reference(word):
return ir_pb2.Reference(source_name=[word])
@_handles('builtin-reference -> builtin-word')
def _builtin_reference(word):
return ir_pb2.Reference(source_name=[word],
canonical_name=ir_pb2.CanonicalName(
object_path=[word.text]))
# Because constant-references ("Enum.NAME") are used in the same contexts as
# field-references ("field.subfield"), module-qualified constant references
# ("module.Enum.VALUE") have to take snake-reference, not snake-word, on the
# left side of the dot. Otherwise, when a "snake_word" is followed by a "." in
# an expression context, the LR(1) parser cannot determine whether to reduce the
# snake-word to snake-reference (to eventually become field-reference), or to
# shift the dot onto the stack (to eventually become constant-reference). By
# using snake-reference as the head of both, the parser can always reduce, then
# shift the dot, then determine whether to proceed with constant-reference if it
# sees "snake_name.TypeName" or field-reference if it sees
# "snake_name.snake_name".
@_handles('constant-reference -> snake-reference "." constant-reference-tail')
def _module_qualified_constant_reference(new_head, dot, reference):
del dot # Unused.
new_source_name = list(new_head.source_name) + list(reference.source_name)
del reference.source_name[:]
reference.source_name.extend(new_source_name)
return reference
@_handles('constant-reference-tail -> type-word "." constant-reference-tail')
# module.Type.SubType.name is a reference to something that *must* be a
# constant.
@_handles('constant-reference-tail -> type-word "." snake-reference')
@_handles('type-reference-tail -> type-word "." type-reference-tail')
@_handles('type-reference -> snake-word "." type-reference-tail')
def _qualified_reference(word, dot, reference):
"""Adds a name. or Type. qualification to the head of a reference."""
del dot # Unused.
new_source_name = [word] + list(reference.source_name)
del reference.source_name[:]
reference.source_name.extend(new_source_name)
return reference
# Arrays are properly translated to IR in _type().
@_handles('array-length-specifier -> "[" expression "]"')
def _array_length_specifier(open_bracket, length, close_bracket):
del open_bracket, close_bracket # Unused.
return length
# An array specifier can end with empty brackets ("arr[3][]"), in which case the
# array's size is inferred from the size of its enclosing field.
@_handles('array-length-specifier -> "[" "]"')
def _auto_array_length_specifier(open_bracket, close_bracket):
# Note that the Void's source_location is the space between the brackets (if
# any).
return ir_pb2.Empty(
source_location=ir_pb2.Location(start=open_bracket.source_location.end,
end=close_bracket.source_location.start))
@_handles('eol -> "\\n" comment-line*')
def _eol(eol, comments):
del comments # Unused
return eol
@_handles('comment-line -> Comment? "\\n"')
def _comment_line(comment, eol):
del comment # Unused
return eol
def _finalize_grammar():
"""_Finalize adds productions for foo*, foo+, and foo? symbols."""
star_symbols = set()
plus_symbols = set()
option_symbols = set()
for production in _handlers:
for symbol in production.rhs:
if symbol[-1] == '*':
star_symbols.add(symbol[:-1])
elif symbol[-1] == '+':
# symbol+ relies on the rule for symbol*
star_symbols.add(symbol[:-1])
plus_symbols.add(symbol[:-1])
elif symbol[-1] == '?':
option_symbols.add(symbol[:-1])
for symbol in star_symbols:
_handles('{s}* -> {s} {s}*'.format(s=symbol))(
lambda e, r: _List([e] + r.list))
_handles('{s}* ->'.format(s=symbol))(lambda: _List([]))
for symbol in plus_symbols:
_handles('{s}+ -> {s} {s}*'.format(s=symbol))(
lambda e, r: _List([e] + r.list))
for symbol in option_symbols:
_handles('{s}? -> {s}'.format(s=symbol))(lambda e: _List([e]))
_handles('{s}? ->'.format(s=symbol))(lambda: _List([]))
_finalize_grammar()
# End of grammar.
################################################################################
# These export the grammar used by module_ir so that parser_generator can build
# a parser for the same language.
START_SYMBOL = 'module'
EXPRESSION_START_SYMBOL = 'expression'
PRODUCTIONS = list(_handlers.keys())