blob: a80b898cd65d30beb4056a927632f8b9736ad921 [file] [log] [blame]
# mako/lexer.py
# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
#
# This module is part of Mako and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
"""provides the Lexer class for parsing template strings into parse trees."""
import re
import codecs
from mako import parsetree, exceptions, compat
from mako.pygen import adjust_whitespace
_regexp_cache = {}
class Lexer(object):
def __init__(self, text, filename=None,
disable_unicode=False,
input_encoding=None, preprocessor=None):
self.text = text
self.filename = filename
self.template = parsetree.TemplateNode(self.filename)
self.matched_lineno = 1
self.matched_charpos = 0
self.lineno = 1
self.match_position = 0
self.tag = []
self.control_line = []
self.ternary_stack = []
self.disable_unicode = disable_unicode
self.encoding = input_encoding
if compat.py3k and disable_unicode:
raise exceptions.UnsupportedError(
"Mako for Python 3 does not "
"support disabling Unicode")
if preprocessor is None:
self.preprocessor = []
elif not hasattr(preprocessor, '__iter__'):
self.preprocessor = [preprocessor]
else:
self.preprocessor = preprocessor
@property
def exception_kwargs(self):
return {'source': self.text,
'lineno': self.matched_lineno,
'pos': self.matched_charpos,
'filename': self.filename}
def match(self, regexp, flags=None):
"""compile the given regexp, cache the reg, and call match_reg()."""
try:
reg = _regexp_cache[(regexp, flags)]
except KeyError:
if flags:
reg = re.compile(regexp, flags)
else:
reg = re.compile(regexp)
_regexp_cache[(regexp, flags)] = reg
return self.match_reg(reg)
def match_reg(self, reg):
"""match the given regular expression object to the current text
position.
if a match occurs, update the current text and line position.
"""
mp = self.match_position
match = reg.match(self.text, self.match_position)
if match:
(start, end) = match.span()
if end == start:
self.match_position = end + 1
else:
self.match_position = end
self.matched_lineno = self.lineno
lines = re.findall(r"\n", self.text[mp:self.match_position])
cp = mp - 1
while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
cp -= 1
self.matched_charpos = mp - cp
self.lineno += len(lines)
# print "MATCHED:", match.group(0), "LINE START:",
# self.matched_lineno, "LINE END:", self.lineno
# print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
# (match and "TRUE" or "FALSE")
return match
def parse_until_text(self, watch_nesting, *text):
startpos = self.match_position
text_re = r'|'.join(text)
brace_level = 0
paren_level = 0
bracket_level = 0
while True:
match = self.match(r'#.*\n')
if match:
continue
match = self.match(r'(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1',
re.S)
if match:
continue
match = self.match(r'(%s)' % text_re)
if match and not (watch_nesting
and (brace_level > 0 or paren_level > 0
or bracket_level > 0)):
return \
self.text[startpos:
self.match_position - len(match.group(1))],\
match.group(1)
elif not match:
match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
if match:
brace_level += match.group(1).count('{')
brace_level -= match.group(1).count('}')
paren_level += match.group(1).count('(')
paren_level -= match.group(1).count(')')
bracket_level += match.group(1).count('[')
bracket_level -= match.group(1).count(']')
continue
raise exceptions.SyntaxException(
"Expected: %s" %
','.join(text),
**self.exception_kwargs)
def append_node(self, nodecls, *args, **kwargs):
kwargs.setdefault('source', self.text)
kwargs.setdefault('lineno', self.matched_lineno)
kwargs.setdefault('pos', self.matched_charpos)
kwargs['filename'] = self.filename
node = nodecls(*args, **kwargs)
if len(self.tag):
self.tag[-1].nodes.append(node)
else:
self.template.nodes.append(node)
# build a set of child nodes for the control line
# (used for loop variable detection)
# also build a set of child nodes on ternary control lines
# (used for determining if a pass needs to be auto-inserted
if self.control_line:
control_frame = self.control_line[-1]
control_frame.nodes.append(node)
if not (isinstance(node, parsetree.ControlLine) and
control_frame.is_ternary(node.keyword)):
if self.ternary_stack and self.ternary_stack[-1]:
self.ternary_stack[-1][-1].nodes.append(node)
if isinstance(node, parsetree.Tag):
if len(self.tag):
node.parent = self.tag[-1]
self.tag.append(node)
elif isinstance(node, parsetree.ControlLine):
if node.isend:
self.control_line.pop()
self.ternary_stack.pop()
elif node.is_primary:
self.control_line.append(node)
self.ternary_stack.append([])
elif self.control_line and \
self.control_line[-1].is_ternary(node.keyword):
self.ternary_stack[-1].append(node)
elif self.control_line and \
not self.control_line[-1].is_ternary(node.keyword):
raise exceptions.SyntaxException(
"Keyword '%s' not a legal ternary for keyword '%s'" %
(node.keyword, self.control_line[-1].keyword),
**self.exception_kwargs)
_coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
"""given string/unicode or bytes/string, determine encoding
from magic encoding comment, return body as unicode
or raw if decode_raw=False
"""
if isinstance(text, compat.text_type):
m = self._coding_re.match(text)
encoding = m and m.group(1) or known_encoding or 'ascii'
return encoding, text
if text.startswith(codecs.BOM_UTF8):
text = text[len(codecs.BOM_UTF8):]
parsed_encoding = 'utf-8'
m = self._coding_re.match(text.decode('utf-8', 'ignore'))
if m is not None and m.group(1) != 'utf-8':
raise exceptions.CompileException(
"Found utf-8 BOM in file, with conflicting "
"magic encoding comment of '%s'" % m.group(1),
text.decode('utf-8', 'ignore'),
0, 0, filename)
else:
m = self._coding_re.match(text.decode('utf-8', 'ignore'))
if m:
parsed_encoding = m.group(1)
else:
parsed_encoding = known_encoding or 'ascii'
if decode_raw:
try:
text = text.decode(parsed_encoding)
except UnicodeDecodeError:
raise exceptions.CompileException(
"Unicode decode operation of encoding '%s' failed" %
parsed_encoding,
text.decode('utf-8', 'ignore'),
0, 0, filename)
return parsed_encoding, text
def parse(self):
self.encoding, self.text = self.decode_raw_stream(
self.text,
not self.disable_unicode,
self.encoding,
self.filename)
for preproc in self.preprocessor:
self.text = preproc(self.text)
# push the match marker past the
# encoding comment.
self.match_reg(self._coding_re)
self.textlength = len(self.text)
while (True):
if self.match_position > self.textlength:
break
if self.match_end():
break
if self.match_expression():
continue
if self.match_control_line():
continue
if self.match_comment():
continue
if self.match_tag_start():
continue
if self.match_tag_end():
continue
if self.match_python_block():
continue
if self.match_text():
continue
if self.match_position > self.textlength:
break
raise exceptions.CompileException("assertion failed")
if len(self.tag):
raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
self.tag[-1].keyword,
**self.exception_kwargs)
if len(self.control_line):
raise exceptions.SyntaxException(
"Unterminated control keyword: '%s'" %
self.control_line[-1].keyword,
self.text,
self.control_line[-1].lineno,
self.control_line[-1].pos, self.filename)
return self.template
def match_tag_start(self):
match = self.match(r'''
\<% # opening tag
([\w\.\:]+) # keyword
((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \
# sign, string expression
\s* # more whitespace
(/)?> # closing
''',
re.I | re.S | re.X)
if match:
keyword, attr, isend = match.groups()
self.keyword = keyword
attributes = {}
if attr:
for att in re.findall(
r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
key, val1, val2 = att
text = val1 or val2
text = text.replace('\r\n', '\n')
attributes[key] = text
self.append_node(parsetree.Tag, keyword, attributes)
if isend:
self.tag.pop()
else:
if keyword == 'text':
match = self.match(r'(.*?)(?=\</%text>)', re.S)
if not match:
raise exceptions.SyntaxException(
"Unclosed tag: <%%%s>" %
self.tag[-1].keyword,
**self.exception_kwargs)
self.append_node(parsetree.Text, match.group(1))
return self.match_tag_end()
return True
else:
return False
def match_tag_end(self):
match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
if match:
if not len(self.tag):
raise exceptions.SyntaxException(
"Closing tag without opening tag: </%%%s>" %
match.group(1),
**self.exception_kwargs)
elif self.tag[-1].keyword != match.group(1):
raise exceptions.SyntaxException(
"Closing tag </%%%s> does not match tag: <%%%s>" %
(match.group(1), self.tag[-1].keyword),
**self.exception_kwargs)
self.tag.pop()
return True
else:
return False
def match_end(self):
match = self.match(r'\Z', re.S)
if match:
string = match.group()
if string:
return string
else:
return True
else:
return False
def match_text(self):
match = self.match(r"""
(.*?) # anything, followed by:
(
(?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
# comment preceded by a
# consumed newline and whitespace
|
(?=\${) # an expression
|
(?=</?[%&]) # a substitution or block or call start or end
# - don't consume
|
(\\\r?\n) # an escaped newline - throw away
|
\Z # end of string
)""", re.X | re.S)
if match:
text = match.group(1)
if text:
self.append_node(parsetree.Text, text)
return True
else:
return False
def match_python_block(self):
match = self.match(r"<%(!)?")
if match:
line, pos = self.matched_lineno, self.matched_charpos
text, end = self.parse_until_text(False, r'%>')
# the trailing newline helps
# compiler.parse() not complain about indentation
text = adjust_whitespace(text) + "\n"
self.append_node(
parsetree.Code,
text,
match.group(1) == '!', lineno=line, pos=pos)
return True
else:
return False
def match_expression(self):
match = self.match(r"\${")
if match:
line, pos = self.matched_lineno, self.matched_charpos
text, end = self.parse_until_text(True, r'\|', r'}')
if end == '|':
escapes, end = self.parse_until_text(True, r'}')
else:
escapes = ""
text = text.replace('\r\n', '\n')
self.append_node(
parsetree.Expression,
text, escapes.strip(),
lineno=line, pos=pos)
return True
else:
return False
def match_control_line(self):
match = self.match(
r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
r"(?:\r?\n|\Z)", re.M)
if match:
operator = match.group(1)
text = match.group(2)
if operator == '%':
m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
if not m2:
raise exceptions.SyntaxException(
"Invalid control line: '%s'" %
text,
**self.exception_kwargs)
isend, keyword = m2.group(1, 2)
isend = (isend is not None)
if isend:
if not len(self.control_line):
raise exceptions.SyntaxException(
"No starting keyword '%s' for '%s'" %
(keyword, text),
**self.exception_kwargs)
elif self.control_line[-1].keyword != keyword:
raise exceptions.SyntaxException(
"Keyword '%s' doesn't match keyword '%s'" %
(text, self.control_line[-1].keyword),
**self.exception_kwargs)
self.append_node(parsetree.ControlLine, keyword, isend, text)
else:
self.append_node(parsetree.Comment, text)
return True
else:
return False
def match_comment(self):
"""matches the multiline version of a comment"""
match = self.match(r"<%doc>(.*?)</%doc>", re.S)
if match:
self.append_node(parsetree.Comment, match.group(1))
return True
else:
return False