| # lexer.py |
| # Copyright (C) 2006, 2007, 2008 Michael Bayer mike_mp@zzzcomputing.com |
| # |
| # This module is part of Mako and is released under |
| # the MIT License: http://www.opensource.org/licenses/mit-license.php |
| |
| """provides the Lexer class for parsing template strings into parse trees.""" |
| |
| import re, codecs |
| from mako import parsetree, exceptions |
| from mako.pygen import adjust_whitespace |
| |
| _regexp_cache = {} |
| |
| class Lexer(object): |
| def __init__(self, text, filename=None, disable_unicode=False, input_encoding=None, preprocessor=None): |
| self.text = text |
| self.filename = filename |
| self.template = parsetree.TemplateNode(self.filename) |
| self.matched_lineno = 1 |
| self.matched_charpos = 0 |
| self.lineno = 1 |
| self.match_position = 0 |
| self.tag = [] |
| self.control_line = [] |
| self.disable_unicode = disable_unicode |
| self.encoding = input_encoding |
| if preprocessor is None: |
| self.preprocessor = [] |
| elif not hasattr(preprocessor, '__iter__'): |
| self.preprocessor = [preprocessor] |
| else: |
| self.preprocessor = preprocessor |
| |
| exception_kwargs = property(lambda self:{'source':self.text, 'lineno':self.matched_lineno, 'pos':self.matched_charpos, 'filename':self.filename}) |
| |
| def match(self, regexp, flags=None): |
| """match the given regular expression string and flags to the current text position. |
| |
| if a match occurs, update the current text and line position.""" |
| mp = self.match_position |
| try: |
| reg = _regexp_cache[(regexp, flags)] |
| except KeyError: |
| if flags: |
| reg = re.compile(regexp, flags) |
| else: |
| reg = re.compile(regexp) |
| _regexp_cache[(regexp, flags)] = reg |
| |
| match = reg.match(self.text, self.match_position) |
| if match: |
| (start, end) = match.span() |
| if end == start: |
| self.match_position = end + 1 |
| else: |
| self.match_position = end |
| self.matched_lineno = self.lineno |
| lines = re.findall(r"\n", self.text[mp:self.match_position]) |
| cp = mp - 1 |
| while (cp >= 0 and cp<self.textlength and self.text[cp] != '\n'): |
| cp -=1 |
| self.matched_charpos = mp - cp |
| self.lineno += len(lines) |
| #print "MATCHED:", match.group(0), "LINE START:", self.matched_lineno, "LINE END:", self.lineno |
| #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], (match and "TRUE" or "FALSE") |
| return match |
| |
| def parse_until_text(self, *text): |
| startpos = self.match_position |
| while True: |
| match = self.match(r'#.*\n') |
| if match: |
| continue |
| match = self.match(r'(\"\"\"|\'\'\'|\"|\')') |
| if match: |
| m = self.match(r'.*?%s' % match.group(1), re.S) |
| if not m: |
| raise exceptions.SyntaxException("Unmatched '%s'" % match.group(1), **self.exception_kwargs) |
| else: |
| match = self.match(r'(%s)' % r'|'.join(text)) |
| if match: |
| return (self.text[startpos:self.match_position-len(match.group(1))], match.group(1)) |
| else: |
| match = self.match(r".*?(?=\"|\'|#|%s)" % r'|'.join(text), re.S) |
| if not match: |
| raise exceptions.SyntaxException("Expected: %s" % ','.join(text), **self.exception_kwargs) |
| |
| def append_node(self, nodecls, *args, **kwargs): |
| kwargs.setdefault('source', self.text) |
| kwargs.setdefault('lineno', self.matched_lineno) |
| kwargs.setdefault('pos', self.matched_charpos) |
| kwargs['filename'] = self.filename |
| node = nodecls(*args, **kwargs) |
| if len(self.tag): |
| self.tag[-1].nodes.append(node) |
| else: |
| self.template.nodes.append(node) |
| if isinstance(node, parsetree.Tag): |
| if len(self.tag): |
| node.parent = self.tag[-1] |
| self.tag.append(node) |
| elif isinstance(node, parsetree.ControlLine): |
| if node.isend: |
| self.control_line.pop() |
| elif node.is_primary: |
| self.control_line.append(node) |
| elif len(self.control_line) and not self.control_line[-1].is_ternary(node.keyword): |
| raise exceptions.SyntaxException("Keyword '%s' not a legal ternary for keyword '%s'" % (node.keyword, self.control_line[-1].keyword), **self.exception_kwargs) |
| |
| def escape_code(self, text): |
| if not self.disable_unicode and self.encoding: |
| return text.encode('ascii', 'backslashreplace') |
| else: |
| return text |
| |
| def parse(self): |
| for preproc in self.preprocessor: |
| self.text = preproc(self.text) |
| if not isinstance(self.text, unicode) and self.text.startswith(codecs.BOM_UTF8): |
| self.text = self.text[len(codecs.BOM_UTF8):] |
| parsed_encoding = 'utf-8' |
| me = self.match_encoding() |
| if me is not None and me != 'utf-8': |
| raise exceptions.CompileException("Found utf-8 BOM in file, with conflicting magic encoding comment of '%s'" % me, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
| else: |
| parsed_encoding = self.match_encoding() |
| if parsed_encoding: |
| self.encoding = parsed_encoding |
| if not self.disable_unicode and not isinstance(self.text, unicode): |
| if self.encoding: |
| try: |
| self.text = self.text.decode(self.encoding) |
| except UnicodeDecodeError, e: |
| raise exceptions.CompileException("Unicode decode operation of encoding '%s' failed" % self.encoding, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
| else: |
| try: |
| self.text = self.text.decode() |
| except UnicodeDecodeError, e: |
| raise exceptions.CompileException("Could not read template using encoding of 'ascii'. Did you forget a magic encoding comment?", self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
| |
| self.textlength = len(self.text) |
| |
| while (True): |
| if self.match_position > self.textlength: |
| break |
| |
| if self.match_end(): |
| break |
| if self.match_expression(): |
| continue |
| if self.match_control_line(): |
| continue |
| if self.match_comment(): |
| continue |
| if self.match_tag_start(): |
| continue |
| if self.match_tag_end(): |
| continue |
| if self.match_python_block(): |
| continue |
| if self.match_text(): |
| continue |
| |
| if self.match_position > self.textlength: |
| break |
| raise exceptions.CompileException("assertion failed") |
| |
| if len(self.tag): |
| raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs) |
| if len(self.control_line): |
| raise exceptions.SyntaxException("Unterminated control keyword: '%s'" % self.control_line[-1].keyword, self.text, self.control_line[-1].lineno, self.control_line[-1].pos, self.filename) |
| return self.template |
| |
| def match_encoding(self): |
| match = self.match(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') |
| if match: |
| return match.group(1) |
| else: |
| return None |
| |
| def match_tag_start(self): |
| match = self.match(r''' |
| \<% # opening tag |
| |
| ([\w\.\:]+) # keyword |
| |
| ((?:\s+\w+|=|".*?"|'.*?')*) # attrname, = sign, string expression |
| |
| \s* # more whitespace |
| |
| (/)?> # closing |
| |
| ''', |
| |
| re.I | re.S | re.X) |
| |
| if match: |
| (keyword, attr, isend) = (match.group(1).lower(), match.group(2), match.group(3)) |
| self.keyword = keyword |
| attributes = {} |
| if attr: |
| for att in re.findall(r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): |
| (key, val1, val2) = att |
| text = val1 or val2 |
| text = text.replace('\r\n', '\n') |
| attributes[key] = self.escape_code(text) |
| self.append_node(parsetree.Tag, keyword, attributes) |
| if isend: |
| self.tag.pop() |
| else: |
| if keyword == 'text': |
| match = self.match(r'(.*?)(?=\</%text>)', re.S) |
| if not match: |
| raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs) |
| self.append_node(parsetree.Text, match.group(1)) |
| return self.match_tag_end() |
| return True |
| else: |
| return False |
| |
| def match_tag_end(self): |
| match = self.match(r'\</%[\t ]*(.+?)[\t ]*>') |
| if match: |
| if not len(self.tag): |
| raise exceptions.SyntaxException("Closing tag without opening tag: </%%%s>" % match.group(1), **self.exception_kwargs) |
| elif self.tag[-1].keyword != match.group(1): |
| raise exceptions.SyntaxException("Closing tag </%%%s> does not match tag: <%%%s>" % (match.group(1), self.tag[-1].keyword), **self.exception_kwargs) |
| self.tag.pop() |
| return True |
| else: |
| return False |
| |
| def match_end(self): |
| match = self.match(r'\Z', re.S) |
| if match: |
| string = match.group() |
| if string: |
| return string |
| else: |
| return True |
| else: |
| return False |
| |
| def match_text(self): |
| match = self.match(r""" |
| (.*?) # anything, followed by: |
| ( |
| (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based comment preceded by a consumed \n and whitespace |
| | |
| (?=\${) # an expression |
| | |
| (?=\#\*) # multiline comment |
| | |
| (?=</?[%&]) # a substitution or block or call start or end |
| # - don't consume |
| | |
| (\\\r?\n) # an escaped newline - throw away |
| | |
| \Z # end of string |
| )""", re.X | re.S) |
| |
| if match: |
| text = match.group(1) |
| self.append_node(parsetree.Text, text) |
| return True |
| else: |
| return False |
| |
| def match_python_block(self): |
| match = self.match(r"<%(!)?") |
| if match: |
| (line, pos) = (self.matched_lineno, self.matched_charpos) |
| (text, end) = self.parse_until_text(r'%>') |
| text = adjust_whitespace(text) + "\n" # the trailing newline helps compiler.parse() not complain about indentation |
| self.append_node(parsetree.Code, self.escape_code(text), match.group(1)=='!', lineno=line, pos=pos) |
| return True |
| else: |
| return False |
| |
| def match_expression(self): |
| match = self.match(r"\${") |
| if match: |
| (line, pos) = (self.matched_lineno, self.matched_charpos) |
| (text, end) = self.parse_until_text(r'\|', r'}') |
| if end == '|': |
| (escapes, end) = self.parse_until_text(r'}') |
| else: |
| escapes = "" |
| text = text.replace('\r\n', '\n') |
| self.append_node(parsetree.Expression, self.escape_code(text), escapes.strip(), lineno=line, pos=pos) |
| return True |
| else: |
| return False |
| |
| def match_control_line(self): |
| match = self.match(r"(?<=^)[\t ]*(%|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)(?:\r?\n|\Z)", re.M) |
| if match: |
| operator = match.group(1) |
| text = match.group(2) |
| if operator == '%': |
| m2 = re.match(r'(end)?(\w+)\s*(.*)', text) |
| if not m2: |
| raise exceptions.SyntaxException("Invalid control line: '%s'" % text, **self.exception_kwargs) |
| (isend, keyword) = m2.group(1, 2) |
| isend = (isend is not None) |
| |
| if isend: |
| if not len(self.control_line): |
| raise exceptions.SyntaxException("No starting keyword '%s' for '%s'" % (keyword, text), **self.exception_kwargs) |
| elif self.control_line[-1].keyword != keyword: |
| raise exceptions.SyntaxException("Keyword '%s' doesn't match keyword '%s'" % (text, self.control_line[-1].keyword), **self.exception_kwargs) |
| self.append_node(parsetree.ControlLine, keyword, isend, self.escape_code(text)) |
| else: |
| self.append_node(parsetree.Comment, text) |
| return True |
| else: |
| return False |
| |
| def match_comment(self): |
| """matches the multiline version of a comment""" |
| match = self.match(r"<%doc>(.*?)</%doc>", re.S) |
| if match: |
| self.append_node(parsetree.Comment, match.group(1)) |
| return True |
| else: |
| return False |
| |