system/host/banjo/lib/lexer.cpp - zircon - Git at Google

 // Copyright 2017 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "banjo/lexer.h"

 #include <assert.h>
 #include <ctype.h>

 namespace banjo {

 namespace {

 bool IsIdentifierBody(char c) {
     return isalnum(c) || c == '_' || c == '*';
 }

 // IsIdentifierValid disallows identifiers (escaped, and unescaped) from
 // starting or ending with underscore.
 bool IsIdentifierValid(StringView source_data) {
     return source_data[0] != '_' && source_data[source_data.size() - 1] != '_';
 }

 bool IsNumericLiteralBody(char c) {
     switch (c) {
     case '0':
     case '1':
     case '2':
     case '3':
     case '4':
     case '5':
     case '6':
     case '7':
     case '8':
     case '9':
     case 'a':
     case 'A':
     case 'b':
     case 'B':
     case 'c':
     case 'C':
     case 'd':
     case 'D':
     case 'e':
     case 'E':
     case 'f':
     case 'F':
     case 'x':
     case 'X':
     case '-':
     case '_':
     case '.':
         return true;
     default:
         return false;
     }
 }

 } // namespace

 constexpr char Lexer::Peek() const {
     return *current_;
 }

 void Lexer::Skip() {
     ++current_;
     ++token_start_;
 }

 char Lexer::Consume() {
     auto current = *current_;
     ++current_;
     ++token_size_;
     return current;
 }

 StringView Lexer::Reset(Token::Kind kind) {
     auto data = StringView(token_start_, token_size_);
     if (kind != Token::Kind::kComment) {
         previous_end_ = token_start_ + token_size_;
     }
     token_start_ = current_;
     token_size_ = 0u;
     return data;
 }

 Token Lexer::Finish(Token::Kind kind) {
     assert(kind != Token::Kind::kIdentifier);
     StringView previous(previous_end_, token_start_ - previous_end_);
     StringView current(token_start_, token_size_);
     SourceLocation previous_location(previous, source_file_);
     return Token(previous_location,
                  SourceLocation(Reset(kind), source_file_), kind, Token::Subkind::kNone);
 }

 Token Lexer::LexEndOfStream() {
     return Finish(Token::Kind::kEndOfFile);
 }

 Token Lexer::LexNumericLiteral() {
     while (IsNumericLiteralBody(Peek()))
         Consume();
     return Finish(Token::Kind::kNumericLiteral);
 }

 Token Lexer::LexIdentifier() {
     while (IsIdentifierBody(Peek()))
         Consume();
     StringView previous(previous_end_, token_start_ - previous_end_);
     SourceLocation previous_end(previous, source_file_);
     StringView identifier_data = Reset(Token::Kind::kNotAToken);
     if (!IsIdentifierValid(identifier_data))
         return Finish(Token::Kind::kNotAToken);
     return identifier_table_->MakeIdentifier(
         previous_end, identifier_data, source_file_, /* escaped */ false);
 }

 Token Lexer::LexEscapedIdentifier() {
     // Reset() to drop the initial @ from the identifier.
     Reset(Token::Kind::kComment);

     while (IsIdentifierBody(Peek()))
         Consume();
     StringView previous(previous_end_, token_start_ - previous_end_);
     SourceLocation previous_end(previous, source_file_);
     StringView identifier_data = Reset(Token::Kind::kNotAToken);
     if (!IsIdentifierValid(identifier_data))
         return Finish(Token::Kind::kNotAToken);
     return identifier_table_->MakeIdentifier(
         previous_end, identifier_data, source_file_, /* escaped */ true);
 }

 Token Lexer::LexStringLiteral() {
     auto last = Peek();

     // Lexing a "string literal" to the next matching delimiter.
     for (;;) {
         auto next = Consume();
         switch (next) {
         case 0:
             return Finish(Token::Kind::kNotAToken);
         case '"':
             // This escaping logic is incorrect for the input: "\\"
             if (last != '\\')
                 return Finish(Token::Kind::kStringLiteral);
         // Fall through.
         default:
             last = next;
         }
     }
 }

 Token Lexer::LexCommentOrDocComment() {
     // Consume the second /.
     assert(Peek() == '/');
     Consume();

     // Check if it's a Doc Comment
     auto comment_type = Token::Kind::kComment;
     if (Peek() == '/') {
         comment_type = Token::Kind::kDocComment;
         Consume();
         // Anything with more than 3 slashes is a likely a section
         // break comment
         if (Peek() == '/') {
           comment_type = Token::Kind::kComment;
         }
     }

     // Lexing a C++-style // comment. Go to the end of the line or
     // file.
     for (;;) {
         switch (Peek()) {
         case 0:
         case '\n':
           return Finish(comment_type);
         default:
             Consume();
             continue;
         }
     }
 }

 void Lexer::SkipWhitespace() {
     for (;;) {
         switch (Peek()) {
         case ' ':
         case '\n':
         case '\r':
         case '\t':
             Skip();
             continue;
         default:
             return;
         }
     }
 }

 Token Lexer::LexNoComments() {
     for (;;) {
         auto token = Lex();
         if (token.kind() == Token::Kind::kComment)
             continue;
         return token;
     }
 }

 Token Lexer::Lex() {
     SkipWhitespace();

     switch (Consume()) {
     case 0:
         return LexEndOfStream();

     case ' ':
     case '\n':
     case '\r':
     case '\t':
         assert(false && "Should have been handled by SkipWhitespace!");

     case '-':
         // Maybe the start of an arrow.
         if (Peek() == '>') {
             Consume();
             return Finish(Token::Kind::kArrow);
         }
     // Fallthrough
     case '0':
     case '1':
     case '2':
     case '3':
     case '4':
     case '5':
     case '6':
     case '7':
     case '8':
     case '9':
         return LexNumericLiteral();

     case 'a':
     case 'A':
     case 'b':
     case 'B':
     case 'c':
     case 'C':
     case 'd':
     case 'D':
     case 'e':
     case 'E':
     case 'f':
     case 'F':
     case 'g':
     case 'G':
     case 'h':
     case 'H':
     case 'i':
     case 'I':
     case 'j':
     case 'J':
     case 'k':
     case 'K':
     case 'l':
     case 'L':
     case 'm':
     case 'M':
     case 'n':
     case 'N':
     case 'o':
     case 'O':
     case 'p':
     case 'P':
     case 'q':
     case 'Q':
     case 'r':
     case 'R':
     case 's':
     case 'S':
     case 't':
     case 'T':
     case 'u':
     case 'U':
     case 'v':
     case 'V':
     case 'w':
     case 'W':
     case 'x':
     case 'X':
     case 'y':
     case 'Y':
     case 'z':
     case 'Z':
         return LexIdentifier();

     case '@':
         return LexEscapedIdentifier();

     case '"':
         return LexStringLiteral();

     case '/':
         // Maybe the start of a comment.
         switch (Peek()) {
         case '/':
             return LexCommentOrDocComment();
         default:
             return Finish(Token::Kind::kNotAToken);
         }

     case '(':
         return Finish(Token::Kind::kLeftParen);
     case ')':
         return Finish(Token::Kind::kRightParen);
     case '[':
         return Finish(Token::Kind::kLeftSquare);
     case ']':
         return Finish(Token::Kind::kRightSquare);
     case '{':
         return Finish(Token::Kind::kLeftCurly);
     case '}':
         return Finish(Token::Kind::kRightCurly);
     case '<':
         return Finish(Token::Kind::kLeftAngle);
     case '>':
         return Finish(Token::Kind::kRightAngle);

     case '.':
         return Finish(Token::Kind::kDot);
     case ',':
         return Finish(Token::Kind::kComma);
     case ';':
         return Finish(Token::Kind::kSemicolon);
     case ':':
         return Finish(Token::Kind::kColon);
     case '?':
         return Finish(Token::Kind::kQuestion);
     case '=':
         return Finish(Token::Kind::kEqual);
     case '&':
         return Finish(Token::Kind::kAmpersand);

     default:
         return Finish(Token::Kind::kNotAToken);
     }
 }

 } // namespace banjo
	// Copyright 2017 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "banjo/lexer.h"

	#include <assert.h>
	#include <ctype.h>

	namespace banjo {

	namespace {

	bool IsIdentifierBody(char c) {
	return isalnum(c) \|\| c == '_' \|\| c == '*';
	}

	// IsIdentifierValid disallows identifiers (escaped, and unescaped) from
	// starting or ending with underscore.
	bool IsIdentifierValid(StringView source_data) {
	return source_data[0] != '_' && source_data[source_data.size() - 1] != '_';
	}

	bool IsNumericLiteralBody(char c) {
	switch (c) {
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	case 'a':
	case 'A':
	case 'b':
	case 'B':
	case 'c':
	case 'C':
	case 'd':
	case 'D':
	case 'e':
	case 'E':
	case 'f':
	case 'F':
	case 'x':
	case 'X':
	case '-':
	case '_':
	case '.':
	return true;
	default:
	return false;
	}
	}

	} // namespace

	constexpr char Lexer::Peek() const {
	return *current_;
	}

	void Lexer::Skip() {
	++current_;
	++token_start_;
	}

	char Lexer::Consume() {
	auto current = *current_;
	++current_;
	++token_size_;
	return current;
	}

	StringView Lexer::Reset(Token::Kind kind) {
	auto data = StringView(token_start_, token_size_);
	if (kind != Token::Kind::kComment) {
	previous_end_ = token_start_ + token_size_;
	}
	token_start_ = current_;
	token_size_ = 0u;
	return data;
	}

	Token Lexer::Finish(Token::Kind kind) {
	assert(kind != Token::Kind::kIdentifier);
	StringView previous(previous_end_, token_start_ - previous_end_);
	StringView current(token_start_, token_size_);
	SourceLocation previous_location(previous, source_file_);
	return Token(previous_location,
	SourceLocation(Reset(kind), source_file_), kind, Token::Subkind::kNone);
	}

	Token Lexer::LexEndOfStream() {
	return Finish(Token::Kind::kEndOfFile);
	}

	Token Lexer::LexNumericLiteral() {
	while (IsNumericLiteralBody(Peek()))
	Consume();
	return Finish(Token::Kind::kNumericLiteral);
	}

	Token Lexer::LexIdentifier() {
	while (IsIdentifierBody(Peek()))
	Consume();
	StringView previous(previous_end_, token_start_ - previous_end_);
	SourceLocation previous_end(previous, source_file_);
	StringView identifier_data = Reset(Token::Kind::kNotAToken);
	if (!IsIdentifierValid(identifier_data))
	return Finish(Token::Kind::kNotAToken);
	return identifier_table_->MakeIdentifier(
	previous_end, identifier_data, source_file_, /* escaped */ false);
	}

	Token Lexer::LexEscapedIdentifier() {
	// Reset() to drop the initial @ from the identifier.
	Reset(Token::Kind::kComment);

	while (IsIdentifierBody(Peek()))
	Consume();
	StringView previous(previous_end_, token_start_ - previous_end_);
	SourceLocation previous_end(previous, source_file_);
	StringView identifier_data = Reset(Token::Kind::kNotAToken);
	if (!IsIdentifierValid(identifier_data))
	return Finish(Token::Kind::kNotAToken);
	return identifier_table_->MakeIdentifier(
	previous_end, identifier_data, source_file_, /* escaped */ true);
	}

	Token Lexer::LexStringLiteral() {
	auto last = Peek();

	// Lexing a "string literal" to the next matching delimiter.
	for (;;) {
	auto next = Consume();
	switch (next) {
	case 0:
	return Finish(Token::Kind::kNotAToken);
	case '"':
	// This escaping logic is incorrect for the input: "\\"
	if (last != '\\')
	return Finish(Token::Kind::kStringLiteral);
	// Fall through.
	default:
	last = next;
	}
	}
	}

	Token Lexer::LexCommentOrDocComment() {
	// Consume the second /.
	assert(Peek() == '/');
	Consume();

	// Check if it's a Doc Comment
	auto comment_type = Token::Kind::kComment;
	if (Peek() == '/') {
	comment_type = Token::Kind::kDocComment;
	Consume();
	// Anything with more than 3 slashes is a likely a section
	// break comment
	if (Peek() == '/') {
	comment_type = Token::Kind::kComment;
	}
	}

	// Lexing a C++-style // comment. Go to the end of the line or
	// file.
	for (;;) {
	switch (Peek()) {
	case 0:
	case '\n':
	return Finish(comment_type);
	default:
	Consume();
	continue;
	}
	}
	}

	void Lexer::SkipWhitespace() {
	for (;;) {
	switch (Peek()) {
	case ' ':
	case '\n':
	case '\r':
	case '\t':
	Skip();
	continue;
	default:
	return;
	}
	}
	}

	Token Lexer::LexNoComments() {
	for (;;) {
	auto token = Lex();
	if (token.kind() == Token::Kind::kComment)
	continue;
	return token;
	}
	}

	Token Lexer::Lex() {
	SkipWhitespace();

	switch (Consume()) {
	case 0:
	return LexEndOfStream();

	case ' ':
	case '\n':
	case '\r':
	case '\t':
	assert(false && "Should have been handled by SkipWhitespace!");

	case '-':
	// Maybe the start of an arrow.
	if (Peek() == '>') {
	Consume();
	return Finish(Token::Kind::kArrow);
	}
	// Fallthrough
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	return LexNumericLiteral();

	case 'a':
	case 'A':
	case 'b':
	case 'B':
	case 'c':
	case 'C':
	case 'd':
	case 'D':
	case 'e':
	case 'E':
	case 'f':
	case 'F':
	case 'g':
	case 'G':
	case 'h':
	case 'H':
	case 'i':
	case 'I':
	case 'j':
	case 'J':
	case 'k':
	case 'K':
	case 'l':
	case 'L':
	case 'm':
	case 'M':
	case 'n':
	case 'N':
	case 'o':
	case 'O':
	case 'p':
	case 'P':
	case 'q':
	case 'Q':
	case 'r':
	case 'R':
	case 's':
	case 'S':
	case 't':
	case 'T':
	case 'u':
	case 'U':
	case 'v':
	case 'V':
	case 'w':
	case 'W':
	case 'x':
	case 'X':
	case 'y':
	case 'Y':
	case 'z':
	case 'Z':
	return LexIdentifier();

	case '@':
	return LexEscapedIdentifier();

	case '"':
	return LexStringLiteral();

	case '/':
	// Maybe the start of a comment.
	switch (Peek()) {
	case '/':
	return LexCommentOrDocComment();
	default:
	return Finish(Token::Kind::kNotAToken);
	}

	case '(':
	return Finish(Token::Kind::kLeftParen);
	case ')':
	return Finish(Token::Kind::kRightParen);
	case '[':
	return Finish(Token::Kind::kLeftSquare);
	case ']':
	return Finish(Token::Kind::kRightSquare);
	case '{':
	return Finish(Token::Kind::kLeftCurly);
	case '}':
	return Finish(Token::Kind::kRightCurly);
	case '<':
	return Finish(Token::Kind::kLeftAngle);
	case '>':
	return Finish(Token::Kind::kRightAngle);

	case '.':
	return Finish(Token::Kind::kDot);
	case ',':
	return Finish(Token::Kind::kComma);
	case ';':
	return Finish(Token::Kind::kSemicolon);
	case ':':
	return Finish(Token::Kind::kColon);
	case '?':
	return Finish(Token::Kind::kQuestion);
	case '=':
	return Finish(Token::Kind::kEqual);
	case '&':
	return Finish(Token::Kind::kAmpersand);

	default:
	return Finish(Token::Kind::kNotAToken);
	}
	}

	} // namespace banjo