blob: f10cfc442ca4d0d3cbef78b2cc2ad6f7b748866f [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "tools/fidl/fidlc/src/lexer.h"
#include <assert.h>
#include <zircon/assert.h>
#include <map>
#include "tools/fidl/fidlc/src/diagnostics.h"
namespace fidlc {
namespace {
bool IsIdentifierBody(char c) {
switch (c) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '_':
return true;
default:
return false;
}
}
bool IsNumericLiteralBody(char c) {
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case 'a':
case 'A':
case 'b':
case 'B':
case 'c':
case 'C':
case 'd':
case 'D':
case 'e':
case 'E':
case 'f':
case 'F':
case 'x':
case 'X':
case '-':
case '_':
case '.':
return true;
default:
return false;
}
}
} // namespace
constexpr char Lexer::Peek() const {
return current_ < end_of_file_ ? static_cast<char>(*current_) : 0;
}
void Lexer::Skip() {
++current_;
++token_start_;
}
char Lexer::Consume() {
auto current = Peek();
++current_;
++token_size_;
return current;
}
Lexer::ResetResult Lexer::Reset(Token::Kind kind) {
uint16_t newlines = leading_newlines_;
auto data = std::string_view(token_start_, token_size_);
token_start_ = current_;
token_size_ = 0u;
leading_newlines_ = 0;
return {
.leading_newlines = newlines,
.data = data,
};
}
Token Lexer::Finish(Token::Kind kind) {
ZX_ASSERT(kind != Token::Kind::kIdentifier);
ResetResult result = Reset(Token::Kind::kIdentifier);
return Token(SourceSpan(result.data, source_file_), result.leading_newlines, kind,
Token::Subkind::kNone);
}
Token Lexer::LexEndOfStream() { return Finish(Token::Kind::kEndOfFile); }
Token Lexer::LexNumericLiteral() {
while (IsNumericLiteralBody(Peek()))
Consume();
return Finish(Token::Kind::kNumericLiteral);
}
Token Lexer::LexIdentifier() {
while (IsIdentifierBody(Peek()))
Consume();
ResetResult identifier_result = Reset(Token::Kind::kIdentifier);
auto subkind = Token::Subkind::kNone;
auto lookup = token_subkinds.find(identifier_result.data);
if (lookup != token_subkinds.end())
subkind = lookup->second;
return Token(SourceSpan(identifier_result.data, source_file_), identifier_result.leading_newlines,
Token::Kind::kIdentifier, subkind);
}
static bool IsHexDigit(char c) {
return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f');
}
Token Lexer::LexStringLiteral() {
enum State {
kNormal,
kEscaped, // saw "\"
kUnicode, // saw "\u"
kUnicodeBrace, // saw "\u{"
};
auto state = kNormal;
auto unicode_hex_digits = 0;
// We've already consumed the opening '"'. Consume until the closing '"'.
for (;;) {
auto curr = Consume();
// Check for EOF and invalid characters.
switch (curr) {
case 0:
return LexEndOfStream();
case '\n':
case '\r': {
SourceSpan span(std::string_view(current_ - 1, 1), source_file_);
reporter_->Fail(ErrUnexpectedLineBreak, span);
state = kNormal;
break;
}
default:
if (curr >= 0 && curr <= 0x1f) {
SourceSpan span(std::string_view(current_ - 1, 1), source_file_);
char buf[3];
snprintf(buf, sizeof buf, "%x", curr);
reporter_->Fail(ErrUnexpectedControlCharacter, span, std::string_view(buf));
state = kNormal;
}
break;
}
// Main state machine.
switch (state) {
case kNormal:
if (curr == '"')
return Finish(Token::Kind::kStringLiteral);
if (curr == '\\')
state = kEscaped;
break;
case kEscaped:
switch (curr) {
case 'u':
state = kUnicode;
break;
case '\\':
case '"':
case 'n':
case 'r':
case 't':
state = kNormal;
break;
default:
SourceSpan span(std::string_view(current_ - 2, 2), source_file_);
reporter_->Fail(ErrInvalidEscapeSequence, span, span.data());
state = kNormal;
}
break;
case kUnicode:
if (curr == '{') {
// Saw "\u{", now switch to lexing the hex digits.
state = kUnicodeBrace;
unicode_hex_digits = 0;
} else {
// Saw something like "\ua" which is invalid.
SourceSpan span(std::string_view(current_ - 3, 2), source_file_);
reporter_->Fail(ErrUnicodeEscapeMissingBraces, span);
if (curr == '"') {
return Finish(Token::Kind::kStringLiteral);
}
state = kNormal;
}
break;
case kUnicodeBrace:
if (IsHexDigit(curr)) {
// Saw a hex digit like "\u{a" or "\u{a3".
++unicode_hex_digits;
} else if (curr == '"') {
// The string literal ended before the closing "}".
SourceSpan span(
std::string_view(current_ - 4 - unicode_hex_digits, unicode_hex_digits + 3),
source_file_);
reporter_->Fail(ErrUnicodeEscapeUnterminated, span);
return Finish(Token::Kind::kStringLiteral);
} else if (curr == '}') {
// Saw "\u{...}", now validate the "..." part.
if (unicode_hex_digits == 0) {
SourceSpan span(
std::string_view(current_ - 4 - unicode_hex_digits, unicode_hex_digits + 4),
source_file_);
reporter_->Fail(ErrUnicodeEscapeEmpty, span);
} else if (unicode_hex_digits > 6) {
SourceSpan span(std::string_view(current_ - 1 - unicode_hex_digits, unicode_hex_digits),
source_file_);
reporter_->Fail(ErrUnicodeEscapeTooLong, span);
} else {
SourceSpan span(std::string_view(current_ - 1 - unicode_hex_digits, unicode_hex_digits),
source_file_);
auto codepoint = DecodeUnicodeHex(span.data());
if (codepoint > 0x10ffff) {
reporter_->Fail(ErrUnicodeEscapeTooLarge, span, span.data());
}
}
state = kNormal;
} else {
SourceSpan span(std::string_view(current_ - 1, 1), source_file_);
reporter_->Fail(ErrInvalidHexDigit, span, curr);
state = kNormal;
}
break;
}
}
}
Token Lexer::LexCommentOrDocComment() {
// Consume the second /.
ZX_ASSERT(Peek() == '/');
Consume();
// Check if it's a Doc Comment
auto comment_type = Token::Kind::kComment;
if (Peek() == '/') {
comment_type = Token::Kind::kDocComment;
Consume();
// Anything with more than 3 slashes is a likely a section
// break comment
if (Peek() == '/') {
comment_type = Token::Kind::kComment;
}
}
// Lexing a C++-style // comment. Go to the end of the line or
// file.
for (;;) {
switch (Peek()) {
case 0:
case '\n':
return Finish(comment_type);
default:
Consume();
continue;
}
}
}
void Lexer::SkipWhitespace() {
for (;;) {
switch (Peek()) {
case '\n':
++leading_newlines_;
[[fallthrough]];
case '\r':
case ' ':
case '\t':
Skip();
continue;
default:
return;
}
}
}
Token Lexer::Lex() {
ZX_ASSERT_MSG(token_start_ <= end_of_file_, "already reached EOF");
ZX_ASSERT_MSG(current_ <= end_of_file_ + 1, "current_ is past null terminator");
if (start_of_file_) {
start_of_file_ = false;
return Finish(Token::Kind::kStartOfFile);
}
do {
SkipWhitespace();
switch (Consume()) {
case 0:
return LexEndOfStream();
case ' ':
case '\n':
case '\r':
case '\t':
ZX_PANIC("should have been handled by SkipWhitespace");
case '-':
// Maybe the start of an arrow.
if (Peek() == '>') {
Consume();
return Finish(Token::Kind::kArrow);
}
[[fallthrough]];
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return LexNumericLiteral();
case 'a':
case 'A':
case 'b':
case 'B':
case 'c':
case 'C':
case 'd':
case 'D':
case 'e':
case 'E':
case 'f':
case 'F':
case 'g':
case 'G':
case 'h':
case 'H':
case 'i':
case 'I':
case 'j':
case 'J':
case 'k':
case 'K':
case 'l':
case 'L':
case 'm':
case 'M':
case 'n':
case 'N':
case 'o':
case 'O':
case 'p':
case 'P':
case 'q':
case 'Q':
case 'r':
case 'R':
case 's':
case 'S':
case 't':
case 'T':
case 'u':
case 'U':
case 'v':
case 'V':
case 'w':
case 'W':
case 'x':
case 'X':
case 'y':
case 'Y':
case 'z':
case 'Z':
return LexIdentifier();
case '"':
return LexStringLiteral();
case '/':
// Maybe the start of a comment.
switch (Peek()) {
case '/':
return LexCommentOrDocComment();
default: {
SourceSpan span(std::string_view(token_start_, token_size_), source_file_);
reporter_->Fail(ErrInvalidCharacter, span, span.data());
continue;
}
} // switch
case '(':
return Finish(Token::Kind::kLeftParen);
case ')':
return Finish(Token::Kind::kRightParen);
case '[':
return Finish(Token::Kind::kLeftSquare);
case ']':
return Finish(Token::Kind::kRightSquare);
case '{':
return Finish(Token::Kind::kLeftCurly);
case '}':
return Finish(Token::Kind::kRightCurly);
case '<':
return Finish(Token::Kind::kLeftAngle);
case '>':
return Finish(Token::Kind::kRightAngle);
case '@':
return Finish(Token::Kind::kAt);
case '.':
return Finish(Token::Kind::kDot);
case ',':
return Finish(Token::Kind::kComma);
case ';':
return Finish(Token::Kind::kSemicolon);
case ':':
return Finish(Token::Kind::kColon);
case '?':
return Finish(Token::Kind::kQuestion);
case '=':
return Finish(Token::Kind::kEqual);
case '&':
return Finish(Token::Kind::kAmpersand);
case '|':
return Finish(Token::Kind::kPipe);
default: {
SourceSpan span(std::string_view(token_start_, token_size_), source_file_);
reporter_->Fail(ErrInvalidCharacter, span, span.data());
continue;
}
} // switch
} while (true);
}
} // namespace fidlc