blob: bd5250974b95b98450b1b7e9e53c67a6414030b8 [file] [log] [blame]
//===-- Lexer.cpp ---------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#include "llbuild/Ninja/Lexer.h"
#include "llbuild/Basic/LLVM.h"
#include <cstring>
#include <string>
#include <iostream>
#include <iomanip>
using namespace llbuild;
using namespace llbuild::ninja;
///
const char* Token::getKindName() const {
#define CASE(name) case Kind::name: return #name
switch (tokenKind) {
CASE(Colon);
CASE(Comment);
CASE(EndOfFile);
CASE(Equals);
CASE(Identifier);
CASE(Indentation);
CASE(KWBuild);
CASE(KWDefault);
CASE(KWInclude);
CASE(KWPool);
CASE(KWRule);
CASE(KWSubninja);
CASE(Newline);
CASE(Pipe);
CASE(PipePipe);
CASE(String);
CASE(Unknown);
}
#undef CASE
return "<invalid token kind>";
}
#ifndef NDEBUG
void Token::dump() {
std::cerr << "(Token \"" << getKindName() << "\" "
<< (const void*) start << " " << length << " "
<< line << " " << column << ")\n";
}
#endif
///
Lexer::Lexer(StringRef buffer)
: buffer(buffer), bufferPos(buffer.data()), lineNumber(1), columnNumber(0),
mode(LexingMode::None)
{
}
Lexer::~Lexer() {
}
int Lexer::peekNextChar() {
if (bufferPos == buffer.end())
return -1;
return *bufferPos;
}
int Lexer::getNextChar() {
if (bufferPos == buffer.end())
return -1;
// Handle DOS/Mac newlines here, by stripping duplicates and by returning '\n'
// for both.
char result = *bufferPos++;
if (result == '\n' || result == '\r') {
if (bufferPos != buffer.end() && *bufferPos == ('\n' + '\r' - result))
++bufferPos;
result = '\n';
}
if (result == '\n') {
++lineNumber;
columnNumber = 0;
} else {
++columnNumber;
}
return result;
}
Token& Lexer::setTokenKind(Token& result, Token::Kind kind) const {
result.tokenKind = kind;
result.length = bufferPos - result.start;
return result;
}
void Lexer::skipToEndOfLine() {
// Skip to the end of the line, but not past the actual newline character
// (which we want to generate a Newline token).
for (;;) {
int c = peekNextChar();
if (c == -1 || c == '\n')
break;
getNextChar();
}
}
Token& Lexer::setIdentifierTokenKind(Token& result) const {
unsigned length = bufferPos - result.start;
switch (length) {
case 4:
if (memcmp("rule", result.start, 4) == 0)
return setTokenKind(result, Token::Kind::KWRule);
if (memcmp("pool", result.start, 4) == 0)
return setTokenKind(result, Token::Kind::KWPool);
break;
case 5:
if (memcmp("build", result.start, 5) == 0)
return setTokenKind(result, Token::Kind::KWBuild);
break;
case 7:
if (memcmp("default", result.start, 7) == 0)
return setTokenKind(result, Token::Kind::KWDefault);
if (memcmp("include", result.start, 7) == 0)
return setTokenKind(result, Token::Kind::KWInclude);
break;
case 8:
if (memcmp("subninja", result.start, 7) == 0)
return setTokenKind(result, Token::Kind::KWSubninja);
break;
}
return setTokenKind(result, Token::Kind::Identifier);
}
Token& Lexer::lexIdentifier(Token& result) {
// Consume characters as long as we are in an identifier.
while (Lexer::isIdentifierChar(peekNextChar())) {
getNextChar();
}
// If we are in identifier specific mode, ignore keywords.
if (mode == Lexer::LexingMode::IdentifierSpecific)
return setTokenKind(result, Token::Kind::Identifier);
// Recognize keywords specially.
return setIdentifierTokenKind(result);
}
static bool isNonNewlineSpace(int c) {
return isspace(c) && c != '\n';
}
Token &Lexer::lexPathString(Token &result) {
// String tokens in path contexts consume until a space, ':', or '|'
// character.
while (true) {
int c = peekNextChar();
// If this is an escape character, skip the next character.
if (c == '$') {
getNextChar(); // Consume the actual '$'.
// Consume the next character.
c = getNextChar();
// If the character was a newline, consume any leading spaces.
if (c == '\n') {
while (isNonNewlineSpace(peekNextChar()))
getNextChar();
}
continue;
}
// Otherwise, continue only if this is not the EOL or EOF.
if (isspace(c) || c == ':' || c == '|' || c == -1)
break;
getNextChar();
}
return setTokenKind(result, Token::Kind::String);
}
Token& Lexer::lexVariableString(Token& result) {
// String tokens in variable assignments consume until the end of the line.
while (true) {
int c = peekNextChar();
// If this is an escape character, skip the next character.
if (c == '$') {
getNextChar(); // Consume the actual '$'.
getNextChar(); // Consume the next character.
continue;
}
// Otherwise, continue only if this is not the EOL or EOF.
if (c == '\n' || c == -1)
break;
getNextChar();
}
return setTokenKind(result, Token::Kind::String);
}
Token& Lexer::lex(Token& result) {
// Check if we need to emit an indentation token.
int c = peekNextChar();
if (isNonNewlineSpace(c) && columnNumber == 0) {
// If we are at the start of a line, then any leading whitespace should be
// parsed as an indentation token.
//
// We do not need to handle "$\n" sequences here because they will be
// consumed next, and the exact length of the indentation token is never
// used.
if (columnNumber == 0) {
result.start = bufferPos;
result.line = lineNumber;
result.column = columnNumber;
do {
getNextChar();
} while (isNonNewlineSpace(peekNextChar()));
return setTokenKind(result, Token::Kind::Indentation);
}
}
// Otherwise, consume any leading whitespace or "$\n" escape sequences (except
// at the start of lines, which Ninja does not recognize).
while (true) {
// Check for escape sequences.
if (c == '$' && columnNumber != 0) {
// If this is a newline escape, consume it.
if (bufferPos + 1 != buffer.end() && bufferPos[1] == '\n') {
getNextChar();
getNextChar();
} else {
// Otherwise, break out and lex normally.
break;
}
} else if (isNonNewlineSpace(c)) {
getNextChar();
} else {
break;
}
c = peekNextChar();
}
// Initialize the token position.
result.start = bufferPos;
result.line = lineNumber;
result.column = columnNumber;
// Check if we are at a string mode independent token.
if (c == '\n') {
getNextChar();
return setTokenKind(result, Token::Kind::Newline);
}
if (c == -1)
return setTokenKind(result, Token::Kind::EndOfFile);
// If we are in string lexing mode, delegate immediately if appropriate.
if (mode == LexingMode::VariableString)
return lexVariableString(result);
if (mode == LexingMode::PathString) {
// Only delegate for characters not special to path lexing.
if (c != ':' && c != '|')
return lexPathString(result);
}
// Otherwise, consume the character and lex from the regular token set.
getNextChar();
switch (c) {
case ':': return setTokenKind(result, Token::Kind::Colon);
case '=': return setTokenKind(result, Token::Kind::Equals);
case '#': {
skipToEndOfLine();
return setTokenKind(result, Token::Kind::Comment);
}
case '|': {
if (peekNextChar() == '|') {
(void) getNextChar();
return setTokenKind(result, Token::Kind::PipePipe);
}
return setTokenKind(result, Token::Kind::Pipe);
}
default:
if (Lexer::isIdentifierChar(c))
return lexIdentifier(result);
return setTokenKind(result, Token::Kind::Unknown);
}
}