lib/Ninja/Lexer.cpp - third_party/swift-llbuild - Git at Google

 //===-- Lexer.cpp ---------------------------------------------------------===//
 //
 // This source file is part of the Swift.org open source project
 //
 // Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
 // Licensed under Apache License v2.0 with Runtime Library Exception
 //
 // See http://swift.org/LICENSE.txt for license information
 // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 //
 //===----------------------------------------------------------------------===//

 #include "llbuild/Ninja/Lexer.h"

 #include "llbuild/Basic/LLVM.h"

 #include <cstring>
 #include <string>
 #include <iostream>
 #include <iomanip>

 using namespace llbuild;
 using namespace llbuild::ninja;

 ///

 const char* Token::getKindName() const {
 #define CASE(name) case Kind::name: return #name
   switch (tokenKind) {
     CASE(Colon);
     CASE(Comment);
     CASE(EndOfFile);
     CASE(Equals);
     CASE(Identifier);
     CASE(Indentation);
     CASE(KWBuild);
     CASE(KWDefault);
     CASE(KWInclude);
     CASE(KWPool);
     CASE(KWRule);
     CASE(KWSubninja);
     CASE(Newline);
     CASE(Pipe);
     CASE(PipePipe);
     CASE(String);
     CASE(Unknown);
   }
 #undef CASE

   return "<invalid token kind>";
 }

 #ifndef NDEBUG
 void Token::dump() {
   std::cerr << "(Token \"" << getKindName() << "\" "
             << (const void*) start << " " << length << " "
             << line << " " << column << ")\n";
 }
 #endif

 ///

 Lexer::Lexer(StringRef buffer)
   : buffer(buffer), bufferPos(buffer.data()), lineNumber(1), columnNumber(0),
     mode(LexingMode::None)
 {
 }

 Lexer::~Lexer() {
 }

 int Lexer::peekNextChar() {
   if (bufferPos == buffer.end())
     return -1;
   return *bufferPos;
 }

 int Lexer::getNextChar() {
   if (bufferPos == buffer.end())
     return -1;

   // Handle DOS/Mac newlines here, by stripping duplicates and by returning '\n'
   // for both.
   char result = *bufferPos++;
   if (result == '\n' || result == '\r') {
     if (bufferPos != buffer.end() && *bufferPos == ('\n' + '\r' - result))
       ++bufferPos;
     result = '\n';
   }

   if (result == '\n') {
     ++lineNumber;
     columnNumber = 0;
   } else {
     ++columnNumber;
   }

   return result;
 }

 Token& Lexer::setTokenKind(Token& result, Token::Kind kind) const {
   result.tokenKind = kind;
   result.length = bufferPos - result.start;
   return result;
 }

 void Lexer::skipToEndOfLine() {
   // Skip to the end of the line, but not past the actual newline character
   // (which we want to generate a Newline token).
   for (;;) {
     int c = peekNextChar();
     if (c == -1 || c == '\n')
       break;
     getNextChar();
   }
 }

 Token& Lexer::setIdentifierTokenKind(Token& result) const {
   unsigned length = bufferPos - result.start;
   switch (length) {
   case 4:
     if (memcmp("rule", result.start, 4) == 0)
       return setTokenKind(result, Token::Kind::KWRule);
     if (memcmp("pool", result.start, 4) == 0)
       return setTokenKind(result, Token::Kind::KWPool);
     break;

   case 5:
     if (memcmp("build", result.start, 5) == 0)
       return setTokenKind(result, Token::Kind::KWBuild);
     break;

   case 7:
     if (memcmp("default", result.start, 7) == 0)
       return setTokenKind(result, Token::Kind::KWDefault);
     if (memcmp("include", result.start, 7) == 0)
       return setTokenKind(result, Token::Kind::KWInclude);
     break;

   case 8:
     if (memcmp("subninja", result.start, 7) == 0)
       return setTokenKind(result, Token::Kind::KWSubninja);
     break;
   }

   return setTokenKind(result, Token::Kind::Identifier);
 }

 Token& Lexer::lexIdentifier(Token& result) {
   // Consume characters as long as we are in an identifier.
   while (Lexer::isIdentifierChar(peekNextChar())) {
     getNextChar();
   }

   // If we are in identifier specific mode, ignore keywords.
   if (mode == Lexer::LexingMode::IdentifierSpecific)
     return setTokenKind(result, Token::Kind::Identifier);

   // Recognize keywords specially.
   return setIdentifierTokenKind(result);
 }

 static bool isNonNewlineSpace(int c) {
   return isspace(c) && c != '\n';
 }

 Token &Lexer::lexPathString(Token &result) {
   // String tokens in path contexts consume until a space, ':', or '|'
   // character.
   while (true) {
     int c = peekNextChar();

     // If this is an escape character, skip the next character.
     if (c == '$') {
       getNextChar(); // Consume the actual '$'.

       // Consume the next character.
       c = getNextChar();

       // If the character was a newline, consume any leading spaces.
       if (c == '\n') {
         while (isNonNewlineSpace(peekNextChar()))
           getNextChar();
       }

       continue;
     }

     // Otherwise, continue only if this is not the EOL or EOF.
     if (isspace(c) || c == ':' || c == '|' || c == -1)
       break;

     getNextChar();
   }

   return setTokenKind(result, Token::Kind::String);
 }

 Token& Lexer::lexVariableString(Token& result) {
   // String tokens in variable assignments consume until the end of the line.
   while (true) {
     int c = peekNextChar();

     // If this is an escape character, skip the next character.
     if (c == '$') {
       getNextChar(); // Consume the actual '$'.
       getNextChar(); // Consume the next character.
       continue;
     }

     // Otherwise, continue only if this is not the EOL or EOF.
     if (c == '\n' || c == -1)
       break;

     getNextChar();
   }

   return setTokenKind(result, Token::Kind::String);
 }

 Token& Lexer::lex(Token& result) {
   // Check if we need to emit an indentation token.
   int c = peekNextChar();
   if (isNonNewlineSpace(c) && columnNumber == 0) {
     // If we are at the start of a line, then any leading whitespace should be
     // parsed as an indentation token.
     //
     // We do not need to handle "$\n" sequences here because they will be
     // consumed next, and the exact length of the indentation token is never
     // used.
     if (columnNumber == 0) {
       result.start = bufferPos;
       result.line = lineNumber;
       result.column = columnNumber;

       do {
         getNextChar();
       } while (isNonNewlineSpace(peekNextChar()));

       return setTokenKind(result, Token::Kind::Indentation);
     }
   }

   // Otherwise, consume any leading whitespace or "$\n" escape sequences (except
   // at the start of lines, which Ninja does not recognize).
   while (true) {
     // Check for escape sequences.
     if (c == '$' && columnNumber != 0) {
       // If this is a newline escape, consume it.
       if (bufferPos + 1 != buffer.end() && bufferPos[1] == '\n') {
         getNextChar();
         getNextChar();
       } else {
         // Otherwise, break out and lex normally.
         break;
       }
     } else if (isNonNewlineSpace(c)) {
       getNextChar();
     } else {
       break;
     }

     c = peekNextChar();
   }

   // Initialize the token position.
   result.start = bufferPos;
   result.line = lineNumber;
   result.column = columnNumber;

   // Check if we are at a string mode independent token.
   if (c == '\n') {
     getNextChar();
     return setTokenKind(result, Token::Kind::Newline);
   }
   if (c == -1)
     return setTokenKind(result, Token::Kind::EndOfFile);

   // If we are in string lexing mode, delegate immediately if appropriate.
   if (mode == LexingMode::VariableString)
     return lexVariableString(result);
   if (mode == LexingMode::PathString) {
     // Only delegate for characters not special to path lexing.
     if (c != ':' && c != '|')
       return lexPathString(result);
   }

   // Otherwise, consume the character and lex from the regular token set.
   getNextChar();
   switch (c) {
   case ':': return setTokenKind(result, Token::Kind::Colon);
   case '=': return setTokenKind(result, Token::Kind::Equals);

   case '#': {
     skipToEndOfLine();
     return setTokenKind(result, Token::Kind::Comment);
   }

   case '|': {
     if (peekNextChar() == '|') {
       (void) getNextChar();
       return setTokenKind(result, Token::Kind::PipePipe);
     }
     return setTokenKind(result, Token::Kind::Pipe);
   }

   default:
     if (Lexer::isIdentifierChar(c))
       return lexIdentifier(result);

     return setTokenKind(result, Token::Kind::Unknown);
   }
 }
	//===-- Lexer.cpp ---------------------------------------------------------===//
	//
	// This source file is part of the Swift.org open source project
	//
	// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
	// Licensed under Apache License v2.0 with Runtime Library Exception
	//
	// See http://swift.org/LICENSE.txt for license information
	// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
	//
	//===----------------------------------------------------------------------===//

	#include "llbuild/Ninja/Lexer.h"

	#include "llbuild/Basic/LLVM.h"

	#include <cstring>
	#include <string>
	#include <iostream>
	#include <iomanip>

	using namespace llbuild;
	using namespace llbuild::ninja;

	///

	const char* Token::getKindName() const {
	#define CASE(name) case Kind::name: return #name
	switch (tokenKind) {
	CASE(Colon);
	CASE(Comment);
	CASE(EndOfFile);
	CASE(Equals);
	CASE(Identifier);
	CASE(Indentation);
	CASE(KWBuild);
	CASE(KWDefault);
	CASE(KWInclude);
	CASE(KWPool);
	CASE(KWRule);
	CASE(KWSubninja);
	CASE(Newline);
	CASE(Pipe);
	CASE(PipePipe);
	CASE(String);
	CASE(Unknown);
	}
	#undef CASE

	return "<invalid token kind>";
	}

	#ifndef NDEBUG
	void Token::dump() {
	std::cerr << "(Token \"" << getKindName() << "\" "
	<< (const void*) start << " " << length << " "
	<< line << " " << column << ")\n";
	}
	#endif

	///

	Lexer::Lexer(StringRef buffer)
	: buffer(buffer), bufferPos(buffer.data()), lineNumber(1), columnNumber(0),
	mode(LexingMode::None)
	{
	}

	Lexer::~Lexer() {
	}

	int Lexer::peekNextChar() {
	if (bufferPos == buffer.end())
	return -1;
	return *bufferPos;
	}

	int Lexer::getNextChar() {
	if (bufferPos == buffer.end())
	return -1;

	// Handle DOS/Mac newlines here, by stripping duplicates and by returning '\n'
	// for both.
	char result = *bufferPos++;
	if (result == '\n' \|\| result == '\r') {
	if (bufferPos != buffer.end() && *bufferPos == ('\n' + '\r' - result))
	++bufferPos;
	result = '\n';
	}

	if (result == '\n') {
	++lineNumber;
	columnNumber = 0;
	} else {
	++columnNumber;
	}

	return result;
	}

	Token& Lexer::setTokenKind(Token& result, Token::Kind kind) const {
	result.tokenKind = kind;
	result.length = bufferPos - result.start;
	return result;
	}

	void Lexer::skipToEndOfLine() {
	// Skip to the end of the line, but not past the actual newline character
	// (which we want to generate a Newline token).
	for (;;) {
	int c = peekNextChar();
	if (c == -1 \|\| c == '\n')
	break;
	getNextChar();
	}
	}

	Token& Lexer::setIdentifierTokenKind(Token& result) const {
	unsigned length = bufferPos - result.start;
	switch (length) {
	case 4:
	if (memcmp("rule", result.start, 4) == 0)
	return setTokenKind(result, Token::Kind::KWRule);
	if (memcmp("pool", result.start, 4) == 0)
	return setTokenKind(result, Token::Kind::KWPool);
	break;

	case 5:
	if (memcmp("build", result.start, 5) == 0)
	return setTokenKind(result, Token::Kind::KWBuild);
	break;

	case 7:
	if (memcmp("default", result.start, 7) == 0)
	return setTokenKind(result, Token::Kind::KWDefault);
	if (memcmp("include", result.start, 7) == 0)
	return setTokenKind(result, Token::Kind::KWInclude);
	break;

	case 8:
	if (memcmp("subninja", result.start, 7) == 0)
	return setTokenKind(result, Token::Kind::KWSubninja);
	break;
	}

	return setTokenKind(result, Token::Kind::Identifier);
	}

	Token& Lexer::lexIdentifier(Token& result) {
	// Consume characters as long as we are in an identifier.
	while (Lexer::isIdentifierChar(peekNextChar())) {
	getNextChar();
	}

	// If we are in identifier specific mode, ignore keywords.
	if (mode == Lexer::LexingMode::IdentifierSpecific)
	return setTokenKind(result, Token::Kind::Identifier);

	// Recognize keywords specially.
	return setIdentifierTokenKind(result);
	}

	static bool isNonNewlineSpace(int c) {
	return isspace(c) && c != '\n';
	}

	Token &Lexer::lexPathString(Token &result) {
	// String tokens in path contexts consume until a space, ':', or '\|'
	// character.
	while (true) {
	int c = peekNextChar();

	// If this is an escape character, skip the next character.
	if (c == '$') {
	getNextChar(); // Consume the actual '$'.

	// Consume the next character.
	c = getNextChar();

	// If the character was a newline, consume any leading spaces.
	if (c == '\n') {
	while (isNonNewlineSpace(peekNextChar()))
	getNextChar();
	}

	continue;
	}

	// Otherwise, continue only if this is not the EOL or EOF.
	if (isspace(c) \|\| c == ':' \|\| c == '\|' \|\| c == -1)
	break;

	getNextChar();
	}

	return setTokenKind(result, Token::Kind::String);
	}

	Token& Lexer::lexVariableString(Token& result) {
	// String tokens in variable assignments consume until the end of the line.
	while (true) {
	int c = peekNextChar();

	// If this is an escape character, skip the next character.
	if (c == '$') {
	getNextChar(); // Consume the actual '$'.
	getNextChar(); // Consume the next character.
	continue;
	}

	// Otherwise, continue only if this is not the EOL or EOF.
	if (c == '\n' \|\| c == -1)
	break;

	getNextChar();
	}

	return setTokenKind(result, Token::Kind::String);
	}

	Token& Lexer::lex(Token& result) {
	// Check if we need to emit an indentation token.
	int c = peekNextChar();
	if (isNonNewlineSpace(c) && columnNumber == 0) {
	// If we are at the start of a line, then any leading whitespace should be
	// parsed as an indentation token.
	//
	// We do not need to handle "$\n" sequences here because they will be
	// consumed next, and the exact length of the indentation token is never
	// used.
	if (columnNumber == 0) {
	result.start = bufferPos;
	result.line = lineNumber;
	result.column = columnNumber;

	do {
	getNextChar();
	} while (isNonNewlineSpace(peekNextChar()));

	return setTokenKind(result, Token::Kind::Indentation);
	}
	}

	// Otherwise, consume any leading whitespace or "$\n" escape sequences (except
	// at the start of lines, which Ninja does not recognize).
	while (true) {
	// Check for escape sequences.
	if (c == '$' && columnNumber != 0) {
	// If this is a newline escape, consume it.
	if (bufferPos + 1 != buffer.end() && bufferPos[1] == '\n') {
	getNextChar();
	getNextChar();
	} else {
	// Otherwise, break out and lex normally.
	break;
	}
	} else if (isNonNewlineSpace(c)) {
	getNextChar();
	} else {
	break;
	}

	c = peekNextChar();
	}

	// Initialize the token position.
	result.start = bufferPos;
	result.line = lineNumber;
	result.column = columnNumber;

	// Check if we are at a string mode independent token.
	if (c == '\n') {
	getNextChar();
	return setTokenKind(result, Token::Kind::Newline);
	}
	if (c == -1)
	return setTokenKind(result, Token::Kind::EndOfFile);

	// If we are in string lexing mode, delegate immediately if appropriate.
	if (mode == LexingMode::VariableString)
	return lexVariableString(result);
	if (mode == LexingMode::PathString) {
	// Only delegate for characters not special to path lexing.
	if (c != ':' && c != '\|')
	return lexPathString(result);
	}

	// Otherwise, consume the character and lex from the regular token set.
	getNextChar();
	switch (c) {
	case ':': return setTokenKind(result, Token::Kind::Colon);
	case '=': return setTokenKind(result, Token::Kind::Equals);

	case '#': {
	skipToEndOfLine();
	return setTokenKind(result, Token::Kind::Comment);
	}

	case '\|': {
	if (peekNextChar() == '\|') {
	(void) getNextChar();
	return setTokenKind(result, Token::Kind::PipePipe);
	}
	return setTokenKind(result, Token::Kind::Pipe);
	}

	default:
	if (Lexer::isIdentifierChar(c))
	return lexIdentifier(result);

	return setTokenKind(result, Token::Kind::Unknown);
	}
	}