| //===--- Lexer.cpp - Swift Language Lexer ---------------------------------===// |
| // |
| // This source file is part of the Swift.org open source project |
| // |
| // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors |
| // Licensed under Apache License v2.0 with Runtime Library Exception |
| // |
| // See https://swift.org/LICENSE.txt for license information |
| // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the Lexer and Token interfaces. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "swift/Parse/Confusables.h" |
| #include "swift/Parse/Lexer.h" |
| #include "swift/AST/DiagnosticsParse.h" |
| #include "swift/AST/Identifier.h" |
| #include "swift/Basic/LangOptions.h" |
| #include "swift/Basic/SourceManager.h" |
| #include "swift/Syntax/SyntaxParsingContext.h" |
| #include "swift/Syntax/RawTokenSyntax.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/MemoryBuffer.h" |
| #include "llvm/ADT/SmallString.h" |
| #include "llvm/ADT/StringSwitch.h" |
| #include "llvm/ADT/Twine.h" |
| // FIXME: Figure out if this can be migrated to LLVM. |
| #include "clang/Basic/CharInfo.h" |
| |
| #include <limits> |
| |
| using namespace swift; |
| using namespace swift::syntax; |
| |
| // clang::isIdentifierHead and clang::isIdentifierBody are deliberately not in |
| // this list as a reminder that they are using C rules for identifiers. |
| // (Admittedly these are the same as Swift's right now.) |
| using clang::isAlphanumeric; |
| using clang::isDigit; |
| using clang::isHexDigit; |
| using clang::isHorizontalWhitespace; |
| using clang::isPrintable; |
| using clang::isWhitespace; |
| |
| //===----------------------------------------------------------------------===// |
| // UTF8 Validation/Encoding/Decoding helper functions |
| //===----------------------------------------------------------------------===// |
| |
| /// EncodeToUTF8 - Encode the specified code point into a UTF8 stream. Return |
| /// true if it is an erroneous code point. |
| static bool EncodeToUTF8(unsigned CharValue, |
| SmallVectorImpl<char> &Result) { |
| // Number of bits in the value, ignoring leading zeros. |
| unsigned NumBits = 32-llvm::countLeadingZeros(CharValue); |
| |
| // Handle the leading byte, based on the number of bits in the value. |
| unsigned NumTrailingBytes; |
| if (NumBits <= 5+6) { |
| // Encoding is 0x110aaaaa 10bbbbbb |
| Result.push_back(char(0xC0 | (CharValue >> 6))); |
| NumTrailingBytes = 1; |
| } else if (NumBits <= 4+6+6) { |
| // Encoding is 0x1110aaaa 10bbbbbb 10cccccc |
| Result.push_back(char(0xE0 | (CharValue >> (6+6)))); |
| NumTrailingBytes = 2; |
| |
| // UTF-16 surrogate pair values are not valid code points. |
| if (CharValue >= 0xD800 && CharValue <= 0xDFFF) |
| return true; |
| // U+FDD0...U+FDEF are also reserved |
| if (CharValue >= 0xFDD0 && CharValue <= 0xFDEF) |
| return true; |
| } else if (NumBits <= 3+6+6+6) { |
| // Encoding is 0x11110aaa 10bbbbbb 10cccccc 10dddddd |
| Result.push_back(char(0xF0 | (CharValue >> (6+6+6)))); |
| NumTrailingBytes = 3; |
| // Reject over-large code points. These cannot be encoded as UTF-16 |
| // surrogate pairs, so UTF-32 doesn't allow them. |
| if (CharValue > 0x10FFFF) |
| return true; |
| } else { |
| return true; // UTF8 can encode these, but they aren't valid code points. |
| } |
| |
| // Emit all of the trailing bytes. |
| while (NumTrailingBytes--) |
| Result.push_back(char(0x80 | (0x3F & (CharValue >> (NumTrailingBytes*6))))); |
| return false; |
| } |
| |
| |
| /// CLO8 - Return the number of leading ones in the specified 8-bit value. |
| static unsigned CLO8(unsigned char C) { |
| return llvm::countLeadingOnes(uint32_t(C) << 24); |
| } |
| |
| /// isStartOfUTF8Character - Return true if this isn't a UTF8 continuation |
| /// character, which will be of the form 0b10XXXXXX |
| static bool isStartOfUTF8Character(unsigned char C) { |
| // RFC 2279: The octet values FE and FF never appear. |
| // RFC 3629: The octet values C0, C1, F5 to FF never appear. |
| return C <= 0x80 || (C >= 0xC2 && C < 0xF5); |
| } |
| |
| /// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a |
| /// UTF8 character, validate it and advance the lexer past it. This returns the |
| /// encoded character or ~0U if the encoding is invalid. |
| uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr, |
| const char *End) { |
| if (Ptr >= End) |
| return ~0U; |
| |
| unsigned char CurByte = *Ptr++; |
| if (CurByte < 0x80) |
| return CurByte; |
| |
| // Read the number of high bits set, which indicates the number of bytes in |
| // the character. |
| unsigned EncodedBytes = CLO8(CurByte); |
| |
| // If this is 0b10XXXXXX, then it is a continuation character. |
| if (EncodedBytes == 1 || |
| !isStartOfUTF8Character(CurByte)) { |
| // Skip until we get the start of another character. This is guaranteed to |
| // at least stop at the nul at the end of the buffer. |
| while (Ptr < End && !isStartOfUTF8Character(*Ptr)) |
| ++Ptr; |
| return ~0U; |
| } |
| |
| // Drop the high bits indicating the # bytes of the result. |
| unsigned CharValue = (unsigned char)(CurByte << EncodedBytes) >> EncodedBytes; |
| |
| // Read and validate the continuation bytes. |
| for (unsigned i = 1; i != EncodedBytes; ++i) { |
| if (Ptr >= End) |
| return ~0U; |
| CurByte = *Ptr; |
| // If the high bit isn't set or the second bit isn't clear, then this is not |
| // a continuation byte! |
| if (CurByte < 0x80 || CurByte >= 0xC0) return ~0U; |
| |
| // Accumulate our result. |
| CharValue <<= 6; |
| CharValue |= CurByte & 0x3F; |
| ++Ptr; |
| } |
| |
| // UTF-16 surrogate pair values are not valid code points. |
| if (CharValue >= 0xD800 && CharValue <= 0xDFFF) |
| return ~0U; |
| |
| // If we got here, we read the appropriate number of accumulated bytes. |
| // Verify that the encoding was actually minimal. |
| // Number of bits in the value, ignoring leading zeros. |
| unsigned NumBits = 32-llvm::countLeadingZeros(CharValue); |
| |
| if (NumBits <= 5+6) |
| return EncodedBytes == 2 ? CharValue : ~0U; |
| if (NumBits <= 4+6+6) |
| return EncodedBytes == 3 ? CharValue : ~0U; |
| return EncodedBytes == 4 ? CharValue : ~0U; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Setup and Helper Methods |
| //===----------------------------------------------------------------------===// |
| |
| Lexer::Lexer(const LangOptions &Options, |
| const SourceManager &SM, DiagnosticEngine *Diags, |
| unsigned BufferID, bool InSILMode, |
| CommentRetentionMode RetainComments, |
| TriviaRetentionMode TriviaRetention) |
| : LangOpts(Options), SourceMgr(SM), Diags(Diags), BufferID(BufferID), |
| InSILMode(InSILMode), RetainComments(RetainComments), |
| TriviaRetention(TriviaRetention) { |
| // Initialize buffer pointers. |
| StringRef contents = SM.extractText(SM.getRangeForBuffer(BufferID)); |
| BufferStart = contents.data(); |
| BufferEnd = contents.data() + contents.size(); |
| |
| // Check for Unicode BOM at start of file (Only UTF-8 BOM supported now). |
| size_t BOMLength = llvm::StringSwitch<size_t>(contents) |
| .StartsWith("\xEF\xBB\xBF", 3) |
| .Default(0); |
| |
| // Since the UTF-8 BOM doesn't carry information (UTF-8 has no dependency |
| // on byte order), throw it away. |
| CurPtr = BufferStart + BOMLength; |
| ContentStart = BufferStart + BOMLength; |
| |
| // Initialize code completion. |
| if (BufferID == SM.getCodeCompletionBufferID()) { |
| const char *Ptr = BufferStart + SM.getCodeCompletionOffset(); |
| if (Ptr >= BufferStart && Ptr <= BufferEnd) |
| CodeCompletionPtr = Ptr; |
| } |
| } |
| |
| void Lexer::primeLexer() { |
| assert(NextToken.is(tok::NUM_TOKENS)); |
| lexImpl(); |
| assert((NextToken.isAtStartOfLine() || CurPtr != BufferStart) && |
| "The token should be at the beginning of the line, " |
| "or we should be lexing from the middle of the buffer"); |
| } |
| |
| void Lexer::initSubLexer(Lexer &Parent, State BeginState, State EndState) { |
| assert(BufferID == SourceMgr.findBufferContainingLoc(BeginState.Loc) && |
| "state for the wrong buffer"); |
| assert(BufferID == SourceMgr.findBufferContainingLoc(EndState.Loc) && |
| "state for the wrong buffer"); |
| |
| // If the parent lexer should stop prematurely, and the ArtificialEOF |
| // position is in this subrange, then we should stop at that point, too. |
| const char *BeginStatePtr = getBufferPtrForSourceLoc(BeginState.Loc); |
| const char *EndStatePtr = getBufferPtrForSourceLoc(EndState.Loc); |
| if (Parent.ArtificialEOF && |
| Parent.ArtificialEOF >= BeginStatePtr && |
| Parent.ArtificialEOF <= EndStatePtr) { |
| ArtificialEOF = Parent.ArtificialEOF; |
| } else |
| ArtificialEOF = EndStatePtr; |
| |
| primeLexer(); |
| restoreState(BeginState); |
| } |
| |
| InFlightDiagnostic Lexer::diagnose(const char *Loc, Diagnostic Diag) { |
| if (Diags) |
| return Diags->diagnose(getSourceLoc(Loc), Diag); |
| |
| return InFlightDiagnostic(); |
| } |
| |
| Token Lexer::getTokenAt(SourceLoc Loc) { |
| assert(BufferID == static_cast<unsigned>( |
| SourceMgr.findBufferContainingLoc(Loc)) && |
| "location from the wrong buffer"); |
| |
| Lexer L(LangOpts, SourceMgr, BufferID, Diags, InSILMode, |
| CommentRetentionMode::None, TriviaRetentionMode::WithoutTrivia); |
| L.restoreState(State(Loc)); |
| Token Result; |
| L.lex(Result); |
| return Result; |
| } |
| |
| void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { |
| assert(CurPtr >= BufferStart && |
| CurPtr <= BufferEnd && "Current pointer out of range!"); |
| |
| // When we are lexing a subrange from the middle of a file buffer, we will |
| // run past the end of the range, but will stay within the file. Check if |
| // we are past the imaginary EOF, and synthesize a tok::eof in this case. |
| if (Kind != tok::eof && ArtificialEOF && TokStart >= ArtificialEOF) { |
| Kind = tok::eof; |
| } |
| unsigned CommentLength = 0; |
| if (RetainComments == CommentRetentionMode::AttachToNextToken && SeenComment) |
| CommentLength = TokStart - LastCommentBlockStart; |
| |
| StringRef TokenText { TokStart, static_cast<size_t>(CurPtr - TokStart) }; |
| |
| lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true); |
| |
| NextToken.setToken(Kind, TokenText, CommentLength, MultilineString); |
| } |
| |
| Lexer::State Lexer::getStateForBeginningOfTokenLoc(SourceLoc Loc) const { |
| const char *Ptr = getBufferPtrForSourceLoc(Loc); |
| // Skip whitespace backwards until we hit a newline. This is needed to |
| // correctly lex the token if it is at the beginning of the line. |
| while (Ptr >= ContentStart + 1) { |
| char C = Ptr[-1]; |
| if (C == ' ' || C == '\t') { |
| --Ptr; |
| continue; |
| } |
| if (C == 0) { |
| // A NUL character can be either whitespace we diagnose or a code |
| // completion token. |
| if (Ptr - 1 == CodeCompletionPtr) |
| break; |
| --Ptr; |
| continue; |
| } |
| if (C == '\n' || C == '\r') { |
| --Ptr; |
| break; |
| } |
| break; |
| } |
| return State(SourceLoc(llvm::SMLoc::getFromPointer(Ptr))); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Lexer Subroutines |
| //===----------------------------------------------------------------------===// |
| |
| static void diagnoseEmbeddedNul(DiagnosticEngine *Diags, const char *Ptr) { |
| assert(Ptr && "invalid source location"); |
| assert(*Ptr == '\0' && "not an embedded null"); |
| |
| if (!Diags) |
| return; |
| |
| SourceLoc NulLoc = Lexer::getSourceLoc(Ptr); |
| SourceLoc NulEndLoc = Lexer::getSourceLoc(Ptr+1); |
| Diags->diagnose(NulLoc, diag::lex_nul_character) |
| .fixItRemoveChars(NulLoc, NulEndLoc); |
| } |
| |
| void Lexer::skipToEndOfLine(bool EatNewline) { |
| while (1) { |
| switch (*CurPtr++) { |
| case '\n': |
| case '\r': |
| if (EatNewline) { |
| NextToken.setAtStartOfLine(true); |
| } else { |
| --CurPtr; |
| } |
| return; // If we found the end of the line, return. |
| default: |
| // If this is a "high" UTF-8 character, validate it. |
| if ((signed char)(CurPtr[-1]) < 0) { |
| --CurPtr; |
| const char *CharStart = CurPtr; |
| if (validateUTF8CharacterAndAdvance(CurPtr, BufferEnd) == ~0U) |
| diagnose(CharStart, diag::lex_invalid_utf8); |
| } |
| break; // Otherwise, eat other characters. |
| case 0: |
| // If this is a random nul character in the middle of a buffer, skip it as |
| // whitespace. |
| if (CurPtr-1 != BufferEnd) { |
| diagnoseEmbeddedNul(Diags, CurPtr-1); |
| break; |
| } |
| |
| // Otherwise, the last line of the file does not have a newline. |
| --CurPtr; |
| return; |
| } |
| } |
| } |
| |
| void Lexer::skipSlashSlashComment(bool EatNewline) { |
| assert(CurPtr[-1] == '/' && CurPtr[0] == '/' && "Not a // comment"); |
| skipToEndOfLine(EatNewline); |
| } |
| |
| void Lexer::skipHashbang(bool EatNewline) { |
| assert(CurPtr == ContentStart && CurPtr[0] == '#' && CurPtr[1] == '!' && |
| "Not a hashbang"); |
| skipToEndOfLine(EatNewline); |
| } |
| |
| /// skipSlashStarComment - /**/ comments are skipped (treated as whitespace). |
| /// Note that (unlike in C) block comments can be nested. |
| void Lexer::skipSlashStarComment() { |
| const char *StartPtr = CurPtr-1; |
| assert(CurPtr[-1] == '/' && CurPtr[0] == '*' && "Not a /* comment"); |
| // Make sure to advance over the * so that we don't incorrectly handle /*/ as |
| // the beginning and end of the comment. |
| ++CurPtr; |
| |
| // /**/ comments can be nested, keep track of how deep we've gone. |
| unsigned Depth = 1; |
| |
| while (1) { |
| switch (*CurPtr++) { |
| case '*': |
| // Check for a '*/' |
| if (*CurPtr == '/') { |
| ++CurPtr; |
| if (--Depth == 0) |
| return; |
| } |
| break; |
| case '/': |
| // Check for a '/*' |
| if (*CurPtr == '*') { |
| ++CurPtr; |
| ++Depth; |
| } |
| break; |
| |
| case '\n': |
| case '\r': |
| NextToken.setAtStartOfLine(true); |
| break; |
| |
| default: |
| // If this is a "high" UTF-8 character, validate it. |
| if ((signed char)(CurPtr[-1]) < 0) { |
| --CurPtr; |
| const char *CharStart = CurPtr; |
| if (validateUTF8CharacterAndAdvance(CurPtr, BufferEnd) == ~0U) |
| diagnose(CharStart, diag::lex_invalid_utf8); |
| } |
| |
| break; // Otherwise, eat other characters. |
| case 0: |
| // If this is a random nul character in the middle of a buffer, skip it as |
| // whitespace. |
| if (CurPtr-1 != BufferEnd) { |
| diagnoseEmbeddedNul(Diags, CurPtr-1); |
| break; |
| } |
| |
| // Otherwise, we have an unterminated /* comment. |
| --CurPtr; |
| |
| // Count how many levels deep we are. |
| llvm::SmallString<8> Terminator("*/"); |
| while (--Depth != 0) |
| Terminator += "*/"; |
| |
| const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr; |
| diagnose(EOL, diag::lex_unterminated_block_comment) |
| .fixItInsert(getSourceLoc(EOL), Terminator); |
| diagnose(StartPtr, diag::lex_comment_start); |
| return; |
| } |
| } |
| } |
| |
| static bool isValidIdentifierContinuationCodePoint(uint32_t c) { |
| if (c < 0x80) |
| return clang::isIdentifierBody(c, /*dollar*/true); |
| |
| // N1518: Recommendations for extended identifier characters for C and C++ |
| // Proposed Annex X.1: Ranges of characters allowed |
| return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF |
| || (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) |
| || (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) |
| || (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF) |
| |
| || (c >= 0x0100 && c <= 0x167F) |
| || (c >= 0x1681 && c <= 0x180D) |
| || (c >= 0x180F && c <= 0x1FFF) |
| |
| || (c >= 0x200B && c <= 0x200D) |
| || (c >= 0x202A && c <= 0x202E) |
| || (c >= 0x203F && c <= 0x2040) |
| || c == 0x2054 |
| || (c >= 0x2060 && c <= 0x206F) |
| |
| || (c >= 0x2070 && c <= 0x218F) |
| || (c >= 0x2460 && c <= 0x24FF) |
| || (c >= 0x2776 && c <= 0x2793) |
| || (c >= 0x2C00 && c <= 0x2DFF) |
| || (c >= 0x2E80 && c <= 0x2FFF) |
| |
| || (c >= 0x3004 && c <= 0x3007) |
| || (c >= 0x3021 && c <= 0x302F) |
| || (c >= 0x3031 && c <= 0x303F) |
| |
| || (c >= 0x3040 && c <= 0xD7FF) |
| |
| || (c >= 0xF900 && c <= 0xFD3D) |
| || (c >= 0xFD40 && c <= 0xFDCF) |
| || (c >= 0xFDF0 && c <= 0xFE44) |
| || (c >= 0xFE47 && c <= 0xFFF8) |
| |
| || (c >= 0x10000 && c <= 0x1FFFD) |
| || (c >= 0x20000 && c <= 0x2FFFD) |
| || (c >= 0x30000 && c <= 0x3FFFD) |
| || (c >= 0x40000 && c <= 0x4FFFD) |
| || (c >= 0x50000 && c <= 0x5FFFD) |
| || (c >= 0x60000 && c <= 0x6FFFD) |
| || (c >= 0x70000 && c <= 0x7FFFD) |
| || (c >= 0x80000 && c <= 0x8FFFD) |
| || (c >= 0x90000 && c <= 0x9FFFD) |
| || (c >= 0xA0000 && c <= 0xAFFFD) |
| || (c >= 0xB0000 && c <= 0xBFFFD) |
| || (c >= 0xC0000 && c <= 0xCFFFD) |
| || (c >= 0xD0000 && c <= 0xDFFFD) |
| || (c >= 0xE0000 && c <= 0xEFFFD); |
| } |
| static bool isValidIdentifierStartCodePoint(uint32_t c) { |
| if (!isValidIdentifierContinuationCodePoint(c)) |
| return false; |
| if (c < 0x80 && (isDigit(c) || c == '$')) |
| return false; |
| |
| // N1518: Recommendations for extended identifier characters for C and C++ |
| // Proposed Annex X.2: Ranges of characters disallowed initially |
| if ((c >= 0x0300 && c <= 0x036F) || |
| (c >= 0x1DC0 && c <= 0x1DFF) || |
| (c >= 0x20D0 && c <= 0x20FF) || |
| (c >= 0xFE20 && c <= 0xFE2F)) |
| return false; |
| |
| return true; |
| } |
| |
| static bool advanceIf(char const *&ptr, char const *end, |
| bool (*predicate)(uint32_t)) { |
| char const *next = ptr; |
| uint32_t c = validateUTF8CharacterAndAdvance(next, end); |
| if (c == ~0U) |
| return false; |
| if (predicate(c)) { |
| ptr = next; |
| return true; |
| } |
| return false; |
| |
| } |
| |
| static bool advanceIfValidStartOfIdentifier(char const *&ptr, |
| char const *end) { |
| return advanceIf(ptr, end, isValidIdentifierStartCodePoint); |
| } |
| |
| static bool advanceIfValidContinuationOfIdentifier(char const *&ptr, |
| char const *end) { |
| return advanceIf(ptr, end, isValidIdentifierContinuationCodePoint); |
| } |
| |
| static bool advanceIfValidStartOfOperator(char const *&ptr, |
| char const *end) { |
| return advanceIf(ptr, end, Identifier::isOperatorStartCodePoint); |
| } |
| |
| static bool advanceIfValidContinuationOfOperator(char const *&ptr, |
| char const *end) { |
| return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint); |
| } |
| |
| bool Lexer::isIdentifier(StringRef string) { |
| if (string.empty()) return false; |
| char const *p = string.data(), *end = string.end(); |
| if (!advanceIfValidStartOfIdentifier(p, end)) |
| return false; |
| while (p < end && advanceIfValidContinuationOfIdentifier(p, end)); |
| return p == end; |
| } |
| |
| /// \brief Determines if the given string is a valid operator identifier, |
| /// without escaping characters. |
| bool Lexer::isOperator(StringRef string) { |
| if (string.empty()) return false; |
| char const *p = string.data(), *end = string.end(); |
| if (!advanceIfValidStartOfOperator(p, end)) |
| return false; |
| while (p < end && advanceIfValidContinuationOfOperator(p, end)); |
| return p == end; |
| } |
| |
| |
| tok Lexer::kindOfIdentifier(StringRef Str, bool InSILMode) { |
| tok Kind = llvm::StringSwitch<tok>(Str) |
| #define KEYWORD(kw) \ |
| .Case(#kw, tok::kw_##kw) |
| #include "swift/Syntax/TokenKinds.def" |
| .Default(tok::identifier); |
| |
| // SIL keywords are only active in SIL mode. |
| switch (Kind) { |
| #define SIL_KEYWORD(kw) \ |
| case tok::kw_##kw: |
| #include "swift/Syntax/TokenKinds.def" |
| if (!InSILMode) |
| Kind = tok::identifier; |
| break; |
| default: |
| break; |
| } |
| return Kind; |
| } |
| |
| /// lexIdentifier - Match [a-zA-Z_][a-zA-Z_$0-9]* |
| void Lexer::lexIdentifier() { |
| const char *TokStart = CurPtr-1; |
| CurPtr = TokStart; |
| bool didStart = advanceIfValidStartOfIdentifier(CurPtr, BufferEnd); |
| assert(didStart && "Unexpected start"); |
| (void) didStart; |
| |
| // Lex [a-zA-Z_$0-9[[:XID_Continue:]]]* |
| while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); |
| |
| tok Kind = kindOfIdentifier(StringRef(TokStart, CurPtr-TokStart), InSILMode); |
| return formToken(Kind, TokStart); |
| } |
| |
| /// lexHash - Handle #], #! for shebangs, and the family of #identifiers. |
| void Lexer::lexHash() { |
| const char *TokStart = CurPtr-1; |
| |
| // NOTE: legacy punctuator. Remove in the future. |
| if (*CurPtr == ']') { // #] |
| ++CurPtr; |
| return formToken(tok::r_square_lit, TokStart); |
| } |
| |
| // Allow a hashbang #! line at the beginning of the file. |
| if (CurPtr - 1 == ContentStart && *CurPtr == '!') { |
| --CurPtr; |
| if (BufferID != SourceMgr.getHashbangBufferID()) |
| diagnose(CurPtr, diag::lex_hashbang_not_allowed); |
| skipHashbang(/*EatNewline=*/true); |
| return lexImpl(); |
| } |
| |
| // Scan for [a-zA-Z]+ to see what we match. |
| const char *tmpPtr = CurPtr; |
| if (clang::isIdentifierHead(*tmpPtr)) { |
| do { |
| ++tmpPtr; |
| } while (clang::isIdentifierBody(*tmpPtr)); |
| } |
| |
| // Map the character sequence onto |
| tok Kind = llvm::StringSwitch<tok>(StringRef(CurPtr, tmpPtr-CurPtr)) |
| #define POUND_KEYWORD(id) \ |
| .Case(#id, tok::pound_##id) |
| #include "swift/Syntax/TokenKinds.def" |
| .Default(tok::pound); |
| |
| // If we didn't find a match, then just return tok::pound. This is highly |
| // dubious in terms of error recovery, but is useful for code completion and |
| // SIL parsing. |
| if (Kind == tok::pound) |
| return formToken(tok::pound, TokStart); |
| |
| // If we found something specific, return it. |
| CurPtr = tmpPtr; |
| return formToken(Kind, TokStart); |
| } |
| |
| |
| /// Is the operator beginning at the given character "left-bound"? |
| static bool isLeftBound(const char *tokBegin, const char *bufferBegin) { |
| // The first character in the file is not left-bound. |
| if (tokBegin == bufferBegin) return false; |
| |
| switch (tokBegin[-1]) { |
| case ' ': case '\r': case '\n': case '\t': // whitespace |
| case '(': case '[': case '{': // opening delimiters |
| case ',': case ';': case ':': // expression separators |
| case '\0': // whitespace / last char in file |
| return false; |
| |
| case '/': |
| if (tokBegin - 1 != bufferBegin && tokBegin[-2] == '*') |
| return false; // End of a slash-star comment, so whitespace. |
| else |
| return true; |
| |
| default: |
| return true; |
| } |
| } |
| |
| /// Is the operator ending at the given character (actually one past the end) |
| /// "right-bound"? |
| /// |
| /// The code-completion point is considered right-bound. |
| static bool isRightBound(const char *tokEnd, bool isLeftBound, |
| const char *codeCompletionPtr) { |
| switch (*tokEnd) { |
| case ' ': case '\r': case '\n': case '\t': // whitespace |
| case ')': case ']': case '}': // closing delimiters |
| case ',': case ';': case ':': // expression separators |
| return false; |
| |
| case '\0': |
| if (tokEnd == codeCompletionPtr) // code-completion |
| return true; |
| return false; // whitespace / last char in file |
| |
| case '.': |
| // Prefer the '^' in "x^.y" to be a postfix op, not binary, but the '^' in |
| // "^.y" to be a prefix op, not binary. |
| return !isLeftBound; |
| |
| case '/': |
| // A following comment counts as whitespace, so this token is not right bound. |
| if (tokEnd[1] == '/' || tokEnd[1] == '*') |
| return false; |
| else |
| return true; |
| |
| default: |
| return true; |
| } |
| } |
| |
| static bool rangeContainsPlaceholderEnd(const char *CurPtr, |
| const char *End) { |
| for (auto SubStr = CurPtr; SubStr != End - 1; ++SubStr) { |
| if (SubStr[0] == '\n') { |
| return false; |
| } |
| if (SubStr[0] == '#' && SubStr[1] == '>') { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// lexOperatorIdentifier - Match identifiers formed out of punctuation. |
| void Lexer::lexOperatorIdentifier() { |
| const char *TokStart = CurPtr-1; |
| CurPtr = TokStart; |
| bool didStart = advanceIfValidStartOfOperator(CurPtr, BufferEnd); |
| assert(didStart && "unexpected operator start"); |
| (void) didStart; |
| |
| do { |
| if (CurPtr != BufferEnd && InSILBody && |
| (*CurPtr == '!' || *CurPtr == '?')) |
| // When parsing SIL body, '!' and '?' are special token and can't be |
| // in the middle of an operator. |
| break; |
| |
| // '.' cannot appear in the middle of an operator unless the operator |
| // started with a '.'. |
| if (*CurPtr == '.' && *TokStart != '.') |
| break; |
| if (Identifier::isEditorPlaceholder(StringRef(CurPtr, BufferEnd-CurPtr)) && |
| rangeContainsPlaceholderEnd(CurPtr + 2, BufferEnd)) { |
| break; |
| } |
| } while (advanceIfValidContinuationOfOperator(CurPtr, BufferEnd)); |
| |
| if (CurPtr-TokStart > 2) { |
| // If there is a "//" or "/*" in the middle of an identifier token, |
| // it starts a comment. |
| for (auto Ptr = TokStart+1; Ptr != CurPtr-1; ++Ptr) { |
| if (Ptr[0] == '/' && (Ptr[1] == '/' || Ptr[1] == '*')) { |
| CurPtr = Ptr; |
| break; |
| } |
| } |
| } |
| |
| // Decide between the binary, prefix, and postfix cases. |
| // It's binary if either both sides are bound or both sides are not bound. |
| // Otherwise, it's postfix if left-bound and prefix if right-bound. |
| bool leftBound = isLeftBound(TokStart, ContentStart); |
| bool rightBound = isRightBound(CurPtr, leftBound, CodeCompletionPtr); |
| |
| // Match various reserved words. |
| if (CurPtr-TokStart == 1) { |
| switch (TokStart[0]) { |
| case '=': |
| if (leftBound != rightBound) { |
| auto d = diagnose(TokStart, diag::lex_unary_equal); |
| if (leftBound) |
| d.fixItInsert(getSourceLoc(TokStart), " "); |
| else |
| d.fixItInsert(getSourceLoc(TokStart+1), " "); |
| } |
| // always emit 'tok::equal' to avoid trickle down parse errors |
| return formToken(tok::equal, TokStart); |
| case '&': |
| if (leftBound == rightBound || leftBound) |
| break; |
| return formToken(tok::amp_prefix, TokStart); |
| case '.': { |
| if (leftBound == rightBound) |
| return formToken(tok::period, TokStart); |
| if (rightBound) |
| return formToken(tok::period_prefix, TokStart); |
| |
| // If left bound but not right bound, handle some likely situations. |
| |
| // If there is just some horizontal whitespace before the next token, its |
| // addition is probably incorrect. |
| const char *AfterHorzWhitespace = CurPtr; |
| while (*AfterHorzWhitespace == ' ' || *AfterHorzWhitespace == '\t') |
| ++AfterHorzWhitespace; |
| |
| // First, when we are code completing "x. <ESC>", then make sure to return |
| // a tok::period, since that is what the user is wanting to know about. |
| if (*AfterHorzWhitespace == '\0' && |
| AfterHorzWhitespace == CodeCompletionPtr) { |
| diagnose(TokStart, diag::expected_member_name); |
| return formToken(tok::period, TokStart); |
| } |
| |
| if (isRightBound(AfterHorzWhitespace, leftBound, CodeCompletionPtr) && |
| // Don't consider comments to be this. A leading slash is probably |
| // either // or /* and most likely occurs just in our testsuite for |
| // expected-error lines. |
| *AfterHorzWhitespace != '/') { |
| diagnose(TokStart, diag::extra_whitespace_period) |
| .fixItRemoveChars(getSourceLoc(CurPtr), |
| getSourceLoc(AfterHorzWhitespace)); |
| return formToken(tok::period, TokStart); |
| } |
| |
| // Otherwise, it is probably a missing member. |
| diagnose(TokStart, diag::expected_member_name); |
| return formToken(tok::unknown, TokStart); |
| } |
| case '?': |
| if (leftBound) |
| return formToken(tok::question_postfix, TokStart); |
| return formToken(tok::question_infix, TokStart); |
| } |
| } else if (CurPtr-TokStart == 2) { |
| switch ((TokStart[0] << 8) | TokStart[1]) { |
| case ('-' << 8) | '>': // -> |
| return formToken(tok::arrow, TokStart); |
| case ('*' << 8) | '/': // */ |
| diagnose(TokStart, diag::lex_unexpected_block_comment_end); |
| return formToken(tok::unknown, TokStart); |
| } |
| } else { |
| // Verify there is no "*/" in the middle of the identifier token, we reject |
| // it as potentially ending a block comment. |
| auto Pos = StringRef(TokStart, CurPtr-TokStart).find("*/"); |
| if (Pos != StringRef::npos) { |
| diagnose(TokStart+Pos, diag::lex_unexpected_block_comment_end); |
| return formToken(tok::unknown, TokStart); |
| } |
| } |
| |
| if (leftBound == rightBound) |
| return formToken(leftBound ? tok::oper_binary_unspaced : |
| tok::oper_binary_spaced, TokStart); |
| |
| return formToken(leftBound ? tok::oper_postfix : tok::oper_prefix, TokStart); |
| } |
| |
| /// lexDollarIdent - Match $[0-9a-zA-Z_$]+ |
| void Lexer::lexDollarIdent() { |
| const char *tokStart = CurPtr-1; |
| assert(*tokStart == '$'); |
| |
| // In a SIL function body, '$' is a token by itself. |
| if (InSILBody) |
| return formToken(tok::sil_dollar, tokStart); |
| |
| bool isAllDigits = true; |
| for (;; ++CurPtr) { |
| if (isDigit(*CurPtr)) { |
| // continue |
| } else if (clang::isIdentifierHead(*CurPtr, /*dollar*/true)) { |
| isAllDigits = false; |
| // continue |
| } else { |
| break; |
| } |
| } |
| |
| if (CurPtr == tokStart + 1) { |
| // It is always an error to see a standalone '$' when not in Swift 3 |
| // compatibility mode. |
| if (!LangOpts.isSwiftVersion3()) { |
| // Offer to replace '$' with '`$`'. |
| diagnose(tokStart, diag::standalone_dollar_identifier) |
| .fixItReplaceChars(getSourceLoc(tokStart), getSourceLoc(CurPtr), "`$`"); |
| } |
| return formToken(tok::identifier, tokStart); |
| } |
| |
| // We reserve $nonNumeric for persistent bindings in the debugger. |
| if (!isAllDigits) { |
| if (!LangOpts.EnableDollarIdentifiers) |
| diagnose(tokStart, diag::expected_dollar_numeric); |
| |
| // Even if we diagnose, we go ahead and form an identifier token, |
| // in part to ensure that the basic behavior of the lexer is |
| // independent of language mode. |
| return formToken(tok::identifier, tokStart); |
| } else { |
| return formToken(tok::dollarident, tokStart); |
| } |
| } |
| |
| enum class ExpectedDigitKind : unsigned { Binary, Octal, Decimal, Hex }; |
| |
| void Lexer::lexHexNumber() { |
| // We assume we're starting from the 'x' in a '0x...' floating-point literal. |
| assert(*CurPtr == 'x' && "not a hex literal"); |
| const char *TokStart = CurPtr-1; |
| assert(*TokStart == '0' && "not a hex literal"); |
| |
| auto expected_digit = [&]() { |
| while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); |
| return formToken(tok::unknown, TokStart); |
| }; |
| |
| auto expected_hex_digit = [&](const char *loc) { |
| diagnose(loc, diag::lex_invalid_digit_in_int_literal, StringRef(loc, 1), |
| (unsigned)ExpectedDigitKind::Hex); |
| return expected_digit(); |
| }; |
| |
| // 0x[0-9a-fA-F][0-9a-fA-F_]* |
| ++CurPtr; |
| if (!isHexDigit(*CurPtr)) |
| return expected_hex_digit(CurPtr); |
| |
| while (isHexDigit(*CurPtr) || *CurPtr == '_') |
| ++CurPtr; |
| |
| if (*CurPtr != '.' && *CurPtr != 'p' && *CurPtr != 'P') { |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) |
| return expected_hex_digit(tmp); |
| else |
| return formToken(tok::integer_literal, TokStart); |
| } |
| |
| const char *PtrOnDot = nullptr; |
| |
| // (\.[0-9A-Fa-f][0-9A-Fa-f_]*)? |
| if (*CurPtr == '.') { |
| PtrOnDot = CurPtr; |
| ++CurPtr; |
| |
| // If the character after the '.' is not a digit, assume we have an int |
| // literal followed by a dot expression. |
| if (!isHexDigit(*CurPtr)) { |
| --CurPtr; |
| return formToken(tok::integer_literal, TokStart); |
| } |
| |
| while (isHexDigit(*CurPtr) || *CurPtr == '_') |
| ++CurPtr; |
| |
| if (*CurPtr != 'p' && *CurPtr != 'P') { |
| if (!isDigit(PtrOnDot[1])) { |
| // e.g: 0xff.description |
| CurPtr = PtrOnDot; |
| return formToken(tok::integer_literal, TokStart); |
| } |
| diagnose(CurPtr, diag::lex_expected_binary_exponent_in_hex_float_literal); |
| return formToken(tok::unknown, TokStart); |
| } |
| } |
| |
| // [pP][+-]?[0-9][0-9_]* |
| assert(*CurPtr == 'p' || *CurPtr == 'P' && "not at a hex float exponent?!"); |
| ++CurPtr; |
| |
| bool signedExponent = false; |
| if (*CurPtr == '+' || *CurPtr == '-') { |
| ++CurPtr; // Eat the sign. |
| signedExponent = true; |
| } |
| |
| if (!isDigit(*CurPtr)) { |
| if (PtrOnDot && !isDigit(PtrOnDot[1]) && !signedExponent) { |
| // e.g: 0xff.fpValue, 0xff.fp |
| CurPtr = PtrOnDot; |
| return formToken(tok::integer_literal, TokStart); |
| } |
| // Note: 0xff.fp+otherExpr can be valid expression. But we don't accept it. |
| |
| // There are 3 cases to diagnose if the exponent starts with a non-digit: |
| // identifier (invalid character), underscore (invalid first character), |
| // non-identifier (empty exponent) |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) |
| diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1), |
| *tmp == '_'); |
| else |
| diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent); |
| |
| return expected_digit(); |
| } |
| |
| while (isDigit(*CurPtr) || *CurPtr == '_') |
| ++CurPtr; |
| |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) { |
| diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1), |
| false); |
| return expected_digit(); |
| } |
| |
| return formToken(tok::floating_literal, TokStart); |
| } |
| |
| /// lexNumber: |
| /// integer_literal ::= [0-9][0-9_]* |
| /// integer_literal ::= 0x[0-9a-fA-F][0-9a-fA-F_]* |
| /// integer_literal ::= 0o[0-7][0-7_]* |
| /// integer_literal ::= 0b[01][01_]* |
| /// floating_literal ::= [0-9][0-9]_*\.[0-9][0-9_]* |
| /// floating_literal ::= [0-9][0-9]*\.[0-9][0-9_]*[eE][+-]?[0-9][0-9_]* |
| /// floating_literal ::= [0-9][0-9_]*[eE][+-]?[0-9][0-9_]* |
| /// floating_literal ::= 0x[0-9A-Fa-f][0-9A-Fa-f_]* |
| /// (\.[0-9A-Fa-f][0-9A-Fa-f_]*)?[pP][+-]?[0-9][0-9_]* |
| void Lexer::lexNumber() { |
| const char *TokStart = CurPtr-1; |
| assert((isDigit(*TokStart) || *TokStart == '.') && "Unexpected start"); |
| |
| auto expected_digit = [&]() { |
| while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); |
| return formToken(tok::unknown, TokStart); |
| }; |
| |
| auto expected_int_digit = [&](const char *loc, ExpectedDigitKind kind) { |
| diagnose(loc, diag::lex_invalid_digit_in_int_literal, StringRef(loc, 1), |
| (unsigned)kind); |
| return expected_digit(); |
| }; |
| |
| if (*TokStart == '0' && *CurPtr == 'x') |
| return lexHexNumber(); |
| |
| if (*TokStart == '0' && *CurPtr == 'o') { |
| // 0o[0-7][0-7_]* |
| ++CurPtr; |
| if (*CurPtr < '0' || *CurPtr > '7') |
| return expected_int_digit(CurPtr, ExpectedDigitKind::Octal); |
| |
| while ((*CurPtr >= '0' && *CurPtr <= '7') || *CurPtr == '_') |
| ++CurPtr; |
| |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) |
| return expected_int_digit(tmp, ExpectedDigitKind::Octal); |
| |
| return formToken(tok::integer_literal, TokStart); |
| } |
| |
| if (*TokStart == '0' && *CurPtr == 'b') { |
| // 0b[01][01_]* |
| ++CurPtr; |
| if (*CurPtr != '0' && *CurPtr != '1') |
| return expected_int_digit(CurPtr, ExpectedDigitKind::Binary); |
| |
| while (*CurPtr == '0' || *CurPtr == '1' || *CurPtr == '_') |
| ++CurPtr; |
| |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) |
| return expected_int_digit(tmp, ExpectedDigitKind::Binary); |
| |
| return formToken(tok::integer_literal, TokStart); |
| } |
| |
| // Handle a leading [0-9]+, lexing an integer or falling through if we have a |
| // floating point value. |
| while (isDigit(*CurPtr) || *CurPtr == '_') |
| ++CurPtr; |
| |
| // Lex things like 4.x as '4' followed by a tok::period. |
| if (*CurPtr == '.') { |
| // NextToken is the soon to be previous token |
| // Therefore: x.0.1 is sub-tuple access, not x.float_literal |
| if (!isDigit(CurPtr[1]) || NextToken.is(tok::period)) |
| return formToken(tok::integer_literal, TokStart); |
| } else { |
| // Floating literals must have '.', 'e', or 'E' after digits. If it is |
| // something else, then this is the end of the token. |
| if (*CurPtr != 'e' && *CurPtr != 'E') { |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) |
| return expected_int_digit(tmp, ExpectedDigitKind::Decimal); |
| |
| return formToken(tok::integer_literal, TokStart); |
| } |
| } |
| |
| // Lex decimal point. |
| if (*CurPtr == '.') { |
| ++CurPtr; |
| |
| // Lex any digits after the decimal point. |
| while (isDigit(*CurPtr) || *CurPtr == '_') |
| ++CurPtr; |
| } |
| |
| // Lex exponent. |
| if (*CurPtr == 'e' || *CurPtr == 'E') { |
| ++CurPtr; // Eat the 'e' |
| if (*CurPtr == '+' || *CurPtr == '-') |
| ++CurPtr; // Eat the sign. |
| |
| if (!isDigit(*CurPtr)) { |
| // There are 3 cases to diagnose if the exponent starts with a non-digit: |
| // identifier (invalid character), underscore (invalid first character), |
| // non-identifier (empty exponent) |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) |
| diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1), |
| *tmp == '_'); |
| else |
| diagnose(CurPtr, diag::lex_expected_digit_in_fp_exponent); |
| |
| return expected_digit(); |
| } |
| |
| while (isDigit(*CurPtr) || *CurPtr == '_') |
| ++CurPtr; |
| |
| auto tmp = CurPtr; |
| if (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)) { |
| diagnose(tmp, diag::lex_invalid_digit_in_fp_exponent, StringRef(tmp, 1), |
| false); |
| return expected_digit(); |
| } |
| } |
| |
| return formToken(tok::floating_literal, TokStart); |
| } |
| |
| /// unicode_character_escape ::= [\]u{hex+} |
| /// hex ::= [0-9a-fA-F] |
| unsigned Lexer::lexUnicodeEscape(const char *&CurPtr, Lexer *Diags) { |
| assert(CurPtr[0] == '{' && "Invalid unicode escape"); |
| ++CurPtr; |
| |
| const char *DigitStart = CurPtr; |
| |
| unsigned NumDigits = 0; |
| for (; isHexDigit(CurPtr[0]); ++NumDigits) |
| ++CurPtr; |
| |
| if (CurPtr[0] != '}') { |
| if (Diags) |
| Diags->diagnose(CurPtr, diag::lex_invalid_u_escape_rbrace); |
| return ~1U; |
| } |
| ++CurPtr; |
| |
| if (NumDigits < 1 || NumDigits > 8) { |
| if (Diags) |
| Diags->diagnose(CurPtr, diag::lex_invalid_u_escape); |
| return ~1U; |
| } |
| |
| unsigned CharValue = 0; |
| StringRef(DigitStart, NumDigits).getAsInteger(16, CharValue); |
| return CharValue; |
| } |
| |
| /// maybeConsumeNewlineEscape - Check for valid elided newline escape and |
| /// move pointer passed in to the character after the end of the line. |
| static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { |
| const char *TmpPtr = CurPtr + Offset; |
| while (true) { |
| switch (*TmpPtr++) { |
| case ' ': case '\t': |
| continue; |
| case '\r': |
| if (*TmpPtr == '\n') |
| ++TmpPtr; |
| LLVM_FALLTHROUGH; |
| case '\n': |
| CurPtr = TmpPtr; |
| return true; |
| case 0: |
| default: |
| return false; |
| } |
| } |
| } |
| |
| /// lexCharacter - Read a character and return its UTF32 code. If this is the |
| /// end of enclosing string/character sequence (i.e. the character is equal to |
| /// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal |
| /// quote. If this is a malformed character sequence, it emits a diagnostic |
| /// (when EmitDiagnostics is true) and returns ~1U. |
| /// |
| /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 |
| /// character_escape ::= unicode_character_escape |
| unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, |
| bool EmitDiagnostics, bool MultilineString) { |
| const char *CharStart = CurPtr; |
| |
| switch (*CurPtr++) { |
| default: {// Normal characters are part of the string. |
| // If this is a "high" UTF-8 character, validate it. |
| if ((signed char)(CurPtr[-1]) >= 0) { |
| if (isPrintable(CurPtr[-1]) == 0) |
| if (!(MultilineString && (CurPtr[-1] == '\t'))) |
| if (EmitDiagnostics) |
| diagnose(CharStart, diag::lex_unprintable_ascii_character); |
| return CurPtr[-1]; |
| } |
| --CurPtr; |
| unsigned CharValue = validateUTF8CharacterAndAdvance(CurPtr, BufferEnd); |
| if (CharValue != ~0U) return CharValue; |
| if (EmitDiagnostics) |
| diagnose(CharStart, diag::lex_invalid_utf8); |
| return ~1U; |
| } |
| case '"': |
| case '\'': |
| // If we found a closing quote character, we're done. |
| if (CurPtr[-1] == StopQuote) { |
| --CurPtr; |
| return ~0U; |
| } |
| // Otherwise, this is just a character. |
| return CurPtr[-1]; |
| |
| case 0: |
| if (CurPtr-1 != BufferEnd) { |
| if (EmitDiagnostics) |
| diagnose(CurPtr-1, diag::lex_nul_character); |
| return CurPtr[-1]; |
| } |
| // Move the pointer back to EOF. |
| --CurPtr; |
| if (EmitDiagnostics) |
| diagnose(CurPtr-1, diag::lex_unterminated_string); |
| return ~1U; |
| case '\n': // String literals cannot have \n or \r in them. |
| case '\r': |
| if (MultilineString) // ... unless they are multiline |
| return CurPtr[-1]; |
| if (EmitDiagnostics) |
| diagnose(CurPtr-1, diag::lex_unterminated_string); |
| return ~1U; |
| case '\\': // Escapes. |
| break; |
| } |
| |
| unsigned CharValue = 0; |
| // Escape processing. We already ate the "\". |
| switch (*CurPtr) { |
| case ' ': case '\t': case '\n': case '\r': |
| if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) |
| return '\n'; |
| LLVM_FALLTHROUGH; |
| default: // Invalid escape. |
| if (EmitDiagnostics) |
| diagnose(CurPtr, diag::lex_invalid_escape); |
| // If this looks like a plausible escape character, recover as though this |
| // is an invalid escape. |
| if (isAlphanumeric(*CurPtr)) ++CurPtr; |
| return ~1U; |
| |
| // Simple single-character escapes. |
| case '0': ++CurPtr; return '\0'; |
| case 'n': ++CurPtr; return '\n'; |
| case 'r': ++CurPtr; return '\r'; |
| case 't': ++CurPtr; return '\t'; |
| case '"': ++CurPtr; return '"'; |
| case '\'': ++CurPtr; return '\''; |
| case '\\': ++CurPtr; return '\\'; |
| |
| case 'u': { // \u HEX HEX HEX HEX |
| ++CurPtr; |
| if (*CurPtr != '{') { |
| if (EmitDiagnostics) |
| diagnose(CurPtr-1, diag::lex_unicode_escape_braces); |
| return ~1U; |
| } |
| |
| CharValue = lexUnicodeEscape(CurPtr, EmitDiagnostics ? this : nullptr); |
| if (CharValue == ~1U) return ~1U; |
| break; |
| } |
| } |
| |
| // Check to see if the encoding is valid. |
| llvm::SmallString<64> TempString; |
| if (CharValue >= 0x80 && EncodeToUTF8(CharValue, TempString)) { |
| if (EmitDiagnostics) |
| diagnose(CharStart, diag::lex_invalid_unicode_scalar); |
| return ~1U; |
| } |
| |
| return CharValue; |
| } |
| |
| /// skipToEndOfInterpolatedExpression - Given the first character after a \( |
| /// sequence in a string literal (the start of an interpolated expression), |
| /// scan forward to the end of the interpolated expression and return the end. |
| /// On success, the returned pointer will point to the ')' at the end of the |
| /// interpolated expression. On failure, it will point to the first character |
| /// that cannot be lexed as part of the interpolated expression; this character |
| /// will never be ')'. |
| /// |
| /// This function performs brace and quote matching, keeping a stack of |
| /// outstanding delimiters as it scans the string. |
| static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, |
| const char *EndPtr, |
| DiagnosticEngine *Diags, |
| bool MultilineString) { |
| llvm::SmallVector<char, 4> OpenDelimiters; |
| llvm::SmallVector<bool, 4> AllowNewline; |
| AllowNewline.push_back(MultilineString); |
| |
| auto inStringLiteral = [&]() { |
| return !OpenDelimiters.empty() && |
| (OpenDelimiters.back() == '"' || OpenDelimiters.back() == '\''); |
| }; |
| while (true) { |
| // This is a simple scanner, capable of recognizing nested parentheses and |
| // string literals but not much else. The implications of this include not |
| // being able to break an expression over multiple lines in an interpolated |
| // string. This limitation allows us to recover from common errors though. |
| // |
| // On success scanning the expression body, the real lexer will be used to |
| // relex the body when parsing the expressions. We let it diagnose any |
| // issues with malformed tokens or other problems. |
| switch (*CurPtr++) { |
| // String literals in general cannot be split across multiple lines; |
| // interpolated ones are no exception - unless multiline literals. |
| case '\n': |
| case '\r': |
| if (AllowNewline.back()) |
| continue; |
| // Will be diagnosed as an unterminated string literal. |
| return CurPtr-1; |
| |
| case '"': |
| case '\'': { |
| if (!AllowNewline.back() && inStringLiteral()) { |
| if (OpenDelimiters.back() == CurPtr[-1]) { |
| // Closing single line string literal. |
| OpenDelimiters.pop_back(); |
| AllowNewline.pop_back(); |
| } |
| // Otherwise, it's just a quote in string literal. e.g. "foo's". |
| continue; |
| } |
| |
| bool isMultilineQuote = ( |
| *CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"'); |
| if (isMultilineQuote) |
| CurPtr += 2; |
| |
| if (!inStringLiteral()) { |
| // Open string literal |
| OpenDelimiters.push_back(CurPtr[-1]); |
| AllowNewline.push_back(isMultilineQuote); |
| continue; |
| } |
| |
| // We are in multiline string literal. |
| assert(AllowNewline.back() && "other cases must be handled above"); |
| if (isMultilineQuote) { |
| // Close multiline string literal. |
| OpenDelimiters.pop_back(); |
| AllowNewline.pop_back(); |
| } |
| |
| // Otherwise, it's just a normal character in multiline string. |
| continue; |
| } |
| case '\\': |
| if (inStringLiteral()) { |
| char escapedChar = *CurPtr++; |
| switch (escapedChar) { |
| case '(': |
| // Entering a recursive interpolated expression |
| OpenDelimiters.push_back('('); |
| continue; |
| case '\n': case '\r': |
| if (AllowNewline.back()) |
| continue; |
| LLVM_FALLTHROUGH; |
| case 0: |
| // Don't jump over newline/EOF due to preceding backslash! |
| return CurPtr-1; |
| default: |
| continue; |
| } |
| } |
| continue; |
| case 0: |
| // If we hit EOF, we fail. |
| if (CurPtr-1 == EndPtr) { |
| if (Diags) |
| Diags->diagnose(Lexer::getSourceLoc(CurPtr-1), |
| diag::lex_unterminated_string); |
| return CurPtr-1; |
| } |
| continue; |
| |
| // Paren nesting deeper to support "foo = \((a+b)-(c*d)) bar". |
| case '(': |
| if (!inStringLiteral()) { |
| OpenDelimiters.push_back('('); |
| } |
| continue; |
| case ')': |
| if (OpenDelimiters.empty()) { |
| // No outstanding open delimiters; we're done. |
| return CurPtr-1; |
| } else if (OpenDelimiters.back() == '(') { |
| // Pop the matching bracket and keep going. |
| OpenDelimiters.pop_back(); |
| continue; |
| } else { |
| // It's a right parenthesis in a string literal. |
| assert(inStringLiteral()); |
| continue; |
| } |
| default: |
| // Normal token character. |
| continue; |
| } |
| } |
| } |
| |
| /// getStringLiteralContent: |
| /// Extract content of string literal from inside quotes. |
| static StringRef getStringLiteralContent(const Token &Str) { |
| StringRef Bytes = Str.getText(); |
| |
| if (Str.IsMultilineString()) |
| Bytes = Bytes.drop_front(3).drop_back(3); |
| else |
| Bytes = Bytes.drop_front().drop_back(); |
| |
| return Bytes; |
| } |
| |
| static size_t commonPrefixLength(StringRef shorter, StringRef longer) { |
| size_t offset = 0; |
| while (offset < shorter.size() && offset < longer.size() && shorter[offset] == longer[offset]) { |
| ++offset; |
| } |
| |
| return offset; |
| } |
| |
| /// getMultilineTrailingIndent: |
| /// Determine trailing indent to be used for multiline literal indent stripping. |
| static std::tuple<StringRef, SourceLoc> |
| getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { |
| StringRef Bytes = getStringLiteralContent(Str); |
| const char *begin = Bytes.begin(), *end = Bytes.end(), *start = end; |
| bool sawNonWhitespace = false; |
| |
| // Work back from the end to find whitespace to strip. |
| while (!sawNonWhitespace && start > begin) { |
| switch (*--start) { |
| case ' ': |
| case '\t': |
| continue; |
| case '\n': |
| case '\r': { |
| ++start; |
| auto startLoc = Lexer::getSourceLoc(start); |
| auto string = StringRef(start, end - start); |
| |
| // Disallow escaped newline in the last line. |
| if (Diags) { |
| auto *Ptr = start - 1; |
| if (*Ptr == '\n') --Ptr; |
| if (*Ptr == '\r') --Ptr; |
| auto *LineEnd = Ptr + 1; |
| while (Ptr > begin && (*Ptr == ' ' || *Ptr == '\t')) --Ptr; |
| if (*Ptr == '\\') { |
| auto escapeLoc = Lexer::getSourceLoc(Ptr); |
| bool invalid = true; |
| while (*--Ptr == '\\') invalid = !invalid; |
| if (invalid) |
| Diags->diagnose(escapeLoc, diag::lex_escaped_newline_at_lastline) |
| .fixItRemoveChars(escapeLoc, Lexer::getSourceLoc(LineEnd)); |
| } |
| } |
| |
| return std::make_tuple(string, startLoc); |
| } |
| default: |
| sawNonWhitespace = true; |
| } |
| } |
| |
| if (sawNonWhitespace && Diags) { |
| auto loc = Lexer::getSourceLoc(start + 1); |
| Diags->diagnose(loc, diag::lex_illegal_multiline_string_end) |
| // FIXME: Should try to suggest indentation. |
| .fixItInsert(loc, "\n"); |
| } |
| |
| return std::make_tuple("", Lexer::getSourceLoc(end - 1)); |
| } |
| |
| /// diagnoseInvalidMultilineIndents: |
| /// Emit errors for a group of multiline indents with the same MistakeOffset. |
| /// Note: Does not emit an error if MistakeOffset does not lie within |
| /// ExpectedIndent. |
| static void diagnoseInvalidMultilineIndents( |
| DiagnosticEngine *Diags, |
| StringRef ExpectedIndent, |
| SourceLoc IndentLoc, |
| StringRef Bytes, |
| SmallVector<size_t, 4> LineStarts, |
| size_t MistakeOffset, |
| StringRef ActualIndent) { |
| if (MistakeOffset >= ExpectedIndent.size()) { |
| // These lines were valid; there's nothing to correct. |
| return; |
| } |
| |
| assert(LineStarts.size() > 0); |
| |
| auto getLoc = [&](size_t offset) -> SourceLoc { |
| return Lexer::getSourceLoc((const char *)Bytes.bytes_begin() + offset); |
| }; |
| auto classify = [&](unsigned char ch) -> unsigned { |
| switch (ch) { |
| case ' ': |
| return 0; |
| case '\t': |
| return 1; |
| default: |
| return 2; |
| } |
| }; |
| |
| Diags->diagnose(getLoc(LineStarts[0] + MistakeOffset), |
| diag::lex_multiline_string_indent_inconsistent, |
| LineStarts.size() != 1, LineStarts.size(), |
| classify(Bytes[LineStarts[0] + MistakeOffset])); |
| |
| Diags->diagnose(IndentLoc.getAdvancedLoc(MistakeOffset), |
| diag::lex_multiline_string_indent_should_match_here, |
| classify(ExpectedIndent[MistakeOffset])); |
| |
| auto fix = Diags->diagnose(getLoc(LineStarts[0] + MistakeOffset), |
| diag::lex_multiline_string_indent_change_line, |
| LineStarts.size() != 1); |
| |
| assert(MistakeOffset <= ActualIndent.size()); |
| assert(ExpectedIndent.substr(0, MistakeOffset) == |
| ActualIndent.substr(0, MistakeOffset)); |
| |
| for (auto line : LineStarts) { |
| fix.fixItReplaceChars(getLoc(line + MistakeOffset), |
| getLoc(line + ActualIndent.size()), |
| ExpectedIndent.substr(MistakeOffset)); |
| } |
| } |
| |
| /// validateMultilineIndents: |
| /// Diagnose contents of string literal that have inconsistent indentation. |
| static void validateMultilineIndents(const Token &Str, |
| DiagnosticEngine *Diags) { |
| StringRef Indent; |
| SourceLoc IndentStartLoc; |
| std::tie(Indent, IndentStartLoc) = getMultilineTrailingIndent(Str, Diags); |
| if (Indent.empty()) |
| return; |
| |
| // The offset into the previous line where it experienced its first indentation |
| // error, or Indent.size() if every character matched. |
| size_t lastMistakeOffset = std::numeric_limits<size_t>::max(); |
| // Offsets for each consecutive previous line with its first error at |
| // lastMatchLength. |
| SmallVector<size_t, 4> linesWithLastMistakeOffset = {}; |
| // Prefix of indentation that's present on all lines in linesWithLastMatchLength. |
| StringRef commonIndentation = ""; |
| |
| StringRef Bytes = getStringLiteralContent(Str); |
| for (size_t pos = Bytes.find('\n'); pos != StringRef::npos; pos = Bytes.find('\n', pos + 1)) { |
| size_t nextpos = pos + 1; |
| auto restOfBytes = Bytes.substr(nextpos); |
| |
| // Ignore blank lines. |
| if (restOfBytes[0] == '\n' || restOfBytes[0] == '\r') { |
| continue; |
| } |
| |
| // Where is the first difference? |
| auto errorOffset = commonPrefixLength(Indent, restOfBytes); |
| |
| // Are we starting a new run? |
| if (errorOffset != lastMistakeOffset) { |
| // Diagnose problems in the just-finished run of lines. |
| diagnoseInvalidMultilineIndents(Diags, Indent, IndentStartLoc, Bytes, |
| linesWithLastMistakeOffset, lastMistakeOffset, |
| commonIndentation); |
| |
| // Set up for a new run. |
| lastMistakeOffset = errorOffset; |
| linesWithLastMistakeOffset = {}; |
| |
| // To begin with, all whitespace is part of the common indentation. |
| auto prefixLength = restOfBytes.find_first_not_of(" \t"); |
| commonIndentation = restOfBytes.substr(0, prefixLength); |
| } |
| else { |
| // We're continuing the run, so include this line in the common prefix. |
| auto prefixLength = commonPrefixLength(commonIndentation, restOfBytes); |
| commonIndentation = commonIndentation.substr(0, prefixLength); |
| } |
| |
| // Either way, add this line to the run. |
| linesWithLastMistakeOffset.push_back(nextpos); |
| } |
| |
| // Handle the last run. |
| diagnoseInvalidMultilineIndents(Diags, Indent, IndentStartLoc, Bytes, |
| linesWithLastMistakeOffset, lastMistakeOffset, |
| commonIndentation); |
| } |
| |
| /// lexStringLiteral: |
| /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] |
| /// string_literal ::= ["]["]["].*["]["]["] - approximately |
| void Lexer::lexStringLiteral() { |
| const char *TokStart = CurPtr-1; |
| assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); |
| // NOTE: We only allow single-quote string literals so we can emit useful |
| // diagnostics about changing them to double quotes. |
| |
| bool wasErroneous = false, MultilineString = false; |
| |
| // Is this the start of a multiline string literal? |
| if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { |
| MultilineString = true; |
| CurPtr += 2; |
| if (*CurPtr != '\n' && *CurPtr != '\r') |
| diagnose(CurPtr, diag::lex_illegal_multiline_string_start) |
| .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); |
| } |
| |
| while (true) { |
| if (*CurPtr == '\\' && *(CurPtr + 1) == '(') { |
| // Consume tokens until we hit the corresponding ')'. |
| CurPtr += 2; |
| const char *EndPtr = |
| skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, |
| Diags, MultilineString); |
| |
| if (*EndPtr == ')') { |
| // Successfully scanned the body of the expression literal. |
| CurPtr = EndPtr+1; |
| } else { |
| CurPtr = EndPtr; |
| wasErroneous = true; |
| } |
| continue; |
| } |
| |
| // String literals cannot have \n or \r in them (unless multiline). |
| if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) |
| || CurPtr == BufferEnd) { |
| diagnose(TokStart, diag::lex_unterminated_string); |
| return formToken(tok::unknown, TokStart); |
| } |
| |
| unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString); |
| wasErroneous |= CharValue == ~1U; |
| |
| // If this is the end of string, we are done. If it is a normal character |
| // or an already-diagnosed error, just munch it. |
| if (CharValue == ~0U) { |
| ++CurPtr; |
| if (wasErroneous) |
| return formToken(tok::unknown, TokStart); |
| |
| if (*TokStart == '\'') { |
| // Complain about single-quote string and suggest replacement with |
| // double-quoted equivalent. |
| StringRef orig(TokStart, CurPtr - TokStart); |
| llvm::SmallString<32> replacement; |
| replacement += '"'; |
| std::string str = orig.slice(1, orig.size() - 1).str(); |
| std::string quot = "\""; |
| size_t pos = 0; |
| while (pos != str.length()) { |
| if (str.at(pos) == '\\') { |
| if (str.at(pos + 1) == '\'') { |
| // Un-escape escaped single quotes. |
| str.replace(pos, 2, "'"); |
| ++pos; |
| } else { |
| // Skip over escaped characters. |
| pos += 2; |
| } |
| } else if (str.at(pos) == '"') { |
| str.replace(pos, 1, "\\\""); |
| // Advance past the newly added ["\""]. |
| pos += 2; |
| } else { |
| ++pos; |
| } |
| } |
| replacement += StringRef(str); |
| replacement += '"'; |
| diagnose(TokStart, diag::lex_single_quote_string) |
| .fixItReplaceChars(getSourceLoc(TokStart), getSourceLoc(CurPtr), |
| replacement); |
| } |
| |
| // Is this the end of a multiline string literal? |
| if (MultilineString) { |
| if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') { |
| CurPtr += 2; |
| formToken(tok::string_literal, TokStart, MultilineString); |
| if (Diags) |
| validateMultilineIndents(NextToken, Diags); |
| return; |
| } |
| else |
| continue; |
| } |
| |
| return formToken(tok::string_literal, TokStart, MultilineString); |
| } |
| } |
| } |
| |
| |
| /// We found an opening curly quote in the source file. Scan ahead until we |
| /// find and end-curly-quote (or straight one). If we find what looks to be a |
| /// string literal, diagnose the problem and return a pointer to the end of the |
| /// entire string literal. This helps us avoid parsing the body of the string |
| /// as program tokens, which will only lead to massive confusion. |
| const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body) { |
| |
| while (true) { |
| // Don't bother with string interpolations. |
| if (*Body == '\\' && *(Body + 1) == '(') |
| return nullptr; |
| |
| // We didn't find the end of the string literal if we ran to end of line. |
| if (*Body == '\r' || *Body == '\n' || Body == BufferEnd) |
| return nullptr; |
| |
| // Get the next character. |
| const char *CharStart = Body; |
| unsigned CharValue = lexCharacter(Body, '\0', false); |
| // If the character was incorrectly encoded, give up. |
| if (CharValue == ~1U) return nullptr; |
| |
| // If we found a straight-quote, then we're done. Just return the spot |
| // to continue. |
| if (CharValue == '"') |
| return Body; |
| |
| // If we found an ending curly quote (common since this thing started with |
| // an opening curly quote) diagnose it with a fixit and then return. |
| if (CharValue == 0x0000201D) { |
| diagnose(CharStart, diag::lex_invalid_curly_quote) |
| .fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body), "\""); |
| return Body; |
| } |
| |
| // Otherwise, keep scanning. |
| } |
| } |
| |
| |
| /// lexEscapedIdentifier: |
| /// identifier ::= '`' identifier '`' |
| /// |
| /// If it doesn't match this production, the leading ` is a punctuator. |
| void Lexer::lexEscapedIdentifier() { |
| assert(CurPtr[-1] == '`' && "Unexpected start of escaped identifier"); |
| |
| const char *Quote = CurPtr-1; |
| |
| // Check whether we have an identifier followed by another backtick, in which |
| // case this is an escaped identifier. |
| const char *IdentifierStart = CurPtr; |
| if (advanceIfValidStartOfIdentifier(CurPtr, BufferEnd)) { |
| // Keep continuing the identifier. |
| while (advanceIfValidContinuationOfIdentifier(CurPtr, BufferEnd)); |
| |
| // If we have the terminating "`", it's an escaped identifier. |
| if (*CurPtr == '`') { |
| ++CurPtr; |
| formToken(tok::identifier, Quote); |
| NextToken.setEscapedIdentifier(true); |
| return; |
| } |
| } |
| |
| // Special case; allow '`$`'. |
| if (Quote[1] == '$' && Quote[2] == '`') { |
| CurPtr = Quote + 3; |
| formToken(tok::identifier, Quote); |
| NextToken.setEscapedIdentifier(true); |
| return; |
| } |
| |
| // The backtick is punctuation. |
| CurPtr = IdentifierStart; |
| formToken(tok::backtick, Quote); |
| } |
| |
| /// Find the end of a version control conflict marker. |
| static const char *findConflictEnd(const char *CurPtr, const char *BufferEnd, |
| ConflictMarkerKind CMK) { |
| StringRef terminator = CMK == ConflictMarkerKind::Perforce ? "<<<<\n" |
| : ">>>>>>> "; |
| size_t termLen = terminator.size(); |
| |
| // Get a reference to the rest of the buffer minus the length of the start |
| // of the conflict marker. |
| auto restOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(termLen); |
| size_t endPos = restOfBuffer.find(terminator); |
| while (endPos != StringRef::npos) { |
| // Must occur at start of line. |
| if (endPos != 0 && |
| (restOfBuffer[endPos - 1] == '\r' || restOfBuffer[endPos - 1] == '\n')) |
| { |
| return restOfBuffer.data() + endPos; |
| } |
| restOfBuffer = restOfBuffer.substr(endPos + termLen); |
| endPos = restOfBuffer.find(terminator); |
| } |
| return nullptr; |
| } |
| |
| bool Lexer::tryLexConflictMarker(bool EatNewline) { |
| const char *Ptr = CurPtr - 1; |
| |
| // Only a conflict marker if it starts at the beginning of a line. |
| if (Ptr != ContentStart && Ptr[-1] != '\n' && Ptr[-1] != '\r') |
| return false; |
| |
| // Check to see if we have <<<<<<< or >>>>. |
| StringRef restOfBuffer(Ptr, BufferEnd - Ptr); |
| if (!restOfBuffer.startswith("<<<<<<< ") && !restOfBuffer.startswith(">>>> ")) |
| return false; |
| |
| ConflictMarkerKind Kind = *Ptr == '<' ? ConflictMarkerKind::Normal |
| : ConflictMarkerKind::Perforce; |
| if (const char *End = findConflictEnd(Ptr, BufferEnd, Kind)) { |
| // Diagnose at the conflict marker, then jump ahead to the end. |
| diagnose(CurPtr, diag::lex_conflict_marker_in_file); |
| CurPtr = End; |
| |
| // Skip ahead to the end of the marker. |
| if (CurPtr != BufferEnd) |
| skipToEndOfLine(EatNewline); |
| |
| return true; |
| } |
| |
| // No end of conflict marker found. |
| return false; |
| } |
| |
| |
| void Lexer::tryLexEditorPlaceholder() { |
| assert(CurPtr[-1] == '<' && CurPtr[0] == '#'); |
| const char *TokStart = CurPtr-1; |
| for (const char *Ptr = CurPtr+1; Ptr < BufferEnd-1; ++Ptr) { |
| if (*Ptr == '\n') |
| break; |
| if (Ptr[0] == '<' && Ptr[1] == '#') |
| break; |
| if (Ptr[0] == '#' && Ptr[1] == '>') { |
| // Found it. Flag it as error (or warning, if in playground mode) for the |
| // rest of the compiler pipeline and lex it as an identifier. |
| if (LangOpts.Playground) { |
| diagnose(TokStart, diag::lex_editor_placeholder_in_playground); |
| } else { |
| diagnose(TokStart, diag::lex_editor_placeholder); |
| } |
| CurPtr = Ptr+2; |
| formToken(tok::identifier, TokStart); |
| return; |
| } |
| } |
| |
| // Not a well-formed placeholder. |
| lexOperatorIdentifier(); |
| } |
| |
| StringRef Lexer::getEncodedStringSegment(StringRef Bytes, |
| SmallVectorImpl<char> &TempString, |
| bool IsFirstSegment, |
| bool IsLastSegment, |
| unsigned IndentToStrip) { |
| |
| TempString.clear(); |
| // Note that it is always safe to read one over the end of "Bytes" because |
| // we know that there is a terminating " character. Use BytesPtr to avoid a |
| // range check subscripting on the StringRef. |
| const char *BytesPtr = Bytes.begin(); |
| bool IsEscapedNewline = false; |
| while (BytesPtr < Bytes.end()) { |
| char CurChar = *BytesPtr++; |
| |
| // Multiline string line ending normalization and indent stripping. |
| if (CurChar == '\r' || CurChar == '\n') { |
| bool stripNewline = IsEscapedNewline || |
| (IsFirstSegment && BytesPtr - 1 == Bytes.begin()); |
| if (CurChar == '\r' && *BytesPtr == '\n') |
| ++BytesPtr; |
| if (*BytesPtr != '\r' && *BytesPtr != '\n') |
| BytesPtr += IndentToStrip; |
| if (IsLastSegment && BytesPtr == Bytes.end()) |
| stripNewline = true; |
| if (!stripNewline) |
| TempString.push_back('\n'); |
| IsEscapedNewline = false; |
| continue; |
| } |
| |
| if (CurChar != '\\') { |
| TempString.push_back(CurChar); |
| continue; |
| } |
| |
| // Invalid escapes are accepted by the lexer but diagnosed as an error. We |
| // just ignore them here. |
| unsigned CharValue = 0; // Unicode character value for \x, \u, \U. |
| switch (*BytesPtr++) { |
| default: |
| continue; // Invalid escape, ignore it. |
| |
| // Simple single-character escapes. |
| case '0': TempString.push_back('\0'); continue; |
| case 'n': TempString.push_back('\n'); continue; |
| case 'r': TempString.push_back('\r'); continue; |
| case 't': TempString.push_back('\t'); continue; |
| case '"': TempString.push_back('"'); continue; |
| case '\'': TempString.push_back('\''); continue; |
| case '\\': TempString.push_back('\\'); continue; |
| |
| case ' ': case '\t': case '\n': case '\r': |
| if (maybeConsumeNewlineEscape(BytesPtr, -1)) { |
| IsEscapedNewline = true; |
| --BytesPtr; |
| } |
| continue; |
| |
| // String interpolation. |
| case '(': |
| llvm_unreachable("string contained interpolated segments"); |
| |
| // Unicode escapes of various lengths. |
| case 'u': // \u HEX HEX HEX HEX |
| if (BytesPtr[0] != '{') |
| continue; // Ignore invalid escapes. |
| |
| CharValue = lexUnicodeEscape(BytesPtr, /*no diagnostics*/nullptr); |
| // Ignore invalid escapes. |
| if (CharValue == ~1U) continue; |
| break; |
| } |
| |
| if (CharValue < 0x80) |
| TempString.push_back(CharValue); |
| else |
| EncodeToUTF8(CharValue, TempString); |
| } |
| |
| // If we didn't escape or reprocess anything, then we don't need to use the |
| // temporary string, just point to the original one. We know that this |
| // is safe because unescaped strings are always shorter than their escaped |
| // forms (in a valid string). |
| if (TempString.size() == Bytes.size()) { |
| TempString.clear(); |
| return Bytes; |
| } |
| return StringRef(TempString.begin(), TempString.size()); |
| } |
| |
| void Lexer::getStringLiteralSegments( |
| const Token &Str, |
| SmallVectorImpl<StringSegment> &Segments, |
| DiagnosticEngine *Diags) { |
| assert(Str.is(tok::string_literal)); |
| // Get the bytes behind the string literal, dropping any double quotes. |
| StringRef Bytes = getStringLiteralContent(Str); |
| |
| // Are substitutions required either for indent stripping or line ending |
| // normalization? |
| bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; |
| unsigned IndentToStrip = 0; |
| if (MultilineString) |
| IndentToStrip = |
| std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); |
| |
| // Note that it is always safe to read one over the end of "Bytes" because |
| // we know that there is a terminating " character. Use BytesPtr to avoid a |
| // range check subscripting on the StringRef. |
| const char *SegmentStartPtr = Bytes.begin(); |
| const char *BytesPtr = SegmentStartPtr; |
| // FIXME: Use SSE to scan for '\'. |
| while (BytesPtr != Bytes.end()) { |
| char CurChar = *BytesPtr++; |
| if (CurChar != '\\') |
| continue; |
| |
| if (*BytesPtr++ != '(') |
| continue; |
| |
| // String interpolation. |
| |
| // Push the current segment. |
| Segments.push_back( |
| StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), |
| BytesPtr-SegmentStartPtr-2, |
| IsFirstSegment, false, IndentToStrip)); |
| IsFirstSegment = false; |
| |
| // Find the closing ')'. |
| const char *End = skipToEndOfInterpolatedExpression(BytesPtr, |
| Str.getText().end(), |
| Diags, MultilineString); |
| assert(*End == ')' && "invalid string literal interpolations should" |
| " not be returned as string literals"); |
| ++End; |
| |
| // Add an expression segment. |
| Segments.push_back( |
| StringSegment::getExpr(getSourceLoc(BytesPtr-1), End-BytesPtr+1)); |
| |
| // Reset the beginning of the segment to the string that remains to be |
| // consumed. |
| SegmentStartPtr = BytesPtr = End; |
| } |
| |
| Segments.push_back( |
| StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), |
| Bytes.end()-SegmentStartPtr, |
| IsFirstSegment, true, IndentToStrip)); |
| } |
| |
| |
| //===----------------------------------------------------------------------===// |
| // Main Lexer Loop |
| //===----------------------------------------------------------------------===// |
| |
| void Lexer::lexImpl() { |
| assert(CurPtr >= BufferStart && |
| CurPtr <= BufferEnd && "Current pointer out of range!"); |
| |
| if (TriviaRetention == TriviaRetentionMode::WithTrivia) { |
| LeadingTrivia.clear(); |
| TrailingTrivia.clear(); |
| } |
| NextToken.setAtStartOfLine(CurPtr == ContentStart); |
| |
| // Remember where we started so that we can find the comment range. |
| LastCommentBlockStart = CurPtr; |
| SeenComment = false; |
| |
| Restart: |
| lexTrivia(LeadingTrivia, /* IsForTrailingTrivia */ false); |
| |
| // Remember the start of the token so we can form the text range. |
| const char *TokStart = CurPtr; |
| |
| switch ((signed char)*CurPtr++) { |
| default: { |
| char const *tmp = CurPtr-1; |
| if (advanceIfValidStartOfIdentifier(tmp, BufferEnd)) |
| return lexIdentifier(); |
| |
| if (advanceIfValidStartOfOperator(tmp, BufferEnd)) |
| return lexOperatorIdentifier(); |
| |
| if (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)) { |
| // If this is a valid identifier continuation, but not a valid identifier |
| // start, attempt to recover by eating more continuation characters. |
| diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character); |
| while (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)); |
| } else { |
| // This character isn't allowed in Swift source. |
| uint32_t codepoint = validateUTF8CharacterAndAdvance(tmp, BufferEnd); |
| if (codepoint == ~0U) { |
| diagnose(CurPtr-1, diag::lex_invalid_utf8) |
| .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " "); |
| CurPtr = tmp; |
| goto Restart; // Skip presumed whitespace. |
| } else if (codepoint == 0x0000201D) { |
| // If this is an end curly quote, just diagnose it with a fixit hint. |
| diagnose(CurPtr-1, diag::lex_invalid_curly_quote) |
| .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), "\""); |
| } else if (codepoint == 0x0000201C) { |
| auto endPtr = tmp; |
| // If this is a start curly quote, do a fuzzy match of a string literal |
| // to improve recovery. |
| if (auto tmp2 = findEndOfCurlyQuoteStringLiteral(tmp)) |
| tmp = tmp2; |
| |
| // Note, we intentionally diagnose the end quote before the start quote, |
| // so that the IDE suggests fixing the end quote before the start quote. |
| // This, in turn, works better with our error recovery because we won't |
| // diagnose an end curly quote in the middle of a straight quoted |
| // literal. |
| diagnose(CurPtr-1, diag::lex_invalid_curly_quote) |
| .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(endPtr),"\""); |
| |
| } else { |
| diagnose(CurPtr-1, diag::lex_invalid_character) |
| .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " "); |
| |
| char expectedCodepoint; |
| if ((expectedCodepoint = |
| confusable::tryConvertConfusableCharacterToASCII(codepoint))) { |
| |
| llvm::SmallString<4> confusedChar; |
| EncodeToUTF8(codepoint, confusedChar); |
| llvm::SmallString<1> expectedChar; |
| expectedChar += expectedCodepoint; |
| diagnose(CurPtr-1, diag::lex_confusable_character, |
| confusedChar, expectedChar) |
| .fixItReplaceChars(getSourceLoc(CurPtr-1), |
| getSourceLoc(tmp), |
| expectedChar); |
| } |
| |
| CurPtr = tmp; |
| goto Restart; // Skip presumed whitespace. |
| } |
| } |
| |
| CurPtr = tmp; |
| return formToken(tok::unknown, TokStart); |
| } |
| |
| case '\n': |
| case '\r': |
| assert(TriviaRetention != TriviaRetentionMode::WithTrivia && |
| "newlines should be eaten by lexTrivia as LeadingTrivia"); |
| NextToken.setAtStartOfLine(true); |
| goto Restart; // Skip whitespace. |
| |
| case ' ': |
| case '\t': |
| case '\f': |
| case '\v': |
| goto Restart; // Skip whitespace. |
| |
| case -1: |
| case -2: |
| diagnose(CurPtr-1, diag::lex_utf16_bom_marker); |
| CurPtr = BufferEnd; |
| return formToken(tok::unknown, TokStart); |
| |
| case 0: |
| if (CurPtr-1 == CodeCompletionPtr) |
| return formToken(tok::code_complete, TokStart); |
| |
| // If this is a random nul character in the middle of a buffer, skip it as |
| // whitespace. |
| if (CurPtr-1 != BufferEnd) { |
| diagnoseEmbeddedNul(Diags, CurPtr-1); |
| goto Restart; |
| } |
| |
| // Otherwise, this is the real end of the buffer. Put CurPtr back into |
| // buffer bounds. |
| --CurPtr; |
| // Return EOF. |
| return formToken(tok::eof, TokStart); |
| |
| case '@': return formToken(tok::at_sign, TokStart); |
| case '{': return formToken(tok::l_brace, TokStart); |
| case '[': { |
| // NOTE: Legacy punctuator for old object literal syntax. |
| // Remove in the future. |
| if (*CurPtr == '#') { // [# |
| // NOTE: Do NOT include the '#' in the token, unlike in earlier |
| // versions of Swift that supported the old object literal syntax |
| // directly. The '#' will be lexed as part of the object literal |
| // keyword token itself. |
| return formToken(tok::l_square_lit, TokStart); |
| } |
| return formToken(tok::l_square, TokStart); |
| } |
| case '(': return formToken(tok::l_paren, TokStart); |
| case '}': return formToken(tok::r_brace, TokStart); |
| case ']': return formToken(tok::r_square, TokStart); |
| case ')': |
| return formToken(tok::r_paren, TokStart); |
| |
| case ',': return formToken(tok::comma, TokStart); |
| case ';': return formToken(tok::semi, TokStart); |
| case ':': return formToken(tok::colon, TokStart); |
| case '\\': |
| return formToken(tok::backslash, TokStart); |
| |
| case '#': |
| return lexHash(); |
| |
| // Operator characters. |
| case '/': |
| if (CurPtr[0] == '/') { // "//" |
| skipSlashSlashComment(/*EatNewline=*/true); |
| SeenComment = true; |
| if (isKeepingComments()) |
| return formToken(tok::comment, TokStart); |
| goto Restart; |
| } |
| if (CurPtr[0] == '*') { // "/*" |
| skipSlashStarComment(); |
| SeenComment = true; |
| if (isKeepingComments()) |
| return formToken(tok::comment, TokStart); |
| goto Restart; |
| } |
| return lexOperatorIdentifier(); |
| case '%': |
| // Lex %[0-9a-zA-Z_]+ as a local SIL value |
| if (InSILBody && clang::isIdentifierBody(CurPtr[0])) { |
| do { |
| ++CurPtr; |
| } while (clang::isIdentifierBody(CurPtr[0])); |
| |
| return formToken(tok::sil_local_name, TokStart); |
| } |
| return lexOperatorIdentifier(); |
| |
| case '!': |
| if (InSILBody) |
| return formToken(tok::sil_exclamation, TokStart); |
| if (isLeftBound(TokStart, ContentStart)) |
| return formToken(tok::exclaim_postfix, TokStart); |
| return lexOperatorIdentifier(); |
| |
| case '?': |
| if (isLeftBound(TokStart, ContentStart)) |
| return formToken(tok::question_postfix, TokStart); |
| return lexOperatorIdentifier(); |
| |
| case '<': |
| if (CurPtr[0] == '#') |
| return tryLexEditorPlaceholder(); |
| else if (CurPtr[0] == '<' && tryLexConflictMarker(/*EatNewline=*/true)) |
| goto Restart; |
| return lexOperatorIdentifier(); |
| |
| case '>': |
| if (CurPtr[0] == '>' && tryLexConflictMarker(/*EatNewline=*/true)) |
| goto Restart; |
| return lexOperatorIdentifier(); |
| |
| case '=': case '-': case '+': case '*': |
| case '&': case '|': case '^': case '~': case '.': |
| return lexOperatorIdentifier(); |
| |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': |
| case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': |
| case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': |
| case 'V': case 'W': case 'X': case 'Y': case 'Z': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': |
| case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': |
| case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': |
| case 'v': case 'w': case 'x': case 'y': case 'z': |
| case '_': |
| return lexIdentifier(); |
| |
| case '$': |
| return lexDollarIdent(); |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| return lexNumber(); |
| |
| case '"': |
| case '\'': |
| return lexStringLiteral(); |
| |
| case '`': |
| return lexEscapedIdentifier(); |
| } |
| } |
| |
| Token Lexer::getTokenAtLocation(const SourceManager &SM, SourceLoc Loc) { |
| // Don't try to do anything with an invalid location. |
| if (!Loc.isValid()) |
| return Token(); |
| |
| // Figure out which buffer contains this location. |
| int BufferID = SM.findBufferContainingLoc(Loc); |
| if (BufferID < 0) |
| return Token(); |
| |
| // Use fake language options; language options only affect validity |
| // and the exact token produced. |
| LangOptions FakeLangOpts; |
| |
| // Here we return comments as tokens because either the caller skipped |
| // comments and normally we won't be at the beginning of a comment token |
| // (making this option irrelevant), or the caller lexed comments and |
| // we need to lex just the comment token. |
| Lexer L(FakeLangOpts, SM, BufferID, nullptr, /*InSILMode=*/ false, |
| CommentRetentionMode::ReturnAsTokens); |
| L.restoreState(State(Loc)); |
| return L.peekNextToken(); |
| } |
| |
| void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) { |
| if (TriviaRetention == TriviaRetentionMode::WithoutTrivia) |
| return; |
| |
| Restart: |
| const char *TriviaStart = CurPtr; |
| |
| // TODO: Handle random nul('\0') character in the middle of a buffer. |
| // TODO: Handle invalid UTF8 sequence which is skipped in lexImpl(). |
| switch (*CurPtr++) { |
| case '\n': |
| if (IsForTrailingTrivia) |
| break; |
| NextToken.setAtStartOfLine(true); |
| Pieces.appendOrSquash(TriviaPiece::newlines(1)); |
| goto Restart; |
| case '\r': |
| if (IsForTrailingTrivia) |
| break; |
| NextToken.setAtStartOfLine(true); |
| if (CurPtr[0] == '\n') { |
| Pieces.appendOrSquash(TriviaPiece::carriageReturnLineFeeds(1)); |
| ++CurPtr; |
| } else { |
| Pieces.appendOrSquash(TriviaPiece::carriageReturns(1)); |
| } |
| goto Restart; |
| case ' ': |
| Pieces.appendOrSquash(TriviaPiece::spaces(1)); |
| goto Restart; |
| case '\t': |
| Pieces.appendOrSquash(TriviaPiece::tabs(1)); |
| goto Restart; |
| case '\v': |
| Pieces.appendOrSquash(TriviaPiece::verticalTabs(1)); |
| goto Restart; |
| case '\f': |
| Pieces.appendOrSquash(TriviaPiece::formfeeds(1)); |
| goto Restart; |
| case '/': |
| if (IsForTrailingTrivia || isKeepingComments()) { |
| // Don't lex comments as trailing trivias (for now). |
| // Don't try to lex comments here if we are lexing comments as Tokens. |
| break; |
| } else if (*CurPtr == '/') { |
| // '// ...' comment. |
| SeenComment = true; |
| bool isDocComment = CurPtr[1] == '/'; |
| skipSlashSlashComment(/*EatNewline=*/false); |
| size_t Length = CurPtr - TriviaStart; |
| Pieces.push_back(isDocComment |
| ? TriviaPiece::docLineComment({TriviaStart, Length}) |
| : TriviaPiece::lineComment({TriviaStart, Length})); |
| goto Restart; |
| } else if (*CurPtr == '*') { |
| // '/* ... */' comment. |
| SeenComment = true; |
| bool isDocComment = CurPtr[1] == '*'; |
| skipSlashStarComment(); |
| size_t Length = CurPtr - TriviaStart; |
| Pieces.push_back(isDocComment |
| ? TriviaPiece::docBlockComment({TriviaStart, Length}) |
| : TriviaPiece::blockComment({TriviaStart, Length})); |
| goto Restart; |
| } |
| break; |
| case '#': |
| if (TriviaStart == ContentStart && *CurPtr == '!') { |
| // Hashbang '#!/path/to/swift'. |
| --CurPtr; |
| if (BufferID != SourceMgr.getHashbangBufferID()) |
| diagnose(TriviaStart, diag::lex_hashbang_not_allowed); |
| skipHashbang(/*EatNewline=*/false); |
| size_t Length = CurPtr - TriviaStart; |
| Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length})); |
| goto Restart; |
| } |
| break; |
| case '<': |
| case '>': |
| if (tryLexConflictMarker(/*EatNewline=*/false)) { |
| // Conflict marker. |
| size_t Length = CurPtr - TriviaStart; |
| Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length})); |
| goto Restart; |
| } |
| break; |
| default: |
| break; |
| } |
| // Reset the cursor. |
| --CurPtr; |
| } |
| |
| SourceLoc Lexer::getLocForEndOfToken(const SourceManager &SM, SourceLoc Loc) { |
| return Loc.getAdvancedLocOrInvalid(getTokenAtLocation(SM, Loc).getLength()); |
| } |
| |
| |
| static SourceLoc getLocForStartOfTokenInBuf(SourceManager &SM, |
| unsigned BufferID, |
| unsigned Offset, |
| unsigned BufferStart, |
| unsigned BufferEnd) { |
| // Use fake language options; language options only affect validity |
| // and the exact token produced. |
| LangOptions FakeLangOptions; |
| |
| Lexer L(FakeLangOptions, SM, BufferID, nullptr, /*InSILMode=*/false, |
| CommentRetentionMode::None, TriviaRetentionMode::WithoutTrivia, |
| BufferStart, BufferEnd); |
| |
| // Lex tokens until we find the token that contains the source location. |
| Token Tok; |
| do { |
| L.lex(Tok); |
| |
| unsigned TokOffs = SM.getLocOffsetInBuffer(Tok.getLoc(), BufferID); |
| if (TokOffs > Offset) { |
| // We ended up skipping over the source location entirely, which means |
| // that it points into whitespace. We are done here. |
| break; |
| } |
| |
| if (Offset < TokOffs+Tok.getLength()) { |
| // Current token encompasses our source location. |
| |
| if (Tok.is(tok::string_literal)) { |
| SmallVector<Lexer::StringSegment, 4> Segments; |
| Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr); |
| for (auto &Seg : Segments) { |
| unsigned SegOffs = SM.getLocOffsetInBuffer(Seg.Loc, BufferID); |
| unsigned SegEnd = SegOffs+Seg.Length; |
| if (SegOffs > Offset) |
| break; |
| |
| // If the offset is inside an interpolated expr segment, re-lex. |
| if (Seg.Kind == Lexer::StringSegment::Expr && Offset < SegEnd) |
| return getLocForStartOfTokenInBuf(SM, BufferID, Offset, |
| /*BufferStart=*/SegOffs, |
| /*BufferEnd=*/SegEnd); |
| } |
| } |
| |
| return Tok.getLoc(); |
| } |
| } while (Tok.isNot(tok::eof)); |
| |
| // We've passed our source location; just return the original source location. |
| return SM.getLocForOffset(BufferID, Offset); |
| } |
| |
| // Find the start of the given line. |
| static const char *findStartOfLine(const char *bufStart, const char *current) { |
| while (current != bufStart) { |
| if (current[0] == '\n' || current[0] == '\r') { |
| ++current; |
| break; |
| } |
| |
| --current; |
| } |
| |
| return current; |
| } |
| |
| SourceLoc Lexer::getLocForStartOfToken(SourceManager &SM, SourceLoc Loc) { |
| Optional<unsigned> BufferIdOp = SM.getIDForBufferIdentifier(SM. |
| getBufferIdentifierForLoc(Loc)); |
| if (!BufferIdOp.hasValue()) |
| return SourceLoc(); |
| return getLocForStartOfToken(SM, BufferIdOp.getValue(), |
| SM.getLocOffsetInBuffer(Loc, BufferIdOp.getValue())); |
| } |
| |
| SourceLoc Lexer::getLocForStartOfToken(SourceManager &SM, unsigned BufferID, |
| unsigned Offset) { |
| CharSourceRange entireRange = SM.getRangeForBuffer(BufferID); |
| StringRef Buffer = SM.extractText(entireRange); |
| |
| const char *BufStart = Buffer.data(); |
| if (Offset > Buffer.size()) |
| return SourceLoc(); |
| |
| const char *StrData = BufStart+Offset; |
| // If it points to whitespace return the SourceLoc for it. |
| if (StrData[0] == '\n' || StrData[0] == '\r' || |
| StrData[0] == ' ' || StrData[0] == '\t') |
| return SM.getLocForOffset(BufferID, Offset); |
| |
| // Back up from the current location until we hit the beginning of a line |
| // (or the buffer). We'll relex from that point. |
| const char *LexStart = findStartOfLine(BufStart, StrData); |
| |
| return getLocForStartOfTokenInBuf(SM, BufferID, Offset, |
| /*BufferStart=*/LexStart-BufStart, |
| /*BufferEnd=*/Buffer.size()); |
| } |
| |
| SourceLoc Lexer::getLocForStartOfLine(SourceManager &SM, SourceLoc Loc) { |
| // Don't try to do anything with an invalid location. |
| if (Loc.isInvalid()) |
| return Loc; |
| |
| // Figure out which buffer contains this location. |
| int BufferID = SM.findBufferContainingLoc(Loc); |
| if (BufferID < 0) |
| return SourceLoc(); |
| |
| CharSourceRange entireRange = SM.getRangeForBuffer(BufferID); |
| StringRef Buffer = SM.extractText(entireRange); |
| |
| const char *BufStart = Buffer.data(); |
| unsigned Offset = SM.getLocOffsetInBuffer(Loc, BufferID); |
| |
| const char *StartOfLine = findStartOfLine(BufStart, BufStart + Offset); |
| return getSourceLoc(StartOfLine); |
| } |
| |
| SourceLoc Lexer::getLocForEndOfLine(SourceManager &SM, SourceLoc Loc) { |
| // Don't try to do anything with an invalid location. |
| if (Loc.isInvalid()) |
| return Loc; |
| |
| // Figure out which buffer contains this location. |
| int BufferID = SM.findBufferContainingLoc(Loc); |
| if (BufferID < 0) |
| return SourceLoc(); |
| |
| // Use fake language options; language options only affect validity |
| // and the exact token produced. |
| LangOptions FakeLangOpts; |
| |
| // Here we return comments as tokens because either the caller skipped |
| // comments and normally we won't be at the beginning of a comment token |
| // (making this option irrelevant), or the caller lexed comments and |
| // we need to lex just the comment token. |
| Lexer L(FakeLangOpts, SM, BufferID, nullptr, /*InSILMode=*/ false, |
| CommentRetentionMode::ReturnAsTokens); |
| L.restoreState(State(Loc)); |
| L.skipToEndOfLine(/*EatNewline=*/true); |
| return getSourceLoc(L.CurPtr); |
| } |
| |
| StringRef Lexer::getIndentationForLine(SourceManager &SM, SourceLoc Loc) { |
| // Don't try to do anything with an invalid location. |
| if (Loc.isInvalid()) |
| return ""; |
| |
| // Figure out which buffer contains this location. |
| int BufferID = SM.findBufferContainingLoc(Loc); |
| if (BufferID < 0) |
| return ""; |
| |
| CharSourceRange entireRange = SM.getRangeForBuffer(BufferID); |
| StringRef Buffer = SM.extractText(entireRange); |
| |
| const char *BufStart = Buffer.data(); |
| unsigned Offset = SM.getLocOffsetInBuffer(Loc, BufferID); |
| |
| const char *StartOfLine = findStartOfLine(BufStart, BufStart + Offset); |
| const char *EndOfIndentation = StartOfLine; |
| while (*EndOfIndentation && isHorizontalWhitespace(*EndOfIndentation)) |
| ++EndOfIndentation; |
| |
| return StringRef(StartOfLine, EndOfIndentation - StartOfLine); |
| } |
| |
| ArrayRef<Token> swift:: |
| slice_token_array(ArrayRef<Token> AllTokens, SourceLoc StartLoc, |
| SourceLoc EndLoc) { |
| assert(StartLoc.isValid() && EndLoc.isValid()); |
| auto StartIt = token_lower_bound(AllTokens, StartLoc); |
| auto EndIt = token_lower_bound(AllTokens, EndLoc); |
| assert(StartIt->getLoc() == StartLoc && EndIt->getLoc() == EndLoc); |
| return AllTokens.slice(StartIt - AllTokens.begin(), EndIt - StartIt + 1); |
| } |