[Syntax] add UTF-8 BOM support to libSyntax
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
index c61783c..f582123 100644
--- a/lib/Parse/Lexer.cpp
+++ b/lib/Parse/Lexer.cpp
@@ -188,9 +188,9 @@
.StartsWith("\xEF\xBB\xBF", 3)
.Default(0);
- // Since the UTF-8 BOM doesn't carry information (UTF-8 has no dependency
- // on byte order), throw it away.
- CurPtr = BufferStart + BOMLength;
+ // Keep information about existance of UTF-8 BOM for transparency source code
+ // editing with libSyntax.
+ CurPtr = BufferStart;
ContentStart = BufferStart + BOMLength;
// Initialize code completion.
@@ -2036,7 +2036,20 @@
LeadingTrivia.clear();
TrailingTrivia.clear();
}
- NextToken.setAtStartOfLine(CurPtr == ContentStart);
+ if (CurPtr == BufferStart) {
+ if (BufferStart < ContentStart) {
+ size_t BOMLen = ContentStart - BufferStart;
+ assert(BOMLen == 3 && "UTF-8 BOM is 3 bytes");
+ if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
+ // Add UTF-8 BOM to LeadingTrivia.
+ LeadingTrivia.push_back(TriviaPiece::garbageText({CurPtr, BOMLen}));
+ }
+ CurPtr += BOMLen;
+ }
+ NextToken.setAtStartOfLine(true);
+ } else {
+ NextToken.setAtStartOfLine(false);
+ }
// Remember where we started so that we can find the comment range.
LastCommentBlockStart = CurPtr;
diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
index 12a5fa4..76172d8 100644
--- a/unittests/Parse/LexerTests.cpp
+++ b/unittests/Parse/LexerTests.cpp
@@ -400,6 +400,7 @@
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
syntax::TriviaPiece::lineComment("// comment"),
syntax::TriviaPiece::newlines(1)
}}), LeadingTrivia);
@@ -440,6 +441,7 @@
ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
syntax::TriviaPiece::lineComment("// comment"),
syntax::TriviaPiece::newlines(1)
}}), LeadingTrivia);
diff --git a/unittests/Parse/LexerTriviaTests.cpp b/unittests/Parse/LexerTriviaTests.cpp
index e1c7d08..5373691 100644
--- a/unittests/Parse/LexerTriviaTests.cpp
+++ b/unittests/Parse/LexerTriviaTests.cpp
@@ -102,9 +102,9 @@
ASSERT_EQ("aaa", Tok.getText());
ASSERT_TRUE(Tok.isAtStartOfLine());
- // FIXME: This should include UTF8-BOM as a GarbargeText trivia.
ASSERT_EQ(LeadingTrivia,
- (Trivia{{TriviaPiece::garbageText("#!/bin/swift"),
+ (Trivia{{TriviaPiece::garbageText("\xEF\xBB\xBF"),
+ TriviaPiece::garbageText("#!/bin/swift"),
TriviaPiece::newlines(1)}}));
}