Merge pull request #13626 from omochi/syntax-bom
[Syntax] add UTF-8 BOM support to libSyntax
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
index c61783c..f582123 100644
--- a/lib/Parse/Lexer.cpp
+++ b/lib/Parse/Lexer.cpp
@@ -188,9 +188,9 @@
.StartsWith("\xEF\xBB\xBF", 3)
.Default(0);
- // Since the UTF-8 BOM doesn't carry information (UTF-8 has no dependency
- // on byte order), throw it away.
- CurPtr = BufferStart + BOMLength;
+ // Keep information about existance of UTF-8 BOM for transparency source code
+ // editing with libSyntax.
+ CurPtr = BufferStart;
ContentStart = BufferStart + BOMLength;
// Initialize code completion.
@@ -2036,7 +2036,20 @@
LeadingTrivia.clear();
TrailingTrivia.clear();
}
- NextToken.setAtStartOfLine(CurPtr == ContentStart);
+ if (CurPtr == BufferStart) {
+ if (BufferStart < ContentStart) {
+ size_t BOMLen = ContentStart - BufferStart;
+ assert(BOMLen == 3 && "UTF-8 BOM is 3 bytes");
+ if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
+ // Add UTF-8 BOM to LeadingTrivia.
+ LeadingTrivia.push_back(TriviaPiece::garbageText({CurPtr, BOMLen}));
+ }
+ CurPtr += BOMLen;
+ }
+ NextToken.setAtStartOfLine(true);
+ } else {
+ NextToken.setAtStartOfLine(false);
+ }
// Remember where we started so that we can find the comment range.
LastCommentBlockStart = CurPtr;
diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
index 75cc8d3..76172d8 100644
--- a/unittests/Parse/LexerTests.cpp
+++ b/unittests/Parse/LexerTests.cpp
@@ -260,6 +260,208 @@
ASSERT_TRUE(Tok.isAtStartOfLine());
}
+TEST_F(LexerTest, BOMNoCommentNoTrivia) {
+ const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+
+ LangOptions LangOpts;
+ SourceManager SourceMgr;
+ unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+
+ Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+ CommentRetentionMode::None, TriviaRetentionMode::WithoutTrivia);
+
+ Token Tok;
+ syntax::Trivia LeadingTrivia, TrailingTrivia;
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::identifier, Tok.getKind());
+ ASSERT_EQ("aaa", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::eof, Tok.getKind());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMTokenCommentNoTrivia) {
+ const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+
+ LangOptions LangOpts;
+ SourceManager SourceMgr;
+ unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+
+ Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+ CommentRetentionMode::ReturnAsTokens, TriviaRetentionMode::WithoutTrivia);
+
+ Token Tok;
+ syntax::Trivia LeadingTrivia, TrailingTrivia;
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::comment, Tok.getKind());
+ ASSERT_EQ("// comment\n", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::identifier, Tok.getKind());
+ ASSERT_EQ("aaa", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::comment, Tok.getKind());
+ ASSERT_EQ("//xx \n", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::comment, Tok.getKind());
+ ASSERT_EQ("/* x */", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::eof, Tok.getKind());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMAttachCommentNoTrivia) {
+ const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+
+ LangOptions LangOpts;
+ SourceManager SourceMgr;
+ unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+
+ Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+ CommentRetentionMode::AttachToNextToken, TriviaRetentionMode::WithoutTrivia);
+
+ Token Tok;
+ syntax::Trivia LeadingTrivia, TrailingTrivia;
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::identifier, Tok.getKind());
+ ASSERT_EQ("aaa", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
+ ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::eof, Tok.getKind());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
+ ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMNoCommentTrivia) {
+ const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+
+ LangOptions LangOpts;
+ SourceManager SourceMgr;
+ unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+
+ Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+ CommentRetentionMode::None, TriviaRetentionMode::WithTrivia);
+
+ Token Tok;
+ syntax::Trivia LeadingTrivia, TrailingTrivia;
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::identifier, Tok.getKind());
+ ASSERT_EQ("aaa", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
+ syntax::TriviaPiece::lineComment("// comment"),
+ syntax::TriviaPiece::newlines(1)
+ }}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::spaces(1)
+ }}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::eof, Tok.getKind());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
+ ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::lineComment("//xx "),
+ syntax::TriviaPiece::newlines(1),
+ syntax::TriviaPiece::blockComment("/* x */")
+ }}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMAttachCommentTrivia) {
+ const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+
+ LangOptions LangOpts;
+ SourceManager SourceMgr;
+ unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+
+ Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+ CommentRetentionMode::AttachToNextToken, TriviaRetentionMode::WithTrivia);
+
+ Token Tok;
+ syntax::Trivia LeadingTrivia, TrailingTrivia;
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::identifier, Tok.getKind());
+ ASSERT_EQ("aaa", Tok.getText());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
+ ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
+ syntax::TriviaPiece::lineComment("// comment"),
+ syntax::TriviaPiece::newlines(1)
+ }}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::spaces(1)
+ }}), TrailingTrivia);
+
+ L.lex(Tok, LeadingTrivia, TrailingTrivia);
+ ASSERT_EQ(tok::eof, Tok.getKind());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+ ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
+ ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
+ ASSERT_EQ((syntax::Trivia{{
+ syntax::TriviaPiece::lineComment("//xx "),
+ syntax::TriviaPiece::newlines(1),
+ syntax::TriviaPiece::blockComment("/* x */")
+ }}), LeadingTrivia);
+ ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
TEST_F(LexerTest, RestoreBasic) {
const char *Source = "aaa \t\0 bbb ccc";
diff --git a/unittests/Parse/LexerTriviaTests.cpp b/unittests/Parse/LexerTriviaTests.cpp
index e1c7d08..5373691 100644
--- a/unittests/Parse/LexerTriviaTests.cpp
+++ b/unittests/Parse/LexerTriviaTests.cpp
@@ -102,9 +102,9 @@
ASSERT_EQ("aaa", Tok.getText());
ASSERT_TRUE(Tok.isAtStartOfLine());
- // FIXME: This should include UTF8-BOM as a GarbargeText trivia.
ASSERT_EQ(LeadingTrivia,
- (Trivia{{TriviaPiece::garbageText("#!/bin/swift"),
+ (Trivia{{TriviaPiece::garbageText("\xEF\xBB\xBF"),
+ TriviaPiece::garbageText("#!/bin/swift"),
TriviaPiece::newlines(1)}}));
}