Merge pull request #13626 from omochi/syntax-bom

[Syntax] add UTF-8 BOM support to libSyntax
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
index c61783c..f582123 100644
--- a/lib/Parse/Lexer.cpp
+++ b/lib/Parse/Lexer.cpp
@@ -188,9 +188,9 @@
     .StartsWith("\xEF\xBB\xBF", 3)
     .Default(0);
 
-  // Since the UTF-8 BOM doesn't carry information (UTF-8 has no dependency
-  // on byte order), throw it away.
-  CurPtr = BufferStart + BOMLength;
+  // Keep information about existance of UTF-8 BOM for transparency source code
+  // editing with libSyntax.
+  CurPtr = BufferStart;
   ContentStart = BufferStart + BOMLength;
 
   // Initialize code completion.
@@ -2036,7 +2036,20 @@
     LeadingTrivia.clear();
     TrailingTrivia.clear();
   }
-  NextToken.setAtStartOfLine(CurPtr == ContentStart);
+  if (CurPtr == BufferStart) {
+    if (BufferStart < ContentStart) {
+      size_t BOMLen = ContentStart - BufferStart;
+      assert(BOMLen == 3 && "UTF-8 BOM is 3 bytes");
+      if (TriviaRetention == TriviaRetentionMode::WithTrivia) {
+        // Add UTF-8 BOM to LeadingTrivia.
+        LeadingTrivia.push_back(TriviaPiece::garbageText({CurPtr, BOMLen}));
+      }
+      CurPtr += BOMLen;
+    }
+    NextToken.setAtStartOfLine(true);
+  } else {
+    NextToken.setAtStartOfLine(false);
+  }
 
   // Remember where we started so that we can find the comment range.
   LastCommentBlockStart = CurPtr;
diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
index 75cc8d3..76172d8 100644
--- a/unittests/Parse/LexerTests.cpp
+++ b/unittests/Parse/LexerTests.cpp
@@ -260,6 +260,208 @@
   ASSERT_TRUE(Tok.isAtStartOfLine());
 }
 
+TEST_F(LexerTest, BOMNoCommentNoTrivia) {
+  const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+  
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+  
+  Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+          CommentRetentionMode::None, TriviaRetentionMode::WithoutTrivia);
+  
+  Token Tok;
+  syntax::Trivia LeadingTrivia, TrailingTrivia;
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::identifier, Tok.getKind());
+  ASSERT_EQ("aaa", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::eof, Tok.getKind());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMTokenCommentNoTrivia) {
+  const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+  
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+  
+  Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+          CommentRetentionMode::ReturnAsTokens, TriviaRetentionMode::WithoutTrivia);
+  
+  Token Tok;
+  syntax::Trivia LeadingTrivia, TrailingTrivia;
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::comment, Tok.getKind());
+  ASSERT_EQ("// comment\n", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::identifier, Tok.getKind());
+  ASSERT_EQ("aaa", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::comment, Tok.getKind());
+  ASSERT_EQ("//xx \n", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::comment, Tok.getKind());
+  ASSERT_EQ("/* x */", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 24), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::eof, Tok.getKind());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMAttachCommentNoTrivia) {
+  const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+  
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+  
+  Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+          CommentRetentionMode::AttachToNextToken, TriviaRetentionMode::WithoutTrivia);
+  
+  Token Tok;
+  syntax::Trivia LeadingTrivia, TrailingTrivia;
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::identifier, Tok.getKind());
+  ASSERT_EQ("aaa", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
+  ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::eof, Tok.getKind());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
+  ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{}}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMNoCommentTrivia) {
+  const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+  
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+  
+  Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+          CommentRetentionMode::None, TriviaRetentionMode::WithTrivia);
+  
+  Token Tok;
+  syntax::Trivia LeadingTrivia, TrailingTrivia;
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::identifier, Tok.getKind());
+  ASSERT_EQ("aaa", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{
+    syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
+    syntax::TriviaPiece::lineComment("// comment"),
+    syntax::TriviaPiece::newlines(1)
+  }}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{
+    syntax::TriviaPiece::spaces(1)
+  }}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::eof, Tok.getKind());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
+  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{
+    syntax::TriviaPiece::lineComment("//xx "),
+    syntax::TriviaPiece::newlines(1),
+    syntax::TriviaPiece::blockComment("/* x */")
+  }}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
+TEST_F(LexerTest, BOMAttachCommentTrivia) {
+  const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
+  
+  LangOptions LangOpts;
+  SourceManager SourceMgr;
+  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
+  
+  Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, /*InSILMode=*/false,
+          CommentRetentionMode::AttachToNextToken, TriviaRetentionMode::WithTrivia);
+  
+  Token Tok;
+  syntax::Trivia LeadingTrivia, TrailingTrivia;
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::identifier, Tok.getKind());
+  ASSERT_EQ("aaa", Tok.getText());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 3), Tok.getCommentRange().getStart());
+  ASSERT_EQ(10u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{
+    syntax::TriviaPiece::garbageText("\xEF\xBB\xBF"),
+    syntax::TriviaPiece::lineComment("// comment"),
+    syntax::TriviaPiece::newlines(1)
+  }}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{
+    syntax::TriviaPiece::spaces(1)
+  }}), TrailingTrivia);
+  
+  L.lex(Tok, LeadingTrivia, TrailingTrivia);
+  ASSERT_EQ(tok::eof, Tok.getKind());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
+  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 18), Tok.getCommentRange().getStart());
+  ASSERT_EQ(13u, Tok.getCommentRange().getByteLength());
+  ASSERT_EQ((syntax::Trivia{{
+    syntax::TriviaPiece::lineComment("//xx "),
+    syntax::TriviaPiece::newlines(1),
+    syntax::TriviaPiece::blockComment("/* x */")
+  }}), LeadingTrivia);
+  ASSERT_EQ((syntax::Trivia{{}}), TrailingTrivia);
+}
+
 TEST_F(LexerTest, RestoreBasic) {
   const char *Source = "aaa \t\0 bbb ccc";
 
diff --git a/unittests/Parse/LexerTriviaTests.cpp b/unittests/Parse/LexerTriviaTests.cpp
index e1c7d08..5373691 100644
--- a/unittests/Parse/LexerTriviaTests.cpp
+++ b/unittests/Parse/LexerTriviaTests.cpp
@@ -102,9 +102,9 @@
   ASSERT_EQ("aaa", Tok.getText());
   ASSERT_TRUE(Tok.isAtStartOfLine());
 
-  // FIXME: This should include UTF8-BOM as a GarbargeText trivia.
   ASSERT_EQ(LeadingTrivia,
-            (Trivia{{TriviaPiece::garbageText("#!/bin/swift"),
+            (Trivia{{TriviaPiece::garbageText("\xEF\xBB\xBF"),
+                     TriviaPiece::garbageText("#!/bin/swift"),
                      TriviaPiece::newlines(1)}}));
 }