Automated g4 rollback of changelist 415285962.

*** Reason for rollback ***

Crashing services with index out of bound (https://coroner.corp.google.com/#/crashes?user=compliance-presubmit3p)

*** Original change description ***

Fixes handling of newline characters so that Normalize preserves the newline
characters of the original input.

The code doing the tokenization of the newlines had some minor bugs that
resulted in spurious newlines being introduced into the token stream. This
wasn't a problem before since they were only used inside the tokenizer to
detect header constructs and de-hyphenate words, and were always removed from
the token stream passed to calling functions. This meant that the token stream
Normalize...

***

PiperOrigin-RevId: 416154051
diff --git a/v2/classifier.go b/v2/classifier.go
index 05937de..0668254 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -223,7 +223,7 @@
 // return the same results as Match(in).
 func (c *Classifier) Normalize(in []byte) []byte {
 	text := normalizeDoc(in, false)
-	doc := extractDoc(text, false)
+	doc := extractDoc(text)
 
 	var buf bytes.Buffer
 
@@ -235,25 +235,11 @@
 		return buf.Bytes()
 	}
 
-	prevLine := 1
 	buf.WriteString(doc.Tokens[0].Text)
+
 	for _, t := range doc.Tokens[1:] {
-		// Only write out an EOL token that incremented the line
-		if t.Line == prevLine+1 {
-			buf.WriteString("\n")
-		}
-
-		// Only write tokens that aren't EOL
-		if t.Text != eol {
-			// Only put a space between tokens if the previous token was on the same
-			// line. This prevents spaces after an EOL
-			if t.Line == prevLine {
-				buf.WriteString(" ")
-			}
-			buf.WriteString(t.Text)
-		}
-
-		prevLine = t.Line
+		buf.WriteString(" ")
+		buf.WriteString(t.Text)
 	}
 	return buf.Bytes()
 }
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index f9d2c74..cb613e5 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -327,34 +327,6 @@
 			input: "   License  ",
 			want:  "License",
 		},
-		{
-			// This tests that the line breaks in the input text are properly
-			// preserved, which is important for visual diffing.
-			input: `Preserving
-line
-
-breaks is important`,
-			want: `Preserving
-line
-
-breaks is important`,
-		},
-		{
-			// This tests that soft EOL functionality doesn't affect normalized output
-			input: `This is a sentence looking construct. This is another sentence. What happens?`,
-			want:  `This is a sentence looking construct This is another sentence What happens`,
-		},
-		{
-			input: `header
-........................ This is oddly formatted`,
-			want: `header
-This is oddly formatted`,
-		},
-		{
-			input: `baseball basket-
-ball football`,
-			want: "baseball basketball\nfootball",
-		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.input, func(t *testing.T) {
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 001eaba..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -83,10 +83,10 @@
 func tokenize(in []byte) *document {
 	// tokenize produces a document from the input content.
 	text := normalizeDoc(in, true)
-	return extractDoc(text, true)
+	return extractDoc(text)
 }
 
-func extractDoc(text string, removeEol bool) *document {
+func extractDoc(text string) *document {
 	var doc document
 	// Iterate on a line-by-line basis.
 	i := 0
@@ -138,13 +138,9 @@
 					// follow this text. This resolves problems with licenses that are a
 					// very long line of text, motivated by
 					// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
-					//
-					// Don't do this if the previous token was already an EOL
-					if doc.Tokens[len(doc.Tokens)-1].Text != eol {
-						doc.Tokens = append(doc.Tokens, &token{
-							Text: eol,
-							Line: i + 1})
-					}
+					doc.Tokens = append(doc.Tokens, &token{
+						Text: eol,
+						Line: i + 1})
 				}
 
 				tok := token{
@@ -159,13 +155,17 @@
 				firstInLine = false
 			}
 		}
+		tok := token{
+			Text: eol,
+			Line: i + 1,
+		}
+		doc.Tokens = append(doc.Tokens, &tok)
 	}
-
-	doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
+	doc.Tokens = cleanupTokens(doc.Tokens)
 	return &doc
 }
 
-func cleanupTokens(in []*token, removeEol bool) []*token {
+func cleanupTokens(in []*token) []*token {
 	// This routine performs sanitization of tokens. If it is a header-looking
 	// token (but not a version number) starting a line, it is removed.
 	// Hyphenated words are reassembled.
@@ -179,15 +179,6 @@
 		}
 		if tok.Text == eol {
 			firstInLine = true
-			if removeEol {
-				continue
-			}
-			// If we are reconstructing a hyphenated word, don't append the EOL
-			// now, do it when the word is reconstructed.
-			if partialWord == "" {
-				out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
-				tokIdx++
-			}
 			continue
 		}
 		firstInLine = false
@@ -204,12 +195,6 @@
 			tp.Previous = ""
 			out = append(out, tp)
 			tokIdx++
-			if !removeEol {
-				// Append the EOL now that the whole word is recovered
-				out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
-				tokIdx++
-			}
-
 			partialWord = ""
 		} else {
 			tok.Text = t
@@ -359,7 +344,7 @@
 // classification
 func removeIgnorableTexts(s string) string {
 	var out []string
-	lines := strings.Split(s, "\n")
+	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
 	for _, l := range lines {
 		line := strings.TrimSpace(l)
 		var match bool
@@ -375,5 +360,5 @@
 			out = append(out, "")
 		}
 	}
-	return strings.Join(out, "\n")
+	return strings.Join(out, "\n") + "\n"
 }