Automated g4 rollback of changelist 415285962. *** Reason for rollback *** Crashing services with index out of bound (https://coroner.corp.google.com/#/crashes?user=compliance-presubmit3p) *** Original change description *** Fixes handling of newline characters so that Normalize preserves the newline characters of the original input. The code doing the tokenization of the newlines had some minor bugs that resulted in spurious newlines being introduced into the token stream. This wasn't a problem before since they were only used inside the tokenizer to detect header constructs and de-hyphenate words, and were always removed from the token stream passed to calling functions. This meant that the token stream Normalize... *** PiperOrigin-RevId: 416154051

commit: 926575c44bbc848f47f0e8b7b6a7d5f325a6ed3a [log] [tgz]
author: Google Open Source <noreply+opensource@google.com> Mon Dec 13 15:51:00 2021 -0800
committer: Bill Neubauer <wcn@google.com> Wed Mar 16 15:36:54 2022 -0700
tree: cc8f0c6559142a443e4c95c53bbe2d4d2453b07e
parent: 9fc6ed6d865bf3329927b98d0da5591381969260 [diff]
diff --git a/v2/classifier.go b/v2/classifier.go
index 05937de..0668254 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go

@@ -223,7 +223,7 @@
 // return the same results as Match(in).
 func (c *Classifier) Normalize(in []byte) []byte {
 	text := normalizeDoc(in, false)
-	doc := extractDoc(text, false)
+	doc := extractDoc(text)
 
 	var buf bytes.Buffer
 
@@ -235,25 +235,11 @@
 		return buf.Bytes()
 	}
 
-	prevLine := 1
 	buf.WriteString(doc.Tokens[0].Text)
+
 	for _, t := range doc.Tokens[1:] {
-		// Only write out an EOL token that incremented the line
-		if t.Line == prevLine+1 {
-			buf.WriteString("\n")
-		}
-
-		// Only write tokens that aren't EOL
-		if t.Text != eol {
-			// Only put a space between tokens if the previous token was on the same
-			// line. This prevents spaces after an EOL
-			if t.Line == prevLine {
-				buf.WriteString(" ")
-			}
-			buf.WriteString(t.Text)
-		}
-
-		prevLine = t.Line
+		buf.WriteString(" ")
+		buf.WriteString(t.Text)
 	}
 	return buf.Bytes()
 }

diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index f9d2c74..cb613e5 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go

@@ -327,34 +327,6 @@
 			input: "   License  ",
 			want:  "License",
 		},
-		{
-			// This tests that the line breaks in the input text are properly
-			// preserved, which is important for visual diffing.
-			input: `Preserving
-line
-
-breaks is important`,
-			want: `Preserving
-line
-
-breaks is important`,
-		},
-		{
-			// This tests that soft EOL functionality doesn't affect normalized output
-			input: `This is a sentence looking construct. This is another sentence. What happens?`,
-			want:  `This is a sentence looking construct This is another sentence What happens`,
-		},
-		{
-			input: `header
-........................ This is oddly formatted`,
-			want: `header
-This is oddly formatted`,
-		},
-		{
-			input: `baseball basket-
-ball football`,
-			want: "baseball basketball\nfootball",
-		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.input, func(t *testing.T) {

diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 001eaba..eaa0479 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go

@@ -83,10 +83,10 @@
 func tokenize(in []byte) *document {
 	// tokenize produces a document from the input content.
 	text := normalizeDoc(in, true)
-	return extractDoc(text, true)
+	return extractDoc(text)
 }
 
-func extractDoc(text string, removeEol bool) *document {
+func extractDoc(text string) *document {
 	var doc document
 	// Iterate on a line-by-line basis.
 	i := 0
@@ -138,13 +138,9 @@
 					// follow this text. This resolves problems with licenses that are a
 					// very long line of text, motivated by
 					// https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109
-					//
-					// Don't do this if the previous token was already an EOL
-					if doc.Tokens[len(doc.Tokens)-1].Text != eol {
-						doc.Tokens = append(doc.Tokens, &token{
-							Text: eol,
-							Line: i + 1})
-					}
+					doc.Tokens = append(doc.Tokens, &token{
+						Text: eol,
+						Line: i + 1})
 				}
 
 				tok := token{
@@ -159,13 +155,17 @@
 				firstInLine = false
 			}
 		}
+		tok := token{
+			Text: eol,
+			Line: i + 1,
+		}
+		doc.Tokens = append(doc.Tokens, &tok)
 	}
-
-	doc.Tokens = cleanupTokens(doc.Tokens, removeEol)
+	doc.Tokens = cleanupTokens(doc.Tokens)
 	return &doc
 }
 
-func cleanupTokens(in []*token, removeEol bool) []*token {
+func cleanupTokens(in []*token) []*token {
 	// This routine performs sanitization of tokens. If it is a header-looking
 	// token (but not a version number) starting a line, it is removed.
 	// Hyphenated words are reassembled.
@@ -179,15 +179,6 @@
 		}
 		if tok.Text == eol {
 			firstInLine = true
-			if removeEol {
-				continue
-			}
-			// If we are reconstructing a hyphenated word, don't append the EOL
-			// now, do it when the word is reconstructed.
-			if partialWord == "" {
-				out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
-				tokIdx++
-			}
 			continue
 		}
 		firstInLine = false
@@ -204,12 +195,6 @@
 			tp.Previous = ""
 			out = append(out, tp)
 			tokIdx++
-			if !removeEol {
-				// Append the EOL now that the whole word is recovered
-				out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
-				tokIdx++
-			}
-
 			partialWord = ""
 		} else {
 			tok.Text = t
@@ -359,7 +344,7 @@
 // classification
 func removeIgnorableTexts(s string) string {
 	var out []string
-	lines := strings.Split(s, "\n")
+	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
 	for _, l := range lines {
 		line := strings.TrimSpace(l)
 		var match bool
@@ -375,5 +360,5 @@
 			out = append(out, "")
 		}
 	}
-	return strings.Join(out, "\n")
+	return strings.Join(out, "\n") + "\n"
 }
commit	926575c44bbc848f47f0e8b7b6a7d5f325a6ed3a	[log] [tgz]
author	Google Open Source <noreply+opensource@google.com>	Mon Dec 13 15:51:00 2021 -0800
committer	Bill Neubauer <wcn@google.com>	Wed Mar 16 15:36:54 2022 -0700
tree	cc8f0c6559142a443e4c95c53bbe2d4d2453b07e
parent	9fc6ed6d865bf3329927b98d0da5591381969260 [diff]