Change the licenseclassifier return type to a summary structure rather than a raw list of matches. This allows us to return additional metadata about the original input to the classifier. One datum currently included in this summary is the number of lines of input presented to the classifier, which is necessary to identify ranges of text that did not contain a license. This change consolidates the usages of the third party classifier API around a common interface for the compliance team and provides a common stub, eliminating many ad-hoc stub implementations. This CL was tested via global presubmit to ensure usages outside of the compliance team were not negatively impacted. *** Change 388303070 PiperOrigin-RevId: 388973411

commit: 944bfc450d2ff7a35690262337efc3a3154cf94f [log] [tgz]
author: Bill Neubauer <wcn@google.com> Thu Aug 05 10:54:58 2021 -0700
committer: Bill Neubauer <wcn@google.com> Wed Mar 16 15:31:27 2022 -0700
tree: 921dc8abe87173ed269cc5c0f56bb7fe11fdb270
parent: ef8812216d51a3a43a3e9409169f8ba06a512766 [diff]
diff --git a/v2/classifier.go b/v2/classifier.go
index 89fa74c..f970543 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go

@@ -36,6 +36,13 @@
 	EndTokenIndex   int
 }
 
+// Results captures the summary information and matches detected by the
+// classifier.
+type Results struct {
+	Matches         Matches
+	TotalInputLines int
+}
+
 // Matches is a sortable slice of Match.
 type Matches []*Match
 
@@ -57,7 +64,7 @@
 }
 
 // Match reports instances of the supplied content in the corpus.
-func (c *Classifier) match(in []byte) Matches {
+func (c *Classifier) match(in []byte) Results {
 	id := c.createTargetIndexedDocument(in)
 
 	firstPass := make(map[string]*indexedDocument)
@@ -69,7 +76,10 @@
 	}
 
 	if len(firstPass) == 0 {
-		return nil
+		return Results{
+			Matches:         nil,
+			TotalInputLines: 0,
+		}
 	}
 
 	// Perform the expensive work of generating a searchset to look for token runs.
@@ -162,7 +172,10 @@
 			out = append(out, candidates[i])
 		}
 	}
-	return out
+	return Results{
+		Matches:         out,
+		TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
+	}
 }
 
 // Classifier provides methods for identifying open source licenses in text
@@ -226,15 +239,15 @@
 
 // Match finds matches within an unknown text. This will not modify the contents
 // of the supplied byte slice.
-func (c *Classifier) Match(in []byte) Matches {
+func (c *Classifier) Match(in []byte) Results {
 	return c.match(in)
 }
 
 // MatchFrom finds matches within the read content.
-func (c *Classifier) MatchFrom(in io.Reader) (Matches, error) {
+func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
 	b, err := ioutil.ReadAll(in)
 	if err != nil {
-		return nil, fmt.Errorf("classifier couldn't read: %w", err)
+		return Results{}, fmt.Errorf("classifier couldn't read: %w", err)
 	}
 	return c.Match(b), nil
 }

diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index 5cb9dbf..b52b283 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go

@@ -75,7 +75,7 @@
 		s := readScenario(f)
 
 		m := c.Match(s.data)
-		checkMatches(t, m, f, s.expected)
+		checkMatches(t, m.Matches, f, s.expected)
 	}
 }
 
@@ -236,7 +236,7 @@
 		if err != nil {
 			t.Errorf("unexpected error: %v", err)
 		}
-		checkMatches(t, m, f, s.expected)
+		checkMatches(t, m.Matches, f, s.expected)
 	}
 }
commit	944bfc450d2ff7a35690262337efc3a3154cf94f	[log] [tgz]
author	Bill Neubauer <wcn@google.com>	Thu Aug 05 10:54:58 2021 -0700
committer	Bill Neubauer <wcn@google.com>	Wed Mar 16 15:31:27 2022 -0700
tree	921dc8abe87173ed269cc5c0f56bb7fe11fdb270
parent	ef8812216d51a3a43a3e9409169f8ba06a512766 [diff]