blob: 5e5f8cc7d9b0dce8fa8882f1621defcfe5943fee [file] [log] [blame]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"sort"
"strings"
)
// Match is the information about a single instance of a detected match.
type Match struct {
Name string
Confidence float64
MatchType string
Variant string
StartLine int
EndLine int
StartTokenIndex int
EndTokenIndex int
}
// Results captures the summary information and matches detected by the
// classifier.
type Results struct {
Matches Matches
TotalInputLines int
}
// Matches is a sortable slice of Match.
type Matches []*Match
// Swap two elements of Matches.
func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
func (d Matches) Len() int { return len(d) }
func (d Matches) Less(i, j int) bool {
di, dj := d[i], d[j]
// Return matches ordered by confidence
if di.Confidence != dj.Confidence {
return di.Confidence > dj.Confidence
}
// Licenses of same confidence are ordered by their appearance
if di.StartTokenIndex != dj.StartTokenIndex {
return di.StartTokenIndex < dj.StartTokenIndex
}
// Should never get here, but tiebreak based on the larger license.
return di.EndTokenIndex > dj.EndTokenIndex
}
// Match reports instances of the supplied content in the corpus.
func (c *Classifier) match(in []byte) Results {
id := c.createTargetIndexedDocument(in)
firstPass := make(map[string]*indexedDocument)
for l, d := range c.docs {
sim := id.tokenSimilarity(d)
if c.tc.traceTokenize(l) {
c.tc.trace("Token similarity for %s: %.2f", l, sim)
}
if sim >= c.threshold {
firstPass[l] = d
}
}
if len(firstPass) == 0 {
return Results{
Matches: nil,
TotalInputLines: 0,
}
}
// Perform the expensive work of generating a searchset to look for token runs.
id.generateSearchSet(c.q)
var candidates Matches
for l, d := range firstPass {
matches := c.findPotentialMatches(d.s, id.s, c.threshold)
for _, m := range matches {
startIndex := m.TargetStart
endIndex := m.TargetEnd
conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
candidates = append(candidates, &Match{
Name: LicenseName(l),
Variant: variantName(l),
MatchType: detectionType(l),
Confidence: conf,
StartLine: id.Tokens[startIndex+startOffset].Line,
EndLine: id.Tokens[endIndex-endOffset-1].Line,
StartTokenIndex: id.Tokens[startIndex+startOffset].Index,
EndTokenIndex: id.Tokens[endIndex-endOffset-1].Index,
})
}
}
}
sort.Sort(candidates)
retain := make([]bool, len(candidates))
for i, c := range candidates {
// Filter out overlapping licenses based primarily on confidence. Since
// the candidates slice is ordered by confidence, we look for overlaps and
// decide if we retain the record c.
// For each candidate, only add it to the report unless we have a
// higher-quality hit that contains these lines. In the case of two
// licenses having overlap, we consider 'token density' to break ties. If a
// less confident match of a larger license has more matching tokens than a
// perfect match of a smaller license, we want to keep that. This handles
// licenses that include another license as a subtext. NPL contains MPL
// as a concrete example.
keep := true
proposals := make(map[int]bool)
for j, o := range candidates {
if j == i {
break
}
// Make sure to only check containment on licenses that are still in consideration at this point.
if contains(c, o) && retain[j] {
// The license here can override a previous detection, but that isn't sufficient to be kept
// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
// bits are missing.
ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
cconf := ctoks * c.Confidence
oconf := otoks * o.Confidence
// If the two licenses are exactly the same confidence, that means we
// have an ambiguous detect and should retain both, so the caller can
// see and resolve the situation.
if cconf > oconf {
proposals[j] = false
} else if oconf > cconf {
keep = false
}
} else if overlaps(c, o) && retain[j] {
// if the ending and start lines exactly overlap, it's OK to keep both
if c.StartLine != o.EndLine {
keep = false
}
}
if !keep {
break
}
}
if keep {
retain[i] = true
for p, v := range proposals {
retain[p] = v
}
}
}
var out Matches
for i, keep := range retain {
if keep {
out = append(out, candidates[i])
}
}
return Results{
Matches: out,
TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
}
}
// Classifier provides methods for identifying open source licenses in text
// content.
type Classifier struct {
tc *TraceConfiguration
dict *dictionary
docs map[string]*indexedDocument
threshold float64
q int // The value of q for q-grams in this corpus
}
// NewClassifier creates a classifier with an empty corpus.
func NewClassifier(threshold float64) *Classifier {
classifier := &Classifier{
tc: new(TraceConfiguration),
dict: newDictionary(),
docs: make(map[string]*indexedDocument),
threshold: threshold,
q: computeQ(threshold),
}
return classifier
}
// Normalize takes input content and applies the following transforms to aid in
// identifying license content. The return value of this function is
// line-separated text which is the basis for position values returned by the
// classifier.
//
//
// 1. Breaks up long lines of text. This helps with detecting licenses like in
// TODO(wcn):URL reference
//
// 2. Certain ignorable texts are removed to aid matching blocks of text.
// Introductory lines such as "The MIT License" are removed. Copyright notices
// are removed since the parties are variable and shouldn't impact matching.
//
// It is NOT necessary to call this function to simply identify licenses in a
// file. It should only be called to aid presenting this information to the user
// in context (for example, creating diffs of differences to canonical
// licenses).
//
// It is an invariant of the classifier that calling Match(Normalize(in)) will
// return the same results as Match(in).
func (c *Classifier) Normalize(in []byte) []byte {
text := normalizeDoc(in, false)
doc := extractDoc(text, false)
var buf bytes.Buffer
switch len(doc.Tokens) {
case 0:
return nil
case 1:
buf.WriteString(doc.Tokens[0].Text)
return buf.Bytes()
}
prevLine := 1
buf.WriteString(doc.Tokens[0].Text)
for _, t := range doc.Tokens[1:] {
// Only write out an EOL token that incremented the line
if t.Line == prevLine+1 {
buf.WriteString("\n")
}
// Only write tokens that aren't EOL
if t.Text != eol {
// Only put a space between tokens if the previous token was on the same
// line. This prevents spaces after an EOL
if t.Line == prevLine {
buf.WriteString(" ")
}
buf.WriteString(t.Text)
}
prevLine = t.Line
}
return buf.Bytes()
}
// LoadLicenses adds the contents of the supplied directory to the corpus of the
// classifier.
func (c *Classifier) LoadLicenses(dir string) error {
var files []string
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return nil
}
if !strings.HasSuffix(path, "txt") {
return nil
}
files = append(files, path)
return nil
})
if err != nil {
return err
}
for _, f := range files {
relativePath := strings.Replace(f, dir, "", 1)
sep := fmt.Sprintf("%c", os.PathSeparator)
segments := strings.Split(relativePath, sep)
category, name, variant := segments[1], segments[2], segments[3]
b, err := ioutil.ReadFile(f)
if err != nil {
return err
}
c.AddContent(category, name, variant, []byte(string(b)))
}
return nil
}
// SetTraceConfiguration installs a tracing configuration for the classifier.
func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
c.tc = in
c.tc.init()
}
// Match finds matches within an unknown text. This will not modify the contents
// of the supplied byte slice.
func (c *Classifier) Match(in []byte) Results {
return c.match(in)
}
// MatchFrom finds matches within the read content.
func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
b, err := ioutil.ReadAll(in)
if err != nil {
return Results{}, fmt.Errorf("classifier couldn't read: %w", err)
}
return c.Match(b), nil
}
func detectionType(in string) string {
splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
return splits[0]
}
func variantName(in string) string {
splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
return splits[2]
}
// LicenseName produces the output name for a license, removing the internal structure
// of the filename in use.
func LicenseName(in string) string {
splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
return splits[1]
}
// contains returns true iff b is completely inside a
func contains(a, b *Match) bool {
return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
}
// returns true iff b <= a <= c
func between(a, b, c int) bool {
return b <= a && a <= c
}
// returns true iff the ranges covered by a and b overlap.
func overlaps(a, b *Match) bool {
return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
}