v2/classifier.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 import (
 	"bytes"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"os"
 	"path"
 	"path/filepath"
 	"sort"
 	"strings"
 )

 // Match is the information about a single instance of a detected match.
 type Match struct {
 	Name            string
 	Confidence      float64
 	MatchType       string
 	StartLine       int
 	EndLine         int
 	StartTokenIndex int
 	EndTokenIndex   int
 }

 // Results captures the summary information and matches detected by the
 // classifier.
 type Results struct {
 	Matches         Matches
 	TotalInputLines int
 }

 // Matches is a sortable slice of Match.
 type Matches []*Match

 // Swap two elements of Matches.
 func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
 func (d Matches) Len() int      { return len(d) }
 func (d Matches) Less(i, j int) bool {
 	di, dj := d[i], d[j]
 	// Return matches ordered by confidence
 	if di.Confidence != dj.Confidence {
 		return di.Confidence > dj.Confidence
 	}
 	// Licenses of same confidence are ordered by their appearance
 	if di.StartTokenIndex != dj.StartTokenIndex {
 		return di.StartTokenIndex < dj.StartTokenIndex
 	}
 	// Should never get here, but tiebreak based on the larger license.
 	return di.EndTokenIndex > dj.EndTokenIndex
 }

 // Match reports instances of the supplied content in the corpus.
 func (c *Classifier) match(in []byte) Results {
 	id := c.createTargetIndexedDocument(in)

 	firstPass := make(map[string]*indexedDocument)
 	for l, d := range c.docs {
 		sim := id.tokenSimilarity(d)
 		if sim >= c.threshold {
 			firstPass[l] = d
 		}
 	}

 	if len(firstPass) == 0 {
 		return Results{
 			Matches:         nil,
 			TotalInputLines: 0,
 		}
 	}

 	// Perform the expensive work of generating a searchset to look for token runs.
 	id.generateSearchSet(c.q)

 	var candidates Matches
 	for l, d := range firstPass {
 		matches := c.findPotentialMatches(d.s, id.s, c.threshold)
 		for _, m := range matches {
 			startIndex := m.TargetStart
 			endIndex := m.TargetEnd
 			conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
 			if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
 				candidates = append(candidates, &Match{
 					Name:            LicenseName(l),
 					MatchType:       detectionType(l),
 					Confidence:      conf,
 					StartLine:       id.Tokens[startIndex+startOffset].Line,
 					EndLine:         id.Tokens[endIndex-endOffset-1].Line,
 					StartTokenIndex: id.Tokens[startIndex+startOffset].Index,
 					EndTokenIndex:   id.Tokens[endIndex-endOffset-1].Index,
 				})
 			}

 		}
 	}
 	sort.Sort(candidates)
 	retain := make([]bool, len(candidates))
 	for i, c := range candidates {
 		// Filter out overlapping licenses based primarily on confidence. Since
 		// the candidates slice is ordered by confidence, we look for overlaps and
 		// decide if we retain the record c.

 		// For each candidate, only add it to the report unless we have a
 		// higher-quality hit that contains these lines. In the case of two
 		// licenses having overlap, we consider 'token density' to break ties. If a
 		// less confident match of a larger license has more matching tokens than a
 		// perfect match of a smaller license, we want to keep that. This handles
 		// licenses that include another license as a subtext. NPL contains MPL
 		// as a concrete example.

 		keep := true
 		proposals := make(map[int]bool)
 		for j, o := range candidates {
 			if j == i {
 				break
 			}
 			// Make sure to only check containment on licenses that are still in consideration at this point.
 			if contains(c, o) && retain[j] {
 				// The license here can override a previous detection, but that isn't sufficient to be kept
 				// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
 				// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
 				// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
 				// bits are missing.

 				ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
 				otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
 				cconf := ctoks * c.Confidence
 				oconf := otoks * o.Confidence

 				// If the two licenses are exactly the same confidence, that means we
 				// have an ambiguous detect and should retain both, so the caller can
 				// see and resolve the situation.
 				if cconf > oconf {
 					proposals[j] = false
 				} else if oconf > cconf {
 					keep = false
 				}
 			} else if overlaps(c, o) && retain[j] {
 				// if the ending and start lines exactly overlap, it's OK to keep both
 				if c.StartLine == o.EndLine {
 					keep = true
 				} else {
 					keep = false
 				}
 			}

 		}
 		if keep {
 			retain[i] = true
 			for p, v := range proposals {
 				retain[p] = v
 			}
 		}
 	}

 	var out Matches
 	for i, keep := range retain {
 		if keep {
 			out = append(out, candidates[i])
 		}
 	}
 	return Results{
 		Matches:         out,
 		TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
 	}
 }

 // Classifier provides methods for identifying open source licenses in text
 // content.
 type Classifier struct {
 	tc        *TraceConfiguration
 	dict      *dictionary
 	docs      map[string]*indexedDocument
 	threshold float64
 	q         int // The value of q for q-grams in this corpus
 }

 // NewClassifier creates a classifier with an empty corpus.
 func NewClassifier(threshold float64) *Classifier {
 	classifier := &Classifier{
 		tc:        new(TraceConfiguration),
 		dict:      newDictionary(),
 		docs:      make(map[string]*indexedDocument),
 		threshold: threshold,
 		q:         computeQ(threshold),
 	}
 	return classifier
 }

 // Normalize takes input content and applies the following transforms to aid in
 // identifying license content. The return value of this function is
 // line-separated text which is the basis for position values returned by the
 // classifier.
 //
 //
 // 1. Breaks up long lines of text. This helps with detecting licenses like in
 // TODO(wcn):URL reference
 //
 // 2. Certain ignorable texts are removed to aid matching blocks of text.
 // Introductory lines such as "The MIT License" are removed. Copyright notices
 // are removed since the parties are variable and shouldn't impact matching.
 //
 // It is NOT necessary to call this function to simply identify licenses in a
 // file. It should only be called to aid presenting this information to the user
 // in context (for example, creating diffs of differences to canonical
 // licenses).
 //
 // It is an invariant of the classifier that calling Match(Normalize(in)) will
 // return the same results as Match(in).
 func (c *Classifier) Normalize(in []byte) []byte {
 	text := normalizeDoc(in, false)
 	doc := extractDoc(text)

 	var buf bytes.Buffer

 	switch len(doc.Tokens) {
 	case 0:
 		return nil
 	case 1:
 		buf.WriteString(doc.Tokens[0].Text)
 		return buf.Bytes()
 	}

 	buf.WriteString(doc.Tokens[0].Text)

 	for _, t := range doc.Tokens[1:] {
 		buf.WriteString(" ")
 		buf.WriteString(t.Text)
 	}
 	return buf.Bytes()
 }

 // LoadLicenses adds the contents of the supplied directory to the corpus of the
 // classifier.
 func (c *Classifier) LoadLicenses(dir string) error {
 	var files []string
 	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			return nil
 		}
 		if !strings.HasSuffix(path, "txt") {
 			return nil
 		}
 		files = append(files, path)
 		return nil
 	})
 	if err != nil {
 		return err
 	}

 	for _, f := range files {
 		_, name := path.Split(f)
 		name = strings.Replace(name, ".txt", "", 1)
 		b, err := ioutil.ReadFile(f)
 		if err != nil {
 			return err
 		}

 		c.AddContent(name, []byte(string(b)))
 	}
 	return nil
 }

 // SetTraceConfiguration installs a tracing configuration for the classifier.
 func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
 	c.tc = in
 	c.tc.init()
 }

 // Match finds matches within an unknown text. This will not modify the contents
 // of the supplied byte slice.
 func (c *Classifier) Match(in []byte) Results {
 	return c.match(in)
 }

 // MatchFrom finds matches within the read content.
 func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
 	b, err := ioutil.ReadAll(in)
 	if err != nil {
 		return Results{}, fmt.Errorf("classifier couldn't read: %w", err)
 	}
 	return c.Match(b), nil
 }

 func detectionType(in string) string {
 	if strings.Index(in, ".header") != -1 {
 		return "Header"
 	}
 	return "License"
 }

 // LicenseName produces the output name for a license, removing the internal structure
 // of the filename in use.
 func LicenseName(in string) string {
 	out := in
 	if idx := strings.Index(out, ".txt"); idx != -1 {
 		out = out[0:idx]
 	}
 	if idx := strings.Index(out, "_"); idx != -1 {
 		out = out[0:idx]
 	}
 	if idx := strings.Index(out, ".header"); idx != -1 {
 		out = out[0:idx]
 	}
 	return out
 }

 // contains returns true iff b is completely inside a
 func contains(a, b *Match) bool {
 	return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
 }

 // returns true iff b <= a <= c
 func between(a, b, c int) bool {
 	return b <= a && a <= c
 }

 // returns true iff the ranges covered by a and b overlap.
 func overlaps(a, b *Match) bool {
 	return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	import (
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"os"
	"path"
	"path/filepath"
	"sort"
	"strings"
	)

	// Match is the information about a single instance of a detected match.
	type Match struct {
	Name string
	Confidence float64
	MatchType string
	StartLine int
	EndLine int
	StartTokenIndex int
	EndTokenIndex int
	}

	// Results captures the summary information and matches detected by the
	// classifier.
	type Results struct {
	Matches Matches
	TotalInputLines int
	}

	// Matches is a sortable slice of Match.
	type Matches []*Match

	// Swap two elements of Matches.
	func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
	func (d Matches) Len() int { return len(d) }
	func (d Matches) Less(i, j int) bool {
	di, dj := d[i], d[j]
	// Return matches ordered by confidence
	if di.Confidence != dj.Confidence {
	return di.Confidence > dj.Confidence
	}
	// Licenses of same confidence are ordered by their appearance
	if di.StartTokenIndex != dj.StartTokenIndex {
	return di.StartTokenIndex < dj.StartTokenIndex
	}
	// Should never get here, but tiebreak based on the larger license.
	return di.EndTokenIndex > dj.EndTokenIndex
	}

	// Match reports instances of the supplied content in the corpus.
	func (c *Classifier) match(in []byte) Results {
	id := c.createTargetIndexedDocument(in)

	firstPass := make(map[string]*indexedDocument)
	for l, d := range c.docs {
	sim := id.tokenSimilarity(d)
	if sim >= c.threshold {
	firstPass[l] = d
	}
	}

	if len(firstPass) == 0 {
	return Results{
	Matches: nil,
	TotalInputLines: 0,
	}
	}

	// Perform the expensive work of generating a searchset to look for token runs.
	id.generateSearchSet(c.q)

	var candidates Matches
	for l, d := range firstPass {
	matches := c.findPotentialMatches(d.s, id.s, c.threshold)
	for _, m := range matches {
	startIndex := m.TargetStart
	endIndex := m.TargetEnd
	conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
	if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
	candidates = append(candidates, &Match{
	Name: LicenseName(l),
	MatchType: detectionType(l),
	Confidence: conf,
	StartLine: id.Tokens[startIndex+startOffset].Line,
	EndLine: id.Tokens[endIndex-endOffset-1].Line,
	StartTokenIndex: id.Tokens[startIndex+startOffset].Index,
	EndTokenIndex: id.Tokens[endIndex-endOffset-1].Index,
	})
	}

	}
	}
	sort.Sort(candidates)
	retain := make([]bool, len(candidates))
	for i, c := range candidates {
	// Filter out overlapping licenses based primarily on confidence. Since
	// the candidates slice is ordered by confidence, we look for overlaps and
	// decide if we retain the record c.

	// For each candidate, only add it to the report unless we have a
	// higher-quality hit that contains these lines. In the case of two
	// licenses having overlap, we consider 'token density' to break ties. If a
	// less confident match of a larger license has more matching tokens than a
	// perfect match of a smaller license, we want to keep that. This handles
	// licenses that include another license as a subtext. NPL contains MPL
	// as a concrete example.

	keep := true
	proposals := make(map[int]bool)
	for j, o := range candidates {
	if j == i {
	break
	}
	// Make sure to only check containment on licenses that are still in consideration at this point.
	if contains(c, o) && retain[j] {
	// The license here can override a previous detection, but that isn't sufficient to be kept
	// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
	// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
	// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
	// bits are missing.

	ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
	otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
	cconf := ctoks * c.Confidence
	oconf := otoks * o.Confidence

	// If the two licenses are exactly the same confidence, that means we
	// have an ambiguous detect and should retain both, so the caller can
	// see and resolve the situation.
	if cconf > oconf {
	proposals[j] = false
	} else if oconf > cconf {
	keep = false
	}
	} else if overlaps(c, o) && retain[j] {
	// if the ending and start lines exactly overlap, it's OK to keep both
	if c.StartLine == o.EndLine {
	keep = true
	} else {
	keep = false
	}
	}

	}
	if keep {
	retain[i] = true
	for p, v := range proposals {
	retain[p] = v
	}
	}
	}

	var out Matches
	for i, keep := range retain {
	if keep {
	out = append(out, candidates[i])
	}
	}
	return Results{
	Matches: out,
	TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
	}
	}

	// Classifier provides methods for identifying open source licenses in text
	// content.
	type Classifier struct {
	tc *TraceConfiguration
	dict *dictionary
	docs map[string]*indexedDocument
	threshold float64
	q int // The value of q for q-grams in this corpus
	}

	// NewClassifier creates a classifier with an empty corpus.
	func NewClassifier(threshold float64) *Classifier {
	classifier := &Classifier{
	tc: new(TraceConfiguration),
	dict: newDictionary(),
	docs: make(map[string]*indexedDocument),
	threshold: threshold,
	q: computeQ(threshold),
	}
	return classifier
	}

	// Normalize takes input content and applies the following transforms to aid in
	// identifying license content. The return value of this function is
	// line-separated text which is the basis for position values returned by the
	// classifier.
	//
	//
	// 1. Breaks up long lines of text. This helps with detecting licenses like in
	// TODO(wcn):URL reference
	//
	// 2. Certain ignorable texts are removed to aid matching blocks of text.
	// Introductory lines such as "The MIT License" are removed. Copyright notices
	// are removed since the parties are variable and shouldn't impact matching.
	//
	// It is NOT necessary to call this function to simply identify licenses in a
	// file. It should only be called to aid presenting this information to the user
	// in context (for example, creating diffs of differences to canonical
	// licenses).
	//
	// It is an invariant of the classifier that calling Match(Normalize(in)) will
	// return the same results as Match(in).
	func (c *Classifier) Normalize(in []byte) []byte {
	text := normalizeDoc(in, false)
	doc := extractDoc(text)

	var buf bytes.Buffer

	switch len(doc.Tokens) {
	case 0:
	return nil
	case 1:
	buf.WriteString(doc.Tokens[0].Text)
	return buf.Bytes()
	}

	buf.WriteString(doc.Tokens[0].Text)

	for _, t := range doc.Tokens[1:] {
	buf.WriteString(" ")
	buf.WriteString(t.Text)
	}
	return buf.Bytes()
	}

	// LoadLicenses adds the contents of the supplied directory to the corpus of the
	// classifier.
	func (c *Classifier) LoadLicenses(dir string) error {
	var files []string
	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
	if err != nil {
	return nil
	}
	if !strings.HasSuffix(path, "txt") {
	return nil
	}
	files = append(files, path)
	return nil
	})
	if err != nil {
	return err
	}

	for _, f := range files {
	_, name := path.Split(f)
	name = strings.Replace(name, ".txt", "", 1)
	b, err := ioutil.ReadFile(f)
	if err != nil {
	return err
	}

	c.AddContent(name, []byte(string(b)))
	}
	return nil
	}

	// SetTraceConfiguration installs a tracing configuration for the classifier.
	func (c Classifier) SetTraceConfiguration(in TraceConfiguration) {
	c.tc = in
	c.tc.init()
	}

	// Match finds matches within an unknown text. This will not modify the contents
	// of the supplied byte slice.
	func (c *Classifier) Match(in []byte) Results {
	return c.match(in)
	}

	// MatchFrom finds matches within the read content.
	func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
	b, err := ioutil.ReadAll(in)
	if err != nil {
	return Results{}, fmt.Errorf("classifier couldn't read: %w", err)
	}
	return c.Match(b), nil
	}

	func detectionType(in string) string {
	if strings.Index(in, ".header") != -1 {
	return "Header"
	}
	return "License"
	}

	// LicenseName produces the output name for a license, removing the internal structure
	// of the filename in use.
	func LicenseName(in string) string {
	out := in
	if idx := strings.Index(out, ".txt"); idx != -1 {
	out = out[0:idx]
	}
	if idx := strings.Index(out, "_"); idx != -1 {
	out = out[0:idx]
	}
	if idx := strings.Index(out, ".header"); idx != -1 {
	out = out[0:idx]
	}
	return out
	}

	// contains returns true iff b is completely inside a
	func contains(a, b *Match) bool {
	return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
	}

	// returns true iff b <= a <= c
	func between(a, b, c int) bool {
	return b <= a && a <= c
	}

	// returns true iff the ranges covered by a and b overlap.
	func overlaps(a, b *Match) bool {
	return between(a.StartLine, b.StartLine, b.EndLine) \|\| between(a.EndLine, b.StartLine, b.EndLine)
	}