blob: 34dffb5e92cfd18e7cc5b3b9bb531cc481fa5904 [file] [log] [blame]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"strings"
"unicode"
"github.com/davecgh/go-spew/spew"
"github.com/sergi/go-diff/diffmatchpatch"
)
// return values for the distance function that explain why a diff
// is not an acceptable match for the source document.
const (
versionChange = -1
introducedPhraseChange = -2
lesserGPLChange = -3
)
// score computes a metric of similarity between the known and unknown
// document, including the offsets into the unknown that yield the content
// generating the computed similarity.
func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
if c.tc.traceScoring(known.s.origin) {
c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
}
knownLength := known.size()
diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
start, end := diffRange(known.norm, diffs)
distance := scoreDiffs(id, diffs[start:end])
if c.tc.traceScoring(known.s.origin) {
c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
}
if distance < 0 {
// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
if c.tc.traceScoring(known.s.origin) {
c.tc.trace("Distance result %v, rejected match", distance)
}
return 0.0, 0, 0
}
// Applying the diffRange-generated offsets provides the run of text from the
// target most closely correlated to the source. This is the process for
// compensating for errors from the searchset. With the reduced text, we
// compute the final confidence score and exact text locations for the
// matched content.
// The diff slice consists of three regions: an optional deletion diff for
// target text before the source, n elements that represent the diff between
// the source and target, and an optional deletion diff for text after the
// target. The helper functions identify the portions of the slice
// corresponding to those regions. This results in a more accurate
// confidence score and better position detection of the source in the
// target.
conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
if c.tc.traceScoring(known.s.origin) {
c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
}
return conf, so, eo
}
// confidencePercentage computes a confidence match score for the lengths,
// handling the cases where source and target lengths differ.
func confidencePercentage(klen, distance int) float64 {
// No text is matched at 100% confidence (avoid divide by zero).
if klen == 0 {
return 1.0
}
// Return a computed fractional match against the known text.
return 1.0 - float64(distance)/float64(klen)
}
// diffLevenshteinWord computes word-based Levenshtein count.
func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
levenshtein := 0
insertions := 0
deletions := 0
for _, aDiff := range diffs {
switch aDiff.Type {
case diffmatchpatch.DiffInsert:
insertions += wordLen(aDiff.Text)
case diffmatchpatch.DiffDelete:
deletions += wordLen(aDiff.Text)
case diffmatchpatch.DiffEqual:
// A deletion and an insertion is one substitution.
levenshtein += max(insertions, deletions)
insertions = 0
deletions = 0
}
}
levenshtein += max(insertions, deletions)
return levenshtein
}
func isVersionNumber(in string) bool {
for _, r := range in {
if !unicode.IsDigit(r) && r != '.' {
return false
}
}
return true
}
// scoreDiffs returns a score rating the acceptability of these diffs. A
// negative value means that the changes represented by the diff are not an
// acceptable transformation since it would change the underlying license. A
// positive value indicates the Levenshtein word distance.
func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
// We make a pass looking for unacceptable substitutions
// Delete diffs are always ordered before insert diffs. This is leveraged to
// analyze a change by checking an insert against the delete text that was
// previously cached.
prevText := ""
prevDelete := ""
for i, diff := range diffs {
text := diff.Text
switch diff.Type {
case diffmatchpatch.DiffInsert:
num := text
if i := strings.Index(num, " "); i != -1 {
num = num[0:i]
}
if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
return versionChange
}
}
// There are certain phrases that can't be introduced to make a license
// hit. TODO: would like to generate this programmatically. Most of
// these are words or phrases that appear in a single/small number of
// licenses. Can we leverage frequency analysis to identify these
// interesting words/phrases and auto-extract them?
inducedPhrases := map[string][]string{
"AGPL": {"affero"},
"Atmel": {"atmel"},
"Apache": {"apache"},
"BSD": {"bsd"},
"BSD-3-Clause-Attribution": {"acknowledgment"},
"bzip2": {"seward"},
"GPL-2.0-with-GCC-exception": {"gcc linking exception"},
"GPL-2.0-with-autoconf-exception": {"autoconf exception"},
"GPL-2.0-with-bison-exception": {"bison exception"},
"GPL-2.0-with-classpath-exception": {"class path exception"},
"GPL-2.0-with-font-exception": {"font exception"},
"LGPL-2.0": {"library"},
"ImageMagick": {"imagemagick"},
"PHP": {"php"},
"SISSL": {"sun standards"},
"SGI-B": {"silicon graphics"},
"SunPro": {"sunpro"},
"X11": {"x consortium"},
}
for k, ps := range inducedPhrases {
if strings.HasPrefix(LicenseName(id), k) {
for _, p := range ps {
if strings.Index(text, p) != -1 {
// Check to make sure there isn't a corresponding diff for this
// insert that also contains the text. This prevents against diff
// blocks that are too big and force a false hit on this check,
// which usually happens with URLs since they are stored in one
// token but can happen in other cases as well. We don't look just
// for delete diffs because the subsequent text may reference the
// content in case a URL was truncated.
if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
continue
}
return introducedPhraseChange
}
}
}
}
// Ignore changes between "library" and "lesser" in a GNU context as they
// changed the terms, but look for introductions of Lesser that would
// otherwise disqualify a match.
if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
// The LGPL 3.0 doesn't have a standard header, so people tend to craft
// their own. As a result, sometimes the warranty clause refers to the
// GPL instead of the LGPL. This is fine from a licensing perspective,
// but we need to tweak matching to ignore that particular case. In
// other circumstances, inserting or removing the word Lesser in the
// GPL context is not an acceptable change. There is also a reference to
// it when suggesting to use the LGPL.
if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
return lesserGPLChange
}
}
case diffmatchpatch.DiffEqual:
prevText = text
prevDelete = ""
case diffmatchpatch.DiffDelete:
// Avoid substitution in most cases. The two exceptions are with usage
// statements that are talking about *another* license, and don't affect
// the detection of the current license.
if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
// Same as above to avoid matching GPL instead of LGPL here.
if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
return lesserGPLChange
}
}
prevDelete = text
}
}
return diffLevenshteinWord(diffs)
}