v2/scoring.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 import (
 	"strings"
 	"unicode"

 	"github.com/davecgh/go-spew/spew"
 	"github.com/sergi/go-diff/diffmatchpatch"
 )

 // return values for the distance function that explain why a diff
 // is not an acceptable match for the source document.
 const (
 	versionChange          = -1
 	introducedPhraseChange = -2
 	lesserGPLChange        = -3
 )

 // score computes a metric of similarity between the known and unknown
 // document, including the offsets into the unknown that yield the content
 // generating the computed similarity.
 func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
 	if c.tc.traceScoring(known.s.origin) {
 		c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
 	}

 	knownLength := known.size()
 	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)

 	start, end := diffRange(known.norm, diffs)
 	distance := scoreDiffs(id, diffs[start:end])

 	if c.tc.traceScoring(known.s.origin) {
 		c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
 	}

 	if distance < 0 {
 		// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
 		if c.tc.traceScoring(known.s.origin) {
 			c.tc.trace("Distance result %v, rejected match", distance)
 		}
 		return 0.0, 0, 0
 	}

 	// Applying the diffRange-generated offsets provides the run of text from the
 	// target most closely correlated to the source.  This is the process for
 	// compensating for errors from the searchset. With the reduced text, we
 	// compute the final confidence score and exact text locations for the
 	// matched content.
 	// The diff slice consists of three regions: an optional deletion diff for
 	// target text before the source, n elements that represent the diff between
 	// the source and target, and an optional deletion diff for text after the
 	// target.  The helper functions identify the portions of the slice
 	// corresponding to those regions.  This results in a more accurate
 	// confidence score and better position detection of the source in the
 	// target.
 	conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])

 	if c.tc.traceScoring(known.s.origin) {
 		c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
 	}
 	return conf, so, eo
 }

 // confidencePercentage computes a confidence match score for the lengths,
 // handling the cases where source and target lengths differ.
 func confidencePercentage(klen, distance int) float64 {
 	// No text is matched at 100% confidence (avoid divide by zero).
 	if klen == 0 {
 		return 1.0
 	}

 	// Return a computed fractional match against the known text.
 	return 1.0 - float64(distance)/float64(klen)
 }

 // diffLevenshteinWord computes word-based Levenshtein count.
 func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
 	levenshtein := 0
 	insertions := 0
 	deletions := 0

 	for _, aDiff := range diffs {
 		switch aDiff.Type {
 		case diffmatchpatch.DiffInsert:
 			insertions += wordLen(aDiff.Text)
 		case diffmatchpatch.DiffDelete:
 			deletions += wordLen(aDiff.Text)
 		case diffmatchpatch.DiffEqual:
 			// A deletion and an insertion is one substitution.
 			levenshtein += max(insertions, deletions)
 			insertions = 0
 			deletions = 0
 		}
 	}

 	levenshtein += max(insertions, deletions)
 	return levenshtein
 }

 func isVersionNumber(in string) bool {
 	for _, r := range in {
 		if !unicode.IsDigit(r) && r != '.' {
 			return false
 		}
 	}
 	return true
 }

 // scoreDiffs returns a score rating the acceptability of these diffs.  A
 // negative value means that the changes represented by the diff are not an
 // acceptable transformation since it would change the underlying license.  A
 // positive value indicates the Levenshtein word distance.
 func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
 	// We make a pass looking for unacceptable substitutions
 	// Delete diffs are always ordered before insert diffs. This is leveraged to
 	// analyze a change by checking an insert against the delete text that was
 	// previously cached.
 	prevText := ""
 	prevDelete := ""
 	for i, diff := range diffs {
 		text := diff.Text
 		switch diff.Type {
 		case diffmatchpatch.DiffInsert:
 			num := text
 			if i := strings.Index(num, " "); i != -1 {
 				num = num[0:i]
 			}
 			if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
 				if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
 					return versionChange
 				}
 			}
 			// There are certain phrases that can't be introduced to make a license
 			// hit.  TODO: would like to generate this programmatically. Most of
 			// these are words or phrases that appear in a single/small number of
 			// licenses. Can we leverage frequency analysis to identify these
 			// interesting words/phrases and auto-extract them?

 			inducedPhrases := map[string][]string{
 				"AGPL":                             {"affero"},
 				"Atmel":                            {"atmel"},
 				"Apache":                           {"apache"},
 				"BSD":                              {"bsd"},
 				"BSD-3-Clause-Attribution":         {"acknowledgment"},
 				"bzip2":                            {"seward"},
 				"GPL-2.0-with-GCC-exception":       {"gcc linking exception"},
 				"GPL-2.0-with-autoconf-exception":  {"autoconf exception"},
 				"GPL-2.0-with-bison-exception":     {"bison exception"},
 				"GPL-2.0-with-classpath-exception": {"class path exception"},
 				"GPL-2.0-with-font-exception":      {"font exception"},
 				"LGPL-2.0":                         {"library"},
 				"ImageMagick":                      {"imagemagick"},
 				"PHP":                              {"php"},
 				"SISSL":                            {"sun standards"},
 				"SGI-B":                            {"silicon graphics"},
 				"SunPro":                           {"sunpro"},
 				"X11":                              {"x consortium"},
 			}

 			for k, ps := range inducedPhrases {
 				if strings.HasPrefix(LicenseName(id), k) {
 					for _, p := range ps {
 						if strings.Index(text, p) != -1 {
 							// Check to make sure there isn't a corresponding diff for this
 							// insert that also contains the text. This prevents against diff
 							// blocks that are too big and force a false hit on this check,
 							// which usually happens with URLs since they are stored in one
 							// token but can happen in other cases as well. We don't look just
 							// for delete diffs because the subsequent text may reference the
 							// content in case a URL was truncated.
 							if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
 								continue
 							}
 							return introducedPhraseChange
 						}
 					}
 				}
 			}

 			// Ignore changes between "library" and "lesser" in a GNU context as they
 			// changed the terms, but look for introductions of Lesser that would
 			// otherwise disqualify a match.
 			if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
 				// The LGPL 3.0 doesn't have a standard header, so people tend to craft
 				// their own. As a result, sometimes the warranty clause refers to the
 				// GPL instead of the LGPL. This is fine from a licensing perspective,
 				// but we need to tweak matching to ignore that particular case. In
 				// other circumstances, inserting or removing the word Lesser in the
 				// GPL context is not an acceptable change. There is also a reference to
 				// it when suggesting to use the LGPL.
 				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
 					return lesserGPLChange
 				}
 			}
 		case diffmatchpatch.DiffEqual:
 			prevText = text
 			prevDelete = ""

 		case diffmatchpatch.DiffDelete:
 			// Avoid substitution in most cases. The two exceptions are with usage
 			// statements that are talking about *another* license, and don't affect
 			// the detection of the current license.
 			if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
 				// Same as above to avoid matching GPL instead of LGPL here.
 				if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
 					return lesserGPLChange
 				}
 			}
 			prevDelete = text
 		}
 	}
 	return diffLevenshteinWord(diffs)
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	import (
	"strings"
	"unicode"

	"github.com/davecgh/go-spew/spew"
	"github.com/sergi/go-diff/diffmatchpatch"
	)

	// return values for the distance function that explain why a diff
	// is not an acceptable match for the source document.
	const (
	versionChange = -1
	introducedPhraseChange = -2
	lesserGPLChange = -3
	)

	// score computes a metric of similarity between the known and unknown
	// document, including the offsets into the unknown that yield the content
	// generating the computed similarity.
	func (c Classifier) score(id string, unknown, known indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
	if c.tc.traceScoring(known.s.origin) {
	c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
	}

	knownLength := known.size()
	diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)

	start, end := diffRange(known.norm, diffs)
	distance := scoreDiffs(id, diffs[start:end])

	if c.tc.traceScoring(known.s.origin) {
	c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
	}

	if distance < 0 {
	// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
	if c.tc.traceScoring(known.s.origin) {
	c.tc.trace("Distance result %v, rejected match", distance)
	}
	return 0.0, 0, 0
	}

	// Applying the diffRange-generated offsets provides the run of text from the
	// target most closely correlated to the source. This is the process for
	// compensating for errors from the searchset. With the reduced text, we
	// compute the final confidence score and exact text locations for the
	// matched content.
	// The diff slice consists of three regions: an optional deletion diff for
	// target text before the source, n elements that represent the diff between
	// the source and target, and an optional deletion diff for text after the
	// target. The helper functions identify the portions of the slice
	// corresponding to those regions. This results in a more accurate
	// confidence score and better position detection of the source in the
	// target.
	conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])

	if c.tc.traceScoring(known.s.origin) {
	c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
	}
	return conf, so, eo
	}

	// confidencePercentage computes a confidence match score for the lengths,
	// handling the cases where source and target lengths differ.
	func confidencePercentage(klen, distance int) float64 {
	// No text is matched at 100% confidence (avoid divide by zero).
	if klen == 0 {
	return 1.0
	}

	// Return a computed fractional match against the known text.
	return 1.0 - float64(distance)/float64(klen)
	}

	// diffLevenshteinWord computes word-based Levenshtein count.
	func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
	levenshtein := 0
	insertions := 0
	deletions := 0

	for _, aDiff := range diffs {
	switch aDiff.Type {
	case diffmatchpatch.DiffInsert:
	insertions += wordLen(aDiff.Text)
	case diffmatchpatch.DiffDelete:
	deletions += wordLen(aDiff.Text)
	case diffmatchpatch.DiffEqual:
	// A deletion and an insertion is one substitution.
	levenshtein += max(insertions, deletions)
	insertions = 0
	deletions = 0
	}
	}

	levenshtein += max(insertions, deletions)
	return levenshtein
	}

	func isVersionNumber(in string) bool {
	for _, r := range in {
	if !unicode.IsDigit(r) && r != '.' {
	return false
	}
	}
	return true
	}

	// scoreDiffs returns a score rating the acceptability of these diffs. A
	// negative value means that the changes represented by the diff are not an
	// acceptable transformation since it would change the underlying license. A
	// positive value indicates the Levenshtein word distance.
	func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
	// We make a pass looking for unacceptable substitutions
	// Delete diffs are always ordered before insert diffs. This is leveraged to
	// analyze a change by checking an insert against the delete text that was
	// previously cached.
	prevText := ""
	prevDelete := ""
	for i, diff := range diffs {
	text := diff.Text
	switch diff.Type {
	case diffmatchpatch.DiffInsert:
	num := text
	if i := strings.Index(num, " "); i != -1 {
	num = num[0:i]
	}
	if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
	if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
	return versionChange
	}
	}
	// There are certain phrases that can't be introduced to make a license
	// hit. TODO: would like to generate this programmatically. Most of
	// these are words or phrases that appear in a single/small number of
	// licenses. Can we leverage frequency analysis to identify these
	// interesting words/phrases and auto-extract them?

	inducedPhrases := map[string][]string{
	"AGPL": {"affero"},
	"Atmel": {"atmel"},
	"Apache": {"apache"},
	"BSD": {"bsd"},
	"BSD-3-Clause-Attribution": {"acknowledgment"},
	"bzip2": {"seward"},
	"GPL-2.0-with-GCC-exception": {"gcc linking exception"},
	"GPL-2.0-with-autoconf-exception": {"autoconf exception"},
	"GPL-2.0-with-bison-exception": {"bison exception"},
	"GPL-2.0-with-classpath-exception": {"class path exception"},
	"GPL-2.0-with-font-exception": {"font exception"},
	"LGPL-2.0": {"library"},
	"ImageMagick": {"imagemagick"},
	"PHP": {"php"},
	"SISSL": {"sun standards"},
	"SGI-B": {"silicon graphics"},
	"SunPro": {"sunpro"},
	"X11": {"x consortium"},
	}

	for k, ps := range inducedPhrases {
	if strings.HasPrefix(LicenseName(id), k) {
	for _, p := range ps {
	if strings.Index(text, p) != -1 {
	// Check to make sure there isn't a corresponding diff for this
	// insert that also contains the text. This prevents against diff
	// blocks that are too big and force a false hit on this check,
	// which usually happens with URLs since they are stored in one
	// token but can happen in other cases as well. We don't look just
	// for delete diffs because the subsequent text may reference the
	// content in case a URL was truncated.
	if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
	continue
	}
	return introducedPhraseChange
	}
	}
	}
	}

	// Ignore changes between "library" and "lesser" in a GNU context as they
	// changed the terms, but look for introductions of Lesser that would
	// otherwise disqualify a match.
	if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
	// The LGPL 3.0 doesn't have a standard header, so people tend to craft
	// their own. As a result, sometimes the warranty clause refers to the
	// GPL instead of the LGPL. This is fine from a licensing perspective,
	// but we need to tweak matching to ignore that particular case. In
	// other circumstances, inserting or removing the word Lesser in the
	// GPL context is not an acceptable change. There is also a reference to
	// it when suggesting to use the LGPL.
	if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
	return lesserGPLChange
	}
	}
	case diffmatchpatch.DiffEqual:
	prevText = text
	prevDelete = ""

	case diffmatchpatch.DiffDelete:
	// Avoid substitution in most cases. The two exceptions are with usage
	// statements that are talking about another license, and don't affect
	// the detection of the current license.
	if (text == "lesser" \|\| text == "library") && strings.HasSuffix(prevText, "gnu") {
	// Same as above to avoid matching GPL instead of LGPL here.
	if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
	return lesserGPLChange
	}
	}
	prevDelete = text
	}
	}
	return diffLevenshteinWord(diffs)
	}