| // Copyright 2020 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package classifier |
| |
| import ( |
| "strings" |
| "unicode" |
| |
| "github.com/sergi/go-diff/diffmatchpatch" |
| ) |
| |
| // return values for the distance function that explain why a diff |
| // is not an acceptable match for the source document. |
| const ( |
| versionChange = -1 |
| introducedPhraseChange = -2 |
| lesserGPLChange = -3 |
| ) |
| |
| // score computes a metric of similarity between the known and unknown |
| // document, including the offsets into the unknown that yield the content |
| // generating the computed similarity. |
| func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) { |
| if c.tc.traceScoring(known.s.origin) { |
| c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd) |
| } |
| |
| knownLength := known.size() |
| diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength) |
| |
| start, end := diffRange(known.norm, diffs) |
| distance := scoreDiffs(id, diffs[start:end]) |
| if distance < 0 { |
| // If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match. |
| if c.tc.traceScoring(known.s.origin) { |
| c.tc.trace("Distance result %v, rejected match", distance) |
| } |
| return 0.0, 0, 0 |
| } |
| |
| // Applying the diffRange-generated offsets provides the run of text from the |
| // target most closely correlated to the source. This is the process for |
| // compensating for errors from the searchset. With the reduced text, we |
| // compute the final confidence score and exact text locations for the |
| // matched content. |
| // The diff slice consists of three regions: an optional deletion diff for |
| // target text before the source, n elements that represent the diff between |
| // the source and target, and an optional deletion diff for text after the |
| // target. The helper functions identify the portions of the slice |
| // corresponding to those regions. This results in a more accurate |
| // confidence score and better position detection of the source in the |
| // target. |
| conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:]) |
| |
| if c.tc.traceScoring(known.s.origin) { |
| c.tc.trace("Score result: %v [%d-%d]", conf, so, eo) |
| } |
| return conf, so, eo |
| } |
| |
| // confidencePercentage computes a confidence match score for the lengths, |
| // handling the cases where source and target lengths differ. |
| func confidencePercentage(klen, distance int) float64 { |
| // No text is matched at 100% confidence (avoid divide by zero). |
| if klen == 0 { |
| return 1.0 |
| } |
| |
| // Return a computed fractional match against the known text. |
| return 1.0 - float64(distance)/float64(klen) |
| } |
| |
| // diffLevenshteinWord computes word-based Levenshtein count. |
| func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int { |
| levenshtein := 0 |
| insertions := 0 |
| deletions := 0 |
| |
| for _, aDiff := range diffs { |
| switch aDiff.Type { |
| case diffmatchpatch.DiffInsert: |
| insertions += wordLen(aDiff.Text) |
| case diffmatchpatch.DiffDelete: |
| deletions += wordLen(aDiff.Text) |
| case diffmatchpatch.DiffEqual: |
| // A deletion and an insertion is one substitution. |
| levenshtein += max(insertions, deletions) |
| insertions = 0 |
| deletions = 0 |
| } |
| } |
| |
| levenshtein += max(insertions, deletions) |
| return levenshtein |
| } |
| |
| func isVersionNumber(in string) bool { |
| for _, r := range in { |
| if !unicode.IsDigit(r) && r != '.' { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // scoreDiffs returns a score rating the acceptability of these diffs. A |
| // negative value means that the changes represented by the diff are not an |
| // acceptable transformation since it would change the underlying license. A |
| // positive value indicates the Levenshtein word distance. |
| func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int { |
| // We make a pass looking for unacceptable substitutions |
| // Delete diffs are always ordered before insert diffs. This is leveraged to |
| // analyze a change by checking an insert against the delete text that was |
| // previously cached. |
| prevText := "" |
| prevDelete := "" |
| for _, diff := range diffs { |
| text := diff.Text |
| switch diff.Type { |
| case diffmatchpatch.DiffInsert: |
| num := text |
| if i := strings.Index(num, " "); i != -1 { |
| num = num[0:i] |
| } |
| if isVersionNumber(num) && strings.HasSuffix(prevText, "version") { |
| if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") { |
| return versionChange |
| } |
| } |
| // There are certain phrases that can't be introduced to make a license |
| // hit. TODO: would like to generate this programmatically. Most of |
| // these are words or phrases that appear in a single/small number of |
| // licenses. Can we leverage frequency analysis to identify these |
| // interesting words/phrases and auto-extract them? |
| |
| inducedPhrases := map[string][]string{ |
| "AGPL": {"affero"}, |
| "Atmel": {"atmel"}, |
| "Apache": {"apache"}, |
| "BSD": {"bsd"}, |
| "BSD-3-Clause-Attribution": {"acknowledgment"}, |
| "bzip2": {"seward"}, |
| "GPL-2.0-with-GCC-exception": {"gcc linking exception"}, |
| "GPL-2.0-with-autoconf-exception": {"autoconf exception"}, |
| "GPL-2.0-with-bison-exception": {"bison exception"}, |
| "GPL-2.0-with-classpath-exception": {"class path exception"}, |
| "GPL-2.0-with-font-exception": {"font exception"}, |
| "LGPL-2.0": {"library"}, |
| "ImageMagick": {"imagemagick"}, |
| "PHP": {"php"}, |
| "SISSL": {"sun standards"}, |
| "SGI-B": {"silicon graphics"}, |
| "X11": {"x consortium"}, |
| } |
| |
| for k, ps := range inducedPhrases { |
| if strings.HasPrefix(id, k) { |
| for _, p := range ps { |
| if strings.Index(text, p) != -1 { |
| return introducedPhraseChange |
| } |
| } |
| } |
| } |
| |
| // Ignore changes between "library" and "lesser" in a GNU context as they |
| // changed the terms, but look for introductions of Lesser that would |
| // otherwise disqualify a match. |
| if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" { |
| // The LGPL 3.0 doesn't have a standard header, so people tend to craft |
| // their own. As a result, sometimes the warranty clause refers to the |
| // GPL instead of the LGPL. This is fine from a licensing perspective, |
| // but we need to tweak matching to ignore that particular case. In |
| // other circumstances, inserting or removing the word Lesser in the |
| // GPL context is not an acceptable change. |
| if !strings.Contains(prevText, "warranty") { |
| return lesserGPLChange |
| } |
| } |
| case diffmatchpatch.DiffEqual: |
| prevText = text |
| prevDelete = "" |
| |
| case diffmatchpatch.DiffDelete: |
| if text == "lesser" && strings.HasSuffix(prevText, "gnu") { |
| // Same as above to avoid matching GPL instead of LGPL here. |
| if !strings.Contains(prevText, "warranty") { |
| return lesserGPLChange |
| } |
| } |
| prevDelete = text |
| } |
| } |
| return diffLevenshteinWord(diffs) |
| } |