blob: af6378236066310eb7fd7fc40f9df803e7b7452b [file] [log] [blame]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"fmt"
"strings"
"testing"
"github.com/sergi/go-diff/diffmatchpatch"
)
func TestLevenshteinDiff(t *testing.T) {
tests := []struct {
name string
diffs []diffmatchpatch.Diff
expected int
}{
{
name: "identical text",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "equivalent text",
},
},
expected: 0,
},
{
name: "changed text",
// Adjacent inverse changes get scored with the maximum of the 2 change scores
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffDelete,
Text: "removed words",
},
{
Type: diffmatchpatch.DiffInsert,
Text: "inserted text here",
},
},
expected: 3,
},
{
name: "inserted text",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "identical words",
},
{
Type: diffmatchpatch.DiffInsert,
Text: "inserted",
},
},
expected: 1,
},
{
name: "deleted text",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffDelete,
Text: "many extraneous deleted words",
},
{
Type: diffmatchpatch.DiffEqual,
Text: "before the equivalent text",
},
},
expected: 4,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
if got := diffLevenshteinWord(test.diffs); got != test.expected {
t.Errorf("got %d wanted %d", got, test.expected)
}
})
}
}
func TestScoreDiffs(t *testing.T) {
tests := []struct {
name string
license string
diffs []diffmatchpatch.Diff
expected int
}{
{
name: "identical text",
license: "License/MIT/license.txt",
diffs: nil,
expected: 0,
},
{
name: "acceptable change",
license: "License/MIT/license.txt",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "license",
},
{
Type: diffmatchpatch.DiffInsert,
Text: "as needed",
},
{
Type: diffmatchpatch.DiffDelete,
Text: "when necessary",
},
},
expected: 2,
},
{
name: "version change",
license: "License/MIT/license.txt",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "version",
},
{
Type: diffmatchpatch.DiffInsert,
Text: "2",
},
},
expected: versionChange,
},
{
name: "license name change by deletion",
license: "License/MIT/license.txt",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "gnu",
},
{
Type: diffmatchpatch.DiffDelete,
Text: "lesser",
},
},
expected: lesserGPLChange,
},
{
name: "license name change by insertion",
license: "License/MIT/license.txt",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "gnu",
},
{
Type: diffmatchpatch.DiffInsert,
Text: "lesser",
},
},
expected: lesserGPLChange,
},
{
name: "license name change by name insertion",
license: "License/ImageMagick/license.txt",
diffs: []diffmatchpatch.Diff{
{
Type: diffmatchpatch.DiffEqual,
Text: "license",
},
{
Type: diffmatchpatch.DiffInsert,
Text: "imagemagick",
},
},
expected: introducedPhraseChange,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
if got := scoreDiffs(test.license, test.diffs); got != test.expected {
t.Errorf("got %d, want %d", got, test.expected)
}
})
}
}
func TestConfidencePercentage(t *testing.T) {
tests := []struct {
name string
klen, distance int
expected float64
}{
{
name: "empty text",
klen: 0,
distance: 0,
expected: 1.0,
},
{
name: "99% match",
klen: 100,
distance: 1,
expected: 0.99,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
if got := confidencePercentage(test.klen, test.distance); got != test.expected {
t.Errorf("got %v want %v", got, test.expected)
}
})
}
}
func TestScore(t *testing.T) {
tests := []struct {
name string
known, unknown string
expectedConf float64
expectedStart, expectedEnd int
}{
{
name: "identical text",
known: "here is some sample text",
unknown: "here is some sample text",
expectedConf: 1.00,
expectedStart: 0,
expectedEnd: 0,
},
{
name: "close match with matching sizes",
known: "here is some sample text",
unknown: "here is different sample text",
expectedConf: .8,
expectedStart: 0,
expectedEnd: 0,
},
{
name: "close match with different sizes",
known: "here is some sample text",
unknown: "padding before here is different sample text",
expectedConf: .8,
expectedStart: 2,
expectedEnd: 0,
},
{
name: "no match due to unacceptable diff",
known: "here is some sample text for version 2 of the license",
unknown: "padding before here is different sample text for version 3 of the licenses",
expectedConf: 0.0,
expectedStart: 0,
expectedEnd: 0,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var trace strings.Builder
c := NewClassifier(.8)
c.SetTraceConfiguration(&TraceConfiguration{
TraceLicenses: "*",
TracePhases: "*",
Tracer: func(f string, args ...interface{}) {
trace.WriteString(fmt.Sprintf(f, args...))
},
})
c.AddContent("", "known", "", []byte(test.known))
kd := c.getIndexedDocument("", "known", "")
ud := c.createTargetIndexedDocument([]byte(test.unknown))
// The name for the test needs to look like an asset path so we prepend
// the directory.
conf, so, eo := c.score("License/"+test.name, ud, kd, 0, ud.size())
success := true
if conf != test.expectedConf {
t.Errorf("conf: got %v want %v", conf, test.expectedConf)
success = false
}
if so != test.expectedStart {
t.Errorf("start offset: got %v want %v", so, test.expectedStart)
success = false
}
if eo != test.expectedEnd {
t.Errorf("end offset: got %v want %v", so, test.expectedEnd)
success = false
}
if !success {
t.Errorf("Trace:\n%s", trace.String())
}
})
}
}