blob: e4db8130b38f5fecf3d63a45781304a8144be571 [file] [log] [blame]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"fmt"
"reflect"
"strings"
"testing"
"github.com/davecgh/go-spew/spew"
"github.com/google/go-cmp/cmp"
)
// hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully.
var hundredLicenseText = `
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
// prefixMissingText exercises the error margin detection case at the beginning of a body of text.
var prefixMissingText = `
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
// suffixMissingText exercises the error margin detection case at the beginning of a body of text.
var suffixMissingText = `
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80`
// fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance.
var fragmentedText = `
1 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X`
// bigChunkText has a gap of maximal length to ensure that reassembly works.
var bigChunkText = `
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
// The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation.
var fiftyLicenseText = `
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50`
var repeatedWithBreakagesText = `
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X`
func TestSearchSet_New(t *testing.T) {
tests := []struct {
description string
text string
q int
want *searchSet
}{
{
description: "Empty string",
text: "",
q: 4,
want: &searchSet{
Tokens: []indexedToken{},
Hashes: make(hash),
Checksums: nil,
ChecksumRanges: nil,
},
},
{
description: "Small string",
text: "Hello world",
q: 4,
want: &searchSet{
Tokens: []indexedToken{
{Index: 0, Line: 1, ID: 1},
{Index: 1, Line: 1, ID: 2},
},
Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}},
Checksums: []uint32{1957950203},
ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}},
nodes: []*node{{1957950203, &tokenRange{Start: 0, End: 2}}},
q: 2,
},
},
}
for _, tt := range tests {
var trace strings.Builder
c := NewClassifier(.8) // This value doesn't affect the test.
c.SetTraceConfiguration(&TraceConfiguration{
TraceLicenses: "*",
TracePhases: "*",
Tracer: func(f string, args ...interface{}) {
trace.WriteString(fmt.Sprintf(f, args...))
},
})
c.AddContent("", "text", "", []byte(tt.text))
if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) {
t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want))
t.Errorf("Trace:\n%s", trace.String())
}
}
}
func TestFindPotentialMatches(t *testing.T) {
tests := []struct {
name string
src string
target string
confidence float64
expectedHits int
}{
{
name: "maximally fragmented",
src: hundredLicenseText,
target: fragmentedText,
confidence: .8,
expectedHits: 1,
},
{
name: "prefix missing",
src: hundredLicenseText,
target: prefixMissingText,
confidence: .8,
expectedHits: 1,
},
{
name: "suffix missing",
src: hundredLicenseText,
target: suffixMissingText,
confidence: .8,
expectedHits: 1,
},
{
name: "maximum-length error",
src: hundredLicenseText,
target: bigChunkText,
confidence: .8,
expectedHits: 1,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var trace strings.Builder
c := NewClassifier(test.confidence)
c.SetTraceConfiguration(&TraceConfiguration{
TraceLicenses: "*",
TracePhases: "*",
Tracer: func(f string, args ...interface{}) {
trace.WriteString(fmt.Sprintf(f, args...))
},
})
c.AddContent("", "source", "", []byte(test.src))
doc := c.createTargetIndexedDocument([]byte(test.target))
doc.generateSearchSet(c.q)
hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence)
if actual := len(hits); actual != test.expectedHits {
t.Errorf("got %d hits, wanted %d", actual, test.expectedHits)
t.Errorf("Trace:\n%s", trace.String())
}
})
}
}
func TestFuseRanges(t *testing.T) {
// This test verifies that target ordering doesn't affect how ranges
// get fused. The data that is input for fuseRanges is not always
// ordered deterministically since it's the product of a map iteration.
// This was a bug that occurred during development.
tests := []struct {
name string
in matchRanges
out matchRanges
conf float64
size int
}{
{
name: "in target order",
conf: .8,
size: 100,
in: matchRanges{
&matchRange{
SrcStart: 50,
SrcEnd: 93,
TargetStart: 0,
TargetEnd: 43,
TokensClaimed: 43,
},
&matchRange{
SrcStart: 0,
SrcEnd: 43,
TargetStart: 0,
TargetEnd: 43,
TokensClaimed: 43,
},
&matchRange{
SrcStart: 10,
SrcEnd: 47,
TargetStart: 60,
TargetEnd: 97,
TokensClaimed: 37,
},
&matchRange{
SrcStart: 60,
SrcEnd: 97,
TargetStart: 60,
TargetEnd: 97,
TokensClaimed: 37,
},
},
out: matchRanges{
&matchRange{
SrcStart: 0,
SrcEnd: 97,
TargetStart: 0,
TargetEnd: 97,
TokensClaimed: 80,
},
},
},
{
name: "not in-target order",
conf: .8,
size: 100,
in: matchRanges{
&matchRange{
SrcStart: 0,
SrcEnd: 43,
TargetStart: 0,
TargetEnd: 43,
TokensClaimed: 43,
},
&matchRange{
SrcStart: 50,
SrcEnd: 93,
TargetStart: 0,
TargetEnd: 43,
TokensClaimed: 43,
},
&matchRange{
SrcStart: 60,
SrcEnd: 97,
TargetStart: 60,
TargetEnd: 97,
TokensClaimed: 37,
},
&matchRange{
SrcStart: 10,
SrcEnd: 47,
TargetStart: 60,
TargetEnd: 97,
TokensClaimed: 37,
},
},
out: matchRanges{
&matchRange{
SrcStart: 0,
SrcEnd: 97,
TargetStart: 0,
TargetEnd: 97,
TokensClaimed: 80,
},
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var trace strings.Builder
c := NewClassifier(.8)
c.SetTraceConfiguration(&TraceConfiguration{
TraceLicenses: "*",
TracePhases: "*",
Tracer: func(f string, args ...interface{}) {
trace.WriteString(fmt.Sprintf(f, args...))
},
})
runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4)
actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100)
if !cmp.Equal(actual, test.out) {
t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out))
t.Errorf("Trace:\n%s", trace.String())
}
})
}
}
func TestDetectRuns(t *testing.T) {
tests := []struct {
name string
matched matchRanges
targetLength, subsetLength, q int
threshold float64
expected []matchRange
}{
{
// For an exact match on 100 accurate tokens, the first q-gram
// is the only possible location hit we can return.
name: "precise matching on perfect runs",
threshold: 1.0,
targetLength: 100,
subsetLength: 100,
q: 4,
matched: matchRanges{
&matchRange{TargetStart: 0, TargetEnd: 100},
},
expected: []matchRange{
{SrcStart: 0, SrcEnd: 4},
},
},
{
// For an 80% match on 100 accurate tokens, the first 20 token
// positions represent possible matches.
name: "approximate matching on perfect runs",
threshold: 0.8,
targetLength: 100,
subsetLength: 100,
q: 4,
matched: matchRanges{
&matchRange{TargetStart: 0, TargetEnd: 100},
},
expected: []matchRange{
{SrcStart: 0, SrcEnd: 24},
},
},
{
name: "multiple runs in a single target",
threshold: 0.8,
targetLength: 100,
subsetLength: 10,
q: 4,
matched: matchRanges{
&matchRange{TargetStart: 0, TargetEnd: 10},
&matchRange{TargetStart: 20, TargetEnd: 25},
&matchRange{TargetStart: 50, TargetEnd: 60},
&matchRange{TargetStart: 70, TargetEnd: 77},
},
expected: []matchRange{
// Runs end on 4-gram boundaries
{SrcStart: 0, SrcEnd: 6},
// The run starts early because of error tolerance
{SrcStart: 48, SrcEnd: 56},
},
},
{
name: "bridge broken runs in a single target",
threshold: 0.8,
targetLength: 100,
subsetLength: 10,
q: 4,
matched: matchRanges{
&matchRange{TargetStart: 20, TargetEnd: 25},
&matchRange{TargetStart: 26, TargetEnd: 30},
&matchRange{TargetStart: 60, TargetEnd: 67},
&matchRange{TargetStart: 68, TargetEnd: 72},
},
expected: []matchRange{
// Runs end on 4-gram boundaries and start early because
// of error tolerance.
{SrcStart: 19, SrcEnd: 25},
{SrcStart: 59, SrcEnd: 67},
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
var trace strings.Builder
c := NewClassifier(.8)
c.SetTraceConfiguration(&TraceConfiguration{
TraceLicenses: "*",
TracePhases: "*",
Tracer: func(f string, args ...interface{}) {
trace.WriteString(fmt.Sprintf(f, args...))
},
})
if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) {
t.Errorf(cmp.Diff(got, test.expected))
t.Errorf("Trace:\n%s", trace.String())
}
})
}
}