| // Copyright 2020 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package classifier |
| |
| import ( |
| "fmt" |
| "reflect" |
| "strings" |
| "testing" |
| |
| "github.com/davecgh/go-spew/spew" |
| "github.com/google/go-cmp/cmp" |
| ) |
| |
| // hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully. |
| var hundredLicenseText = ` |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` |
| |
| // prefixMissingText exercises the error margin detection case at the beginning of a body of text. |
| var prefixMissingText = ` |
| 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` |
| |
| // suffixMissingText exercises the error margin detection case at the beginning of a body of text. |
| var suffixMissingText = ` |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80` |
| |
| // fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance. |
| var fragmentedText = ` |
| 1 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X` |
| |
| // bigChunkText has a gap of maximal length to ensure that reassembly works. |
| var bigChunkText = ` |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` |
| |
| // The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation. |
| var fiftyLicenseText = ` |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50` |
| |
| var repeatedWithBreakagesText = ` |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X` |
| |
| func TestSearchSet_New(t *testing.T) { |
| tests := []struct { |
| description string |
| text string |
| q int |
| want *searchSet |
| }{ |
| { |
| description: "Empty string", |
| text: "", |
| q: 4, |
| want: &searchSet{ |
| Tokens: []indexedToken{}, |
| Hashes: make(hash), |
| Checksums: nil, |
| ChecksumRanges: nil, |
| }, |
| }, |
| { |
| description: "Small string", |
| text: "Hello world", |
| q: 4, |
| want: &searchSet{ |
| Tokens: []indexedToken{ |
| {Index: 0, Line: 1, ID: 1}, |
| {Index: 1, Line: 1, ID: 2}, |
| }, |
| Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}}, |
| Checksums: []uint32{1957950203}, |
| ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}}, |
| nodes: []*node{{1957950203, &tokenRange{Start: 0, End: 2}}}, |
| q: 2, |
| }, |
| }, |
| } |
| |
| for _, tt := range tests { |
| var trace strings.Builder |
| c := NewClassifier(.8) // This value doesn't affect the test. |
| c.SetTraceConfiguration(&TraceConfiguration{ |
| TraceLicenses: "*", |
| TracePhases: "*", |
| Tracer: func(f string, args ...interface{}) { |
| trace.WriteString(fmt.Sprintf(f, args...)) |
| }, |
| }) |
| c.AddContent("", "text", "", []byte(tt.text)) |
| if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) { |
| t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want)) |
| t.Errorf("Trace:\n%s", trace.String()) |
| } |
| } |
| } |
| func TestFindPotentialMatches(t *testing.T) { |
| tests := []struct { |
| name string |
| src string |
| target string |
| confidence float64 |
| expectedHits int |
| }{ |
| { |
| name: "maximally fragmented", |
| src: hundredLicenseText, |
| target: fragmentedText, |
| confidence: .8, |
| expectedHits: 1, |
| }, |
| { |
| name: "prefix missing", |
| src: hundredLicenseText, |
| target: prefixMissingText, |
| confidence: .8, |
| expectedHits: 1, |
| }, |
| { |
| name: "suffix missing", |
| src: hundredLicenseText, |
| target: suffixMissingText, |
| confidence: .8, |
| expectedHits: 1, |
| }, |
| { |
| name: "maximum-length error", |
| src: hundredLicenseText, |
| target: bigChunkText, |
| confidence: .8, |
| expectedHits: 1, |
| }, |
| } |
| |
| for _, test := range tests { |
| t.Run(test.name, func(t *testing.T) { |
| var trace strings.Builder |
| c := NewClassifier(test.confidence) |
| c.SetTraceConfiguration(&TraceConfiguration{ |
| TraceLicenses: "*", |
| TracePhases: "*", |
| Tracer: func(f string, args ...interface{}) { |
| trace.WriteString(fmt.Sprintf(f, args...)) |
| }, |
| }) |
| c.AddContent("", "source", "", []byte(test.src)) |
| |
| doc := c.createTargetIndexedDocument([]byte(test.target)) |
| doc.generateSearchSet(c.q) |
| hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence) |
| if actual := len(hits); actual != test.expectedHits { |
| t.Errorf("got %d hits, wanted %d", actual, test.expectedHits) |
| t.Errorf("Trace:\n%s", trace.String()) |
| } |
| }) |
| } |
| } |
| |
| func TestFuseRanges(t *testing.T) { |
| // This test verifies that target ordering doesn't affect how ranges |
| // get fused. The data that is input for fuseRanges is not always |
| // ordered deterministically since it's the product of a map iteration. |
| // This was a bug that occurred during development. |
| tests := []struct { |
| name string |
| in matchRanges |
| out matchRanges |
| conf float64 |
| size int |
| }{ |
| { |
| name: "in target order", |
| conf: .8, |
| size: 100, |
| in: matchRanges{ |
| &matchRange{ |
| SrcStart: 50, |
| SrcEnd: 93, |
| TargetStart: 0, |
| TargetEnd: 43, |
| TokensClaimed: 43, |
| }, |
| &matchRange{ |
| SrcStart: 0, |
| SrcEnd: 43, |
| TargetStart: 0, |
| TargetEnd: 43, |
| TokensClaimed: 43, |
| }, |
| &matchRange{ |
| SrcStart: 10, |
| SrcEnd: 47, |
| TargetStart: 60, |
| TargetEnd: 97, |
| TokensClaimed: 37, |
| }, |
| &matchRange{ |
| SrcStart: 60, |
| SrcEnd: 97, |
| TargetStart: 60, |
| TargetEnd: 97, |
| TokensClaimed: 37, |
| }, |
| }, |
| out: matchRanges{ |
| &matchRange{ |
| SrcStart: 0, |
| SrcEnd: 97, |
| TargetStart: 0, |
| TargetEnd: 97, |
| TokensClaimed: 80, |
| }, |
| }, |
| }, |
| { |
| name: "not in-target order", |
| conf: .8, |
| size: 100, |
| in: matchRanges{ |
| &matchRange{ |
| SrcStart: 0, |
| SrcEnd: 43, |
| TargetStart: 0, |
| TargetEnd: 43, |
| TokensClaimed: 43, |
| }, |
| &matchRange{ |
| SrcStart: 50, |
| SrcEnd: 93, |
| TargetStart: 0, |
| TargetEnd: 43, |
| TokensClaimed: 43, |
| }, |
| &matchRange{ |
| SrcStart: 60, |
| SrcEnd: 97, |
| TargetStart: 60, |
| TargetEnd: 97, |
| TokensClaimed: 37, |
| }, |
| &matchRange{ |
| SrcStart: 10, |
| SrcEnd: 47, |
| TargetStart: 60, |
| TargetEnd: 97, |
| TokensClaimed: 37, |
| }, |
| }, |
| out: matchRanges{ |
| &matchRange{ |
| SrcStart: 0, |
| SrcEnd: 97, |
| TargetStart: 0, |
| TargetEnd: 97, |
| TokensClaimed: 80, |
| }, |
| }, |
| }, |
| } |
| for _, test := range tests { |
| t.Run(test.name, func(t *testing.T) { |
| var trace strings.Builder |
| c := NewClassifier(.8) |
| c.SetTraceConfiguration(&TraceConfiguration{ |
| TraceLicenses: "*", |
| TracePhases: "*", |
| Tracer: func(f string, args ...interface{}) { |
| trace.WriteString(fmt.Sprintf(f, args...)) |
| }, |
| }) |
| runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4) |
| actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100) |
| if !cmp.Equal(actual, test.out) { |
| t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out)) |
| t.Errorf("Trace:\n%s", trace.String()) |
| } |
| }) |
| } |
| } |
| |
| func TestDetectRuns(t *testing.T) { |
| tests := []struct { |
| name string |
| matched matchRanges |
| targetLength, subsetLength, q int |
| threshold float64 |
| expected []matchRange |
| }{ |
| { |
| // For an exact match on 100 accurate tokens, the first q-gram |
| // is the only possible location hit we can return. |
| name: "precise matching on perfect runs", |
| threshold: 1.0, |
| targetLength: 100, |
| subsetLength: 100, |
| q: 4, |
| matched: matchRanges{ |
| &matchRange{TargetStart: 0, TargetEnd: 100}, |
| }, |
| expected: []matchRange{ |
| {SrcStart: 0, SrcEnd: 4}, |
| }, |
| }, |
| { |
| // For an 80% match on 100 accurate tokens, the first 20 token |
| // positions represent possible matches. |
| name: "approximate matching on perfect runs", |
| threshold: 0.8, |
| targetLength: 100, |
| subsetLength: 100, |
| q: 4, |
| matched: matchRanges{ |
| &matchRange{TargetStart: 0, TargetEnd: 100}, |
| }, |
| expected: []matchRange{ |
| {SrcStart: 0, SrcEnd: 24}, |
| }, |
| }, |
| { |
| name: "multiple runs in a single target", |
| threshold: 0.8, |
| targetLength: 100, |
| subsetLength: 10, |
| q: 4, |
| matched: matchRanges{ |
| &matchRange{TargetStart: 0, TargetEnd: 10}, |
| &matchRange{TargetStart: 20, TargetEnd: 25}, |
| &matchRange{TargetStart: 50, TargetEnd: 60}, |
| &matchRange{TargetStart: 70, TargetEnd: 77}, |
| }, |
| expected: []matchRange{ |
| // Runs end on 4-gram boundaries |
| {SrcStart: 0, SrcEnd: 6}, |
| // The run starts early because of error tolerance |
| {SrcStart: 48, SrcEnd: 56}, |
| }, |
| }, |
| { |
| name: "bridge broken runs in a single target", |
| threshold: 0.8, |
| targetLength: 100, |
| subsetLength: 10, |
| q: 4, |
| matched: matchRanges{ |
| &matchRange{TargetStart: 20, TargetEnd: 25}, |
| &matchRange{TargetStart: 26, TargetEnd: 30}, |
| &matchRange{TargetStart: 60, TargetEnd: 67}, |
| &matchRange{TargetStart: 68, TargetEnd: 72}, |
| }, |
| expected: []matchRange{ |
| // Runs end on 4-gram boundaries and start early because |
| // of error tolerance. |
| {SrcStart: 19, SrcEnd: 25}, |
| {SrcStart: 59, SrcEnd: 67}, |
| }, |
| }, |
| } |
| |
| for _, test := range tests { |
| t.Run(test.name, func(t *testing.T) { |
| var trace strings.Builder |
| c := NewClassifier(.8) |
| c.SetTraceConfiguration(&TraceConfiguration{ |
| TraceLicenses: "*", |
| TracePhases: "*", |
| Tracer: func(f string, args ...interface{}) { |
| trace.WriteString(fmt.Sprintf(f, args...)) |
| }, |
| }) |
| if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) { |
| t.Errorf(cmp.Diff(got, test.expected)) |
| t.Errorf("Trace:\n%s", trace.String()) |
| } |
| |
| }) |
| } |
| } |