blob: 1bf01e263888af508bfdeafebc85636b5a822014 [file] [log] [blame]
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package stringclassifier finds the nearest match between a string and a set of known values. It
// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
// likely to be correct than one with a small LD. A confidence percentage is returned, which
// indicates how confident the algorithm is that the match is correct. The higher the percentage,
// the greater the confidence that the match is correct.
//
// Example Usage:
//
// type Text struct {
// Name string
// Text string
// }
//
// func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
// sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
// for _, known := range knownTexts {
// if err := sc.AddValue(known.Name, known.Text); err != nil {
// return nil, err
// }
// }
// return sc, nil
// }
//
// func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) {
// for _, unknown := range unknownTexts {
// m := sc.NearestMatch(unknown.Text)
// log.Printf("The nearest match to %q is %q (confidence: %v)",
// unknown.Name, m.Name, m.Confidence)
// }
// }
package stringclassifier
import (
"fmt"
"log"
"math"
"regexp"
"sort"
"sync"
"github.com/google/licenseclassifier/stringclassifier/internal/pq"
"github.com/google/licenseclassifier/stringclassifier/searchset"
"github.com/sergi/go-diff/diffmatchpatch"
)
// The diff/match/patch algorithm.
var dmp = diffmatchpatch.New()
const (
// DefaultConfidenceThreshold is the minimum ratio threshold between
// the matching range and the full source range that we're willing to
// accept in order to say that the matching range will produce a
// sufficiently good edit distance. I.e., if the matching range is
// below this threshold we won't run the Levenshtein Distance algorithm
// on it.
DefaultConfidenceThreshold float64 = 0.80
defaultMinDiffRatio float64 = 0.75
)
// A Classifier matches a string to a set of known values.
type Classifier struct {
muValues sync.RWMutex
values map[string]*knownValue
normalizers []NormalizeFunc
threshold float64
// MinDiffRatio defines the minimum ratio of the length difference
// allowed to consider a known value a possible match. This is used as
// a performance optimization to eliminate values that are unlikely to
// be a match.
//
// For example, a value of 0.75 means that the shorter string must be
// at least 75% the length of the longer string to consider it a
// possible match.
//
// Setting this to 1.0 will require that strings are identical length.
// Setting this to 0 will consider all known values as possible
// matches.
MinDiffRatio float64
}
// NormalizeFunc is a function that is used to normalize a string prior to comparison.
type NormalizeFunc func(string) string
// New creates a new Classifier with the provided NormalizeFuncs. Each
// NormalizeFunc is applied in order to a string before comparison.
func New(threshold float64, funcs ...NormalizeFunc) *Classifier {
return &Classifier{
values: make(map[string]*knownValue),
normalizers: append([]NormalizeFunc(nil), funcs...),
threshold: threshold,
MinDiffRatio: defaultMinDiffRatio,
}
}
// knownValue identifies a value in the corpus to match against.
type knownValue struct {
key string
normalizedValue string
reValue *regexp.Regexp
set *searchset.SearchSet
}
// AddValue adds a known value to be matched against. If a value already exists
// for key, an error is returned.
func (c *Classifier) AddValue(key, value string) error {
c.muValues.Lock()
defer c.muValues.Unlock()
if _, ok := c.values[key]; ok {
return fmt.Errorf("value already registered with key %q", key)
}
norm := c.normalize(value)
c.values[key] = &knownValue{
key: key,
normalizedValue: norm,
reValue: regexp.MustCompile(norm),
}
return nil
}
// AddPrecomputedValue adds a known value to be matched against. The value has
// already been normalized and the SearchSet object deserialized, so no
// processing is necessary.
func (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error {
c.muValues.Lock()
defer c.muValues.Unlock()
if _, ok := c.values[key]; ok {
return fmt.Errorf("value already registered with key %q", key)
}
set.GenerateNodeList()
c.values[key] = &knownValue{
key: key,
normalizedValue: value,
reValue: regexp.MustCompile(value),
set: set,
}
return nil
}
// normalize a string by applying each of the registered NormalizeFuncs.
func (c *Classifier) normalize(s string) string {
for _, fn := range c.normalizers {
s = fn(s)
}
return s
}
// Match identifies the result of matching a string against a knownValue.
type Match struct {
Name string // Name of knownValue that was matched
Confidence float64 // Confidence percentage
Offset int // The offset into the unknown string the match was made
Extent int // The length from the offset into the unknown string
}
// Matches is a list of Match-es. This is here mainly so that the list can be
// sorted.
type Matches []*Match
func (m Matches) Len() int { return len(m) }
func (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m Matches) Less(i, j int) bool {
if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
if m[i].Name == m[j].Name {
if m[i].Offset > m[j].Offset {
return false
}
if m[i].Offset == m[j].Offset {
return m[i].Extent > m[j].Extent
}
return true
}
return m[i].Name < m[j].Name
}
return m[i].Confidence > m[j].Confidence
}
// Names returns an unsorted slice of the names of the matched licenses.
func (m Matches) Names() []string {
var names []string
for _, n := range m {
names = append(names, n.Name)
}
return names
}
// uniquify goes through the matches and removes any that are contained within
// one with a higher confidence. This assumes that Matches is sorted.
func (m Matches) uniquify() Matches {
type matchedRange struct {
offset, extent int
}
var matched []matchedRange
var matches Matches
OUTER:
for _, match := range m {
for _, mr := range matched {
if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
continue OUTER
}
}
matched = append(matched, matchedRange{match.Offset, match.Extent})
matches = append(matches, match)
}
return matches
}
// NearestMatch returns the name of the known value that most closely matches
// the unknown string and a confidence percentage is returned indicating how
// confident the classifier is in the result. A percentage of "1.0" indicates
// an exact match, while a percentage of "0.0" indicates a complete mismatch.
//
// If the string is equidistant from multiple known values, it is undefined
// which will be returned.
func (c *Classifier) NearestMatch(s string) *Match {
pq := c.nearestMatch(s)
if pq.Len() == 0 {
return &Match{}
}
return pq.Pop().(*Match)
}
// MultipleMatch tries to determine which known strings are found within an
// unknown string. This differs from "NearestMatch" in that it looks only at
// those areas within the unknown string that are likely to match. A list of
// potential matches are returned. It's up to the caller to determine which
// ones are acceptable.
func (c *Classifier) MultipleMatch(s string) (matches Matches) {
pq := c.multipleMatch(s)
if pq == nil {
return matches
}
// A map to remove duplicate entries.
m := make(map[Match]bool)
for pq.Len() != 0 {
v := pq.Pop().(*Match)
if _, ok := m[*v]; !ok {
m[*v] = true
matches = append(matches, v)
}
}
sort.Sort(matches)
return matches.uniquify()
}
// possibleMatch identifies a known value and it's diffRatio to a given string.
type possibleMatch struct {
value *knownValue
diffRatio float64
}
// likelyMatches is a slice of possibleMatches that can be sorted by their
// diffRatio to a given string, such that the most likely matches (based on
// length) are at the beginning.
type likelyMatches []possibleMatch
func (m likelyMatches) Len() int { return len(m) }
func (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
func (m likelyMatches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
// nearestMatch returns a Queue of values that the unknown string may be. The
// values are compared via their Levenshtein Distance and ranked with the
// nearest match at the beginning.
func (c *Classifier) nearestMatch(unknown string) *pq.Queue {
var mu sync.Mutex // Protect the priority queue.
pq := pq.NewQueue(func(x, y interface{}) bool {
return x.(*Match).Confidence > y.(*Match).Confidence
}, nil)
unknown = c.normalize(unknown)
if len(unknown) == 0 {
return pq
}
c.muValues.RLock()
var likely likelyMatches
for _, v := range c.values {
dr := diffRatio(unknown, v.normalizedValue)
if dr < c.MinDiffRatio {
continue
}
if unknown == v.normalizedValue {
// We found an exact match.
pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
c.muValues.RUnlock()
return pq
}
likely = append(likely, possibleMatch{value: v, diffRatio: dr})
}
c.muValues.RUnlock()
sort.Sort(likely)
var wg sync.WaitGroup
classifyString := func(name, unknown, known string) {
defer wg.Done()
diffs := dmp.DiffMain(unknown, known, true)
distance := dmp.DiffLevenshtein(diffs)
confidence := confidencePercentage(len(unknown), len(known), distance)
if confidence > 0.0 {
mu.Lock()
pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
mu.Unlock()
}
}
wg.Add(len(likely))
for _, known := range likely {
go classifyString(known.value.key, unknown, known.value.normalizedValue)
}
wg.Wait()
return pq
}
// matcher finds all potential matches of "known" in "unknown". The results are
// placed in "queue".
type matcher struct {
unknown *searchset.SearchSet
normUnknown string
threshold float64
mu sync.Mutex
queue *pq.Queue
}
// newMatcher creates a "matcher" object.
func newMatcher(unknown string, threshold float64) *matcher {
return &matcher{
unknown: searchset.New(unknown, searchset.DefaultGranularity),
normUnknown: unknown,
threshold: threshold,
queue: pq.NewQueue(func(x, y interface{}) bool {
return x.(*Match).Confidence > y.(*Match).Confidence
}, nil),
}
}
// findMatches takes a known text and finds all potential instances of it in
// the unknown text. The resulting matches can then filtered to determine which
// are the best matches.
func (m *matcher) findMatches(known *knownValue) {
var mrs []searchset.MatchRanges
if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
// We found exact matches. Just use those!
for _, a := range all {
var start, end int
for i, tok := range m.unknown.Tokens {
if tok.Offset == a[0] {
start = i
} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
end = i
break
}
}
mrs = append(mrs, searchset.MatchRanges{{
SrcStart: 0,
SrcEnd: len(known.set.Tokens),
TargetStart: start,
TargetEnd: end + 1,
}})
}
} else {
// No exact match. Perform a more thorough match.
mrs = searchset.FindPotentialMatches(known.set, m.unknown)
}
var wg sync.WaitGroup
for _, mr := range mrs {
if !m.withinConfidenceThreshold(known.set, mr) {
continue
}
wg.Add(1)
go func(mr searchset.MatchRanges) {
start, end := mr.TargetRange(m.unknown)
conf := levDist(m.normUnknown[start:end], known.normalizedValue)
if conf > 0.0 {
m.mu.Lock()
m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
m.mu.Unlock()
}
wg.Done()
}(mr)
}
wg.Wait()
}
// withinConfidenceThreshold returns the Confidence we have in the potential
// match. It does this by calculating the ratio of what's matching to the
// original known text.
func (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool {
return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
}
// multipleMatch returns a Queue of values that might be within the unknown
// string. The values are compared via their Levenshtein Distance and ranked
// with the nearest match at the beginning.
func (c *Classifier) multipleMatch(unknown string) *pq.Queue {
normUnknown := c.normalize(unknown)
if normUnknown == "" {
return nil
}
m := newMatcher(normUnknown, c.threshold)
c.muValues.RLock()
var kvals []*knownValue
for _, known := range c.values {
kvals = append(kvals, known)
}
c.muValues.RUnlock()
var wg sync.WaitGroup
wg.Add(len(kvals))
for _, known := range kvals {
go func(known *knownValue) {
if known.set == nil {
k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
c.muValues.Lock()
c.values[known.key].set = k
c.muValues.Unlock()
}
m.findMatches(known)
wg.Done()
}(known)
}
wg.Wait()
return m.queue
}
// levDist runs the Levenshtein Distance algorithm on the known and unknown
// texts to measure how well they match.
func levDist(unknown, known string) float64 {
if len(known) == 0 || len(unknown) == 0 {
log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
return 0.0
}
// Calculate the differences between the potentially matching known
// text and the unknown text.
diffs := dmp.DiffMain(unknown, known, false)
end := diffRangeEnd(known, diffs)
// Now execute the Levenshtein Distance algorithm to see how much it
// does match.
distance := dmp.DiffLevenshtein(diffs[:end])
return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
}
// unknownTextLength returns the length of the unknown text based on the diff range.
func unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
last := len(diffs) - 1
for ; last >= 0; last-- {
if diffs[last].Type == diffmatchpatch.DiffEqual {
break
}
}
ulen := 0
for i := 0; i < last+1; i++ {
switch diffs[i].Type {
case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
ulen += len(diffs[i].Text)
}
}
return ulen
}
// diffRangeEnd returns the end index for the "Diff" objects that constructs
// (or nearly constructs) the "known" value.
func diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
var seen string
for end = 0; end < len(diffs); end++ {
if seen == known {
// Once we've constructed the "known" value, then we've
// reached the point in the diff list where more
// "Diff"s would just make the Levenshtein Distance
// less valid. There shouldn't be further "DiffEqual"
// nodes, because there's nothing further to match in
// the "known" text.
break
}
switch diffs[end].Type {
case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
seen += diffs[end].Text
}
}
return end
}
// confidencePercentage calculates how confident we are in the result of the
// match. A percentage of "1.0" means an identical match. A confidence of "0.0"
// means a complete mismatch.
func confidencePercentage(ulen, klen, distance int) float64 {
if ulen == 0 && klen == 0 {
return 1.0
}
if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) {
return 0.0
}
return 1.0 - float64(distance)/float64(max(ulen, klen))
}
// diffRatio calculates the ratio of the length of s1 and s2, returned as a
// percentage of the length of the longer string. E.g., diffLength("abcd", "e")
// would return 0.25 because "e" is 25% of the size of "abcd". Comparing
// strings of equal length will return 1.
func diffRatio(s1, s2 string) float64 {
x, y := len(s1), len(s2)
if x == 0 && y == 0 {
// Both strings are zero length
return 1.0
}
if x < y {
return float64(x) / float64(y)
}
return float64(y) / float64(x)
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
// wsRegexp is a regexp used to identify blocks of whitespace.
var wsRegexp = regexp.MustCompile(`\s+`)
// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
var FlattenWhitespace NormalizeFunc = func(s string) string {
return wsRegexp.ReplaceAllString(s, " ")
}