stringclassifier/classifier.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2017 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Package stringclassifier finds the nearest match between a string and a set of known values. It
 // uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
 // likely to be correct than one with a small LD. A confidence percentage is returned, which
 // indicates how confident the algorithm is that the match is correct. The higher the percentage,
 // the greater the confidence that the match is correct.
 //
 // Example Usage:
 //
 //   type Text struct {
 //     Name string
 //     Text string
 //   }
 //
 //   func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
 //     sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
 //     for _, known := range knownTexts {
 //       if err := sc.AddValue(known.Name, known.Text); err != nil {
 //         return nil, err
 //       }
 //     }
 //     return sc, nil
 //   }
 //
 //   func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) {
 //     for _, unknown := range unknownTexts {
 //       m := sc.NearestMatch(unknown.Text)
 //       log.Printf("The nearest match to %q is %q (confidence: %v)",
 //         unknown.Name, m.Name, m.Confidence)
 //     }
 //   }
 package stringclassifier

 import (
 	"fmt"
 	"log"
 	"math"
 	"regexp"
 	"sort"
 	"sync"

 	"github.com/google/licenseclassifier/stringclassifier/internal/pq"
 	"github.com/google/licenseclassifier/stringclassifier/searchset"
 	"github.com/sergi/go-diff/diffmatchpatch"
 )

 // The diff/match/patch algorithm.
 var dmp = diffmatchpatch.New()

 const (
 	// DefaultConfidenceThreshold is the minimum ratio threshold between
 	// the matching range and the full source range that we're willing to
 	// accept in order to say that the matching range will produce a
 	// sufficiently good edit distance. I.e., if the matching range is
 	// below this threshold we won't run the Levenshtein Distance algorithm
 	// on it.
 	DefaultConfidenceThreshold float64 = 0.80

 	defaultMinDiffRatio float64 = 0.75
 )

 // A Classifier matches a string to a set of known values.
 type Classifier struct {
 	muValues    sync.RWMutex
 	values      map[string]*knownValue
 	normalizers []NormalizeFunc
 	threshold   float64

 	// MinDiffRatio defines the minimum ratio of the length difference
 	// allowed to consider a known value a possible match. This is used as
 	// a performance optimization to eliminate values that are unlikely to
 	// be a match.
 	//
 	// For example, a value of 0.75 means that the shorter string must be
 	// at least 75% the length of the longer string to consider it a
 	// possible match.
 	//
 	// Setting this to 1.0 will require that strings are identical length.
 	// Setting this to 0 will consider all known values as possible
 	// matches.
 	MinDiffRatio float64
 }

 // NormalizeFunc is a function that is used to normalize a string prior to comparison.
 type NormalizeFunc func(string) string

 // New creates a new Classifier with the provided NormalizeFuncs. Each
 // NormalizeFunc is applied in order to a string before comparison.
 func New(threshold float64, funcs ...NormalizeFunc) *Classifier {
 	return &Classifier{
 		values:       make(map[string]*knownValue),
 		normalizers:  append([]NormalizeFunc(nil), funcs...),
 		threshold:    threshold,
 		MinDiffRatio: defaultMinDiffRatio,
 	}
 }

 // knownValue identifies a value in the corpus to match against.
 type knownValue struct {
 	key             string
 	normalizedValue string
 	reValue         *regexp.Regexp
 	set             *searchset.SearchSet
 }

 // AddValue adds a known value to be matched against. If a value already exists
 // for key, an error is returned.
 func (c *Classifier) AddValue(key, value string) error {
 	c.muValues.Lock()
 	defer c.muValues.Unlock()
 	if _, ok := c.values[key]; ok {
 		return fmt.Errorf("value already registered with key %q", key)
 	}
 	norm := c.normalize(value)
 	c.values[key] = &knownValue{
 		key:             key,
 		normalizedValue: norm,
 		reValue:         regexp.MustCompile(norm),
 	}
 	return nil
 }

 // AddPrecomputedValue adds a known value to be matched against. The value has
 // already been normalized and the SearchSet object deserialized, so no
 // processing is necessary.
 func (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error {
 	c.muValues.Lock()
 	defer c.muValues.Unlock()
 	if _, ok := c.values[key]; ok {
 		return fmt.Errorf("value already registered with key %q", key)
 	}
 	set.GenerateNodeList()
 	c.values[key] = &knownValue{
 		key:             key,
 		normalizedValue: value,
 		reValue:         regexp.MustCompile(value),
 		set:             set,
 	}
 	return nil
 }

 // normalize a string by applying each of the registered NormalizeFuncs.
 func (c *Classifier) normalize(s string) string {
 	for _, fn := range c.normalizers {
 		s = fn(s)
 	}
 	return s
 }

 // Match identifies the result of matching a string against a knownValue.
 type Match struct {
 	Name       string  // Name of knownValue that was matched
 	Confidence float64 // Confidence percentage
 	Offset     int     // The offset into the unknown string the match was made
 	Extent     int     // The length from the offset into the unknown string
 }

 // Matches is a list of Match-es. This is here mainly so that the list can be
 // sorted.
 type Matches []*Match

 func (m Matches) Len() int      { return len(m) }
 func (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
 func (m Matches) Less(i, j int) bool {
 	if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
 		if m[i].Name == m[j].Name {
 			if m[i].Offset > m[j].Offset {
 				return false
 			}
 			if m[i].Offset == m[j].Offset {
 				return m[i].Extent > m[j].Extent
 			}
 			return true
 		}
 		return m[i].Name < m[j].Name
 	}
 	return m[i].Confidence > m[j].Confidence
 }

 // Names returns an unsorted slice of the names of the matched licenses.
 func (m Matches) Names() []string {
 	var names []string
 	for _, n := range m {
 		names = append(names, n.Name)
 	}
 	return names
 }

 // uniquify goes through the matches and removes any that are contained within
 // one with a higher confidence. This assumes that Matches is sorted.
 func (m Matches) uniquify() Matches {
 	type matchedRange struct {
 		offset, extent int
 	}

 	var matched []matchedRange
 	var matches Matches
 OUTER:
 	for _, match := range m {
 		for _, mr := range matched {
 			if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
 				continue OUTER
 			}
 		}
 		matched = append(matched, matchedRange{match.Offset, match.Extent})
 		matches = append(matches, match)
 	}

 	return matches
 }

 // NearestMatch returns the name of the known value that most closely matches
 // the unknown string and a confidence percentage is returned indicating how
 // confident the classifier is in the result. A percentage of "1.0" indicates
 // an exact match, while a percentage of "0.0" indicates a complete mismatch.
 //
 // If the string is equidistant from multiple known values, it is undefined
 // which will be returned.
 func (c *Classifier) NearestMatch(s string) *Match {
 	pq := c.nearestMatch(s)
 	if pq.Len() == 0 {
 		return &Match{}
 	}
 	return pq.Pop().(*Match)
 }

 // MultipleMatch tries to determine which known strings are found within an
 // unknown string. This differs from "NearestMatch" in that it looks only at
 // those areas within the unknown string that are likely to match. A list of
 // potential matches are returned. It's up to the caller to determine which
 // ones are acceptable.
 func (c *Classifier) MultipleMatch(s string) (matches Matches) {
 	pq := c.multipleMatch(s)
 	if pq == nil {
 		return matches
 	}

 	// A map to remove duplicate entries.
 	m := make(map[Match]bool)

 	for pq.Len() != 0 {
 		v := pq.Pop().(*Match)
 		if _, ok := m[*v]; !ok {
 			m[*v] = true
 			matches = append(matches, v)
 		}
 	}

 	sort.Sort(matches)
 	return matches.uniquify()
 }

 // possibleMatch identifies a known value and it's diffRatio to a given string.
 type possibleMatch struct {
 	value     *knownValue
 	diffRatio float64
 }

 // likelyMatches is a slice of possibleMatches that can be sorted by their
 // diffRatio to a given string, such that the most likely matches (based on
 // length) are at the beginning.
 type likelyMatches []possibleMatch

 func (m likelyMatches) Len() int           { return len(m) }
 func (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
 func (m likelyMatches) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }

 // nearestMatch returns a Queue of values that the unknown string may be. The
 // values are compared via their Levenshtein Distance and ranked with the
 // nearest match at the beginning.
 func (c *Classifier) nearestMatch(unknown string) *pq.Queue {
 	var mu sync.Mutex // Protect the priority queue.
 	pq := pq.NewQueue(func(x, y interface{}) bool {
 		return x.(*Match).Confidence > y.(*Match).Confidence
 	}, nil)

 	unknown = c.normalize(unknown)
 	if len(unknown) == 0 {
 		return pq
 	}

 	c.muValues.RLock()
 	var likely likelyMatches
 	for _, v := range c.values {
 		dr := diffRatio(unknown, v.normalizedValue)
 		if dr < c.MinDiffRatio {
 			continue
 		}
 		if unknown == v.normalizedValue {
 			// We found an exact match.
 			pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
 			c.muValues.RUnlock()
 			return pq
 		}
 		likely = append(likely, possibleMatch{value: v, diffRatio: dr})
 	}
 	c.muValues.RUnlock()
 	sort.Sort(likely)

 	var wg sync.WaitGroup
 	classifyString := func(name, unknown, known string) {
 		defer wg.Done()

 		diffs := dmp.DiffMain(unknown, known, true)
 		distance := dmp.DiffLevenshtein(diffs)
 		confidence := confidencePercentage(len(unknown), len(known), distance)
 		if confidence > 0.0 {
 			mu.Lock()
 			pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
 			mu.Unlock()
 		}
 	}

 	wg.Add(len(likely))
 	for _, known := range likely {
 		go classifyString(known.value.key, unknown, known.value.normalizedValue)
 	}
 	wg.Wait()
 	return pq
 }

 // matcher finds all potential matches of "known" in "unknown". The results are
 // placed in "queue".
 type matcher struct {
 	unknown     *searchset.SearchSet
 	normUnknown string
 	threshold   float64

 	mu    sync.Mutex
 	queue *pq.Queue
 }

 // newMatcher creates a "matcher" object.
 func newMatcher(unknown string, threshold float64) *matcher {
 	return &matcher{
 		unknown:     searchset.New(unknown, searchset.DefaultGranularity),
 		normUnknown: unknown,
 		threshold:   threshold,
 		queue: pq.NewQueue(func(x, y interface{}) bool {
 			return x.(*Match).Confidence > y.(*Match).Confidence
 		}, nil),
 	}
 }

 // findMatches takes a known text and finds all potential instances of it in
 // the unknown text. The resulting matches can then filtered to determine which
 // are the best matches.
 func (m *matcher) findMatches(known *knownValue) {
 	var mrs []searchset.MatchRanges
 	if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
 		// We found exact matches. Just use those!
 		for _, a := range all {
 			var start, end int
 			for i, tok := range m.unknown.Tokens {
 				if tok.Offset == a[0] {
 					start = i
 				} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
 					end = i
 					break
 				}
 			}

 			mrs = append(mrs, searchset.MatchRanges{{
 				SrcStart:    0,
 				SrcEnd:      len(known.set.Tokens),
 				TargetStart: start,
 				TargetEnd:   end + 1,
 			}})
 		}
 	} else {
 		// No exact match. Perform a more thorough match.
 		mrs = searchset.FindPotentialMatches(known.set, m.unknown)
 	}

 	var wg sync.WaitGroup
 	for _, mr := range mrs {
 		if !m.withinConfidenceThreshold(known.set, mr) {
 			continue
 		}

 		wg.Add(1)
 		go func(mr searchset.MatchRanges) {
 			start, end := mr.TargetRange(m.unknown)
 			conf := levDist(m.normUnknown[start:end], known.normalizedValue)
 			if conf > 0.0 {
 				m.mu.Lock()
 				m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
 				m.mu.Unlock()
 			}
 			wg.Done()
 		}(mr)
 	}
 	wg.Wait()
 }

 // withinConfidenceThreshold returns the Confidence we have in the potential
 // match. It does this by calculating the ratio of what's matching to the
 // original known text.
 func (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool {
 	return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
 }

 // multipleMatch returns a Queue of values that might be within the unknown
 // string. The values are compared via their Levenshtein Distance and ranked
 // with the nearest match at the beginning.
 func (c *Classifier) multipleMatch(unknown string) *pq.Queue {
 	normUnknown := c.normalize(unknown)
 	if normUnknown == "" {
 		return nil
 	}

 	m := newMatcher(normUnknown, c.threshold)

 	c.muValues.RLock()
 	var kvals []*knownValue
 	for _, known := range c.values {
 		kvals = append(kvals, known)
 	}
 	c.muValues.RUnlock()

 	var wg sync.WaitGroup
 	wg.Add(len(kvals))
 	for _, known := range kvals {
 		go func(known *knownValue) {
 			if known.set == nil {
 				k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
 				c.muValues.Lock()
 				c.values[known.key].set = k
 				c.muValues.Unlock()
 			}
 			m.findMatches(known)
 			wg.Done()
 		}(known)
 	}
 	wg.Wait()
 	return m.queue
 }

 // levDist runs the Levenshtein Distance algorithm on the known and unknown
 // texts to measure how well they match.
 func levDist(unknown, known string) float64 {
 	if len(known) == 0 || len(unknown) == 0 {
 		log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
 		return 0.0
 	}

 	// Calculate the differences between the potentially matching known
 	// text and the unknown text.
 	diffs := dmp.DiffMain(unknown, known, false)
 	end := diffRangeEnd(known, diffs)

 	// Now execute the Levenshtein Distance algorithm to see how much it
 	// does match.
 	distance := dmp.DiffLevenshtein(diffs[:end])
 	return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
 }

 // unknownTextLength returns the length of the unknown text based on the diff range.
 func unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
 	last := len(diffs) - 1
 	for ; last >= 0; last-- {
 		if diffs[last].Type == diffmatchpatch.DiffEqual {
 			break
 		}
 	}
 	ulen := 0
 	for i := 0; i < last+1; i++ {
 		switch diffs[i].Type {
 		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
 			ulen += len(diffs[i].Text)
 		}
 	}
 	return ulen
 }

 // diffRangeEnd returns the end index for the "Diff" objects that constructs
 // (or nearly constructs) the "known" value.
 func diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
 	var seen string
 	for end = 0; end < len(diffs); end++ {
 		if seen == known {
 			// Once we've constructed the "known" value, then we've
 			// reached the point in the diff list where more
 			// "Diff"s would just make the Levenshtein Distance
 			// less valid. There shouldn't be further "DiffEqual"
 			// nodes, because there's nothing further to match in
 			// the "known" text.
 			break
 		}
 		switch diffs[end].Type {
 		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
 			seen += diffs[end].Text
 		}
 	}
 	return end
 }

 // confidencePercentage calculates how confident we are in the result of the
 // match. A percentage of "1.0" means an identical match. A confidence of "0.0"
 // means a complete mismatch.
 func confidencePercentage(ulen, klen, distance int) float64 {
 	if ulen == 0 && klen == 0 {
 		return 1.0
 	}
 	if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) {
 		return 0.0
 	}
 	return 1.0 - float64(distance)/float64(max(ulen, klen))
 }

 // diffRatio calculates the ratio of the length of s1 and s2, returned as a
 // percentage of the length of the longer string. E.g., diffLength("abcd", "e")
 // would return 0.25 because "e" is 25% of the size of "abcd". Comparing
 // strings of equal length will return 1.
 func diffRatio(s1, s2 string) float64 {
 	x, y := len(s1), len(s2)
 	if x == 0 && y == 0 {
 		// Both strings are zero length
 		return 1.0
 	}
 	if x < y {
 		return float64(x) / float64(y)
 	}
 	return float64(y) / float64(x)
 }

 func max(a, b int) int {
 	if a > b {
 		return a
 	}
 	return b
 }

 func min(a, b int) int {
 	if a < b {
 		return a
 	}
 	return b
 }

 // wsRegexp is a regexp used to identify blocks of whitespace.
 var wsRegexp = regexp.MustCompile(`\s+`)

 // FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
 var FlattenWhitespace NormalizeFunc = func(s string) string {
 	return wsRegexp.ReplaceAllString(s, " ")
 }
	// Copyright 2017 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// Package stringclassifier finds the nearest match between a string and a set of known values. It
	// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
	// likely to be correct than one with a small LD. A confidence percentage is returned, which
	// indicates how confident the algorithm is that the match is correct. The higher the percentage,
	// the greater the confidence that the match is correct.
	//
	// Example Usage:
	//
	// type Text struct {
	// Name string
	// Text string
	// }
	//
	// func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
	// sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
	// for _, known := range knownTexts {
	// if err := sc.AddValue(known.Name, known.Text); err != nil {
	// return nil, err
	// }
	// }
	// return sc, nil
	// }
	//
	// func IdentifyTexts(sc stringclassifier.Classifier, unknownTexts []Text) {
	// for _, unknown := range unknownTexts {
	// m := sc.NearestMatch(unknown.Text)
	// log.Printf("The nearest match to %q is %q (confidence: %v)",
	// unknown.Name, m.Name, m.Confidence)
	// }
	// }
	package stringclassifier

	import (
	"fmt"
	"log"
	"math"
	"regexp"
	"sort"
	"sync"

	"github.com/google/licenseclassifier/stringclassifier/internal/pq"
	"github.com/google/licenseclassifier/stringclassifier/searchset"
	"github.com/sergi/go-diff/diffmatchpatch"
	)

	// The diff/match/patch algorithm.
	var dmp = diffmatchpatch.New()

	const (
	// DefaultConfidenceThreshold is the minimum ratio threshold between
	// the matching range and the full source range that we're willing to
	// accept in order to say that the matching range will produce a
	// sufficiently good edit distance. I.e., if the matching range is
	// below this threshold we won't run the Levenshtein Distance algorithm
	// on it.
	DefaultConfidenceThreshold float64 = 0.80

	defaultMinDiffRatio float64 = 0.75
	)

	// A Classifier matches a string to a set of known values.
	type Classifier struct {
	muValues sync.RWMutex
	values map[string]*knownValue
	normalizers []NormalizeFunc
	threshold float64

	// MinDiffRatio defines the minimum ratio of the length difference
	// allowed to consider a known value a possible match. This is used as
	// a performance optimization to eliminate values that are unlikely to
	// be a match.
	//
	// For example, a value of 0.75 means that the shorter string must be
	// at least 75% the length of the longer string to consider it a
	// possible match.
	//
	// Setting this to 1.0 will require that strings are identical length.
	// Setting this to 0 will consider all known values as possible
	// matches.
	MinDiffRatio float64
	}

	// NormalizeFunc is a function that is used to normalize a string prior to comparison.
	type NormalizeFunc func(string) string

	// New creates a new Classifier with the provided NormalizeFuncs. Each
	// NormalizeFunc is applied in order to a string before comparison.
	func New(threshold float64, funcs ...NormalizeFunc) *Classifier {
	return &Classifier{
	values: make(map[string]*knownValue),
	normalizers: append([]NormalizeFunc(nil), funcs...),
	threshold: threshold,
	MinDiffRatio: defaultMinDiffRatio,
	}
	}

	// knownValue identifies a value in the corpus to match against.
	type knownValue struct {
	key string
	normalizedValue string
	reValue *regexp.Regexp
	set *searchset.SearchSet
	}

	// AddValue adds a known value to be matched against. If a value already exists
	// for key, an error is returned.
	func (c *Classifier) AddValue(key, value string) error {
	c.muValues.Lock()
	defer c.muValues.Unlock()
	if _, ok := c.values[key]; ok {
	return fmt.Errorf("value already registered with key %q", key)
	}
	norm := c.normalize(value)
	c.values[key] = &knownValue{
	key: key,
	normalizedValue: norm,
	reValue: regexp.MustCompile(norm),
	}
	return nil
	}

	// AddPrecomputedValue adds a known value to be matched against. The value has
	// already been normalized and the SearchSet object deserialized, so no
	// processing is necessary.
	func (c Classifier) AddPrecomputedValue(key, value string, set searchset.SearchSet) error {
	c.muValues.Lock()
	defer c.muValues.Unlock()
	if _, ok := c.values[key]; ok {
	return fmt.Errorf("value already registered with key %q", key)
	}
	set.GenerateNodeList()
	c.values[key] = &knownValue{
	key: key,
	normalizedValue: value,
	reValue: regexp.MustCompile(value),
	set: set,
	}
	return nil
	}

	// normalize a string by applying each of the registered NormalizeFuncs.
	func (c *Classifier) normalize(s string) string {
	for _, fn := range c.normalizers {
	s = fn(s)
	}
	return s
	}

	// Match identifies the result of matching a string against a knownValue.
	type Match struct {
	Name string // Name of knownValue that was matched
	Confidence float64 // Confidence percentage
	Offset int // The offset into the unknown string the match was made
	Extent int // The length from the offset into the unknown string
	}

	// Matches is a list of Match-es. This is here mainly so that the list can be
	// sorted.
	type Matches []*Match

	func (m Matches) Len() int { return len(m) }
	func (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
	func (m Matches) Less(i, j int) bool {
	if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
	if m[i].Name == m[j].Name {
	if m[i].Offset > m[j].Offset {
	return false
	}
	if m[i].Offset == m[j].Offset {
	return m[i].Extent > m[j].Extent
	}
	return true
	}
	return m[i].Name < m[j].Name
	}
	return m[i].Confidence > m[j].Confidence
	}

	// Names returns an unsorted slice of the names of the matched licenses.
	func (m Matches) Names() []string {
	var names []string
	for _, n := range m {
	names = append(names, n.Name)
	}
	return names
	}

	// uniquify goes through the matches and removes any that are contained within
	// one with a higher confidence. This assumes that Matches is sorted.
	func (m Matches) uniquify() Matches {
	type matchedRange struct {
	offset, extent int
	}

	var matched []matchedRange
	var matches Matches
	OUTER:
	for _, match := range m {
	for _, mr := range matched {
	if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
	continue OUTER
	}
	}
	matched = append(matched, matchedRange{match.Offset, match.Extent})
	matches = append(matches, match)
	}

	return matches
	}

	// NearestMatch returns the name of the known value that most closely matches
	// the unknown string and a confidence percentage is returned indicating how
	// confident the classifier is in the result. A percentage of "1.0" indicates
	// an exact match, while a percentage of "0.0" indicates a complete mismatch.
	//
	// If the string is equidistant from multiple known values, it is undefined
	// which will be returned.
	func (c Classifier) NearestMatch(s string) Match {
	pq := c.nearestMatch(s)
	if pq.Len() == 0 {
	return &Match{}
	}
	return pq.Pop().(*Match)
	}

	// MultipleMatch tries to determine which known strings are found within an
	// unknown string. This differs from "NearestMatch" in that it looks only at
	// those areas within the unknown string that are likely to match. A list of
	// potential matches are returned. It's up to the caller to determine which
	// ones are acceptable.
	func (c *Classifier) MultipleMatch(s string) (matches Matches) {
	pq := c.multipleMatch(s)
	if pq == nil {
	return matches
	}

	// A map to remove duplicate entries.
	m := make(map[Match]bool)

	for pq.Len() != 0 {
	v := pq.Pop().(*Match)
	if _, ok := m[*v]; !ok {
	m[*v] = true
	matches = append(matches, v)
	}
	}

	sort.Sort(matches)
	return matches.uniquify()
	}

	// possibleMatch identifies a known value and it's diffRatio to a given string.
	type possibleMatch struct {
	value *knownValue
	diffRatio float64
	}

	// likelyMatches is a slice of possibleMatches that can be sorted by their
	// diffRatio to a given string, such that the most likely matches (based on
	// length) are at the beginning.
	type likelyMatches []possibleMatch

	func (m likelyMatches) Len() int { return len(m) }
	func (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
	func (m likelyMatches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }

	// nearestMatch returns a Queue of values that the unknown string may be. The
	// values are compared via their Levenshtein Distance and ranked with the
	// nearest match at the beginning.
	func (c Classifier) nearestMatch(unknown string) pq.Queue {
	var mu sync.Mutex // Protect the priority queue.
	pq := pq.NewQueue(func(x, y interface{}) bool {
	return x.(Match).Confidence > y.(Match).Confidence
	}, nil)

	unknown = c.normalize(unknown)
	if len(unknown) == 0 {
	return pq
	}

	c.muValues.RLock()
	var likely likelyMatches
	for _, v := range c.values {
	dr := diffRatio(unknown, v.normalizedValue)
	if dr < c.MinDiffRatio {
	continue
	}
	if unknown == v.normalizedValue {
	// We found an exact match.
	pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
	c.muValues.RUnlock()
	return pq
	}
	likely = append(likely, possibleMatch{value: v, diffRatio: dr})
	}
	c.muValues.RUnlock()
	sort.Sort(likely)

	var wg sync.WaitGroup
	classifyString := func(name, unknown, known string) {
	defer wg.Done()

	diffs := dmp.DiffMain(unknown, known, true)
	distance := dmp.DiffLevenshtein(diffs)
	confidence := confidencePercentage(len(unknown), len(known), distance)
	if confidence > 0.0 {
	mu.Lock()
	pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
	mu.Unlock()
	}
	}

	wg.Add(len(likely))
	for _, known := range likely {
	go classifyString(known.value.key, unknown, known.value.normalizedValue)
	}
	wg.Wait()
	return pq
	}

	// matcher finds all potential matches of "known" in "unknown". The results are
	// placed in "queue".
	type matcher struct {
	unknown *searchset.SearchSet
	normUnknown string
	threshold float64

	mu sync.Mutex
	queue *pq.Queue
	}

	// newMatcher creates a "matcher" object.
	func newMatcher(unknown string, threshold float64) *matcher {
	return &matcher{
	unknown: searchset.New(unknown, searchset.DefaultGranularity),
	normUnknown: unknown,
	threshold: threshold,
	queue: pq.NewQueue(func(x, y interface{}) bool {
	return x.(Match).Confidence > y.(Match).Confidence
	}, nil),
	}
	}

	// findMatches takes a known text and finds all potential instances of it in
	// the unknown text. The resulting matches can then filtered to determine which
	// are the best matches.
	func (m matcher) findMatches(known knownValue) {
	var mrs []searchset.MatchRanges
	if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
	// We found exact matches. Just use those!
	for _, a := range all {
	var start, end int
	for i, tok := range m.unknown.Tokens {
	if tok.Offset == a[0] {
	start = i
	} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
	end = i
	break
	}
	}

	mrs = append(mrs, searchset.MatchRanges{{
	SrcStart: 0,
	SrcEnd: len(known.set.Tokens),
	TargetStart: start,
	TargetEnd: end + 1,
	}})
	}
	} else {
	// No exact match. Perform a more thorough match.
	mrs = searchset.FindPotentialMatches(known.set, m.unknown)
	}

	var wg sync.WaitGroup
	for _, mr := range mrs {
	if !m.withinConfidenceThreshold(known.set, mr) {
	continue
	}

	wg.Add(1)
	go func(mr searchset.MatchRanges) {
	start, end := mr.TargetRange(m.unknown)
	conf := levDist(m.normUnknown[start:end], known.normalizedValue)
	if conf > 0.0 {
	m.mu.Lock()
	m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
	m.mu.Unlock()
	}
	wg.Done()
	}(mr)
	}
	wg.Wait()
	}

	// withinConfidenceThreshold returns the Confidence we have in the potential
	// match. It does this by calculating the ratio of what's matching to the
	// original known text.
	func (m matcher) withinConfidenceThreshold(known searchset.SearchSet, mr searchset.MatchRanges) bool {
	return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
	}

	// multipleMatch returns a Queue of values that might be within the unknown
	// string. The values are compared via their Levenshtein Distance and ranked
	// with the nearest match at the beginning.
	func (c Classifier) multipleMatch(unknown string) pq.Queue {
	normUnknown := c.normalize(unknown)
	if normUnknown == "" {
	return nil
	}

	m := newMatcher(normUnknown, c.threshold)

	c.muValues.RLock()
	var kvals []*knownValue
	for _, known := range c.values {
	kvals = append(kvals, known)
	}
	c.muValues.RUnlock()

	var wg sync.WaitGroup
	wg.Add(len(kvals))
	for _, known := range kvals {
	go func(known *knownValue) {
	if known.set == nil {
	k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
	c.muValues.Lock()
	c.values[known.key].set = k
	c.muValues.Unlock()
	}
	m.findMatches(known)
	wg.Done()
	}(known)
	}
	wg.Wait()
	return m.queue
	}

	// levDist runs the Levenshtein Distance algorithm on the known and unknown
	// texts to measure how well they match.
	func levDist(unknown, known string) float64 {
	if len(known) == 0 \|\| len(unknown) == 0 {
	log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
	return 0.0
	}

	// Calculate the differences between the potentially matching known
	// text and the unknown text.
	diffs := dmp.DiffMain(unknown, known, false)
	end := diffRangeEnd(known, diffs)

	// Now execute the Levenshtein Distance algorithm to see how much it
	// does match.
	distance := dmp.DiffLevenshtein(diffs[:end])
	return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
	}

	// unknownTextLength returns the length of the unknown text based on the diff range.
	func unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
	last := len(diffs) - 1
	for ; last >= 0; last-- {
	if diffs[last].Type == diffmatchpatch.DiffEqual {
	break
	}
	}
	ulen := 0
	for i := 0; i < last+1; i++ {
	switch diffs[i].Type {
	case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
	ulen += len(diffs[i].Text)
	}
	}
	return ulen
	}

	// diffRangeEnd returns the end index for the "Diff" objects that constructs
	// (or nearly constructs) the "known" value.
	func diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
	var seen string
	for end = 0; end < len(diffs); end++ {
	if seen == known {
	// Once we've constructed the "known" value, then we've
	// reached the point in the diff list where more
	// "Diff"s would just make the Levenshtein Distance
	// less valid. There shouldn't be further "DiffEqual"
	// nodes, because there's nothing further to match in
	// the "known" text.
	break
	}
	switch diffs[end].Type {
	case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
	seen += diffs[end].Text
	}
	}
	return end
	}

	// confidencePercentage calculates how confident we are in the result of the
	// match. A percentage of "1.0" means an identical match. A confidence of "0.0"
	// means a complete mismatch.
	func confidencePercentage(ulen, klen, distance int) float64 {
	if ulen == 0 && klen == 0 {
	return 1.0
	}
	if ulen == 0 \|\| klen == 0 \|\| (distance > ulen && distance > klen) {
	return 0.0
	}
	return 1.0 - float64(distance)/float64(max(ulen, klen))
	}

	// diffRatio calculates the ratio of the length of s1 and s2, returned as a
	// percentage of the length of the longer string. E.g., diffLength("abcd", "e")
	// would return 0.25 because "e" is 25% of the size of "abcd". Comparing
	// strings of equal length will return 1.
	func diffRatio(s1, s2 string) float64 {
	x, y := len(s1), len(s2)
	if x == 0 && y == 0 {
	// Both strings are zero length
	return 1.0
	}
	if x < y {
	return float64(x) / float64(y)
	}
	return float64(y) / float64(x)
	}

	func max(a, b int) int {
	if a > b {
	return a
	}
	return b
	}

	func min(a, b int) int {
	if a < b {
	return a
	}
	return b
	}

	// wsRegexp is a regexp used to identify blocks of whitespace.
	var wsRegexp = regexp.MustCompile(`\s+`)

	// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
	var FlattenWhitespace NormalizeFunc = func(s string) string {
	return wsRegexp.ReplaceAllString(s, " ")
	}