v2/searchset.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 import (
 	"fmt"
 	"hash/crc32"
 	"math"
 	"sort"

 	"github.com/davecgh/go-spew/spew"
 )

 // searchSet is a set of q-grams that have hashes associated with them,
 // making it fast to search for potential matches.
 type searchSet struct {
 	// Tokens is a tokenized list of the original input string.
 	Tokens []indexedToken
 	// Hashes is a map of checksums to a range of tokens.
 	Hashes hash
 	// Checksums is a list of checksums ordered from longest range to
 	// shortest.
 	Checksums []uint32
 	// ChecksumRanges are the token ranges for the above checksums.
 	ChecksumRanges tokenRanges
 	origin         string // A debugging identifier to label what this searchset is associated with

 	nodes []*node
 	q     int // The length of q-grams in this searchset.
 }

 // node consists of a range of tokens along with the checksum for those tokens.
 type node struct {
 	checksum uint32
 	tokens   *tokenRange
 }

 func (n *node) String() string {
 	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
 }

 // newSearchSet creates a new searchSet object. A searchset generates all
 // possible q-grams of tokens. These q-grams of tokens can be correlated to
 // determine where a section of text from one source may appear in another
 // source.
 func newSearchSet(s *indexedDocument, q int) *searchSet {
 	// Start generating hash values for all q-grams within the text.
 	h := make(hash)
 	if len(s.Tokens) < q {
 		// We can't have a smaller q than the number of tokens.
 		q = len(s.Tokens)
 	}
 	checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict)
 	sset := &searchSet{
 		Tokens:         s.Tokens,
 		Hashes:         h,
 		Checksums:      checksums,
 		ChecksumRanges: tokenRanges,
 		q:              q,
 	}
 	sset.generateNodeList()
 	return sset
 }

 // tokenRange indicates the range of tokens that map to a particular checksum.
 type tokenRange struct {
 	Start int
 	End   int
 }

 func (t *tokenRange) String() string {
 	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
 }

 // tokenRanges is a sortable type of a slice of TokenRange.
 type tokenRanges []*tokenRange

 // generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens.
 func generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) {
 	if q == 0 {
 		return nil, nil
 	}
 	var css []uint32
 	var tr tokenRanges
 	crc := crc32.NewIEEE()
 	for offset := 0; offset+q <= len(toks); offset++ {
 		crc.Reset()
 		for i := 0; i < q; i++ {
 			crc.Write([]byte(dict.getWord(toks[offset+i].ID)))
 			crc.Write([]byte{' '})
 		}
 		cs := crc.Sum32()
 		css = append(css, cs)
 		tr = append(tr, &tokenRange{offset, offset + q})
 		h.add(cs, offset, offset+q)
 	}

 	return css, tr
 }

 // generateNodeList creates a node list out of the search set.
 func (s *searchSet) generateNodeList() {
 	if len(s.Tokens) == 0 {
 		return
 	}

 	for i := 0; i < len(s.Checksums); i++ {
 		s.nodes = append(s.nodes, &node{
 			checksum: s.Checksums[i],
 			tokens:   s.ChecksumRanges[i],
 		})
 	}
 }

 // matchRange is the range within the source text that is a match to the range
 // in the target text.
 type matchRange struct {
 	// Offsets into the source tokens.
 	SrcStart, SrcEnd int
 	// Offsets into the target tokens.
 	TargetStart, TargetEnd int
 	// TokensClaimed tracks the number of positively matched tokens in this
 	// range.  For initially created matchRanges, this is equal to the extent of
 	// the range.  However, as matchRanges get merged together and error is
 	// potentially introduced, this tracks the precise number of tokens that
 	// exist in the range.
 	TokensClaimed int
 }

 // in returns true if the start and end are enclosed in the match range.
 func (m *matchRange) in(other *matchRange) bool {
 	return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd
 }

 func (m *matchRange) String() string {
 	return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed)
 }

 // matchRanges is a list of "matchRange"s. The ranges are monotonically
 // increasing in value and indicate a single potential occurrence of the source
 // text in the target text. They are sorted to support greedy matching with the
 // longest runs of q-grams appearing first in the sort.
 type matchRanges []*matchRange

 func (m matchRanges) Len() int      { return len(m) }
 func (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
 func (m matchRanges) Less(i, j int) bool {
 	if m[i].TokensClaimed != m[j].TokensClaimed {
 		return m[i].TokensClaimed > m[j].TokensClaimed
 	}

 	if m[i].TargetStart != m[j].TargetStart {
 		return m[i].TargetStart < m[j].TargetStart
 	}
 	return m[i].SrcStart < m[j].SrcStart
 }

 // findPotentialMatches returns the ranges in the target (unknown) text that
 // are best potential matches to the source (known) text.
 func (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges {
 	matchedRanges := c.getMatchedRanges(src, target, confidence, src.q)
 	if c.tc.traceSearchset(src.origin) {
 		c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges))
 	}
 	if len(matchedRanges) == 0 {
 		return nil
 	}

 	// After computing all potential matches, we only output ranges that contain
 	// enough tokens to clear the confidence threshold. As noted, this number can
 	// be too high, yielding false positives, but cannot yield false negatives.
 	threshold := int(confidence * float64(len(src.Tokens)))

 	for i, m := range matchedRanges {
 		if m.TokensClaimed < threshold {
 			matchedRanges = matchedRanges[:i]
 			break
 		}
 	}

 	return matchedRanges
 }

 // fuseRanges analyzes the source matches, attempting to combine hits without
 // errors into larger hits with tolerable amounts of error to produce matches
 // that contain enough tokens to be considered for exact matching against a a
 // target document. This routine intentionally does not accurately track error
 // contributions from merging runs, trading false positives (but not false
 // negatives), for faster performance.
 func (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
 	var claimed matchRanges
 	errorMargin := int(math.Round(float64(size) * (1.0 - confidence)))

 	filter := make([]bool, targetSize)
 	for _, m := range runs {
 		for i := m.SrcStart; i < m.SrcEnd; i++ {
 			// Only consider these offsets if they fit into the target document, since
 			// the target may be smaller than the source being analyzed
 			if i < targetSize {
 				filter[i] = true
 			}
 		}
 	}

 	filterDrops := 0
 	filterPasses := 0

 	// For each hit detected, compare it against all other previous hits to see if it can be part of match
 	// or represents a group that is eligible for matching and having other hits contribute to it.
 	for i, m := range matched {
 		off := m.TargetStart - m.SrcStart

 		// If the offset is negative, but within error margins, we associate it
 		// with the first index to see if it could contribute to a run. If the
 		// absolute offset is larger than the error margin, it can't possibly
 		// contribute and will be dropped. This puts more potential error into the zero
 		// index, but that just slightly increases the rate of false positives. In
 		// practice, this would only be an issue if there are major substrings of a
 		// source in a target that aren't part of a real hit. We see many small
 		// references (the name of a license) but not large chunks of the license.
 		if off < 0 {
 			if -off <= errorMargin {
 				off = 0
 			} else {
 				continue
 			}
 		}

 		// If the filter is false, there was not sufficient token density in that
 		// part of the target document for a viable match, so this match is a
 		// spurious hit and can be discarded.
 		if !filter[off] {
 			filterDrops++
 			continue
 		}

 		filterPasses++
 		unclaimed := true

 		for _, c := range claimed {
 			moff := m.TargetStart - m.SrcStart
 			coff := c.TargetStart - c.SrcStart
 			sampleError := int(math.Round(math.Abs(float64(moff - coff))))
 			withinError := sampleError < errorMargin

 			// The contribution needs to add more value than accumulated error. This prevents
 			// against spurious matches of a reference to a license incorrectly overextending the
 			// match range.
 			if withinError && m.TokensClaimed > int(sampleError) {
 				if m.in(c) {
 					// This can cause too many tokens to be claimed, but that's fine since we want to avoid
 					// undercounting and missing content.
 					c.TokensClaimed += m.TokensClaimed
 					unclaimed = false
 				} else {
 					// See if the claims can be merged. If the error tolerances allow for it,
 					// we merge the claims and the ranges. This doesn't accumulate error
 					// accurately so it's possible that repeated merges can introduce too
 					// much error to actually make a match, but we won't get false
 					// negatives from this approach.  The error tolerances allow for a
 					// merge, but we only want to merge if it increases the range of
 					// tokens being covered. If this match is already handled by an
 					// existing (stronger by definition) claim, we don't merge this one,
 					// but treat it as a new claim. This allows for the case where a
 					// highly fragmented text will be matched by a long series of low
 					// score matches.
 					if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart {
 						c.TargetStart = m.TargetStart
 						c.SrcStart = m.SrcStart
 						c.TokensClaimed += m.TokensClaimed
 						unclaimed = false
 					} else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd {
 						c.TargetEnd = m.TargetEnd
 						c.SrcEnd = m.SrcEnd
 						c.TokensClaimed += m.TokensClaimed
 						unclaimed = false
 					}
 					// This claim does not extend any existing block, and it may be claimed in its own
 					// right.
 				}
 			}
 			if !unclaimed {
 				break
 			}
 		}
 		// Only create a claim if this claim is likely relevant. If we had some higher quality hits,
 		// it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep
 		// this around.
 		if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed {
 			claimed = append(claimed, m)
 		}
 		if c.tc.traceSearchset(origin) {
 			c.tc.trace("after %d ranges, claimed is %s", i, spew.Sdump(claimed))
 		}
 	}
 	sort.Sort(claimed)
 	if c.tc.traceSearchset(origin) {
 		c.tc.trace("filterPasses = %+v", filterPasses)
 		c.tc.trace("filterDrops = %+v", filterDrops)
 		c.tc.trace("claimed = %s", spew.Sdump(claimed))
 	}
 	return claimed
 }

 // getMatchedRanges finds the ranges in the target text that match the source
 // text. The ranges returned are ordered from the entries with the most matched
 // tokens to the least.
 func (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges {
 	shouldTrace := c.tc.traceSearchset(src.origin)

 	if shouldTrace {
 		c.tc.trace("src.origin = %+v", src.origin)
 	}
 	// Assemble a list of all the matched q-grams without any consideration to
 	// error tolerances.
 	matched := targetMatchedRanges(src, target)
 	if shouldTrace {
 		c.tc.trace("matched = %s", spew.Sdump(matched))
 	}
 	if len(matched) == 0 {
 		return nil
 	}

 	// Perform neighborhood matching to figure out which clusters of q-grams have
 	// sufficient likelihood to be a potential match to the source. For an error
 	// confidence threshold of X, we require that a sequence of N target tokens
 	// must contain N*X (X <=1.0) ordered source tokens in order to be a viable
 	// match.
 	//
 	// It's much easier to compute potential ranges in the target if we disregard
 	// proper ordering of source tokens initially, and see which source q-grams
 	// appear in sufficient quantities to be a potential match. We can then
 	// disregard any q-gram that falls outside of that range. This helps
 	// significantly since processing token matches is an N^2 (or worse)
 	// operation, so reducing N is a big win.

 	runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)

 	if shouldTrace {
 		c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs))
 	}

 	// If there are no target runs of source tokens, we're done.
 	if len(runs) == 0 {
 		return nil
 	}

 	// Using the runs as a filter to ignore irrelevant matches, fuse the source
 	// match ranges into larger matches (with possible errors) to see if we can
 	// produce large enough runs that pass the confidence threshold.

 	fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
 	if shouldTrace {
 		c.tc.trace("fr = %s", spew.Sdump(fr))
 	}
 	return fr
 }

 func (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
 	shouldTrace := c.tc.traceSearchset(origin)
 	hits := make([]bool, targetLength)
 	for _, m := range matched {
 		for idx := m.TargetStart; idx < m.TargetEnd; idx++ {
 			hits[idx] = true
 		}
 	}

 	if len(hits) == 0 {
 		return nil
 	}
 	var out []int

 	total := 0
 	target := int(float64(subsetLength) * threshold)
 	if shouldTrace {
 		c.tc.trace("target = %+v", target)
 		c.tc.trace("targetLength = %+v", targetLength)
 		c.tc.trace("subsetLength = %+v", subsetLength)
 	}

 	// If we don't have at least 1 subset (i.e. the target is shorter than the
 	// source) just analyze what we have.
 	if len(hits) < subsetLength {
 		if shouldTrace {
 			c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits))
 		}
 		subsetLength = len(hits)
 	}
 	// Initialize our sliding window value.
 	for i := 0; i < subsetLength; i++ {
 		if hits[i] {
 			total++
 		}
 	}

 	if total >= target {
 		out = append(out, 0)
 	}

 	// Now move through the window adjusting the total by subtracting out the
 	// last bit and adding in the new bit.
 	for i := 1; i < len(hits); i++ {
 		if hits[i-1] {
 			total--
 		}
 		end := i + subsetLength - 1
 		if end < len(hits) && hits[i+subsetLength-1] {
 			total++
 		}
 		if total >= target {
 			out = append(out, i)
 		}
 	}
 	if len(out) == 0 {
 		return nil
 	}

 	final := []matchRange{
 		{
 			SrcStart: out[0],
 			SrcEnd:   out[0] + q,
 		},
 	}
 	for i := 1; i < len(out); i++ {
 		if out[i] != 1+out[i-1] {
 			final = append(final, matchRange{
 				SrcStart: out[i],
 				SrcEnd:   out[i] + q,
 			})
 		} else {
 			final[len(final)-1].SrcEnd = out[i] + q
 		}
 	}

 	return final
 }

 func targetMatchedRanges(src, target *searchSet) matchRanges {
 	offsetMappings := make(map[int][]*matchRange)

 	var matched matchRanges
 	for _, tgtNode := range target.nodes {
 		sr, ok := src.Hashes[tgtNode.checksum]
 		if !ok {
 			continue
 		}

 		tv := tgtNode.tokens
 		for _, sv := range sr {
 			offset := tv.Start - sv.Start
 			if om, ok := offsetMappings[offset]; ok {
 				// See if this extends the most recent existing mapping
 				lastIdx := len(om) - 1
 				if om[lastIdx].TargetEnd == tv.End-1 {
 					// This new value extends. Update the value in place
 					om[lastIdx].SrcEnd = sv.End
 					om[lastIdx].TargetEnd = tv.End
 					continue
 				}
 			}
 			offsetMappings[offset] = append(offsetMappings[offset], &matchRange{
 				SrcStart:    sv.Start,
 				SrcEnd:      sv.End,
 				TargetStart: tv.Start,
 				TargetEnd:   tv.End,
 			})
 		}
 	}

 	// Compute the number of tokens claimed in each run and flatten into a single slice.
 	for _, mr := range offsetMappings {
 		for _, m := range mr {
 			m.TokensClaimed = m.TargetEnd - m.TargetStart
 		}
 		matched = append(matched, mr...)
 	}
 	sort.Sort(matched)
 	return matched
 }

 type hash map[uint32]tokenRanges

 func (h hash) add(checksum uint32, start, end int) {
 	h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end})
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	import (
	"fmt"
	"hash/crc32"
	"math"
	"sort"

	"github.com/davecgh/go-spew/spew"
	)

	// searchSet is a set of q-grams that have hashes associated with them,
	// making it fast to search for potential matches.
	type searchSet struct {
	// Tokens is a tokenized list of the original input string.
	Tokens []indexedToken
	// Hashes is a map of checksums to a range of tokens.
	Hashes hash
	// Checksums is a list of checksums ordered from longest range to
	// shortest.
	Checksums []uint32
	// ChecksumRanges are the token ranges for the above checksums.
	ChecksumRanges tokenRanges
	origin string // A debugging identifier to label what this searchset is associated with

	nodes []*node
	q int // The length of q-grams in this searchset.
	}

	// node consists of a range of tokens along with the checksum for those tokens.
	type node struct {
	checksum uint32
	tokens *tokenRange
	}

	func (n *node) String() string {
	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
	}

	// newSearchSet creates a new searchSet object. A searchset generates all
	// possible q-grams of tokens. These q-grams of tokens can be correlated to
	// determine where a section of text from one source may appear in another
	// source.
	func newSearchSet(s indexedDocument, q int) searchSet {
	// Start generating hash values for all q-grams within the text.
	h := make(hash)
	if len(s.Tokens) < q {
	// We can't have a smaller q than the number of tokens.
	q = len(s.Tokens)
	}
	checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict)
	sset := &searchSet{
	Tokens: s.Tokens,
	Hashes: h,
	Checksums: checksums,
	ChecksumRanges: tokenRanges,
	q: q,
	}
	sset.generateNodeList()
	return sset
	}

	// tokenRange indicates the range of tokens that map to a particular checksum.
	type tokenRange struct {
	Start int
	End int
	}

	func (t *tokenRange) String() string {
	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
	}

	// tokenRanges is a sortable type of a slice of TokenRange.
	type tokenRanges []*tokenRange

	// generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens.
	func generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) {
	if q == 0 {
	return nil, nil
	}
	var css []uint32
	var tr tokenRanges
	crc := crc32.NewIEEE()
	for offset := 0; offset+q <= len(toks); offset++ {
	crc.Reset()
	for i := 0; i < q; i++ {
	crc.Write([]byte(dict.getWord(toks[offset+i].ID)))
	crc.Write([]byte{' '})
	}
	cs := crc.Sum32()
	css = append(css, cs)
	tr = append(tr, &tokenRange{offset, offset + q})
	h.add(cs, offset, offset+q)
	}

	return css, tr
	}

	// generateNodeList creates a node list out of the search set.
	func (s *searchSet) generateNodeList() {
	if len(s.Tokens) == 0 {
	return
	}

	for i := 0; i < len(s.Checksums); i++ {
	s.nodes = append(s.nodes, &node{
	checksum: s.Checksums[i],
	tokens: s.ChecksumRanges[i],
	})
	}
	}

	// matchRange is the range within the source text that is a match to the range
	// in the target text.
	type matchRange struct {
	// Offsets into the source tokens.
	SrcStart, SrcEnd int
	// Offsets into the target tokens.
	TargetStart, TargetEnd int
	// TokensClaimed tracks the number of positively matched tokens in this
	// range. For initially created matchRanges, this is equal to the extent of
	// the range. However, as matchRanges get merged together and error is
	// potentially introduced, this tracks the precise number of tokens that
	// exist in the range.
	TokensClaimed int
	}

	// in returns true if the start and end are enclosed in the match range.
	func (m matchRange) in(other matchRange) bool {
	return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd
	}

	func (m *matchRange) String() string {
	return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed)
	}

	// matchRanges is a list of "matchRange"s. The ranges are monotonically
	// increasing in value and indicate a single potential occurrence of the source
	// text in the target text. They are sorted to support greedy matching with the
	// longest runs of q-grams appearing first in the sort.
	type matchRanges []*matchRange

	func (m matchRanges) Len() int { return len(m) }
	func (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
	func (m matchRanges) Less(i, j int) bool {
	if m[i].TokensClaimed != m[j].TokensClaimed {
	return m[i].TokensClaimed > m[j].TokensClaimed
	}

	if m[i].TargetStart != m[j].TargetStart {
	return m[i].TargetStart < m[j].TargetStart
	}
	return m[i].SrcStart < m[j].SrcStart
	}

	// findPotentialMatches returns the ranges in the target (unknown) text that
	// are best potential matches to the source (known) text.
	func (c Classifier) findPotentialMatches(src, target searchSet, confidence float64) matchRanges {
	matchedRanges := c.getMatchedRanges(src, target, confidence, src.q)
	if c.tc.traceSearchset(src.origin) {
	c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges))
	}
	if len(matchedRanges) == 0 {
	return nil
	}

	// After computing all potential matches, we only output ranges that contain
	// enough tokens to clear the confidence threshold. As noted, this number can
	// be too high, yielding false positives, but cannot yield false negatives.
	threshold := int(confidence * float64(len(src.Tokens)))

	for i, m := range matchedRanges {
	if m.TokensClaimed < threshold {
	matchedRanges = matchedRanges[:i]
	break
	}
	}

	return matchedRanges
	}

	// fuseRanges analyzes the source matches, attempting to combine hits without
	// errors into larger hits with tolerable amounts of error to produce matches
	// that contain enough tokens to be considered for exact matching against a a
	// target document. This routine intentionally does not accurately track error
	// contributions from merging runs, trading false positives (but not false
	// negatives), for faster performance.
	func (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
	var claimed matchRanges
	errorMargin := int(math.Round(float64(size) * (1.0 - confidence)))

	filter := make([]bool, targetSize)
	for _, m := range runs {
	for i := m.SrcStart; i < m.SrcEnd; i++ {
	// Only consider these offsets if they fit into the target document, since
	// the target may be smaller than the source being analyzed
	if i < targetSize {
	filter[i] = true
	}
	}
	}

	filterDrops := 0
	filterPasses := 0

	// For each hit detected, compare it against all other previous hits to see if it can be part of match
	// or represents a group that is eligible for matching and having other hits contribute to it.
	for i, m := range matched {
	off := m.TargetStart - m.SrcStart

	// If the offset is negative, but within error margins, we associate it
	// with the first index to see if it could contribute to a run. If the
	// absolute offset is larger than the error margin, it can't possibly
	// contribute and will be dropped. This puts more potential error into the zero
	// index, but that just slightly increases the rate of false positives. In
	// practice, this would only be an issue if there are major substrings of a
	// source in a target that aren't part of a real hit. We see many small
	// references (the name of a license) but not large chunks of the license.
	if off < 0 {
	if -off <= errorMargin {
	off = 0
	} else {
	continue
	}
	}

	// If the filter is false, there was not sufficient token density in that
	// part of the target document for a viable match, so this match is a
	// spurious hit and can be discarded.
	if !filter[off] {
	filterDrops++
	continue
	}

	filterPasses++
	unclaimed := true

	for _, c := range claimed {
	moff := m.TargetStart - m.SrcStart
	coff := c.TargetStart - c.SrcStart
	sampleError := int(math.Round(math.Abs(float64(moff - coff))))
	withinError := sampleError < errorMargin

	// The contribution needs to add more value than accumulated error. This prevents
	// against spurious matches of a reference to a license incorrectly overextending the
	// match range.
	if withinError && m.TokensClaimed > int(sampleError) {
	if m.in(c) {
	// This can cause too many tokens to be claimed, but that's fine since we want to avoid
	// undercounting and missing content.
	c.TokensClaimed += m.TokensClaimed
	unclaimed = false
	} else {
	// See if the claims can be merged. If the error tolerances allow for it,
	// we merge the claims and the ranges. This doesn't accumulate error
	// accurately so it's possible that repeated merges can introduce too
	// much error to actually make a match, but we won't get false
	// negatives from this approach. The error tolerances allow for a
	// merge, but we only want to merge if it increases the range of
	// tokens being covered. If this match is already handled by an
	// existing (stronger by definition) claim, we don't merge this one,
	// but treat it as a new claim. This allows for the case where a
	// highly fragmented text will be matched by a long series of low
	// score matches.
	if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart {
	c.TargetStart = m.TargetStart
	c.SrcStart = m.SrcStart
	c.TokensClaimed += m.TokensClaimed
	unclaimed = false
	} else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd {
	c.TargetEnd = m.TargetEnd
	c.SrcEnd = m.SrcEnd
	c.TokensClaimed += m.TokensClaimed
	unclaimed = false
	}
	// This claim does not extend any existing block, and it may be claimed in its own
	// right.
	}
	}
	if !unclaimed {
	break
	}
	}
	// Only create a claim if this claim is likely relevant. If we had some higher quality hits,
	// it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep
	// this around.
	if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed {
	claimed = append(claimed, m)
	}
	if c.tc.traceSearchset(origin) {
	c.tc.trace("after %d ranges, claimed is %s", i, spew.Sdump(claimed))
	}
	}
	sort.Sort(claimed)
	if c.tc.traceSearchset(origin) {
	c.tc.trace("filterPasses = %+v", filterPasses)
	c.tc.trace("filterDrops = %+v", filterDrops)
	c.tc.trace("claimed = %s", spew.Sdump(claimed))
	}
	return claimed
	}

	// getMatchedRanges finds the ranges in the target text that match the source
	// text. The ranges returned are ordered from the entries with the most matched
	// tokens to the least.
	func (c Classifier) getMatchedRanges(src, target searchSet, confidence float64, q int) matchRanges {
	shouldTrace := c.tc.traceSearchset(src.origin)

	if shouldTrace {
	c.tc.trace("src.origin = %+v", src.origin)
	}
	// Assemble a list of all the matched q-grams without any consideration to
	// error tolerances.
	matched := targetMatchedRanges(src, target)
	if shouldTrace {
	c.tc.trace("matched = %s", spew.Sdump(matched))
	}
	if len(matched) == 0 {
	return nil
	}

	// Perform neighborhood matching to figure out which clusters of q-grams have
	// sufficient likelihood to be a potential match to the source. For an error
	// confidence threshold of X, we require that a sequence of N target tokens
	// must contain N*X (X <=1.0) ordered source tokens in order to be a viable
	// match.
	//
	// It's much easier to compute potential ranges in the target if we disregard
	// proper ordering of source tokens initially, and see which source q-grams
	// appear in sufficient quantities to be a potential match. We can then
	// disregard any q-gram that falls outside of that range. This helps
	// significantly since processing token matches is an N^2 (or worse)
	// operation, so reducing N is a big win.

	runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)

	if shouldTrace {
	c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs))
	}

	// If there are no target runs of source tokens, we're done.
	if len(runs) == 0 {
	return nil
	}

	// Using the runs as a filter to ignore irrelevant matches, fuse the source
	// match ranges into larger matches (with possible errors) to see if we can
	// produce large enough runs that pass the confidence threshold.

	fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
	if shouldTrace {
	c.tc.trace("fr = %s", spew.Sdump(fr))
	}
	return fr
	}

	func (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
	shouldTrace := c.tc.traceSearchset(origin)
	hits := make([]bool, targetLength)
	for _, m := range matched {
	for idx := m.TargetStart; idx < m.TargetEnd; idx++ {
	hits[idx] = true
	}
	}

	if len(hits) == 0 {
	return nil
	}
	var out []int

	total := 0
	target := int(float64(subsetLength) * threshold)
	if shouldTrace {
	c.tc.trace("target = %+v", target)
	c.tc.trace("targetLength = %+v", targetLength)
	c.tc.trace("subsetLength = %+v", subsetLength)
	}

	// If we don't have at least 1 subset (i.e. the target is shorter than the
	// source) just analyze what we have.
	if len(hits) < subsetLength {
	if shouldTrace {
	c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits))
	}
	subsetLength = len(hits)
	}
	// Initialize our sliding window value.
	for i := 0; i < subsetLength; i++ {
	if hits[i] {
	total++
	}
	}

	if total >= target {
	out = append(out, 0)
	}

	// Now move through the window adjusting the total by subtracting out the
	// last bit and adding in the new bit.
	for i := 1; i < len(hits); i++ {
	if hits[i-1] {
	total--
	}
	end := i + subsetLength - 1
	if end < len(hits) && hits[i+subsetLength-1] {
	total++
	}
	if total >= target {
	out = append(out, i)
	}
	}
	if len(out) == 0 {
	return nil
	}

	final := []matchRange{
	{
	SrcStart: out[0],
	SrcEnd: out[0] + q,
	},
	}
	for i := 1; i < len(out); i++ {
	if out[i] != 1+out[i-1] {
	final = append(final, matchRange{
	SrcStart: out[i],
	SrcEnd: out[i] + q,
	})
	} else {
	final[len(final)-1].SrcEnd = out[i] + q
	}
	}

	return final
	}

	func targetMatchedRanges(src, target *searchSet) matchRanges {
	offsetMappings := make(map[int][]*matchRange)

	var matched matchRanges
	for _, tgtNode := range target.nodes {
	sr, ok := src.Hashes[tgtNode.checksum]
	if !ok {
	continue
	}

	tv := tgtNode.tokens
	for _, sv := range sr {
	offset := tv.Start - sv.Start
	if om, ok := offsetMappings[offset]; ok {
	// See if this extends the most recent existing mapping
	lastIdx := len(om) - 1
	if om[lastIdx].TargetEnd == tv.End-1 {
	// This new value extends. Update the value in place
	om[lastIdx].SrcEnd = sv.End
	om[lastIdx].TargetEnd = tv.End
	continue
	}
	}
	offsetMappings[offset] = append(offsetMappings[offset], &matchRange{
	SrcStart: sv.Start,
	SrcEnd: sv.End,
	TargetStart: tv.Start,
	TargetEnd: tv.End,
	})
	}
	}

	// Compute the number of tokens claimed in each run and flatten into a single slice.
	for _, mr := range offsetMappings {
	for _, m := range mr {
	m.TokensClaimed = m.TargetEnd - m.TargetStart
	}
	matched = append(matched, mr...)
	}
	sort.Sort(matched)
	return matched
	}

	type hash map[uint32]tokenRanges

	func (h hash) add(checksum uint32, start, end int) {
	h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end})
	}