v2/frequencies.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 type frequencyTable struct {
 	counts map[tokenID]int // key: token ID, value: number of instances of that token
 }

 func newFrequencyTable() *frequencyTable {
 	return &frequencyTable{
 		counts: make(map[tokenID]int),
 	}
 }

 func (f *frequencyTable) update(d *indexedDocument) {
 	for _, tok := range d.Tokens {
 		f.counts[tok.ID]++
 	}
 }

 func (d *indexedDocument) generateFrequencies() {
 	d.f = newFrequencyTable()
 	d.f.update(d)
 }

 // TokenSimilarity returns a confidence score of how well d contains
 // the tokens of o. This is used as a fast similarity metric to
 // avoid running more expensive classifiers.
 func (d *indexedDocument) tokenSimilarity(o *indexedDocument) float64 {
 	hits := 0
 	// For each token in the source document, see if the target has "enough" instances
 	// of that token to possibly be a match to the target.
 	// We count up all the matches, and divide by the total number of unique source
 	// tokens to get a similarity metric. 1.0 means that all the tokens in the target
 	// are present in the source in appropriate quantities. If the value here is lower
 	// than the desired matching threshold, the target can't possibly match the source.
 	// Profiling indicates a significant amount of time is spent here.
 	// Avoiding checking (or storing) "uninteresting" tokens (common English words)
 	// could help.
 	for t, c := range o.f.counts {
 		if d.f.counts[t] >= c {
 			hits++
 		}
 	}

 	return float64(hits) / float64(len(o.f.counts))
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	type frequencyTable struct {
	counts map[tokenID]int // key: token ID, value: number of instances of that token
	}

	func newFrequencyTable() *frequencyTable {
	return &frequencyTable{
	counts: make(map[tokenID]int),
	}
	}

	func (f frequencyTable) update(d indexedDocument) {
	for _, tok := range d.Tokens {
	f.counts[tok.ID]++
	}
	}

	func (d *indexedDocument) generateFrequencies() {
	d.f = newFrequencyTable()
	d.f.update(d)
	}

	// TokenSimilarity returns a confidence score of how well d contains
	// the tokens of o. This is used as a fast similarity metric to
	// avoid running more expensive classifiers.
	func (d indexedDocument) tokenSimilarity(o indexedDocument) float64 {
	hits := 0
	// For each token in the source document, see if the target has "enough" instances
	// of that token to possibly be a match to the target.
	// We count up all the matches, and divide by the total number of unique source
	// tokens to get a similarity metric. 1.0 means that all the tokens in the target
	// are present in the source in appropriate quantities. If the value here is lower
	// than the desired matching threshold, the target can't possibly match the source.
	// Profiling indicates a significant amount of time is spent here.
	// Avoiding checking (or storing) "uninteresting" tokens (common English words)
	// could help.
	for t, c := range o.f.counts {
	if d.f.counts[t] >= c {
	hits++
	}
	}

	return float64(hits) / float64(len(o.f.counts))
	}