| // Copyright 2020 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Package classifier provides the implementation of the v2 license classifier. |
| package classifier |
| |
| import ( |
| "fmt" |
| "os" |
| "strings" |
| ) |
| |
| type tokenID int // type to ensure safety when manipulating token identifiers. |
| |
| // token provides detailed information about a single textual token in the document. |
| type token struct { |
| Text string // normalized text of the token |
| Index int // the token's location in the tokenized document |
| Line int // line position of this token in the source |
| Previous string // for the first token in a line, any previous text. |
| } |
| |
| // document is the representation of the input text for downstream filtering and matching. |
| type document struct { |
| Tokens []*token // ordered tokens of the document |
| } |
| |
| type indexedToken struct { |
| Index int // the token's location in the tokenized document |
| Line int // line position of this token in the source |
| ID tokenID // identifier of the text in the dictionary |
| } |
| |
| type indexedDocument struct { |
| Tokens []indexedToken // ordered tokens of the document |
| f *frequencyTable // frequencies computed for this document |
| dict *dictionary // The corpus dictionary for this document |
| s *searchSet // The searchset for this document |
| runes []rune |
| norm string // The normalized token sequence |
| } |
| |
| func (d *indexedDocument) generateSearchSet(q int) { |
| d.s = newSearchSet(d, q) |
| } |
| |
| func (d *indexedDocument) size() int { |
| return len(d.Tokens) |
| } |
| |
| // normalized returns a string of the normalized tokens concatenated with a |
| // single space. This is used by the diff algorithm. |
| // TODO: it'd be more efficient to have the diff algorithm work with the raw tokens directly |
| // and avoid these ephemeral allocations. |
| func (d *indexedDocument) normalized() string { |
| var w strings.Builder |
| for i, t := range d.Tokens { |
| w.WriteString(d.dict.getWord(t.ID)) |
| if (i + 1) != d.size() { |
| w.WriteString(" ") |
| } |
| } |
| return w.String() |
| } |
| |
| func computeQ(threshold float64) int { |
| // q is the lower bound for token runs (q-grams) that must exist |
| // in content that can be recognized at the specified threshold. |
| // Imagine a document with 100 tokens, and a threshold of 80%. This means |
| // that in a worst-case scenario, the 20 errors are evenly distributed to |
| // create the sortest possible token runs. In this case, there would be |
| // a repeating sequence of 4 good tokens and 1 errored token, occurring |
| // 20 times. This function returns the minimum token length, or returning |
| // a value of 1 if necessary (since a threshold level below 50% would generate |
| // a run of 0-length, which is meaningless.) |
| if threshold == 1.0 { |
| return 10 // avoid divide by 0 |
| } |
| |
| return max(1, int((threshold)/(1.0-threshold))) |
| } |
| |
| func max(a, b int) int { |
| if a > b { |
| return a |
| } |
| return b |
| } |
| |
| // AddContent incorporates the provided textual content into the classifier for |
| // matching. This will not modify the supplied content. |
| func (c *Classifier) AddContent(category, name, variant string, content []byte) { |
| doc := tokenize(content) |
| c.addDocument(category, name, variant, doc) |
| } |
| |
| // addDocument takes a textual document and incorporates it into the classifier for matching. |
| func (c *Classifier) addDocument(category, name, variant string, doc *document) { |
| // For documents that are part of the corpus, we add them to the dictionary and |
| // compute their associated search data eagerly so they are ready for matching against |
| // candidates. |
| indexName := c.generateDocName(category, name, variant) |
| id := c.generateIndexedDocument(doc, true) |
| id.generateFrequencies() |
| id.generateSearchSet(c.q) |
| id.s.origin = indexName |
| c.docs[indexName] = id |
| } |
| |
| // generateIndexedDocument creates an indexedDocument from the supplied document. if addWords |
| // is true, the classifier dictionary is updated with new tokens encountered in the document. |
| func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument { |
| id := &indexedDocument{ |
| Tokens: make([]indexedToken, 0, len(d.Tokens)), |
| dict: c.dict, |
| } |
| |
| for _, t := range d.Tokens { |
| var tokID tokenID |
| if addWords { |
| tokID = id.dict.add(t.Text) |
| } else { |
| tokID = id.dict.getIndex(t.Text) |
| } |
| |
| id.Tokens = append(id.Tokens, indexedToken{ |
| Index: t.Index, |
| Line: t.Line, |
| ID: tokID, |
| }) |
| |
| } |
| id.generateFrequencies() |
| id.runes = diffWordsToRunes(id, 0, id.size()) |
| id.norm = id.normalized() |
| return id |
| } |
| |
| // createTargetIndexedDocument creates an indexed document without adding the |
| // words to the classifier dictionary. This should be used for matching targets, not |
| // populating the corpus. |
| func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument { |
| doc := tokenize(in) |
| return c.generateIndexedDocument(doc, false) |
| } |
| |
| func (c *Classifier) generateDocName(category, name, variant string) string { |
| return fmt.Sprintf("%s%c%s%c%s", category, os.PathSeparator, name, os.PathSeparator, variant) |
| } |
| func (c *Classifier) getIndexedDocument(category, name, variant string) *indexedDocument { |
| return c.docs[c.generateDocName(category, name, variant)] |
| } |
| |
| // dictionary is used to intern all the token words encountered in the text corpus. |
| // words and indices form an inverse mapping relationship. It is just a convenience type |
| // over a pair of correlated maps. |
| type dictionary struct { |
| words map[tokenID]string |
| indices map[string]tokenID |
| } |
| |
| func newDictionary() *dictionary { |
| return &dictionary{ |
| words: make(map[tokenID]string), |
| indices: make(map[string]tokenID), |
| } |
| } |
| |
| // add inserts the provided word into the dictionary if it does not already exist. |
| func (d *dictionary) add(word string) tokenID { |
| if idx := d.getIndex(word); idx != unknownIndex { |
| return idx |
| } |
| // token IDs start from 1, 0 is reserved for the invalid ID |
| idx := tokenID(len(d.words) + 1) |
| d.words[idx] = word |
| d.indices[word] = idx |
| return idx |
| } |
| |
| var unknownWord = "UNKNOWN" |
| var unknownIndex = tokenID(0) |
| |
| // getIndex returns the index of the supplied word, or 0 if the word is not in the dictionary. |
| func (d *dictionary) getIndex(word string) tokenID { |
| if idx, found := d.indices[word]; found { |
| return idx |
| } |
| return unknownIndex |
| } |
| |
| // getWord returns the word associated with the index. |
| func (d *dictionary) getWord(index tokenID) string { |
| if word, found := d.words[index]; found { |
| return word |
| } |
| return unknownWord |
| } |