blob: b60c140f39915ef5d4d84ec006e4d3c4c6024013 [file] [log] [blame]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"strings"
"github.com/sergi/go-diff/diffmatchpatch"
)
// This file contains word-diffing routines that build on top of the go-diff package.
// The algorithm implemented here is from the suggested word diffing technique in
// https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs
// diffRange returns the indices of the beginning and end locations of the diff
// that reconstruct (as best possible) the source value.
func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) {
var foundStart bool
var seen string
for end = 0; end < len(diffs); end++ {
if len(seen) > 1 && seen[:len(seen)-1] == known {
break
}
switch diffs[end].Type {
case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
if !foundStart {
start = end
foundStart = true
}
seen += diffs[end].Text + " "
}
}
return start, end
}
func docDiff(id string, doc1 *indexedDocument, doc1Start, doc1End int, doc2 *indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff {
chars1 := doc1.runes[doc1Start:doc1End]
chars2 := doc2.runes[doc2Start:doc2End]
dmp := diffmatchpatch.New()
diffs := dmp.DiffMainRunes(chars1, chars2, false)
// Recover the words from the previous rune encoding and return the textual diffs.
diffs = diffRunesToWords(diffs, doc1.dict)
return diffs
}
func diffWordsToRunes(doc *indexedDocument, start, end int) []rune {
// Creates a slice of runes using the indexed values as a basis for runes.
// The go-diff code basically does exactly this using ephemeral dictionaries
// for each input string. We leverage the fact we have a persistent dictionary
// to make this operation cheaper.
// TODO: perhaps we should cache these in the corpus?
runes := make([]rune, 0, end-start)
for _, t := range doc.Tokens[start:end] {
runes = append(runes, rune(t.ID))
}
return runes
}
// diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text.
func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff {
hydrated := make([]diffmatchpatch.Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := []rune(aDiff.Text)
var sb strings.Builder
for i, r := range chars {
sb.WriteString(dict.getWord(tokenID(r)))
if (i + 1) < len(chars) {
sb.WriteByte(' ')
}
}
aDiff.Text = sb.String()
hydrated = append(hydrated, aDiff)
}
return hydrated
}
// Returns the number of words in the input string. Used by scoring and distance functions.
// This function depends on the behavior of the tokenizer such that strings are separated
// by exactly one space and don't start or end with whitespace.
func wordLen(text string) int {
if text == "" {
return 0
}
return strings.Count(text, " ") + 1
}
// textLength returns the number of tokens in the diff. This value is used to
// adjust the offset for detection, since this is the number of tokens
// discarded while matching a diff. By virtue of how it's called, there won't
// be "change" diffs (a paired insert/delete) so we can simplify the scan to
// just count up everything.
func textLength(diffs []diffmatchpatch.Diff) int {
l := 0
for _, d := range diffs {
l += wordLen(d.Text)
}
return l
}