v2/diff.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 import (
 	"strings"

 	"github.com/sergi/go-diff/diffmatchpatch"
 )

 // This file contains word-diffing routines that build on top of the go-diff package.
 // The algorithm implemented here is from the suggested word diffing technique in
 // https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs

 // diffRange returns the indices of the beginning and end locations of the diff
 // that reconstruct (as best possible) the source value.
 func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) {
 	var foundStart bool
 	var seen string
 	for end = 0; end < len(diffs); end++ {
 		if len(seen) > 1 && seen[:len(seen)-1] == known {
 			break
 		}
 		switch diffs[end].Type {
 		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
 			if !foundStart {
 				start = end
 				foundStart = true
 			}
 			seen += diffs[end].Text + " "
 		}
 	}
 	return start, end
 }

 func docDiff(id string, doc1 *indexedDocument, doc1Start, doc1End int, doc2 *indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff {
 	chars1 := doc1.runes[doc1Start:doc1End]
 	chars2 := doc2.runes[doc2Start:doc2End]

 	dmp := diffmatchpatch.New()
 	diffs := dmp.DiffMainRunes(chars1, chars2, false)

 	// Recover the words from the previous rune encoding and return the textual diffs.
 	diffs = diffRunesToWords(diffs, doc1.dict)
 	return diffs
 }

 func diffWordsToRunes(doc *indexedDocument, start, end int) []rune {
 	// Creates a slice of runes using the indexed values as a basis for runes.
 	// The go-diff code basically does exactly this using ephemeral dictionaries
 	// for each input string. We leverage the fact we have a persistent dictionary
 	// to make this operation cheaper.
 	// TODO: perhaps we should cache these in the corpus?
 	runes := make([]rune, 0, end-start)

 	for _, t := range doc.Tokens[start:end] {
 		runes = append(runes, rune(t.ID))
 	}
 	return runes
 }

 // diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text.
 func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff {
 	hydrated := make([]diffmatchpatch.Diff, 0, len(diffs))
 	for _, aDiff := range diffs {
 		chars := []rune(aDiff.Text)
 		var sb strings.Builder

 		for i, r := range chars {
 			sb.WriteString(dict.getWord(tokenID(r)))
 			if (i + 1) < len(chars) {
 				sb.WriteByte(' ')
 			}
 		}

 		aDiff.Text = sb.String()
 		hydrated = append(hydrated, aDiff)
 	}
 	return hydrated
 }

 // Returns the number of words in the input string. Used by scoring and distance functions.
 // This function depends on the behavior of the tokenizer such that strings are separated
 // by exactly one space and don't start or end with whitespace.
 func wordLen(text string) int {
 	if text == "" {
 		return 0
 	}
 	return strings.Count(text, " ") + 1
 }

 // textLength returns the number of tokens in the diff. This value is used to
 // adjust the offset for detection, since this is the number of tokens
 // discarded while matching a diff.  By virtue of how it's called, there won't
 // be "change" diffs (a paired insert/delete) so we can simplify the scan to
 // just count up everything.
 func textLength(diffs []diffmatchpatch.Diff) int {
 	l := 0
 	for _, d := range diffs {
 		l += wordLen(d.Text)
 	}
 	return l
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	import (
	"strings"

	"github.com/sergi/go-diff/diffmatchpatch"
	)

	// This file contains word-diffing routines that build on top of the go-diff package.
	// The algorithm implemented here is from the suggested word diffing technique in
	// https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs

	// diffRange returns the indices of the beginning and end locations of the diff
	// that reconstruct (as best possible) the source value.
	func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) {
	var foundStart bool
	var seen string
	for end = 0; end < len(diffs); end++ {
	if len(seen) > 1 && seen[:len(seen)-1] == known {
	break
	}
	switch diffs[end].Type {
	case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
	if !foundStart {
	start = end
	foundStart = true
	}
	seen += diffs[end].Text + " "
	}
	}
	return start, end
	}

	func docDiff(id string, doc1 indexedDocument, doc1Start, doc1End int, doc2 indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff {
	chars1 := doc1.runes[doc1Start:doc1End]
	chars2 := doc2.runes[doc2Start:doc2End]

	dmp := diffmatchpatch.New()
	diffs := dmp.DiffMainRunes(chars1, chars2, false)

	// Recover the words from the previous rune encoding and return the textual diffs.
	diffs = diffRunesToWords(diffs, doc1.dict)
	return diffs
	}

	func diffWordsToRunes(doc *indexedDocument, start, end int) []rune {
	// Creates a slice of runes using the indexed values as a basis for runes.
	// The go-diff code basically does exactly this using ephemeral dictionaries
	// for each input string. We leverage the fact we have a persistent dictionary
	// to make this operation cheaper.
	// TODO: perhaps we should cache these in the corpus?
	runes := make([]rune, 0, end-start)

	for _, t := range doc.Tokens[start:end] {
	runes = append(runes, rune(t.ID))
	}
	return runes
	}

	// diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text.
	func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff {
	hydrated := make([]diffmatchpatch.Diff, 0, len(diffs))
	for _, aDiff := range diffs {
	chars := []rune(aDiff.Text)
	var sb strings.Builder

	for i, r := range chars {
	sb.WriteString(dict.getWord(tokenID(r)))
	if (i + 1) < len(chars) {
	sb.WriteByte(' ')
	}
	}

	aDiff.Text = sb.String()
	hydrated = append(hydrated, aDiff)
	}
	return hydrated
	}

	// Returns the number of words in the input string. Used by scoring and distance functions.
	// This function depends on the behavior of the tokenizer such that strings are separated
	// by exactly one space and don't start or end with whitespace.
	func wordLen(text string) int {
	if text == "" {
	return 0
	}
	return strings.Count(text, " ") + 1
	}

	// textLength returns the number of tokens in the diff. This value is used to
	// adjust the offset for detection, since this is the number of tokens
	// discarded while matching a diff. By virtue of how it's called, there won't
	// be "change" diffs (a paired insert/delete) so we can simplify the scan to
	// just count up everything.
	func textLength(diffs []diffmatchpatch.Diff) int {
	l := 0
	for _, d := range diffs {
	l += wordLen(d.Text)
	}
	return l
	}