Define the behavior for invalid UTF-8 sequences
Fixes #21
diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go
index 3b39e08..59b8851 100644
--- a/diffmatchpatch/diff.go
+++ b/diffmatchpatch/diff.go
@@ -45,11 +45,13 @@
}
// DiffMain finds the differences between two texts.
+// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
}
// DiffMainRunes finds the differences between two rune sequences.
+// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
var deadline time.Time
if dmp.DiffTimeout > 0 {
@@ -209,6 +211,7 @@
}
// DiffBisect finds the 'middle snake' of a diff, split the problem in two and return the recursively constructed diff.
+// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) []Diff {
// Unused in this code, but retained for interface compatibility.
@@ -353,7 +356,7 @@
return string(chars1), string(chars2), lineArray
}
-// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line.
+// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go
index 5459dcf..9eaf99a 100644
--- a/diffmatchpatch/diff_test.go
+++ b/diffmatchpatch/diff_test.go
@@ -1167,6 +1167,11 @@
actual := dmp.DiffBisect("cat", "map", tc.Time)
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %s", i, tc.Name))
}
+
+ // Test for invalid UTF-8 sequences
+ assert.Equal(t, []Diff{
+ Diff{DiffEqual, "��"},
+ }, dmp.DiffBisect("\xe0\xe5", "\xe0\xe5", time.Now().Add(time.Minute)))
}
func TestDiffMain(t *testing.T) {
@@ -1297,6 +1302,11 @@
actual := dmp.DiffMain(tc.Text1, tc.Text2, false)
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))
}
+
+ // Test for invalid UTF-8 sequences
+ assert.Equal(t, []Diff{
+ Diff{DiffDelete, "��"},
+ }, dmp.DiffMain("\xe0\xe5", "", false))
}
func TestDiffMainWithTimeout(t *testing.T) {