Merge pull request #12 from goojba/utf8bug
Use []rune in diff internals
diff --git a/diffmatchpatch/dmp.go b/diffmatchpatch/dmp.go
index a8975f4..dba0133 100644
--- a/diffmatchpatch/dmp.go
+++ b/diffmatchpatch/dmp.go
@@ -366,10 +366,13 @@
// and return the recursively constructed diff.
// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
func (dmp *DiffMatchPatch) DiffBisect(text1, text2 string, deadline time.Time) []Diff {
+ // Convert to runes to avoid utf8 slicing bugs.
+ runes1 := []rune(text1)
+ runes2 := []rune(text2)
// Cache the text lengths to prevent multiple calls.
- text1_len, text2_len := len(text1), len(text2)
+ runes1_len, runes2_len := len(runes1), len(runes2)
- max_d := (text1_len + text2_len + 1) / 2
+ max_d := (runes1_len + runes2_len + 1) / 2
v_offset := max_d
v_length := 2 * max_d
@@ -382,7 +385,7 @@
v1[v_offset+1] = 0
v2[v_offset+1] = 0
- delta := text1_len - text2_len
+ delta := runes1_len - runes2_len
// If the total number of characters is odd, then the front path will collide
// with the reverse path.
front := (delta%2 != 0)
@@ -410,30 +413,28 @@
}
y1 := x1 - k1
- for x1 < text1_len && y1 < text2_len {
- r1, size := utf8.DecodeRuneInString(text1[x1:])
- r2, _ := utf8.DecodeRuneInString(text2[y1:])
- if r1 != r2 {
+ for x1 < runes1_len && y1 < runes2_len {
+ if runes1[x1] != runes2[y1] {
break
}
- x1 += size
- y1 += size
+ x1++
+ y1++
}
v1[k1_offset] = x1
- if x1 > text1_len {
+ if x1 > runes1_len {
// Ran off the right of the graph.
k1end += 2
- } else if y1 > text2_len {
+ } else if y1 > runes2_len {
// Ran off the bottom of the graph.
k1start += 2
} else if front {
k2_offset := v_offset + delta - k1
if k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1 {
// Mirror x2 onto top-left coordinate system.
- x2 := text1_len - v2[k2_offset]
+ x2 := runes1_len - v2[k2_offset]
if x1 >= x2 {
// Overlap detected.
- return dmp.diffBisectSplit_(text1, text2, x1, y1, deadline)
+ return dmp.diffBisectSplit_(runes1, runes2, x1, y1, deadline)
}
}
}
@@ -448,20 +449,18 @@
x2 = v2[k2_offset-1] + 1
}
var y2 = x2 - k2
- for x2 < text1_len && y2 < text2_len {
- r1, size := utf8.DecodeLastRuneInString(text1[:text1_len-x2])
- r2, _ := utf8.DecodeLastRuneInString(text2[:text2_len-y2])
- if r1 != r2 {
+ for x2 < runes1_len && y2 < runes2_len {
+ if runes1[runes1_len-x2-1] != runes2[runes2_len-y2-1] {
break
}
- x2 += size
- y2 += size
+ x2++
+ y2++
}
v2[k2_offset] = x2
- if x2 > text1_len {
+ if x2 > runes1_len {
// Ran off the left of the graph.
k2end += 2
- } else if y2 > text2_len {
+ } else if y2 > runes2_len {
// Ran off the top of the graph.
k2start += 2
} else if !front {
@@ -470,10 +469,10 @@
x1 := v1[k1_offset]
y1 := v_offset + x1 - k1_offset
// Mirror x2 onto top-left coordinate system.
- x2 = text1_len - x2
+ x2 = runes1_len - x2
if x1 >= x2 {
// Overlap detected.
- return dmp.diffBisectSplit_(text1, text2, x1, y1, deadline)
+ return dmp.diffBisectSplit_(runes1, runes2, x1, y1, deadline)
}
}
}
@@ -487,16 +486,16 @@
}
}
-func (dmp *DiffMatchPatch) diffBisectSplit_(text1, text2 string, x, y int,
+func (dmp *DiffMatchPatch) diffBisectSplit_(runes1, runes2 []rune, x, y int,
deadline time.Time) []Diff {
- text1a := text1[:x]
- text2a := text2[:y]
- text1b := text1[x:]
- text2b := text2[y:]
+ runes1a := runes1[:x]
+ runes2a := runes2[:y]
+ runes1b := runes1[x:]
+ runes2b := runes2[y:]
// Compute both diffs serially.
- diffs := dmp.diffMain(text1a, text2a, false, deadline)
- diffsb := dmp.diffMain(text1b, text2b, false, deadline)
+ diffs := dmp.diffMain(string(runes1a), string(runes2a), false, deadline)
+ diffsb := dmp.diffMain(string(runes1b), string(runes2b), false, deadline)
return append(diffs, diffsb...)
}
diff --git a/diffmatchpatch/dmp_test.go b/diffmatchpatch/dmp_test.go
index 98dc2cf..a6af9ae 100644
--- a/diffmatchpatch/dmp_test.go
+++ b/diffmatchpatch/dmp_test.go
@@ -193,6 +193,17 @@
assert.True(t, dmp.DiffHalfMatch("qHilloHelloHew", "xHelloHeHulloy") == nil, "")
}
+func Test_diffBisectSplit(t *testing.T) {
+ // As originally written, this can produce invalid utf8 strings.
+ dmp := New()
+ diffs := dmp.diffBisectSplit_([]rune("STUV\x05WX\x05YZ\x05["),
+ []rune("WĺĻļ\x05YZ\x05ĽľĿŀZ"), 7, 6, time.Now().Add(time.Hour))
+ for _, d := range diffs {
+ assert.True(t, utf8.ValidString(d.Text))
+ }
+}
+
+
func Test_diffLinesToChars(t *testing.T) {
dmp := New()
// Convert lines down to characters.