Merge pull request #139 from michaelcheah/fix_newline_diffs_colored_text
Fix colored display for diffs with newlines
diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go
index a312c1a..34ce7ce 100644
--- a/diffmatchpatch/diff.go
+++ b/diffmatchpatch/diff.go
@@ -34,8 +34,6 @@
DiffInsert Operation = 1
// DiffEqual item represents an equal diff.
DiffEqual Operation = 0
- //IndexSeparator is used to seperate the array indexes in an index string
- IndexSeparator = ","
)
// Diff represents one diff operation
@@ -406,14 +404,11 @@
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
- chars := strings.Split(aDiff.Text, IndexSeparator)
- text := make([]string, len(chars))
+ runes := []rune(aDiff.Text)
+ text := make([]string, len(runes))
- for i, r := range chars {
- i1, err := strconv.Atoi(r)
- if err == nil {
- text[i] = lineArray[i1]
- }
+ for i, r := range runes {
+ text[i] = lineArray[runeToInt(r)]
}
aDiff.Text = strings.Join(text, "")
diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go
index 04329d6..f63bf98 100644
--- a/diffmatchpatch/diff_test.go
+++ b/diffmatchpatch/diff_test.go
@@ -314,12 +314,12 @@
dmp := New()
for i, tc := range []TestCase{
- {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
- {"a", "b", "1", "2", []string{"", "a", "b"}},
+ {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
+ {"a", "b", "\x01", "\x02", []string{"", "a", "b"}},
// Omit final newline.
- {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
+ {"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}},
// Same lines in Text1 and Text2
- {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
+ {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "\x01\x02\x03", "\x01\x04\x03\x05", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
} {
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -332,14 +332,13 @@
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
- var charList []string
+ var charList []rune
for x := 1; x < n+1; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
- charList = append(charList, strconv.Itoa(x))
+ charList = append(charList, rune(x))
}
lines := strings.Join(lineList, "")
- chars := strings.Join(charList[:], ",")
- assert.Equal(t, n, len(strings.Split(chars, ",")))
+ chars := string(charList)
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
assert.Equal(t, chars, actualChars1)
@@ -360,8 +359,8 @@
for i, tc := range []TestCase{
{
Diffs: []Diff{
- {DiffEqual, "1,2,1"},
- {DiffInsert, "2,1,2"},
+ {DiffEqual, "\x01\x02\x01"},
+ {DiffInsert, "\x02\x01\x02"},
},
Lines: []string{"", "alpha\n", "beta\n"},
@@ -380,13 +379,12 @@
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
- charList := []string{}
+ charList := []rune{}
for x := 1; x <= n; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
- charList = append(charList, strconv.Itoa(x))
+ charList = append(charList, rune(x))
}
- assert.Equal(t, n, len(charList))
- chars := strings.Join(charList[:], ",")
+ chars := string(charList)
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
@@ -1479,6 +1477,38 @@
assert.NotEmpty(t, diffs)
}
+func TestDiffPartialLineIndex(t *testing.T) {
+ dmp := New()
+ t1, t2, tt := dmp.DiffLinesToChars(
+ `line 1
+line 2
+line 3
+line 4
+line 5
+line 6
+line 7
+line 8
+line 9
+line 10 text1`,
+ `line 1
+line 2
+line 3
+line 4
+line 5
+line 6
+line 7
+line 8
+line 9
+line 10 text2`)
+ diffs := dmp.DiffMain(t1, t2, false)
+ diffs = dmp.DiffCharsToLines(diffs, tt)
+ assert.Equal(t, []Diff{
+ Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"},
+ Diff{DiffDelete, "line 10 text1"},
+ Diff{DiffInsert, "line 10 text2"},
+ }, diffs)
+}
+
func BenchmarkDiffMain(bench *testing.B) {
s1 := "`Twas brillig, and the slithy toves\nDid gyre and gimble in the wabe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.\n"
s2 := "I am the very model of a modern major general,\nI've information vegetable, animal, and mineral,\nI know the kings of England, and I quote the fights historical,\nFrom Marathon to Waterloo, in order categorical.\n"
diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go
index 44c4359..eb727bb 100644
--- a/diffmatchpatch/stringutil.go
+++ b/diffmatchpatch/stringutil.go
@@ -9,11 +9,16 @@
package diffmatchpatch
import (
- "strconv"
+ "fmt"
"strings"
"unicode/utf8"
)
+const UNICODE_INVALID_RANGE_START = 0xD800
+const UNICODE_INVALID_RANGE_END = 0xDFFF
+const UNICODE_INVALID_RANGE_DELTA = UNICODE_INVALID_RANGE_END - UNICODE_INVALID_RANGE_START + 1
+const UNICODE_RANGE_MAX = 0x10FFFF
+
// unescaper unescapes selected chars for compatibility with JavaScript's encodeURI.
// In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc.
var unescaper = strings.NewReplacer(
@@ -93,14 +98,93 @@
return ""
}
- indexSeparator := IndexSeparator[0]
-
- // Appr. 3 chars per num plus the comma.
- b := []byte{}
+ b := []rune{}
for _, n := range ns {
- b = strconv.AppendInt(b, int64(n), 10)
- b = append(b, indexSeparator)
+ b = append(b, intToRune(n))
}
- b = b[:len(b)-1]
return string(b)
}
+
+// These constants define the number of bits representable
+// in 1,2,3,4 byte utf8 sequences, respectively.
+const ONE_BYTE_BITS = 7
+const TWO_BYTE_BITS = 11
+const THREE_BYTE_BITS = 16
+const FOUR_BYTE_BITS = 21
+
+// Helper for getting a sequence of bits from an integer.
+func getBits(i uint32, cnt byte, from byte) byte {
+ return byte((i >> from) & ((1 << cnt) - 1))
+}
+
+// Converts an integer in the range 0~1112060 into a rune.
+// Based on the ranges table in https://en.wikipedia.org/wiki/UTF-8
+func intToRune(i uint32) rune {
+ if i < (1 << ONE_BYTE_BITS) {
+ return rune(i)
+ }
+
+ if i < (1 << TWO_BYTE_BITS) {
+ r, size := utf8.DecodeRune([]byte{0b11000000 | getBits(i, 5, 6), 0b10000000 | getBits(i, 6, 0)})
+ if size != 2 || r == utf8.RuneError {
+ panic(fmt.Sprintf("Error encoding an int %d with size 2, got rune %v and size %d", size, r, i))
+ }
+ return r
+ }
+
+ // Last -3 here needed because for some reason 3rd to last codepoint 65533 in this range
+ // was returning utf8.RuneError during encoding.
+ if i < ((1 << THREE_BYTE_BITS) - UNICODE_INVALID_RANGE_DELTA - 3) {
+ if i >= UNICODE_INVALID_RANGE_START {
+ i += UNICODE_INVALID_RANGE_DELTA
+ }
+
+ r, size := utf8.DecodeRune([]byte{0b11100000 | getBits(i, 4, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
+ if size != 3 || r == utf8.RuneError {
+ panic(fmt.Sprintf("Error encoding an int %d with size 3, got rune %v and size %d", size, r, i))
+ }
+ return r
+ }
+
+ if i < (1<<FOUR_BYTE_BITS - UNICODE_INVALID_RANGE_DELTA - 3) {
+ i += UNICODE_INVALID_RANGE_DELTA + 3
+ r, size := utf8.DecodeRune([]byte{0b11110000 | getBits(i, 3, 18), 0b10000000 | getBits(i, 6, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
+ if size != 4 || r == utf8.RuneError {
+ panic(fmt.Sprintf("Error encoding an int %d with size 4, got rune %v and size %d", size, r, i))
+ }
+ return r
+ }
+ panic(fmt.Sprintf("The integer %d is too large for runeToInt()", i))
+}
+
+// Converts a rune generated by intToRune back to an integer
+func runeToInt(r rune) uint32 {
+ i := uint32(r)
+ if i < (1 << ONE_BYTE_BITS) {
+ return i
+ }
+
+ bytes := []byte{0, 0, 0, 0}
+
+ size := utf8.EncodeRune(bytes, r)
+
+ if size == 2 {
+ return uint32(bytes[0]&0b11111)<<6 | uint32(bytes[1]&0b111111)
+ }
+
+ if size == 3 {
+ result := uint32(bytes[0]&0b1111)<<12 | uint32(bytes[1]&0b111111)<<6 | uint32(bytes[2]&0b111111)
+ if result >= UNICODE_INVALID_RANGE_END {
+ return result - UNICODE_INVALID_RANGE_DELTA
+ }
+
+ return result
+ }
+
+ if size == 4 {
+ result := uint32(bytes[0]&0b111)<<18 | uint32(bytes[1]&0b111111)<<12 | uint32(bytes[2]&0b111111)<<6 | uint32(bytes[3]&0b111111)
+ return result - UNICODE_INVALID_RANGE_DELTA - 3
+ }
+
+ panic(fmt.Sprintf("Unexpected state decoding rune=%v size=%d", r, size))
+}
diff --git a/diffmatchpatch/stringutil_test.go b/diffmatchpatch/stringutil_test.go
index ab2bc10..73ab6ca 100644
--- a/diffmatchpatch/stringutil_test.go
+++ b/diffmatchpatch/stringutil_test.go
@@ -114,3 +114,21 @@
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))
}
}
+
+// Exhaustive check for all ints from 0 to 1112060 for correctness of implementation
+// of `intToRune() -> runeToInt()`.
+// This test is slow and runs longer than 5 seconds but it does provide a safety
+// guarantee that these 2 functions are correct for the ranges we support.
+func TestRuneToInt(t *testing.T) {
+
+ for i := uint32(0); i <= UNICODE_RANGE_MAX-UNICODE_INVALID_RANGE_DELTA-3; i += 1 {
+ r := intToRune(i)
+ ic := runeToInt(r)
+
+ assert.Equal(t, i, ic, fmt.Sprintf("intToRune(%d)=%d and runeToInt(%d)=%d", i, r, r, ic))
+ }
+
+ assert.Panics(t, func() {
+ intToRune(UNICODE_RANGE_MAX - UNICODE_INVALID_RANGE_DELTA - 2)
+ })
+}
diff --git a/go.mod b/go.mod
index c731033..c7886ce 100644
--- a/go.mod
+++ b/go.mod
@@ -8,4 +8,4 @@
gopkg.in/yaml.v2 v2.4.0 // indirect
)
-go 1.12
+go 1.13