internal/export/unicode: add table generator

This is used to update tables in core unicode.

Change-Id: I6fb34eba45842e38426b1ca54e79b74c361195ec
Reviewed-on: https://go-review.googlesource.com/c/154439
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/gen.go b/gen.go
index cf2640e..224eef9 100644
--- a/gen.go
+++ b/gen.go
@@ -97,7 +97,8 @@
 	var unicode = &dependency{}
 	if updateCore {
 		fmt.Printf("Updating core to version %s...\n", gen.UnicodeVersion())
-		unicode = generate("unicode")
+		unicodeInternal := generate("./internal/export/unicode")
+		unicode = generate("unicode", unicodeInternal)
 
 		// Test some users of the unicode packages, especially the ones that
 		// keep a mirrored table. These may need to be corrected by hand.
diff --git a/internal/export/unicode/doc.go b/internal/export/unicode/doc.go
new file mode 100644
index 0000000..c49ab6e
--- /dev/null
+++ b/internal/export/unicode/doc.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package unicode generates the Unicode tables in core.
+package unicode
+
+// This package is defined here, instead of core, as Go does not allow any
+// standard packages to have non-standard imports, even if imported in files
+// with a build ignore tag.
+
+//go:generate go run gen.go -tables=all -output tables.go
+//go:generate mv tables.go $GOROOT/src/unicode
diff --git a/internal/export/unicode/maketables.go b/internal/export/unicode/gen.go
similarity index 90%
rename from internal/export/unicode/maketables.go
rename to internal/export/unicode/gen.go
index a1f1586..c93e695 100644
--- a/internal/export/unicode/maketables.go
+++ b/internal/export/unicode/gen.go
@@ -24,6 +24,8 @@
 	"strconv"
 	"strings"
 	"unicode"
+
+	"golang.org/x/text/unicode/rangetable"
 )
 
 func main() {
@@ -450,9 +452,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Code generated by maketables; DO NOT EDIT.
-// To regenerate, run:
-//	maketables --tables=%s --data=%s --casefolding=%s
+// Code generated by go generate; DO NOT EDIT.
 
 package unicode
 
@@ -504,7 +504,7 @@
 		fullCategoryTest(list)
 		return
 	}
-	printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
+	printf(progHeader)
 
 	println("// Version is the Unicode edition from which the tables are derived.")
 	printf("const Version = %q\n\n", version())
@@ -596,91 +596,38 @@
 
 type Op func(code rune) bool
 
-const format = "\t\t{0x%04x, 0x%04x, %d},\n"
-
 func dumpRange(header string, inCategory Op) {
-	print(header)
-	next := rune(0)
-	latinOffset := 0
-	print("\tR16: []Range16{\n")
-	// one Range for each iteration
-	count := &range16Count
-	size := 16
-	for {
-		// look for start of range
-		for next < rune(len(chars)) && !inCategory(next) {
-			next++
+	runes := []rune{}
+	for i := range chars {
+		r := rune(i)
+		if inCategory(r) {
+			runes = append(runes, r)
 		}
-		if next >= rune(len(chars)) {
-			// no characters remain
-			break
-		}
-
-		// start of range
-		lo := next
-		hi := next
-		stride := rune(1)
-		// accept lo
-		next++
-		// look for another character to set the stride
-		for next < rune(len(chars)) && !inCategory(next) {
-			next++
-		}
-		if next >= rune(len(chars)) {
-			// no more characters
-			printf(format, lo, hi, stride)
-			break
-		}
-		// set stride
-		stride = next - lo
-		// check for length of run. next points to first jump in stride
-		for i := next; i < rune(len(chars)); i++ {
-			if inCategory(i) == (((i - lo) % stride) == 0) {
-				// accept
-				if inCategory(i) {
-					hi = i
-				}
-			} else {
-				// no more characters in this run
-				break
-			}
-		}
-		if uint32(hi) <= unicode.MaxLatin1 {
-			latinOffset++
-		}
-		size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
-		// next range: start looking where this range ends
-		next = hi + 1
 	}
-	print("\t},\n")
-	if latinOffset > 0 {
-		printf("\tLatinOffset: %d,\n", latinOffset)
-	}
-	print("}\n\n")
+	printRangeTable(header, runes)
 }
 
-func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
-	if size == 16 && hi >= 1<<16 {
-		if lo < 1<<16 {
-			if lo+stride != hi {
-				logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
-			}
-			// No range contains U+FFFF as an instance, so split
-			// the range into two entries. That way we can maintain
-			// the invariant that R32 contains only >= 1<<16.
-			printf(format, lo, lo, 1)
-			lo = hi
-			stride = 1
-			*count++
-		}
-		print("\t},\n")
-		print("\tR32: []Range32{\n")
-		size = 32
-		count = &range32Count
+func printRangeTable(header string, runes []rune) {
+	rt := rangetable.New(runes...)
+	print(header)
+	println("\tR16: []Range16{")
+	for _, r := range rt.R16 {
+		printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
+		range16Count++
 	}
-	printf(format, lo, hi, stride)
-	*count++
-	return size, count
+	println("\t},")
+	if len(rt.R32) > 0 {
+		println("\tR32: []Range32{")
+		for _, r := range rt.R32 {
+			printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
+			range32Count++
+		}
+		println("\t},")
+	}
+	if rt.LatinOffset > 0 {
+		printf("\tLatinOffset: %d,\n", rt.LatinOffset)
+	}
+	printf("}\n\n")
 }
 
 func fullCategoryTest(list []string) {
@@ -751,26 +698,6 @@
 	scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
 }
 
-// The script tables have a lot of adjacent elements. Fold them together.
-func foldAdjacent(r []Script) []unicode.Range32 {
-	s := make([]unicode.Range32, 0, len(r))
-	j := 0
-	for i := 0; i < len(r); i++ {
-		if j > 0 && r[i].lo == s[j-1].Hi+1 {
-			s[j-1].Hi = r[i].hi
-		} else {
-			s = s[0 : j+1]
-			s[j] = unicode.Range32{
-				Lo:     uint32(r[i].lo),
-				Hi:     uint32(r[i].hi),
-				Stride: 1,
-			}
-			j++
-		}
-	}
-	return s
-}
-
 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
 	for _, name := range list {
 		if _, ok := scripts[name]; !ok {
@@ -796,13 +723,11 @@
 
 // PropList.txt has the same format as Scripts.txt so we can share its parser.
 func printScriptOrProperty(doProps bool) {
-	flag := "scripts"
 	flaglist := *scriptlist
 	file := "Scripts.txt"
 	table := scripts
 	installed := unicode.Scripts
 	if doProps {
-		flag = "props"
 		flaglist = *proplist
 		file = "PropList.txt"
 		table = props
@@ -831,13 +756,6 @@
 		return
 	}
 
-	printf(
-		"// Generated by running\n"+
-			"//	maketables --%s=%s --url=%s\n"+
-			"// DO NOT EDIT\n\n",
-		flag,
-		flaglist,
-		*url)
 	if flaglist == "all" {
 		if doProps {
 			println("// Properties is the set of Unicode property tables.")
@@ -874,19 +792,14 @@
 				alias, name)
 			ndecl++
 		}
-		printf("var _%s = &RangeTable {\n", name)
-		ranges := foldAdjacent(table[name])
-		print("\tR16: []Range16{\n")
-		size := 16
-		count := &range16Count
-		for _, s := range ranges {
-			size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
+		decl := fmt.Sprintf("var _%s = &RangeTable {\n", name)
+		runes := []rune{}
+		for _, scr := range table[name] {
+			for r := scr.lo; r <= scr.hi; r++ {
+				runes = append(runes, rune(r))
+			}
 		}
-		print("\t},\n")
-		if off := findLatinOffset(ranges); off > 0 {
-			printf("\tLatinOffset: %d,\n", off)
-		}
-		print("}\n\n")
+		printRangeTable(decl, runes)
 	}
 	decl.Sort()
 	println("// These variables have type *RangeTable.")
@@ -897,14 +810,6 @@
 	print(")\n\n")
 }
 
-func findLatinOffset(ranges []unicode.Range32) int {
-	i := 0
-	for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 {
-		i++
-	}
-	return i
-}
-
 const (
 	CaseUpper = 1 << iota
 	CaseLower
@@ -1054,14 +959,10 @@
 		return
 	}
 	printf(
-		"// Generated by running\n"+
-			"//	maketables --data=%s --casefolding=%s\n"+
-			"// DO NOT EDIT\n\n"+
-			"// CaseRanges is the table describing case mappings for all letters with\n"+
-			"// non-self mappings.\n"+
-			"var CaseRanges = _CaseRanges\n"+
-			"var _CaseRanges = []CaseRange {\n",
-		*dataURL, *casefoldingURL)
+		"// CaseRanges is the table describing case mappings for all letters with\n" +
+			"// non-self mappings.\n" +
+			"var CaseRanges = _CaseRanges\n" +
+			"var _CaseRanges = []CaseRange {\n")
 
 	var startState *caseState    // the start of a run; nil for not active
 	var prevState = &caseState{} // the state of the previous character