internal/export/unicode: add table generator
This is used to update tables in core unicode.
Change-Id: I6fb34eba45842e38426b1ca54e79b74c361195ec
Reviewed-on: https://go-review.googlesource.com/c/154439
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/gen.go b/gen.go
index cf2640e..224eef9 100644
--- a/gen.go
+++ b/gen.go
@@ -97,7 +97,8 @@
var unicode = &dependency{}
if updateCore {
fmt.Printf("Updating core to version %s...\n", gen.UnicodeVersion())
- unicode = generate("unicode")
+ unicodeInternal := generate("./internal/export/unicode")
+ unicode = generate("unicode", unicodeInternal)
// Test some users of the unicode packages, especially the ones that
// keep a mirrored table. These may need to be corrected by hand.
diff --git a/internal/export/unicode/doc.go b/internal/export/unicode/doc.go
new file mode 100644
index 0000000..c49ab6e
--- /dev/null
+++ b/internal/export/unicode/doc.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package unicode generates the Unicode tables in core.
+package unicode
+
+// This package is defined here, instead of core, as Go does not allow any
+// standard packages to have non-standard imports, even if imported in files
+// with a build ignore tag.
+
+//go:generate go run gen.go -tables=all -output tables.go
+//go:generate mv tables.go $GOROOT/src/unicode
diff --git a/internal/export/unicode/maketables.go b/internal/export/unicode/gen.go
similarity index 90%
rename from internal/export/unicode/maketables.go
rename to internal/export/unicode/gen.go
index a1f1586..c93e695 100644
--- a/internal/export/unicode/maketables.go
+++ b/internal/export/unicode/gen.go
@@ -24,6 +24,8 @@
"strconv"
"strings"
"unicode"
+
+ "golang.org/x/text/unicode/rangetable"
)
func main() {
@@ -450,9 +452,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// Code generated by maketables; DO NOT EDIT.
-// To regenerate, run:
-// maketables --tables=%s --data=%s --casefolding=%s
+// Code generated by go generate; DO NOT EDIT.
package unicode
@@ -504,7 +504,7 @@
fullCategoryTest(list)
return
}
- printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
+ printf(progHeader)
println("// Version is the Unicode edition from which the tables are derived.")
printf("const Version = %q\n\n", version())
@@ -596,91 +596,38 @@
type Op func(code rune) bool
-const format = "\t\t{0x%04x, 0x%04x, %d},\n"
-
func dumpRange(header string, inCategory Op) {
- print(header)
- next := rune(0)
- latinOffset := 0
- print("\tR16: []Range16{\n")
- // one Range for each iteration
- count := &range16Count
- size := 16
- for {
- // look for start of range
- for next < rune(len(chars)) && !inCategory(next) {
- next++
+ runes := []rune{}
+ for i := range chars {
+ r := rune(i)
+ if inCategory(r) {
+ runes = append(runes, r)
}
- if next >= rune(len(chars)) {
- // no characters remain
- break
- }
-
- // start of range
- lo := next
- hi := next
- stride := rune(1)
- // accept lo
- next++
- // look for another character to set the stride
- for next < rune(len(chars)) && !inCategory(next) {
- next++
- }
- if next >= rune(len(chars)) {
- // no more characters
- printf(format, lo, hi, stride)
- break
- }
- // set stride
- stride = next - lo
- // check for length of run. next points to first jump in stride
- for i := next; i < rune(len(chars)); i++ {
- if inCategory(i) == (((i - lo) % stride) == 0) {
- // accept
- if inCategory(i) {
- hi = i
- }
- } else {
- // no more characters in this run
- break
- }
- }
- if uint32(hi) <= unicode.MaxLatin1 {
- latinOffset++
- }
- size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
- // next range: start looking where this range ends
- next = hi + 1
}
- print("\t},\n")
- if latinOffset > 0 {
- printf("\tLatinOffset: %d,\n", latinOffset)
- }
- print("}\n\n")
+ printRangeTable(header, runes)
}
-func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
- if size == 16 && hi >= 1<<16 {
- if lo < 1<<16 {
- if lo+stride != hi {
- logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
- }
- // No range contains U+FFFF as an instance, so split
- // the range into two entries. That way we can maintain
- // the invariant that R32 contains only >= 1<<16.
- printf(format, lo, lo, 1)
- lo = hi
- stride = 1
- *count++
- }
- print("\t},\n")
- print("\tR32: []Range32{\n")
- size = 32
- count = &range32Count
+func printRangeTable(header string, runes []rune) {
+ rt := rangetable.New(runes...)
+ print(header)
+ println("\tR16: []Range16{")
+ for _, r := range rt.R16 {
+ printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
+ range16Count++
}
- printf(format, lo, hi, stride)
- *count++
- return size, count
+ println("\t},")
+ if len(rt.R32) > 0 {
+ println("\tR32: []Range32{")
+ for _, r := range rt.R32 {
+ printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
+ range32Count++
+ }
+ println("\t},")
+ }
+ if rt.LatinOffset > 0 {
+ printf("\tLatinOffset: %d,\n", rt.LatinOffset)
+ }
+ printf("}\n\n")
}
func fullCategoryTest(list []string) {
@@ -751,26 +698,6 @@
scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
}
-// The script tables have a lot of adjacent elements. Fold them together.
-func foldAdjacent(r []Script) []unicode.Range32 {
- s := make([]unicode.Range32, 0, len(r))
- j := 0
- for i := 0; i < len(r); i++ {
- if j > 0 && r[i].lo == s[j-1].Hi+1 {
- s[j-1].Hi = r[i].hi
- } else {
- s = s[0 : j+1]
- s[j] = unicode.Range32{
- Lo: uint32(r[i].lo),
- Hi: uint32(r[i].hi),
- Stride: 1,
- }
- j++
- }
- }
- return s
-}
-
func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
@@ -796,13 +723,11 @@
// PropList.txt has the same format as Scripts.txt so we can share its parser.
func printScriptOrProperty(doProps bool) {
- flag := "scripts"
flaglist := *scriptlist
file := "Scripts.txt"
table := scripts
installed := unicode.Scripts
if doProps {
- flag = "props"
flaglist = *proplist
file = "PropList.txt"
table = props
@@ -831,13 +756,6 @@
return
}
- printf(
- "// Generated by running\n"+
- "// maketables --%s=%s --url=%s\n"+
- "// DO NOT EDIT\n\n",
- flag,
- flaglist,
- *url)
if flaglist == "all" {
if doProps {
println("// Properties is the set of Unicode property tables.")
@@ -874,19 +792,14 @@
alias, name)
ndecl++
}
- printf("var _%s = &RangeTable {\n", name)
- ranges := foldAdjacent(table[name])
- print("\tR16: []Range16{\n")
- size := 16
- count := &range16Count
- for _, s := range ranges {
- size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
+ decl := fmt.Sprintf("var _%s = &RangeTable {\n", name)
+ runes := []rune{}
+ for _, scr := range table[name] {
+ for r := scr.lo; r <= scr.hi; r++ {
+ runes = append(runes, rune(r))
+ }
}
- print("\t},\n")
- if off := findLatinOffset(ranges); off > 0 {
- printf("\tLatinOffset: %d,\n", off)
- }
- print("}\n\n")
+ printRangeTable(decl, runes)
}
decl.Sort()
println("// These variables have type *RangeTable.")
@@ -897,14 +810,6 @@
print(")\n\n")
}
-func findLatinOffset(ranges []unicode.Range32) int {
- i := 0
- for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 {
- i++
- }
- return i
-}
-
const (
CaseUpper = 1 << iota
CaseLower
@@ -1054,14 +959,10 @@
return
}
printf(
- "// Generated by running\n"+
- "// maketables --data=%s --casefolding=%s\n"+
- "// DO NOT EDIT\n\n"+
- "// CaseRanges is the table describing case mappings for all letters with\n"+
- "// non-self mappings.\n"+
- "var CaseRanges = _CaseRanges\n"+
- "var _CaseRanges = []CaseRange {\n",
- *dataURL, *casefoldingURL)
+ "// CaseRanges is the table describing case mappings for all letters with\n" +
+ "// non-self mappings.\n" +
+ "var CaseRanges = _CaseRanges\n" +
+ "var _CaseRanges = []CaseRange {\n")
var startState *caseState // the start of a run; nil for not active
var prevState = &caseState{} // the state of the previous character