language: implement paradigm locales

This allows deleting parent distance.

This also fully implements allowing the user to
order dialects non-contiguously.

Change-Id: I09a8d21a6c6e18edc24db136b280ba960eb72370
Reviewed-on: https://go-review.googlesource.com/55911
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/language/gen.go b/language/gen.go
index 50f772d..7c260e5 100644
--- a/language/gen.go
+++ b/language/gen.go
@@ -1417,6 +1417,22 @@
 	}
 	b.writeSlice("regionToGroups", regionToGroups)
 
+	// maps language id to in- and out-of-group region.
+	paradigmLocales := [][3]uint16{}
+	locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
+	for i := 0; i < len(locales); i += 2 {
+		x := [3]uint16{}
+		for j := 0; j < 2; j++ {
+			pc := strings.SplitN(locales[i+j], "-", 2)
+			x[0] = b.langIndex(pc[0])
+			if len(pc) == 2 {
+				x[1+j] = uint16(b.region.index(pc[1]))
+			}
+		}
+		paradigmLocales = append(paradigmLocales, x)
+	}
+	b.writeSlice("paradigmLocales", paradigmLocales)
+
 	b.writeType(mutualIntelligibility{})
 	b.writeType(scriptIntelligibility{})
 	b.writeType(regionIntelligibility{})
diff --git a/language/match.go b/language/match.go
index 2f14436..b7779ed 100644
--- a/language/match.go
+++ b/language/match.go
@@ -559,9 +559,6 @@
 		}
 	}
 
-	// TODO: include alt script.
-	// - don't replace regions, but allow regions to be made more specific.
-
 	// update is used to add indexes in the map for equivalent languages.
 	// update will only add entries to original indexes, thus not computing any
 	// transitive relations.
@@ -687,16 +684,18 @@
 
 // bestMatch accumulates the best match so far.
 type bestMatch struct {
-	have        *haveTag
-	want        Tag
-	conf        Confidence
-	pinLanguage bool
+	have            *haveTag
+	want            Tag
+	conf            Confidence
+	pinnedRegion    regionID
+	pinLanguage     bool
+	sameRegionGroup bool
 	// Cached results from applying tie-breaking rules.
 	origLang     bool
 	origReg      bool
+	paradigmReg  bool
 	regGroupDist uint8
 	origScript   bool
-	parentDist   uint8 // 255 if have is not an ancestor of want tag.
 }
 
 // update updates the existing best match if the new pair is considered to be a
@@ -723,12 +722,20 @@
 	if m.pinLanguage && tag.lang != m.want.lang {
 		return
 	}
-	if c == Exact && have.tag.script == tag.script {
+	// Pin the region group if we are comparing tags for the same language.
+	if tag.lang == m.want.lang && m.sameRegionGroup {
+		_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.lang)
+		if !sameGroup {
+			return
+		}
+	}
+	if c == Exact && have.maxScript == maxScript {
+		// If there is another language and then another entry of this language,
+		// don't pin anything, otherwise pin the language.
 		m.pinLanguage = pin
 	}
 	if have.tag.equalsRest(tag) {
 	} else if have.maxScript != maxScript {
-		// fmt.Println("FFFFF", maxScript, have.maxScript)
 		// There is usually very little comprehension between different scripts.
 		// In a few cases there may still be Low comprehension. This possibility
 		// is pre-computed and stored in have.altScript.
@@ -737,9 +744,8 @@
 		}
 		c = Low
 	} else if have.maxRegion != maxRegion {
-		// There is usually a small difference between languages across regions.
-		// We use the region distance (below) to disambiguate between equal matches.
 		if High < c {
+			// There is usually a small difference between languages across regions.
 			c = High
 		}
 	}
@@ -766,7 +772,16 @@
 		beaten = true
 	}
 
-	regGroupDist := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
+	// We prefer if the pre-maximized region was specified and identical.
+	origReg := have.tag.region == tag.region && tag.region != 0
+	if !beaten && m.origReg != origReg {
+		if m.origReg {
+			return
+		}
+		beaten = true
+	}
+
+	regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
 	if !beaten && m.regGroupDist != regGroupDist {
 		if regGroupDist > m.regGroupDist {
 			return
@@ -774,10 +789,9 @@
 		beaten = true
 	}
 
-	// We prefer if the pre-maximized region was specified and identical.
-	origReg := have.tag.region == tag.region && tag.region != 0
-	if !beaten && m.origReg != origReg {
-		if m.origReg {
+	paradigmReg := isParadigmLocale(tag.lang, have.maxRegion)
+	if !beaten && m.paradigmReg != paradigmReg {
+		if !paradigmReg {
 			return
 		}
 		beaten = true
@@ -792,48 +806,35 @@
 		beaten = true
 	}
 
-	// TODO: remove parent distance once primary locales are implemented.
-	parentDist := parentDistance(have.tag.region, tag)
-	if !beaten && m.parentDist != parentDist {
-		if parentDist > m.parentDist {
-			return
-		}
-		beaten = true
-	}
-
 	// Update m to the newly found best match.
 	if beaten {
 		m.have = have
 		m.want = tag
 		m.conf = c
+		m.pinnedRegion = maxRegion
+		m.sameRegionGroup = sameGroup
 		m.origLang = origLang
 		m.origReg = origReg
+		m.paradigmReg = paradigmReg
 		m.origScript = origScript
 		m.regGroupDist = regGroupDist
-		m.parentDist = parentDist
 	}
 }
 
-// parentDistance returns the number of times Parent must be called before the
-// regions match. It is assumed that it has already been checked that lang and
-// script are identical. If haveRegion does not occur in the ancestor chain of
-// tag, it returns 255.
-func parentDistance(haveRegion regionID, tag Tag) uint8 {
-	p := tag.Parent()
-	d := uint8(1)
-	for haveRegion != p.region {
-		if p.region == 0 {
-			return 255
+func isParadigmLocale(lang langID, r regionID) bool {
+	for _, e := range paradigmLocales {
+		if langID(e[0]) == lang && (r == regionID(e[1]) || r == regionID(e[2])) {
+			return true
 		}
-		p = p.Parent()
-		d++
 	}
-	return d
+	return false
 }
 
 // regionGroupDist computes the distance between two regions based on their
 // CLDR grouping.
-func regionGroupDist(a, b regionID, script scriptID, lang langID) uint8 {
+func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, same bool) {
+	const defaultDistance = 4
+
 	aGroup := uint(regionToGroups[a]) << 1
 	bGroup := uint(regionToGroups[b]) << 1
 	for _, ri := range matchRegion {
@@ -841,17 +842,16 @@
 			group := uint(1 << (ri.group &^ 0x80))
 			if 0x80&ri.group == 0 {
 				if aGroup&bGroup&group != 0 { // Both regions are in the group.
-					return ri.distance
+					return ri.distance, ri.distance == defaultDistance
 				}
 			} else {
 				if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
-					return ri.distance
+					return ri.distance, ri.distance == defaultDistance
 				}
 			}
 		}
 	}
-	const defaultDistance = 4
-	return defaultDistance
+	return defaultDistance, true
 }
 
 func (t Tag) variants() string {
@@ -898,4 +898,14 @@
 			notEquivalent = append(notEquivalent, langID(lm.from))
 		}
 	}
+	// Maximize undefined regions of paradigm locales.
+	for i, v := range paradigmLocales {
+		max, _ := addTags(Tag{lang: langID(v[0])})
+		if v[1] == 0 {
+			paradigmLocales[i][1] = uint16(max.region)
+		}
+		if v[2] == 0 {
+			paradigmLocales[i][2] = uint16(max.region)
+		}
+	}
 }
diff --git a/language/match_test.go b/language/match_test.go
index ca77115..f1b9010 100644
--- a/language/match_test.go
+++ b/language/match_test.go
@@ -64,15 +64,6 @@
 	// combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab"
 	"und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true,
 
-	// Go prefers exact matches over less exact preferred ones.
-	// Preferring desired ones might be better.
-	// NOTE: allow users to distinguish languages is a good solution.
-	//       the remaining cases are due to preferred locale rules.
-	"pt-PT,pt-BR,es,es-419/pt-US,pt-PT": true, // match: got "pt-PT"; want "pt-BR"
-	"pt-PT,pt,es,es-419/pt-US,pt-PT,pt": true, // match: got "pt-PT"; want "pt"
-	// TODO: implement prefer primary locales.
-	"und,en,en-GU,en-IN,en-GB/en-ZA": true, // match: got "en-IN"; want "en-GB"
-
 	// Inconsistencies with Mark Davis' implementation where it is not clear
 	// which is better.
 
@@ -259,6 +250,12 @@
 	}{
 		{"zh-TW", "zh-HK", 5},
 		{"zh-MO", "zh-HK", 4},
+		{"es-ES", "es-AR", 5},
+		{"es-ES", "es", 4},
+		{"es-419", "es-MX", 4},
+		{"es-AR", "es-MX", 4},
+		{"es-ES", "es-MX", 5},
+		{"es-PT", "es-MX", 5},
 	}
 	for _, tc := range testCases {
 		a := MustParse(tc.a)
@@ -270,33 +267,27 @@
 			t.Errorf("scripts differ: %q vs %q", aScript, bScript)
 			continue
 		}
-		d := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
+		d, _ := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
 		if d != tc.distance {
 			t.Errorf("got %q; want %q", d, tc.distance)
 		}
 	}
 }
 
-func TestParentDistance(t *testing.T) {
-	tests := []struct {
-		parent string
-		tag    string
-		d      uint8
-	}{
-		{"en-001", "en-AU", 1},
-		{"pt-PT", "pt-AO", 1},
-		{"pt", "pt-AO", 2},
-		{"en-AU", "en-GB", 255},
-		{"en-NL", "en-AU", 255},
-		// Note that pt-BR and en-US are not automatically minimized.
-		{"pt-BR", "pt-AO", 255},
-		{"en-US", "en-AU", 255},
+func TestIsParadigmLocale(t *testing.T) {
+	testCases := map[string]bool{
+		"en-US":  true,
+		"en-GB":  true,
+		"en-VI":  false,
+		"es-GB":  false,
+		"es-ES":  true,
+		"es-419": true,
 	}
-	for _, tt := range tests {
-		r := Raw.MustParse(tt.parent).region
-		tag := Raw.MustParse(tt.tag)
-		if d := parentDistance(r, tag); d != tt.d {
-			t.Errorf("d(%s, %s) was %d; want %d", r, tag, d, tt.d)
+	for str, want := range testCases {
+		tag := Make(str)
+		got := isParadigmLocale(tag.lang, tag.region)
+		if got != want {
+			t.Errorf("isPL(%q) = %v; want %v", str, got, want)
 		}
 	}
 }
diff --git a/language/tables.go b/language/tables.go
index a5e59d3..ec17f97 100644
--- a/language/tables.go
+++ b/language/tables.go
@@ -3335,6 +3335,13 @@
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 }
 
+// Size: 18 bytes, 3 elements
+var paradigmLocales = [3][3]uint16{
+	0: [3]uint16{0x138, 0x0, 0x7b},
+	1: [3]uint16{0x13d, 0x0, 0x1f},
+	2: [3]uint16{0x3be, 0x41, 0xee},
+}
+
 type mutualIntelligibility struct {
 	want     uint16
 	have     uint16
@@ -3665,4 +3672,4 @@
 	4: {lang: 0x527, script: 0x38, maxScript: 0x38, toRegion: 0x8d, fromRegion: []uint16{0xc6}},
 }
 
-// Total table size 27157 bytes (26KiB); checksum: F21EE307
+// Total table size 27175 bytes (26KiB); checksum: 569649CD