language: distinguish regional variant in matching
Change-Id: I1d997d625726e4b8152a7d039fc761dfcad45b7a
Reviewed-on: https://go-review.googlesource.com/95832
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/language/httpexample_test.go b/language/httpexample_test.go
index 40d0663..03c0ab9 100644
--- a/language/httpexample_test.go
+++ b/language/httpexample_test.go
@@ -24,7 +24,7 @@
t, q, err := language.ParseAcceptLanguage(r.Header.Get("Accept-Language"))
// We ignore the error: the default language will be selected for t == nil.
tag, _, _ := matcher.Match(t...)
- fmt.Printf("%5v (t: %6v; q: %3v; err: %v)\n", tag, t, q, err)
+ fmt.Printf("%17v (t: %6v; q: %3v; err: %v)\n", tag, t, q, err)
}
func ExampleParseAcceptLanguage() {
@@ -41,8 +41,8 @@
}
// Output:
- // en-GB (t: [ en en-US nn]; q: [ 1 0.8 0.3]; err: <nil>)
- // en-GB (t: [ gsw en-US en]; q: [ 1 0.8 0.7]; err: <nil>)
- // de (t: [ gsw nl da]; q: [ 1 1 1]; err: <nil>)
- // en-GB (t: []; q: []; err: language: tag is not well-formed)
+ // en-GB (t: [ en en-US nn]; q: [ 1 0.8 0.3]; err: <nil>)
+ // en-GB-u-rg-uszzzz (t: [ gsw en-US en]; q: [ 1 0.8 0.7]; err: <nil>)
+ // de (t: [ gsw nl da]; q: [ 1 1 1]; err: <nil>)
+ // en-GB (t: []; q: []; err: language: tag is not well-formed)
}
diff --git a/language/language.go b/language/language.go
index 9ddff22..b254794 100644
--- a/language/language.go
+++ b/language/language.go
@@ -30,10 +30,11 @@
}
func makeTag(t language.Tag) (tag Tag) {
- if region := t.TypeForKey("rg"); len(region) > 2 {
+ if region := t.TypeForKey("rg"); len(region) == 6 && region[2:] == "zzzz" {
if r, err := language.ParseRegion(region[:2]); err == nil {
tFull := t
t, _ = t.SetTypeForKey("rg", "")
+ // TODO: should we not consider "va" for the language tag?
var exact1, exact2 bool
tag.language, exact1 = compactIndex(t)
t.RegionID = r
@@ -60,7 +61,7 @@
tag := t.language.tag()
if t.language != t.locale {
loc := t.locale.tag()
- tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
+ tag, _ = tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
}
return tag
}
@@ -483,12 +484,56 @@
}
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
-// for which data exists in the text repository. The index will change over time
+// for which data exists in the text repository.The index will change over time
// and should not be stored in persistent storage. If t does not match a compact
// index, exact will be false and the compact index will be returned for the
// first match after repeatedly taking the Parent of t.
func CompactIndex(t Tag) (index int, exact bool) {
- return int(t.locale), t.language == t.locale && t.full == nil
+ return int(t.language), t.full == nil
+}
+
+// TODO: make these functions and methods public once we settle on the API and
+//
+
+// regionalCompactIndex returns the CompactIndex for the regional variant of
+// this tag. This index is used to indicate region-specific overrides, such as
+// default currency, default calendar and week data, default time cycle, and
+// default measurement system and unit preferences.
+//
+// For instance, the tag en-GB-u-rg-uszzzz specifies British English with US
+// settings for currency, number formatting, etc. The CompactIndex for this tag
+// will be that for en-GB, while the regionalCompactIndex will be the one
+// corresponding to en-US.
+func regionalCompactIndex(t Tag) (index int, exact bool) {
+ return int(t.locale), t.full == nil
+}
+
+// languageTag returns t stripped of regional variant indicators.
+//
+// At the moment this means it is stripped of a regional and variant subtag "rg"
+// and "va" in the "u" extension.
+func (t Tag) languageTag() Tag {
+ if t.full == nil {
+ return Tag{language: t.language, locale: t.language}
+ }
+ tt := t.tag()
+ tt.SetTypeForKey("rg", "")
+ tt.SetTypeForKey("va", "")
+ return makeTag(tt)
+}
+
+// regionalTag returns the regional variant of the tag.
+//
+// At the moment this means that the region is set from the regional subtag
+// "rg" in the "u" extension.
+func (t Tag) regionalTag() Tag {
+ rt := Tag{language: t.locale, locale: t.locale}
+ if t.full == nil {
+ return rt
+ }
+ t, _ = Raw.Compose(rt, t.Variants(), t.Extensions())
+ t, _ = t.SetTypeForKey("rg", "")
+ return t
}
func compactIndex(t language.Tag) (index compactID, exact bool) {
diff --git a/language/language_test.go b/language/language_test.go
index 273f61f..168af38 100644
--- a/language/language_test.go
+++ b/language/language_test.go
@@ -46,35 +46,60 @@
}
}
-func TestCompactIndex(t *testing.T) {
- tests := []struct {
- tag string
- index compactID
- ok bool
- }{
- // TODO: these values will change with each CLDR update. This issue
- // will be solved if we decide to fix the indexes.
- {"und", undIndex, true},
- {"ca-ES-valencia", caESvalenciaIndex, true},
- {"ca-ES-valencia-u-va-posix", caESvalenciaIndex, false},
- {"ca-ES-valencia-u-co-phonebk", caESvalenciaIndex, false},
- {"ca-ES-valencia-u-co-phonebk-va-posix", caESvalenciaIndex, false},
- {"x-klingon", 0, false},
- {"en-US", enUSIndex, true},
- {"en-US-u-va-posix", enUSuvaposixIndex, true},
- {"en", enIndex, true},
- {"en-u-co-phonebk", enIndex, false},
- {"en-001", en001Index, true},
- {"zh-Hant-HK", zhHantHKIndex, true},
- {"zh-HK", zhHantHKIndex, false}, // maximized to zh-Hant-HK
- {"nl-Beng", 0, false}, // parent skips script
- {"nl-NO", nlIndex, false}, // region is ignored
- {"nl-Latn-NO", nlIndex, false},
- {"nl-Latn-NO-u-co-phonebk", nlIndex, false},
- {"nl-Latn-NO-valencia", nlIndex, false},
- {"nl-Latn-NO-oxendict", nlIndex, false},
- {"sh", shIndex, true}, // From plural rules.
+func TestString(t *testing.T) {
+ tests := []string{
+ "no-u-rg-dkzzzz",
}
+ for i, s := range tests {
+ tag := Make(s)
+ if tag.String() != s {
+ t.Errorf("%d:%s: got %s: want %s (%#v)", i, s, tag.String(), s, tag)
+ }
+ }
+}
+
+type compactTest struct {
+ tag string
+ index compactID
+ ok bool
+}
+
+var compactTests = []compactTest{
+ // TODO: these values will change with each CLDR update. This issue
+ // will be solved if we decide to fix the indexes.
+ {"und", undIndex, true},
+ {"ca-ES-valencia", caESvalenciaIndex, true},
+ {"ca-ES-valencia-u-va-posix", caESvalenciaIndex, false},
+ {"ca-ES-valencia-u-co-phonebk", caESvalenciaIndex, false},
+ {"ca-ES-valencia-u-co-phonebk-va-posix", caESvalenciaIndex, false},
+ {"x-klingon", 0, false},
+ {"en-US", enUSIndex, true},
+ {"en-US-u-va-posix", enUSuvaposixIndex, true},
+ {"en", enIndex, true},
+ {"en-u-co-phonebk", enIndex, false},
+ {"en-001", en001Index, true},
+ {"zh-Hant-HK", zhHantHKIndex, true},
+ {"zh-HK", zhHantHKIndex, false}, // maximized to zh-Hant-HK
+ {"nl-Beng", 0, false}, // parent skips script
+ {"nl-NO", nlIndex, false}, // region is ignored
+ {"nl-Latn-NO", nlIndex, false},
+ {"nl-Latn-NO-u-co-phonebk", nlIndex, false},
+ {"nl-Latn-NO-valencia", nlIndex, false},
+ {"nl-Latn-NO-oxendict", nlIndex, false},
+ {"sh", shIndex, true}, // From plural rules.
+}
+
+func TestCompactIndex(t *testing.T) {
+ tests := append(compactTests, []compactTest{
+ {"en-GB", enGBIndex, true},
+ {"en-GB-u-rg-uszzzz", enGBIndex, true},
+ {"en-GB-u-rg-USZZZZ", enGBIndex, true},
+ {"en-GB-u-rg-uszzzz-va-posix", enGBIndex, false},
+ {"en-GB-u-co-phonebk-rg-uszzzz", enGBIndex, false},
+ // Invalid region specifications are ignored.
+ {"en-GB-u-rg-usz-va-posix", enGBIndex, false},
+ {"en-GB-u-co-phonebk-rg-usz", enGBIndex, false},
+ }...)
for _, tt := range tests {
x, ok := CompactIndex(Raw.MustParse(tt.tag))
if compactID(x) != tt.index || ok != tt.ok {
@@ -83,6 +108,27 @@
}
}
+func TestRegionalCompactIndex(t *testing.T) {
+ tests := append(compactTests, []compactTest{
+ {"en-GB", enGBIndex, true},
+ {"en-GB-u-rg-uszzzz", enUSIndex, true},
+ {"en-GB-u-rg-USZZZZ", enUSIndex, true},
+ // TODO: use different exact values for language and regional tag?
+ {"en-GB-u-rg-uszzzz-va-posix", enUSuvaposixIndex, false},
+ {"en-GB-u-co-phonebk-rg-uszzzz-va-posix", enUSuvaposixIndex, false},
+ {"en-GB-u-co-phonebk-rg-uszzzz", enUSIndex, false},
+ // Invalid region specifications are ignored.
+ {"en-GB-u-rg-usz-va-posix", enGBIndex, false},
+ {"en-GB-u-co-phonebk-rg-usz", enGBIndex, false},
+ }...)
+ for _, tt := range tests {
+ x, ok := regionalCompactIndex(Raw.MustParse(tt.tag))
+ if compactID(x) != tt.index || ok != tt.ok {
+ t.Errorf("%s: got %d, %v; want %d %v", tt.tag, x, ok, tt.index, tt.ok)
+ }
+ }
+}
+
func TestMarshal(t *testing.T) {
testCases := []string{
// TODO: these values will change with each CLDR update. This issue
@@ -99,6 +145,12 @@
"en-u-co-phonebk",
"en-001",
"sh",
+
+ "en-GB-u-rg-uszzzz",
+ "en-GB-u-rg-uszzzz-va-posix",
+ "en-GB-u-co-phonebk-rg-uszzzz",
+ // Invalid tags should also roundtrip.
+ "en-GB-u-co-phonebk-rg-uszz",
}
for _, tc := range testCases {
var tag Tag
@@ -532,6 +584,16 @@
{"und-Qaai", "und-Zinh", DeprecatedScript},
{"und-Qaai", "und-Qaai", DeprecatedBase},
{"drh", "mn", All}, // drh -> khk -> mn
+
+ {"en-GB-u-rg-uszzzz", "en-GB-u-rg-uszzzz", Raw},
+ {"en-GB-u-rg-USZZZZ", "en-GB-u-rg-uszzzz", Raw},
+ // TODO: use different exact values for language and regional tag?
+ {"en-GB-u-rg-uszzzz-va-posix", "en-GB-u-rg-uszzzz-va-posix", Raw},
+ {"en-GB-u-rg-uszzzz-co-phonebk", "en-GB-u-co-phonebk-rg-uszzzz", Raw},
+ // Invalid region specifications are left as is.
+ {"en-GB-u-rg-usz", "en-GB-u-rg-usz", Raw},
+ {"en-GB-u-rg-usz-va-posix", "en-GB-u-rg-usz-va-posix", Raw},
+ {"en-GB-u-rg-usz-co-phonebk", "en-GB-u-co-phonebk-rg-usz", Raw},
}
for i, tt := range tests {
in, _ := Raw.Parse(tt.in)
@@ -558,6 +620,8 @@
{"co", "en-u-co-phonebk", "phonebk"},
{"co", "en-u-co-phonebk-cu-aud", "phonebk"},
{"co", "x-foo-u-co-phonebk", ""},
+ {"va", "en-US-u-va-posix", "posix"},
+ {"rg", "en-u-rg-gbzzzz", "gbzzzz"},
{"nu", "en-u-co-phonebk-nu-arabic", "arabic"},
{"kc", "cmn-u-co-stroke", ""},
}
@@ -656,6 +720,19 @@
{"pt-MZ", "pt-PT"},
{"pt-ST", "pt-PT"},
{"pt-TL", "pt-PT"},
+
+ {"en-GB-u-co-phonebk-rg-uszzzz", "en-GB"},
+ {"en-GB-u-rg-uszzzz", "en-GB"},
+ {"en-US-u-va-posix", "en-US"},
+
+ // Difference between language and regional tag.
+ {"ca-ES-valencia", "ca-ES"},
+ {"ca-ES-valencia-u-rg-ptzzzz", "ca-ES"},
+ {"en-US-u-va-variant", "en-US"},
+ {"en-u-va-variant", "en"},
+ {"en-u-rg-gbzzzz", "en"},
+ {"en-US-u-rg-gbzzzz", "en-US"},
+ {"nl-US-u-rg-gbzzzz", "nl-US"},
}
for _, tt := range tests {
tag := Raw.MustParse(tt.in)
diff --git a/language/match.go b/language/match.go
index d84e972..0edbc9d 100644
--- a/language/match.go
+++ b/language/match.go
@@ -6,6 +6,7 @@
import (
"errors"
+ "strings"
"golang.org/x/text/language/internal"
)
@@ -104,9 +105,14 @@
}
// TODO: select first language tag based on script.
}
- if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
- tt.RegionID = w.RegionID
- tt.RemakeString()
+ if w.RegionID != tt.RegionID && w.RegionID != 0 {
+ if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
+ tt.RegionID = w.RegionID
+ tt.RemakeString()
+ } else if r := w.RegionID.String(); len(r) == 2 {
+ // TODO: also filter macro and deprecated.
+ tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
+ }
}
// Copy options from the user-provided tag into the result tag. This is hard
// to do after the fact, so we do it here.
diff --git a/language/match_test.go b/language/match_test.go
index 3bfefff..c21b863 100644
--- a/language/match_test.go
+++ b/language/match_test.go
@@ -40,13 +40,15 @@
gotCombined, index, conf := NewMatcher(supported).Match(desired...)
gotMatch := supported[index]
- wantMatch := mk(p.String(2))
+ wantMatch := Raw.Make(p.String(2)) // wantMatch may be null
if gotMatch != wantMatch {
t.Fatalf("match: got %q; want %q (%v)", gotMatch, wantMatch, conf)
}
- wantCombined, err := Raw.Parse(p.String(3))
- if err == nil && gotCombined != wantCombined {
- t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
+ if tag := strings.TrimSpace(p.String(3)); tag != "" {
+ wantCombined := Raw.MustParse(tag)
+ if err == nil && gotCombined != wantCombined {
+ t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
+ }
}
})
})
@@ -68,7 +70,7 @@
// which is better.
// Inconsistencies in combined. I think the Go approach is more appropriate.
- // We could use -u-rg- and -u-va- as alternative.
+ // We could use -u-rg- as alternative.
"und,fr/fr-BE-fonipa": true, // combined: got "fr"; want "fr-BE-fonipa"
"und,fr-CA/fr-BE-fonipa": true, // combined: got "fr-CA"; want "fr-BE-fonipa"
"und,fr-fonupa/fr-BE-fonipa": true, // combined: got "fr-fonupa"; want "fr-BE-fonipa"
diff --git a/language/parse.go b/language/parse.go
index e54a0a4..f0e0b64 100644
--- a/language/parse.go
+++ b/language/parse.go
@@ -74,10 +74,10 @@
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. For -u extensions, though, the key-type pairs are
-// added, where later values overwrite older ones. A Tag overwrites all former values and typically
-// only makes sense as the first argument. The resulting tag is returned after
-// canonicalizing using CanonType c. If one or more errors are encountered,
-// one of the errors is returned.
+// added, where later values overwrite older ones. A Tag overwrites all former
+// values and typically only makes sense as the first argument. The resulting
+// tag is returned after canonicalizing using CanonType c. If one or more errors
+// are encountered, one of the errors is returned.
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
var b language.Builder
if err = update(&b, part...); err != nil {
diff --git a/language/parse_test.go b/language/parse_test.go
index 7a5b54b..3ac1642 100644
--- a/language/parse_test.go
+++ b/language/parse_test.go
@@ -36,6 +36,11 @@
{in: "root", lang: "und"},
{in: "und", lang: "und"},
{in: "en", lang: "en"},
+
+ {in: "en-US-u-va-posix", lang: "en", region: "US", ext: "u-va-posix"},
+ {in: "ca-ES-valencia", lang: "ca", region: "ES", variants: "valencia"},
+ {in: "en-US-u-rg-gbzzzz", lang: "en", region: "US", ext: "u-rg-gbzzzz"},
+
{in: "xy", lang: "und", invalid: true},
{in: "en-ZY", lang: "en", invalid: true},
{in: "gsw", lang: "gsw"},
diff --git a/language/testdata/GoLocaleMatcherTest.txt b/language/testdata/GoLocaleMatcherTest.txt
index 4f4c609..32a649f 100644
--- a/language/testdata/GoLocaleMatcherTest.txt
+++ b/language/testdata/GoLocaleMatcherTest.txt
@@ -9,7 +9,7 @@
zh-CN, zh-TW, iw ; zh ; zh-CN
zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN
zh-CN, zh-TW, iw ; zh-Hant-HK ; zh-TW
-zh-CN, zh-TW, iw ; he-IT ; iw ; iw
+zh-CN, zh-TW, iw ; he-IT ; iw ; iw-u-rg-itzzzz
# language-specific script fallbacks 1
en, sr, nl ; sr-Latn ; sr
@@ -125,7 +125,7 @@
pt, pt-PT ; pt-ES ; pt-PT
# if no preferred locale specified, pick top language, not regional
-en, fr, fr-CA, fr-CH ; fr-US ; fr #TODO: ; fr-u-rg-US
+en, fr, fr-CA, fr-CH ; fr-US ; fr ; fr-u-rg-uszzzz
# region distance German
de-AT, de-DE, de-CH ; de ; de-DE
@@ -218,9 +218,14 @@
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-TL ; pt-PT
# preserve extensions
-en, de, sl-nedis ; de-FR-u-co-phonebk ; de ; de-u-co-phonebk
+en, de, sl-nedis ; de-FR-u-co-phonebk ; de ; de-u-co-phonebk-rg-frzzzz
en, de, sl-nedis ; sl-nedis-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur
en, de, sl-nedis ; sl-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur
-en, de, sl-nedis ; sl-HR-nedis-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur
+en, de, sl-nedis ; sl-HR-nedis-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur-rg-hrzzzz
en, de, sl-nedis ; de-t-m0-iso-i0-pinyin ; de ; de-t-m0-iso-i0-pinyin
+und, nl ; nl-BE-fonipa ; nl ; nl-u-rg-bezzzz
+und, nl-CA ; nl-BE-fonipa ; nl-CA ; nl-CA-u-rg-bezzzz
+und, nl-fonupa ; nl-BE-fonipa ; nl-fonupa ; nl-fonupa-u-rg-bezzzz
+und, no ; nn-DK-fonipa ; no ; no-u-rg-dkzzzz
+und, en-GB-u-sd-usca ; en-US-fonipa-u-nu-Arab-ca-buddhist-sd-usdc-t-m0-iso-i0-pinyin ; en-GB-u-sd-usca ; en-GB-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-Arab-rg-uszzzz-sd-usca
\ No newline at end of file
diff --git a/message/catalog/catalog_test.go b/message/catalog/catalog_test.go
index 08bfdc7..3de4c52 100644
--- a/message/catalog/catalog_test.go
+++ b/message/catalog/catalog_test.go
@@ -63,7 +63,7 @@
},
match: []string{
"gr -> en",
- "en-US -> en",
+ "en-US -> en-u-rg-uszzzz",
},
tags: langs("en"),
}, {
diff --git a/message/catalog_test.go b/message/catalog_test.go
index 7a2301c..ff409a8 100644
--- a/message/catalog_test.go
+++ b/message/catalog_test.go
@@ -23,7 +23,7 @@
want string
}{{
args: "de-CH",
- want: "de",
+ want: "de-u-rg-chzzzz",
}, {
args: "bn-u-nu-latn|en-US,en;q=0.9,de;q=0.8,nl;q=0.7",
want: "bn-u-nu-latn",