| diff --git a/source/common/Makefile.in b/source/common/Makefile.in |
| index 79e371b0..d21f5d06 100644 |
| --- a/source/common/Makefile.in |
| +++ b/source/common/Makefile.in |
| @@ -88,8 +88,9 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ |
| ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \ |
| resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \ |
| ucurr.o \ |
| -localebuilder.o \ |
| +localebuilder.o localeprioritylist.o \ |
| messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \ |
| +lsr.o loclikelysubtags.o locdistance.o localematcher.o \ |
| bytestream.o stringpiece.o bytesinkutil.o \ |
| stringtriebuilder.o bytestriebuilder.o \ |
| bytestrie.o bytestrieiterator.o \ |
| diff --git a/source/common/charstr.cpp b/source/common/charstr.cpp |
| index 852cc539..dda29dac 100644 |
| --- a/source/common/charstr.cpp |
| +++ b/source/common/charstr.cpp |
| @@ -35,6 +35,17 @@ CharString& CharString::operator=(CharString&& src) U_NOEXCEPT { |
| return *this; |
| } |
| |
| +char *CharString::cloneData(UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode)) { return nullptr; } |
| + char *p = static_cast<char *>(uprv_malloc(len + 1)); |
| + if (p == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return nullptr; |
| + } |
| + uprv_memcpy(p, buffer.getAlias(), len + 1); |
| + return p; |
| +} |
| + |
| CharString &CharString::copyFrom(const CharString &s, UErrorCode &errorCode) { |
| if(U_SUCCESS(errorCode) && this!=&s && ensureCapacity(s.len+1, 0, errorCode)) { |
| len=s.len; |
| @@ -52,6 +63,18 @@ int32_t CharString::lastIndexOf(char c) const { |
| return -1; |
| } |
| |
| +bool CharString::contains(StringPiece s) const { |
| + if (s.empty()) { return false; } |
| + const char *p = buffer.getAlias(); |
| + int32_t lastStart = len - s.length(); |
| + for (int32_t i = 0; i <= lastStart; ++i) { |
| + if (uprv_memcmp(p + i, s.data(), s.length()) == 0) { |
| + return true; |
| + } |
| + } |
| + return false; |
| +} |
| + |
| CharString &CharString::truncate(int32_t newLength) { |
| if(newLength<0) { |
| newLength=0; |
| diff --git a/source/common/charstr.h b/source/common/charstr.h |
| index 1a97e019..23b950ed 100644 |
| --- a/source/common/charstr.h |
| +++ b/source/common/charstr.h |
| @@ -82,10 +82,24 @@ public: |
| |
| const char *data() const { return buffer.getAlias(); } |
| char *data() { return buffer.getAlias(); } |
| + /** |
| + * Allocates length()+1 chars and copies the NUL-terminated data(). |
| + * The caller must uprv_free() the result. |
| + */ |
| + char *cloneData(UErrorCode &errorCode) const; |
| + |
| + bool operator==(StringPiece other) const { |
| + return len == other.length() && (len == 0 || uprv_memcmp(data(), other.data(), len) == 0); |
| + } |
| + bool operator!=(StringPiece other) const { |
| + return !operator==(other); |
| + } |
| |
| /** @return last index of c, or -1 if c is not in this string */ |
| int32_t lastIndexOf(char c) const; |
| |
| + bool contains(StringPiece s) const; |
| + |
| CharString &clear() { len=0; buffer[0]=0; return *this; } |
| CharString &truncate(int32_t newLength); |
| |
| diff --git a/source/common/localebuilder.cpp b/source/common/localebuilder.cpp |
| index fe931fcf..837b92f1 100644 |
| --- a/source/common/localebuilder.cpp |
| +++ b/source/common/localebuilder.cpp |
| @@ -157,13 +157,18 @@ _isKeywordValue(const char* key, const char* value, int32_t value_len) |
| } |
| |
| static void |
| -_copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& errorCode) |
| +_copyExtensions(const Locale& from, icu::StringEnumeration *keywords, |
| + Locale& to, bool validate, UErrorCode& errorCode) |
| { |
| if (U_FAILURE(errorCode)) { return; } |
| - LocalPointer<icu::StringEnumeration> iter(from.createKeywords(errorCode)); |
| - if (U_FAILURE(errorCode) || iter.isNull()) { return; } |
| + LocalPointer<icu::StringEnumeration> ownedKeywords; |
| + if (keywords == nullptr) { |
| + ownedKeywords.adoptInstead(from.createKeywords(errorCode)); |
| + if (U_FAILURE(errorCode) || ownedKeywords.isNull()) { return; } |
| + keywords = ownedKeywords.getAlias(); |
| + } |
| const char* key; |
| - while ((key = iter->next(nullptr, errorCode)) != nullptr) { |
| + while ((key = keywords->next(nullptr, errorCode)) != nullptr) { |
| CharString value; |
| CharStringByteSink sink(&value); |
| from.getKeywordValue(key, sink, errorCode); |
| @@ -176,34 +181,34 @@ _copyExtensions(const Locale& from, Locale* to, bool validate, UErrorCode& error |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| - to->setKeywordValue(key, value.data(), errorCode); |
| + to.setKeywordValue(key, value.data(), errorCode); |
| if (U_FAILURE(errorCode)) { return; } |
| } |
| } |
| |
| void static |
| -_clearUAttributesAndKeyType(Locale* locale, UErrorCode& errorCode) |
| +_clearUAttributesAndKeyType(Locale& locale, UErrorCode& errorCode) |
| { |
| // Clear Unicode attributes |
| - locale->setKeywordValue(kAttributeKey, "", errorCode); |
| + locale.setKeywordValue(kAttributeKey, "", errorCode); |
| |
| // Clear all Unicode keyword values |
| - LocalPointer<icu::StringEnumeration> iter(locale->createUnicodeKeywords(errorCode)); |
| + LocalPointer<icu::StringEnumeration> iter(locale.createUnicodeKeywords(errorCode)); |
| if (U_FAILURE(errorCode) || iter.isNull()) { return; } |
| const char* key; |
| while ((key = iter->next(nullptr, errorCode)) != nullptr) { |
| - locale->setUnicodeKeywordValue(key, nullptr, errorCode); |
| + locale.setUnicodeKeywordValue(key, nullptr, errorCode); |
| } |
| } |
| |
| static void |
| -_setUnicodeExtensions(Locale* locale, const CharString& value, UErrorCode& errorCode) |
| +_setUnicodeExtensions(Locale& locale, const CharString& value, UErrorCode& errorCode) |
| { |
| // Add the unicode extensions to extensions_ |
| CharString locale_str("und-u-", errorCode); |
| locale_str.append(value, errorCode); |
| _copyExtensions( |
| - Locale::forLanguageTag(locale_str.data(), errorCode), |
| + Locale::forLanguageTag(locale_str.data(), errorCode), nullptr, |
| locale, false, errorCode); |
| } |
| |
| @@ -235,10 +240,10 @@ LocaleBuilder& LocaleBuilder::setExtension(char key, StringPiece value) |
| status_); |
| return *this; |
| } |
| - _clearUAttributesAndKeyType(extensions_, status_); |
| + _clearUAttributesAndKeyType(*extensions_, status_); |
| if (U_FAILURE(status_)) { return *this; } |
| if (!value.empty()) { |
| - _setUnicodeExtensions(extensions_, value_str, status_); |
| + _setUnicodeExtensions(*extensions_, value_str, status_); |
| } |
| return *this; |
| } |
| @@ -401,6 +406,24 @@ Locale makeBogusLocale() { |
| return bogus; |
| } |
| |
| +void LocaleBuilder::copyExtensionsFrom(const Locale& src, UErrorCode& errorCode) |
| +{ |
| + if (U_FAILURE(errorCode)) { return; } |
| + LocalPointer<icu::StringEnumeration> keywords(src.createKeywords(errorCode)); |
| + if (U_FAILURE(errorCode) || keywords.isNull() || keywords->count(errorCode) == 0) { |
| + // Error, or no extensions to copy. |
| + return; |
| + } |
| + if (extensions_ == nullptr) { |
| + extensions_ = new Locale(); |
| + if (extensions_ == nullptr) { |
| + status_ = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + } |
| + _copyExtensions(src, keywords.getAlias(), *extensions_, false, errorCode); |
| +} |
| + |
| Locale LocaleBuilder::build(UErrorCode& errorCode) |
| { |
| if (U_FAILURE(errorCode)) { |
| @@ -425,7 +448,7 @@ Locale LocaleBuilder::build(UErrorCode& errorCode) |
| } |
| Locale product(locale_str.data()); |
| if (extensions_ != nullptr) { |
| - _copyExtensions(*extensions_, &product, true, errorCode); |
| + _copyExtensions(*extensions_, nullptr, product, true, errorCode); |
| } |
| if (U_FAILURE(errorCode)) { |
| return makeBogusLocale(); |
| diff --git a/source/common/localematcher.cpp b/source/common/localematcher.cpp |
| new file mode 100644 |
| index 00000000..d975fe75 |
| --- /dev/null |
| +++ b/source/common/localematcher.cpp |
| @@ -0,0 +1,720 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// localematcher.cpp |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#ifndef __LOCMATCHER_H__ |
| +#define __LOCMATCHER_H__ |
| + |
| +#include "unicode/utypes.h" |
| +#include "unicode/localebuilder.h" |
| +#include "unicode/localematcher.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/stringpiece.h" |
| +#include "unicode/uobject.h" |
| +#include "cstring.h" |
| +#include "localeprioritylist.h" |
| +#include "loclikelysubtags.h" |
| +#include "locdistance.h" |
| +#include "lsr.h" |
| +#include "uassert.h" |
| +#include "uhash.h" |
| +#include "uvector.h" |
| + |
| +#define UND_LSR LSR("und", "", "") |
| + |
| +/** |
| + * Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher. |
| + * |
| + * @draft ICU 65 |
| + */ |
| +enum ULocMatchLifetime { |
| + /** |
| + * Locale objects are temporary. |
| + * The matcher will make a copy of a locale that will be used beyond one function call. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ULOCMATCH_TEMPORARY_LOCALES, |
| + /** |
| + * Locale objects are stored at least as long as the matcher is used. |
| + * The matcher will keep only a pointer to a locale that will be used beyond one function call, |
| + * avoiding a copy. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ULOCMATCH_STORED_LOCALES // TODO: permanent? cached? clone? |
| +}; |
| +#ifndef U_IN_DOXYGEN |
| +typedef enum ULocMatchLifetime ULocMatchLifetime; |
| +#endif |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +LocaleMatcher::Result::Result(LocaleMatcher::Result &&src) U_NOEXCEPT : |
| + desiredLocale(src.desiredLocale), |
| + supportedLocale(src.supportedLocale), |
| + desiredIndex(src.desiredIndex), |
| + supportedIndex(src.supportedIndex), |
| + desiredIsOwned(src.desiredIsOwned) { |
| + if (desiredIsOwned) { |
| + src.desiredLocale = nullptr; |
| + src.desiredIndex = -1; |
| + src.desiredIsOwned = FALSE; |
| + } |
| +} |
| + |
| +LocaleMatcher::Result::~Result() { |
| + if (desiredIsOwned) { |
| + delete desiredLocale; |
| + } |
| +} |
| + |
| +LocaleMatcher::Result &LocaleMatcher::Result::operator=(LocaleMatcher::Result &&src) U_NOEXCEPT { |
| + this->~Result(); |
| + |
| + desiredLocale = src.desiredLocale; |
| + supportedLocale = src.supportedLocale; |
| + desiredIndex = src.desiredIndex; |
| + supportedIndex = src.supportedIndex; |
| + desiredIsOwned = src.desiredIsOwned; |
| + |
| + if (desiredIsOwned) { |
| + src.desiredLocale = nullptr; |
| + src.desiredIndex = -1; |
| + src.desiredIsOwned = FALSE; |
| + } |
| + return *this; |
| +} |
| + |
| +Locale LocaleMatcher::Result::makeResolvedLocale(UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode) || supportedLocale == nullptr) { |
| + return Locale::getRoot(); |
| + } |
| + const Locale *bestDesired = getDesiredLocale(); |
| + if (bestDesired == nullptr || *supportedLocale == *bestDesired) { |
| + return *supportedLocale; |
| + } |
| + LocaleBuilder b; |
| + b.setLocale(*supportedLocale); |
| + |
| + // Copy the region from bestDesired, if there is one. |
| + const char *region = bestDesired->getCountry(); |
| + if (*region != 0) { |
| + b.setRegion(region); |
| + } |
| + |
| + // Copy the variants from bestDesired, if there are any. |
| + // Note that this will override any supportedLocale variants. |
| + // For example, "sco-ulster-fonipa" + "...-fonupa" => "sco-fonupa" (replacing ulster). |
| + const char *variants = bestDesired->getVariant(); |
| + if (*variants != 0) { |
| + b.setVariant(variants); |
| + } |
| + |
| + // Copy the extensions from bestDesired, if there are any. |
| + // C++ note: The following note, copied from Java, may not be true, |
| + // as long as C++ copies by legacy ICU keyword, not by extension singleton. |
| + // Note that this will override any supportedLocale extensions. |
| + // For example, "th-u-nu-latn-ca-buddhist" + "...-u-nu-native" => "th-u-nu-native" |
| + // (replacing calendar). |
| + b.copyExtensionsFrom(*bestDesired, errorCode); |
| + return b.build(errorCode); |
| +} |
| + |
| +LocaleMatcher::Builder::Builder(LocaleMatcher::Builder &&src) U_NOEXCEPT : |
| + errorCode_(src.errorCode_), |
| + supportedLocales_(src.supportedLocales_), |
| + thresholdDistance_(src.thresholdDistance_), |
| + demotion_(src.demotion_), |
| + defaultLocale_(src.defaultLocale_), |
| + favor_(src.favor_) { |
| + src.supportedLocales_ = nullptr; |
| + src.defaultLocale_ = nullptr; |
| +} |
| + |
| +LocaleMatcher::Builder::~Builder() { |
| + delete supportedLocales_; |
| + delete defaultLocale_; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::operator=(LocaleMatcher::Builder &&src) U_NOEXCEPT { |
| + this->~Builder(); |
| + |
| + errorCode_ = src.errorCode_; |
| + supportedLocales_ = src.supportedLocales_; |
| + thresholdDistance_ = src.thresholdDistance_; |
| + demotion_ = src.demotion_; |
| + defaultLocale_ = src.defaultLocale_; |
| + favor_ = src.favor_; |
| + |
| + src.supportedLocales_ = nullptr; |
| + src.defaultLocale_ = nullptr; |
| + return *this; |
| +} |
| + |
| +void LocaleMatcher::Builder::clearSupportedLocales() { |
| + if (supportedLocales_ != nullptr) { |
| + supportedLocales_->removeAllElements(); |
| + } |
| +} |
| + |
| +bool LocaleMatcher::Builder::ensureSupportedLocaleVector() { |
| + if (U_FAILURE(errorCode_)) { return false; } |
| + if (supportedLocales_ != nullptr) { return true; } |
| + supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_); |
| + if (U_FAILURE(errorCode_)) { return false; } |
| + if (supportedLocales_ == nullptr) { |
| + errorCode_ = U_MEMORY_ALLOCATION_ERROR; |
| + return false; |
| + } |
| + return true; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListString( |
| + StringPiece locales) { |
| + LocalePriorityList list(locales, errorCode_); |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + clearSupportedLocales(); |
| + if (!ensureSupportedLocaleVector()) { return *this; } |
| + int32_t length = list.getLengthIncludingRemoved(); |
| + for (int32_t i = 0; i < length; ++i) { |
| + Locale *locale = list.orphanLocaleAt(i); |
| + if (locale == nullptr) { continue; } |
| + supportedLocales_->addElement(locale, errorCode_); |
| + if (U_FAILURE(errorCode_)) { |
| + delete locale; |
| + break; |
| + } |
| + } |
| + return *this; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + clearSupportedLocales(); |
| + if (!ensureSupportedLocaleVector()) { return *this; } |
| + while (locales.hasNext()) { |
| + const Locale &locale = locales.next(); |
| + Locale *clone = locale.clone(); |
| + if (clone == nullptr) { |
| + errorCode_ = U_MEMORY_ALLOCATION_ERROR; |
| + break; |
| + } |
| + supportedLocales_->addElement(clone, errorCode_); |
| + if (U_FAILURE(errorCode_)) { |
| + delete clone; |
| + break; |
| + } |
| + } |
| + return *this; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) { |
| + if (!ensureSupportedLocaleVector()) { return *this; } |
| + Locale *clone = locale.clone(); |
| + if (clone == nullptr) { |
| + errorCode_ = U_MEMORY_ALLOCATION_ERROR; |
| + return *this; |
| + } |
| + supportedLocales_->addElement(clone, errorCode_); |
| + if (U_FAILURE(errorCode_)) { |
| + delete clone; |
| + } |
| + return *this; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::setDefaultLocale(const Locale *defaultLocale) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + Locale *clone = nullptr; |
| + if (defaultLocale != nullptr) { |
| + clone = defaultLocale->clone(); |
| + if (clone == nullptr) { |
| + errorCode_ = U_MEMORY_ALLOCATION_ERROR; |
| + return *this; |
| + } |
| + } |
| + delete defaultLocale_; |
| + defaultLocale_ = clone; |
| + return *this; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag subtag) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + favor_ = subtag; |
| + return *this; |
| +} |
| + |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion demotion) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + demotion_ = demotion; |
| + return *this; |
| +} |
| + |
| +#if 0 |
| +/** |
| + * <i>Internal only!</i> |
| + * |
| + * @param thresholdDistance the thresholdDistance to set, with -1 = default |
| + * @return this Builder object |
| + * @internal |
| + * @deprecated This API is ICU internal only. |
| + */ |
| +@Deprecated |
| +LocaleMatcher::Builder &LocaleMatcher::Builder::internalSetThresholdDistance(int32_t thresholdDistance) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + if (thresholdDistance > 100) { |
| + thresholdDistance = 100; |
| + } |
| + thresholdDistance_ = thresholdDistance; |
| + return *this; |
| +} |
| +#endif |
| + |
| +UBool LocaleMatcher::Builder::copyErrorTo(UErrorCode &outErrorCode) const { |
| + if (U_FAILURE(outErrorCode)) { return TRUE; } |
| + if (U_SUCCESS(errorCode_)) { return FALSE; } |
| + outErrorCode = errorCode_; |
| + return TRUE; |
| +} |
| + |
| +LocaleMatcher LocaleMatcher::Builder::build(UErrorCode &errorCode) const { |
| + if (U_SUCCESS(errorCode) && U_FAILURE(errorCode_)) { |
| + errorCode = errorCode_; |
| + } |
| + return LocaleMatcher(*this, errorCode); |
| +} |
| + |
| +namespace { |
| + |
| +LSR getMaximalLsrOrUnd(const XLikelySubtags &likelySubtags, const Locale &locale, |
| + UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) { |
| + return UND_LSR; |
| + } else { |
| + return likelySubtags.makeMaximizedLsrFrom(locale, errorCode); |
| + } |
| +} |
| + |
| +int32_t hashLSR(const UHashTok token) { |
| + const LSR *lsr = static_cast<const LSR *>(token.pointer); |
| + return lsr->hashCode; |
| +} |
| + |
| +UBool compareLSRs(const UHashTok t1, const UHashTok t2) { |
| + const LSR *lsr1 = static_cast<const LSR *>(t1.pointer); |
| + const LSR *lsr2 = static_cast<const LSR *>(t2.pointer); |
| + return *lsr1 == *lsr2; |
| +} |
| + |
| +bool putIfAbsent(UHashtable *lsrToIndex, const LSR &lsr, int32_t i, UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return false; } |
| + U_ASSERT(i > 0); |
| + int32_t index = uhash_geti(lsrToIndex, &lsr); |
| + if (index != 0) { |
| + return false; |
| + } else { |
| + uhash_puti(lsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode); |
| + return U_SUCCESS(errorCode); |
| + } |
| +} |
| + |
| +} // namespace |
| + |
| +LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) : |
| + likelySubtags(*XLikelySubtags::getSingleton(errorCode)), |
| + localeDistance(*LocaleDistance::getSingleton(errorCode)), |
| + thresholdDistance(builder.thresholdDistance_), |
| + demotionPerDesiredLocale(0), |
| + favorSubtag(builder.favor_), |
| + supportedLocales(nullptr), lsrs(nullptr), supportedLocalesLength(0), |
| + supportedLsrToIndex(nullptr), |
| + supportedLSRs(nullptr), supportedIndexes(nullptr), supportedLSRsLength(0), |
| + ownedDefaultLocale(nullptr), defaultLocale(nullptr), defaultLocaleIndex(-1) { |
| + if (U_FAILURE(errorCode)) { return; } |
| + if (thresholdDistance < 0) { |
| + thresholdDistance = localeDistance.getDefaultScriptDistance(); |
| + } |
| + supportedLocalesLength = builder.supportedLocales_ != nullptr ? |
| + builder.supportedLocales_->size() : 0; |
| + const Locale *def = builder.defaultLocale_; |
| + int32_t idef = -1; |
| + if (supportedLocalesLength > 0) { |
| + // Store the supported locales in input order, |
| + // so that when different types are used (e.g., language tag strings) |
| + // we can return those by parallel index. |
| + supportedLocales = static_cast<const Locale **>( |
| + uprv_malloc(supportedLocalesLength * sizeof(const Locale *))); |
| + // Supported LRSs in input order. |
| + // In C++, we store these permanently to simplify ownership management |
| + // in the hash tables. Duplicate LSRs (if any) are unused overhead. |
| + lsrs = new LSR[supportedLocalesLength]; |
| + if (supportedLocales == nullptr || lsrs == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + // If the constructor fails partway, we need null pointers for destructibility. |
| + uprv_memset(supportedLocales, 0, supportedLocalesLength * sizeof(const Locale *)); |
| + // Also find the first supported locale whose LSR is |
| + // the same as that for the default locale. |
| + LSR builderDefaultLSR; |
| + const LSR *defLSR = nullptr; |
| + if (def != nullptr) { |
| + builderDefaultLSR = getMaximalLsrOrUnd(likelySubtags, *def, errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + defLSR = &builderDefaultLSR; |
| + } |
| + for (int32_t i = 0; i < supportedLocalesLength; ++i) { |
| + const Locale &locale = *static_cast<Locale *>(builder.supportedLocales_->elementAt(i)); |
| + supportedLocales[i] = locale.clone(); |
| + if (supportedLocales[i] == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + const Locale &supportedLocale = *supportedLocales[i]; |
| + LSR &lsr = lsrs[i] = getMaximalLsrOrUnd(likelySubtags, supportedLocale, errorCode); |
| + lsr.setHashCode(); |
| + if (U_FAILURE(errorCode)) { return; } |
| + if (idef < 0 && defLSR != nullptr && lsr == *defLSR) { |
| + idef = i; |
| + defLSR = &lsr; // owned pointer to put into supportedLsrToIndex |
| + if (*def == supportedLocale) { |
| + def = &supportedLocale; // owned pointer to keep |
| + } |
| + } |
| + } |
| + |
| + // We need an unordered map from LSR to first supported locale with that LSR, |
| + // and an ordered list of (LSR, supported index). |
| + // We insert the supported locales in the following order: |
| + // 1. Default locale, if it is supported. |
| + // 2. Priority locales (aka "paradigm locales") in builder order. |
| + // 3. Remaining locales in builder order. |
| + // In Java, we use a LinkedHashMap for both map & ordered lists. |
| + // In C++, we use separate structures. |
| + // We over-allocate arrays of LSRs and indexes for simplicity. |
| + // We reserve slots at the array starts for the default and paradigm locales, |
| + // plus enough for all supported locales. |
| + // If there are few paradigm locales and few duplicate supported LSRs, |
| + // then the amount of wasted space is small. |
| + supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong, |
| + supportedLocalesLength, &errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + int32_t paradigmLimit = 1 + localeDistance.getParadigmLSRsLength(); |
| + int32_t suppLSRsCapacity = paradigmLimit + supportedLocalesLength; |
| + supportedLSRs = static_cast<const LSR **>( |
| + uprv_malloc(suppLSRsCapacity * sizeof(const LSR *))); |
| + supportedIndexes = static_cast<int32_t *>( |
| + uprv_malloc(suppLSRsCapacity * sizeof(int32_t))); |
| + if (supportedLSRs == nullptr || supportedIndexes == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + int32_t paradigmIndex = 0; |
| + int32_t otherIndex = paradigmLimit; |
| + if (idef >= 0) { |
| + uhash_puti(supportedLsrToIndex, const_cast<LSR *>(defLSR), idef + 1, &errorCode); |
| + supportedLSRs[0] = defLSR; |
| + supportedIndexes[0] = idef; |
| + paradigmIndex = 1; |
| + } |
| + for (int32_t i = 0; i < supportedLocalesLength; ++i) { |
| + if (i == idef) { continue; } |
| + const Locale &locale = *supportedLocales[i]; |
| + const LSR &lsr = lsrs[i]; |
| + if (defLSR == nullptr) { |
| + U_ASSERT(i == 0); |
| + def = &locale; |
| + defLSR = &lsr; |
| + idef = 0; |
| + uhash_puti(supportedLsrToIndex, const_cast<LSR *>(&lsr), 0 + 1, &errorCode); |
| + supportedLSRs[0] = &lsr; |
| + supportedIndexes[0] = 0; |
| + paradigmIndex = 1; |
| + } else if (idef >= 0 && lsr == *defLSR) { |
| + // lsr == *defLSR means that this supported locale is |
| + // a duplicate of the default locale. |
| + // Either an explicit default locale is supported, and we added it before the loop, |
| + // or there is no explicit default locale, and this is |
| + // a duplicate of the first supported locale. |
| + // In both cases, idef >= 0 now, so otherwise we can skip the comparison. |
| + // For a duplicate, putIfAbsent() is a no-op, so nothing to do. |
| + } else { |
| + if (putIfAbsent(supportedLsrToIndex, lsr, i + 1, errorCode)) { |
| + if (localeDistance.isParadigmLSR(lsr)) { |
| + supportedLSRs[paradigmIndex] = &lsr; |
| + supportedIndexes[paradigmIndex++] = i; |
| + } else { |
| + supportedLSRs[otherIndex] = &lsr; |
| + supportedIndexes[otherIndex++] = i; |
| + } |
| + } |
| + } |
| + if (U_FAILURE(errorCode)) { return; } |
| + } |
| + // Squeeze out unused array slots. |
| + if (paradigmIndex < paradigmLimit && paradigmLimit < otherIndex) { |
| + uprv_memmove(supportedLSRs + paradigmIndex, supportedLSRs + paradigmLimit, |
| + (otherIndex - paradigmLimit) * sizeof(const LSR *)); |
| + uprv_memmove(supportedIndexes + paradigmIndex, supportedIndexes + paradigmLimit, |
| + (otherIndex - paradigmLimit) * sizeof(int32_t)); |
| + } |
| + supportedLSRsLength = otherIndex - (paradigmLimit - paradigmIndex); |
| + } |
| + |
| + if (def != nullptr && (idef < 0 || def != supportedLocales[idef])) { |
| + ownedDefaultLocale = def->clone(); |
| + if (ownedDefaultLocale == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + def = ownedDefaultLocale; |
| + } |
| + defaultLocale = def; |
| + defaultLocaleIndex = idef; |
| + |
| + if (builder.demotion_ == ULOCMATCH_DEMOTION_REGION) { |
| + demotionPerDesiredLocale = localeDistance.getDefaultDemotionPerDesiredLocale(); |
| + } |
| +} |
| + |
| +LocaleMatcher::LocaleMatcher(LocaleMatcher &&src) U_NOEXCEPT : |
| + likelySubtags(src.likelySubtags), |
| + localeDistance(src.localeDistance), |
| + thresholdDistance(src.thresholdDistance), |
| + demotionPerDesiredLocale(src.demotionPerDesiredLocale), |
| + favorSubtag(src.favorSubtag), |
| + supportedLocales(src.supportedLocales), lsrs(src.lsrs), |
| + supportedLocalesLength(src.supportedLocalesLength), |
| + supportedLsrToIndex(src.supportedLsrToIndex), |
| + supportedLSRs(src.supportedLSRs), |
| + supportedIndexes(src.supportedIndexes), |
| + supportedLSRsLength(src.supportedLSRsLength), |
| + ownedDefaultLocale(src.ownedDefaultLocale), defaultLocale(src.defaultLocale), |
| + defaultLocaleIndex(src.defaultLocaleIndex) { |
| + src.supportedLocales = nullptr; |
| + src.lsrs = nullptr; |
| + src.supportedLocalesLength = 0; |
| + src.supportedLsrToIndex = nullptr; |
| + src.supportedLSRs = nullptr; |
| + src.supportedIndexes = nullptr; |
| + src.supportedLSRsLength = 0; |
| + src.ownedDefaultLocale = nullptr; |
| + src.defaultLocale = nullptr; |
| + src.defaultLocaleIndex = -1; |
| +} |
| + |
| +LocaleMatcher::~LocaleMatcher() { |
| + for (int32_t i = 0; i < supportedLocalesLength; ++i) { |
| + delete supportedLocales[i]; |
| + } |
| + uprv_free(supportedLocales); |
| + delete[] lsrs; |
| + uhash_close(supportedLsrToIndex); |
| + uprv_free(supportedLSRs); |
| + uprv_free(supportedIndexes); |
| + delete ownedDefaultLocale; |
| +} |
| + |
| +LocaleMatcher &LocaleMatcher::operator=(LocaleMatcher &&src) U_NOEXCEPT { |
| + this->~LocaleMatcher(); |
| + |
| + thresholdDistance = src.thresholdDistance; |
| + demotionPerDesiredLocale = src.demotionPerDesiredLocale; |
| + favorSubtag = src.favorSubtag; |
| + supportedLocales = src.supportedLocales; |
| + lsrs = src.lsrs; |
| + supportedLocalesLength = src.supportedLocalesLength; |
| + supportedLsrToIndex = src.supportedLsrToIndex; |
| + supportedLSRs = src.supportedLSRs; |
| + supportedIndexes = src.supportedIndexes; |
| + supportedLSRsLength = src.supportedLSRsLength; |
| + ownedDefaultLocale = src.ownedDefaultLocale; |
| + defaultLocale = src.defaultLocale; |
| + defaultLocaleIndex = src.defaultLocaleIndex; |
| + |
| + src.supportedLocales = nullptr; |
| + src.lsrs = nullptr; |
| + src.supportedLocalesLength = 0; |
| + src.supportedLsrToIndex = nullptr; |
| + src.supportedLSRs = nullptr; |
| + src.supportedIndexes = nullptr; |
| + src.supportedLSRsLength = 0; |
| + src.ownedDefaultLocale = nullptr; |
| + src.defaultLocale = nullptr; |
| + src.defaultLocaleIndex = -1; |
| + return *this; |
| +} |
| + |
| +class LocaleLsrIterator { |
| +public: |
| + LocaleLsrIterator(const XLikelySubtags &likelySubtags, Locale::Iterator &locales, |
| + ULocMatchLifetime lifetime) : |
| + likelySubtags(likelySubtags), locales(locales), lifetime(lifetime) {} |
| + |
| + ~LocaleLsrIterator() { |
| + if (lifetime == ULOCMATCH_TEMPORARY_LOCALES) { |
| + delete remembered; |
| + } |
| + } |
| + |
| + bool hasNext() const { |
| + return locales.hasNext(); |
| + } |
| + |
| + LSR next(UErrorCode &errorCode) { |
| + current = &locales.next(); |
| + return getMaximalLsrOrUnd(likelySubtags, *current, errorCode); |
| + } |
| + |
| + void rememberCurrent(int32_t desiredIndex, UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return; } |
| + bestDesiredIndex = desiredIndex; |
| + if (lifetime == ULOCMATCH_STORED_LOCALES) { |
| + remembered = current; |
| + } else { |
| + // ULOCMATCH_TEMPORARY_LOCALES |
| + delete remembered; |
| + remembered = new Locale(*current); |
| + if (remembered == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + } |
| + } |
| + } |
| + |
| + const Locale *orphanRemembered() { |
| + const Locale *rem = remembered; |
| + remembered = nullptr; |
| + return rem; |
| + } |
| + |
| + int32_t getBestDesiredIndex() const { |
| + return bestDesiredIndex; |
| + } |
| + |
| +private: |
| + const XLikelySubtags &likelySubtags; |
| + Locale::Iterator &locales; |
| + ULocMatchLifetime lifetime; |
| + const Locale *current = nullptr, *remembered = nullptr; |
| + int32_t bestDesiredIndex = -1; |
| +}; |
| + |
| +const Locale *LocaleMatcher::getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode)) { return nullptr; } |
| + int32_t suppIndex = getBestSuppIndex( |
| + getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode), |
| + nullptr, errorCode); |
| + return U_SUCCESS(errorCode) && suppIndex >= 0 ? supportedLocales[suppIndex] : defaultLocale; |
| +} |
| + |
| +const Locale *LocaleMatcher::getBestMatch(Locale::Iterator &desiredLocales, |
| + UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode)) { return nullptr; } |
| + if (!desiredLocales.hasNext()) { |
| + return defaultLocale; |
| + } |
| + LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES); |
| + int32_t suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode); |
| + return U_SUCCESS(errorCode) && suppIndex >= 0 ? supportedLocales[suppIndex] : defaultLocale; |
| +} |
| + |
| +const Locale *LocaleMatcher::getBestMatchForListString( |
| + StringPiece desiredLocaleList, UErrorCode &errorCode) const { |
| + LocalePriorityList list(desiredLocaleList, errorCode); |
| + LocalePriorityList::Iterator iter = list.iterator(); |
| + return getBestMatch(iter, errorCode); |
| +} |
| + |
| +LocaleMatcher::Result LocaleMatcher::getBestMatchResult( |
| + const Locale &desiredLocale, UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode)) { |
| + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); |
| + } |
| + int32_t suppIndex = getBestSuppIndex( |
| + getMaximalLsrOrUnd(likelySubtags, desiredLocale, errorCode), |
| + nullptr, errorCode); |
| + if (U_FAILURE(errorCode) || suppIndex < 0) { |
| + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); |
| + } else { |
| + return Result(&desiredLocale, supportedLocales[suppIndex], 0, suppIndex, FALSE); |
| + } |
| +} |
| + |
| +LocaleMatcher::Result LocaleMatcher::getBestMatchResult( |
| + Locale::Iterator &desiredLocales, UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode) || !desiredLocales.hasNext()) { |
| + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); |
| + } |
| + LocaleLsrIterator lsrIter(likelySubtags, desiredLocales, ULOCMATCH_TEMPORARY_LOCALES); |
| + int32_t suppIndex = getBestSuppIndex(lsrIter.next(errorCode), &lsrIter, errorCode); |
| + if (U_FAILURE(errorCode) || suppIndex < 0) { |
| + return Result(nullptr, defaultLocale, -1, defaultLocaleIndex, FALSE); |
| + } else { |
| + return Result(lsrIter.orphanRemembered(), supportedLocales[suppIndex], |
| + lsrIter.getBestDesiredIndex(), suppIndex, TRUE); |
| + } |
| +} |
| + |
| +int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, |
| + UErrorCode &errorCode) const { |
| + if (U_FAILURE(errorCode)) { return -1; } |
| + int32_t desiredIndex = 0; |
| + int32_t bestSupportedLsrIndex = -1; |
| + for (int32_t bestDistance = thresholdDistance;;) { |
| + // Quick check for exact maximized LSR. |
| + // Returns suppIndex+1 where 0 means not found. |
| + if (supportedLsrToIndex != nullptr) { |
| + desiredLSR.setHashCode(); |
| + int32_t index = uhash_geti(supportedLsrToIndex, &desiredLSR); |
| + if (index != 0) { |
| + int32_t suppIndex = index - 1; |
| + if (remainingIter != nullptr) { |
| + remainingIter->rememberCurrent(desiredIndex, errorCode); |
| + } |
| + return suppIndex; |
| + } |
| + } |
| + int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance( |
| + desiredLSR, supportedLSRs, supportedLSRsLength, bestDistance, favorSubtag); |
| + if (bestIndexAndDistance >= 0) { |
| + bestDistance = bestIndexAndDistance & 0xff; |
| + if (remainingIter != nullptr) { |
| + remainingIter->rememberCurrent(desiredIndex, errorCode); |
| + if (U_FAILURE(errorCode)) { return -1; } |
| + } |
| + bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? bestIndexAndDistance >> 8 : -1; |
| + } |
| + if ((bestDistance -= demotionPerDesiredLocale) <= 0) { |
| + break; |
| + } |
| + if (remainingIter == nullptr || !remainingIter->hasNext()) { |
| + break; |
| + } |
| + desiredLSR = remainingIter->next(errorCode); |
| + if (U_FAILURE(errorCode)) { return -1; } |
| + ++desiredIndex; |
| + } |
| + if (bestSupportedLsrIndex < 0) { |
| + // no good match |
| + return -1; |
| + } |
| + return supportedIndexes[bestSupportedLsrIndex]; |
| +} |
| + |
| +double LocaleMatcher::internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const { |
| + // Returns the inverse of the distance: That is, 1-distance(desired, supported). |
| + LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode); |
| + if (U_FAILURE(errorCode)) { return 0; } |
| + const LSR *pSuppLSR = &suppLSR; |
| + int32_t distance = localeDistance.getBestIndexAndDistance( |
| + getMaximalLsrOrUnd(likelySubtags, desired, errorCode), |
| + &pSuppLSR, 1, |
| + thresholdDistance, favorSubtag) & 0xff; |
| + return (100 - distance) / 100.0; |
| +} |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif // __LOCMATCHER_H__ |
| diff --git a/source/common/localeprioritylist.cpp b/source/common/localeprioritylist.cpp |
| new file mode 100644 |
| index 00000000..06442fb4 |
| --- /dev/null |
| +++ b/source/common/localeprioritylist.cpp |
| @@ -0,0 +1,239 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// localeprioritylist.cpp |
| +// created: 2019jul11 Markus W. Scherer |
| + |
| +#include "unicode/utypes.h" |
| +#include "unicode/localpointer.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/stringpiece.h" |
| +#include "unicode/uobject.h" |
| +#include "charstr.h" |
| +#include "cmemory.h" |
| +#include "localeprioritylist.h" |
| +#include "uarrsort.h" |
| +#include "uassert.h" |
| +#include "uhash.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +namespace { |
| + |
| +int32_t hashLocale(const UHashTok token) { |
| + auto *locale = static_cast<const Locale *>(token.pointer); |
| + return locale->hashCode(); |
| +} |
| + |
| +UBool compareLocales(const UHashTok t1, const UHashTok t2) { |
| + auto *l1 = static_cast<const Locale *>(t1.pointer); |
| + auto *l2 = static_cast<const Locale *>(t2.pointer); |
| + return *l1 == *l2; |
| +} |
| + |
| +constexpr int32_t WEIGHT_ONE = 1000; |
| + |
| +struct LocaleAndWeight { |
| + Locale *locale; |
| + int32_t weight; // 0..1000 = 0.0..1.0 |
| + int32_t index; // force stable sort |
| + |
| + int32_t compare(const LocaleAndWeight &other) const { |
| + int32_t diff = other.weight - weight; // descending: other-this |
| + if (diff != 0) { return diff; } |
| + return index - other.index; |
| + } |
| +}; |
| + |
| +int32_t U_CALLCONV |
| +compareLocaleAndWeight(const void * /*context*/, const void *left, const void *right) { |
| + return static_cast<const LocaleAndWeight *>(left)-> |
| + compare(*static_cast<const LocaleAndWeight *>(right)); |
| +} |
| + |
| +const char *skipSpaces(const char *p, const char *limit) { |
| + while (p < limit && *p == ' ') { ++p; } |
| + return p; |
| +} |
| + |
| +int32_t findTagLength(const char *p, const char *limit) { |
| + // Look for accept-language delimiters. |
| + // Leave other validation up to the Locale constructor. |
| + const char *q; |
| + for (q = p; q < limit; ++q) { |
| + char c = *q; |
| + if (c == ' ' || c == ',' || c == ';') { break; } |
| + } |
| + return static_cast<int32_t>(q - p); |
| +} |
| + |
| +/** |
| + * Parses and returns a qvalue weight in millis. |
| + * Advances p to after the parsed substring. |
| + * Returns a negative value if parsing fails. |
| + */ |
| +int32_t parseWeight(const char *&p, const char *limit) { |
| + p = skipSpaces(p, limit); |
| + char c; |
| + if (p == limit || ((c = *p) != '0' && c != '1')) { return -1; } |
| + int32_t weight = (c - '0') * 1000; |
| + if (++p == limit || *p != '.') { return weight; } |
| + int32_t multiplier = 100; |
| + while (++p != limit && '0' <= (c = *p) && c <= '9') { |
| + c -= '0'; |
| + if (multiplier > 0) { |
| + weight += c * multiplier; |
| + multiplier /= 10; |
| + } else if (multiplier == 0) { |
| + // round up |
| + if (c >= 5) { ++weight; } |
| + multiplier = -1; |
| + } // else ignore further fraction digits |
| + } |
| + return weight <= WEIGHT_ONE ? weight : -1; // bad if > 1.0 |
| +} |
| + |
| +} // namespace |
| + |
| +/** |
| + * Nothing but a wrapper over a MaybeStackArray of LocaleAndWeight. |
| + * |
| + * This wrapper exists (and is not in an anonymous namespace) |
| + * so that we can forward-declare it in the header file and |
| + * don't have to expose the MaybeStackArray specialization and |
| + * the LocaleAndWeight to code (like the test) that #includes localeprioritylist.h. |
| + * Also, otherwise we would have to do a platform-specific |
| + * template export declaration of some kind for the MaybeStackArray specialization |
| + * to be properly exported from the common DLL. |
| + */ |
| +struct LocaleAndWeightArray : public UMemory { |
| + MaybeStackArray<LocaleAndWeight, 20> array; |
| +}; |
| + |
| +LocalePriorityList::LocalePriorityList(StringPiece s, UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return; } |
| + list = new LocaleAndWeightArray(); |
| + if (list == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + const char *p = s.data(); |
| + const char *limit = p + s.length(); |
| + while ((p = skipSpaces(p, limit)) != limit) { |
| + if (*p == ',') { // empty range field |
| + ++p; |
| + continue; |
| + } |
| + int32_t tagLength = findTagLength(p, limit); |
| + if (tagLength == 0) { |
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| + return; |
| + } |
| + CharString tag(p, tagLength, errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + Locale locale = Locale(tag.data()); |
| + if (locale.isBogus()) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + int32_t weight = WEIGHT_ONE; |
| + if ((p = skipSpaces(p + tagLength, limit)) != limit && *p == ';') { |
| + if ((p = skipSpaces(p + 1, limit)) == limit || *p != 'q' || |
| + (p = skipSpaces(p + 1, limit)) == limit || *p != '=' || |
| + (++p, (weight = parseWeight(p, limit)) < 0)) { |
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| + return; |
| + } |
| + p = skipSpaces(p, limit); |
| + } |
| + if (p != limit && *p != ',') { // trailing junk |
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| + return; |
| + } |
| + add(locale, weight, errorCode); |
| + if (p == limit) { break; } |
| + ++p; |
| + } |
| + sort(errorCode); |
| +} |
| + |
| +LocalePriorityList::~LocalePriorityList() { |
| + if (list != nullptr) { |
| + for (int32_t i = 0; i < listLength; ++i) { |
| + delete list->array[i].locale; |
| + } |
| + delete list; |
| + } |
| + uhash_close(map); |
| +} |
| + |
| +const Locale *LocalePriorityList::localeAt(int32_t i) const { |
| + return list->array[i].locale; |
| +} |
| + |
| +Locale *LocalePriorityList::orphanLocaleAt(int32_t i) { |
| + if (list == nullptr) { return nullptr; } |
| + LocaleAndWeight &lw = list->array[i]; |
| + Locale *l = lw.locale; |
| + lw.locale = nullptr; |
| + return l; |
| +} |
| + |
| +bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return false; } |
| + if (map == nullptr) { |
| + if (weight <= 0) { return true; } // do not add q=0 |
| + map = uhash_open(hashLocale, compareLocales, uhash_compareLong, &errorCode); |
| + if (U_FAILURE(errorCode)) { return false; } |
| + } |
| + LocalPointer<Locale> clone; |
| + int32_t index = uhash_geti(map, &locale); |
| + if (index != 0) { |
| + // Duplicate: Remove the old item and append it anew. |
| + LocaleAndWeight &lw = list->array[index - 1]; |
| + clone.adoptInstead(lw.locale); |
| + lw.locale = nullptr; |
| + lw.weight = 0; |
| + ++numRemoved; |
| + } |
| + if (weight <= 0) { // do not add q=0 |
| + if (index != 0) { |
| + // Not strictly necessary but cleaner. |
| + uhash_removei(map, &locale); |
| + } |
| + return true; |
| + } |
| + if (clone.isNull()) { |
| + clone.adoptInstead(locale.clone()); |
| + if (clone.isNull() || (clone->isBogus() && !locale.isBogus())) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return false; |
| + } |
| + } |
| + if (listLength == list->array.getCapacity()) { |
| + int32_t newCapacity = listLength < 50 ? 100 : 4 * listLength; |
| + if (list->array.resize(newCapacity, listLength) == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return false; |
| + } |
| + } |
| + uhash_puti(map, clone.getAlias(), listLength + 1, &errorCode); |
| + if (U_FAILURE(errorCode)) { return false; } |
| + LocaleAndWeight &lw = list->array[listLength]; |
| + lw.locale = clone.orphan(); |
| + lw.weight = weight; |
| + lw.index = listLength++; |
| + if (weight < WEIGHT_ONE) { hasWeights = true; } |
| + U_ASSERT(uhash_count(map) == getLength()); |
| + return true; |
| +} |
| + |
| +void LocalePriorityList::sort(UErrorCode &errorCode) { |
| + // Sort by descending weights if there is a mix of weights. |
| + // The comparator forces a stable sort via the item index. |
| + if (U_FAILURE(errorCode) || getLength() <= 1 || !hasWeights) { return; } |
| + uprv_sortArray(list->array.getAlias(), listLength, sizeof(LocaleAndWeight), |
| + compareLocaleAndWeight, nullptr, FALSE, &errorCode); |
| +} |
| + |
| +U_NAMESPACE_END |
| diff --git a/source/common/localeprioritylist.h b/source/common/localeprioritylist.h |
| new file mode 100644 |
| index 00000000..80ca38a7 |
| --- /dev/null |
| +++ b/source/common/localeprioritylist.h |
| @@ -0,0 +1,115 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// localeprioritylist.h |
| +// created: 2019jul11 Markus W. Scherer |
| + |
| +#ifndef __LOCALEPRIORITYLIST_H__ |
| +#define __LOCALEPRIORITYLIST_H__ |
| + |
| +#include "unicode/utypes.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/stringpiece.h" |
| +#include "unicode/uobject.h" |
| + |
| +struct UHashtable; |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +struct LocaleAndWeightArray; |
| + |
| +/** |
| + * Parses a list of locales from an accept-language string. |
| + * We are a bit more lenient than the spec: |
| + * We accept extra whitespace in more places, empty range fields, |
| + * and any number of qvalue fraction digits. |
| + * |
| + * https://tools.ietf.org/html/rfc2616#section-14.4 |
| + * 14.4 Accept-Language |
| + * |
| + * Accept-Language = "Accept-Language" ":" |
| + * 1#( language-range [ ";" "q" "=" qvalue ] ) |
| + * language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" ) |
| + * |
| + * Each language-range MAY be given an associated quality value which |
| + * represents an estimate of the user's preference for the languages |
| + * specified by that range. The quality value defaults to "q=1". For |
| + * example, |
| + * |
| + * Accept-Language: da, en-gb;q=0.8, en;q=0.7 |
| + * |
| + * https://tools.ietf.org/html/rfc2616#section-3.9 |
| + * 3.9 Quality Values |
| + * |
| + * HTTP content negotiation (section 12) uses short "floating point" |
| + * numbers to indicate the relative importance ("weight") of various |
| + * negotiable parameters. A weight is normalized to a real number in |
| + * the range 0 through 1, where 0 is the minimum and 1 the maximum |
| + * value. If a parameter has a quality value of 0, then content with |
| + * this parameter is `not acceptable' for the client. HTTP/1.1 |
| + * applications MUST NOT generate more than three digits after the |
| + * decimal point. User configuration of these values SHOULD also be |
| + * limited in this fashion. |
| + * |
| + * qvalue = ( "0" [ "." 0*3DIGIT ] ) |
| + * | ( "1" [ "." 0*3("0") ] ) |
| + */ |
| +class U_COMMON_API LocalePriorityList : public UMemory { |
| +public: |
| + class Iterator : public Locale::Iterator { |
| + public: |
| + UBool hasNext() const override { return count < length; } |
| + |
| + const Locale &next() override { |
| + for(;;) { |
| + const Locale *locale = list.localeAt(index++); |
| + if (locale != nullptr) { |
| + ++count; |
| + return *locale; |
| + } |
| + } |
| + } |
| + |
| + private: |
| + friend class LocalePriorityList; |
| + |
| + Iterator(const LocalePriorityList &list) : list(list), length(list.getLength()) {} |
| + |
| + const LocalePriorityList &list; |
| + int32_t index = 0; |
| + int32_t count = 0; |
| + const int32_t length; |
| + }; |
| + |
| + LocalePriorityList(StringPiece s, UErrorCode &errorCode); |
| + |
| + ~LocalePriorityList(); |
| + |
| + int32_t getLength() const { return listLength - numRemoved; } |
| + |
| + int32_t getLengthIncludingRemoved() const { return listLength; } |
| + |
| + Iterator iterator() const { return Iterator(*this); } |
| + |
| + const Locale *localeAt(int32_t i) const; |
| + |
| + Locale *orphanLocaleAt(int32_t i); |
| + |
| +private: |
| + LocalePriorityList(const LocalePriorityList &) = delete; |
| + LocalePriorityList &operator=(const LocalePriorityList &) = delete; |
| + |
| + bool add(const Locale &locale, int32_t weight, UErrorCode &errorCode); |
| + |
| + void sort(UErrorCode &errorCode); |
| + |
| + LocaleAndWeightArray *list = nullptr; |
| + int32_t listLength = 0; |
| + int32_t numRemoved = 0; |
| + bool hasWeights = false; // other than 1.0 |
| + UHashtable *map = nullptr; |
| +}; |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif // __LOCALEPRIORITYLIST_H__ |
| diff --git a/source/common/locdistance.cpp b/source/common/locdistance.cpp |
| new file mode 100644 |
| index 00000000..800d0eac |
| --- /dev/null |
| +++ b/source/common/locdistance.cpp |
| @@ -0,0 +1,364 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// locdistance.cpp |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#include "unicode/utypes.h" |
| +#include "unicode/bytestrie.h" |
| +#include "unicode/localematcher.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/uobject.h" |
| +#include "unicode/ures.h" |
| +#include "cstring.h" |
| +#include "locdistance.h" |
| +#include "loclikelysubtags.h" |
| +#include "uassert.h" |
| +#include "ucln_cmn.h" |
| +#include "uinvchar.h" |
| +#include "umutex.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +namespace { |
| + |
| +/** |
| + * Bit flag used on the last character of a subtag in the trie. |
| + * Must be set consistently by the builder and the lookup code. |
| + */ |
| +constexpr int32_t END_OF_SUBTAG = 0x80; |
| +/** Distance value bit flag, set by the builder. */ |
| +constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; |
| +/** Distance value bit flag, set by trieNext(). */ |
| +constexpr int32_t DISTANCE_IS_FINAL = 0x100; |
| +constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; |
| + |
| +constexpr int32_t ABOVE_THRESHOLD = 100; |
| + |
| +// Indexes into array of distances. |
| +enum { |
| + IX_DEF_LANG_DISTANCE, |
| + IX_DEF_SCRIPT_DISTANCE, |
| + IX_DEF_REGION_DISTANCE, |
| + IX_MIN_REGION_DISTANCE, |
| + IX_LIMIT |
| +}; |
| + |
| +LocaleDistance *gLocaleDistance = nullptr; |
| +UInitOnce gInitOnce = U_INITONCE_INITIALIZER; |
| + |
| +UBool U_CALLCONV cleanup() { |
| + delete gLocaleDistance; |
| + gLocaleDistance = nullptr; |
| + gInitOnce.reset(); |
| + return TRUE; |
| +} |
| + |
| +} // namespace |
| + |
| +void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { |
| + // This function is invoked only via umtx_initOnce(). |
| + U_ASSERT(gLocaleDistance == nullptr); |
| + const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + const LocaleDistanceData &data = likely.getDistanceData(); |
| + if (data.distanceTrieBytes == nullptr || |
| + data.regionToPartitions == nullptr || data.partitions == nullptr || |
| + // ok if no paradigms |
| + data.distances == nullptr) { |
| + errorCode = U_MISSING_RESOURCE_ERROR; |
| + return; |
| + } |
| + gLocaleDistance = new LocaleDistance(data); |
| + if (gLocaleDistance == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); |
| +} |
| + |
| +const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return nullptr; } |
| + umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); |
| + return gLocaleDistance; |
| +} |
| + |
| +LocaleDistance::LocaleDistance(const LocaleDistanceData &data) : |
| + trie(data.distanceTrieBytes), |
| + regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), |
| + paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), |
| + defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), |
| + defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), |
| + defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), |
| + minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { |
| + // For the default demotion value, use the |
| + // default region distance between unrelated Englishes. |
| + // Thus, unless demotion is turned off, |
| + // a mere region difference for one desired locale |
| + // is as good as a perfect match for the next following desired locale. |
| + // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. |
| + LSR en("en", "Latn", "US"); |
| + LSR enGB("en", "Latn", "GB"); |
| + const LSR *p_enGB = &enGB; |
| + defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1, |
| + 50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff; |
| +} |
| + |
| +int32_t LocaleDistance::getBestIndexAndDistance( |
| + const LSR &desired, |
| + const LSR **supportedLSRs, int32_t supportedLSRsLength, |
| + int32_t threshold, ULocMatchFavorSubtag favorSubtag) const { |
| + BytesTrie iter(trie); |
| + // Look up the desired language only once for all supported LSRs. |
| + // Its "distance" is either a match point value of 0, or a non-match negative value. |
| + // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
| + int32_t desLangDistance = trieNext(iter, desired.language, false); |
| + uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; |
| + // Index of the supported LSR with the lowest distance. |
| + int32_t bestIndex = -1; |
| + for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { |
| + const LSR &supported = *supportedLSRs[slIndex]; |
| + bool star = false; |
| + int32_t distance = desLangDistance; |
| + if (distance >= 0) { |
| + U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); |
| + if (slIndex != 0) { |
| + iter.resetToState64(desLangState); |
| + } |
| + distance = trieNext(iter, supported.language, true); |
| + } |
| + // Note: The data builder verifies that there are no rules with "any" (*) language and |
| + // real (non *) script or region subtags. |
| + // This means that if the lookup for either language fails we can use |
| + // the default distances without further lookups. |
| + int32_t flags; |
| + if (distance >= 0) { |
| + flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
| + distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; |
| + } else { // <*, *> |
| + if (uprv_strcmp(desired.language, supported.language) == 0) { |
| + distance = 0; |
| + } else { |
| + distance = defaultLanguageDistance; |
| + } |
| + flags = 0; |
| + star = true; |
| + } |
| + U_ASSERT(0 <= distance && distance <= 100); |
| + // We implement "favor subtag" by reducing the language subtag distance |
| + // (unscientifically reducing it to a quarter of the normal value), |
| + // so that the script distance is relatively more important. |
| + // For example, given a default language distance of 80, we reduce it to 20, |
| + // which is below the default threshold of 50, which is the default script distance. |
| + if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { |
| + distance >>= 2; |
| + } |
| + if (distance >= threshold) { |
| + continue; |
| + } |
| + |
| + int32_t scriptDistance; |
| + if (star || flags != 0) { |
| + if (uprv_strcmp(desired.script, supported.script) == 0) { |
| + scriptDistance = 0; |
| + } else { |
| + scriptDistance = defaultScriptDistance; |
| + } |
| + } else { |
| + scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), |
| + desired.script, supported.script); |
| + flags = scriptDistance & DISTANCE_IS_FINAL; |
| + scriptDistance &= ~DISTANCE_IS_FINAL; |
| + } |
| + distance += scriptDistance; |
| + if (distance >= threshold) { |
| + continue; |
| + } |
| + |
| + if (uprv_strcmp(desired.region, supported.region) == 0) { |
| + // regionDistance = 0 |
| + } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { |
| + distance += defaultRegionDistance; |
| + } else { |
| + int32_t remainingThreshold = threshold - distance; |
| + if (minRegionDistance >= remainingThreshold) { |
| + continue; |
| + } |
| + |
| + // From here on we know the regions are not equal. |
| + // Map each region to zero or more partitions. (zero = one non-matching string) |
| + // (Each array of single-character partition strings is encoded as one string.) |
| + // If either side has more than one, then we find the maximum distance. |
| + // This could be optimized by adding some more structure, but probably not worth it. |
| + distance += getRegionPartitionsDistance( |
| + iter, iter.getState64(), |
| + partitionsForRegion(desired), |
| + partitionsForRegion(supported), |
| + remainingThreshold); |
| + } |
| + if (distance < threshold) { |
| + if (distance == 0) { |
| + return slIndex << 8; |
| + } |
| + bestIndex = slIndex; |
| + threshold = distance; |
| + } |
| + } |
| + return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD; |
| +} |
| + |
| +int32_t LocaleDistance::getDesSuppScriptDistance( |
| + BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { |
| + // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. |
| + int32_t distance = trieNext(iter, desired, false); |
| + if (distance >= 0) { |
| + distance = trieNext(iter, supported, true); |
| + } |
| + if (distance < 0) { |
| + UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> |
| + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
| + if (uprv_strcmp(desired, supported) == 0) { |
| + distance = 0; // same script |
| + } else { |
| + distance = iter.getValue(); |
| + U_ASSERT(distance >= 0); |
| + } |
| + if (result == USTRINGTRIE_FINAL_VALUE) { |
| + distance |= DISTANCE_IS_FINAL; |
| + } |
| + } |
| + return distance; |
| +} |
| + |
| +int32_t LocaleDistance::getRegionPartitionsDistance( |
| + BytesTrie &iter, uint64_t startState, |
| + const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { |
| + char desired = *desiredPartitions++; |
| + char supported = *supportedPartitions++; |
| + U_ASSERT(desired != 0 && supported != 0); |
| + // See if we have single desired/supported partitions, from NUL-terminated |
| + // partition strings without explicit length. |
| + bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character |
| + // equivalent to: if (desLength == 1 && suppLength == 1) |
| + if (*desiredPartitions == 0 && !suppLengthGt1) { |
| + // Fastpath for single desired/supported partitions. |
| + UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
| + if (USTRINGTRIE_HAS_NEXT(result)) { |
| + result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
| + if (USTRINGTRIE_HAS_VALUE(result)) { |
| + return iter.getValue(); |
| + } |
| + } |
| + return getFallbackRegionDistance(iter, startState); |
| + } |
| + |
| + const char *supportedStart = supportedPartitions - 1; // for restart of inner loop |
| + int32_t regionDistance = 0; |
| + // Fall back to * only once, not for each pair of partition strings. |
| + bool star = false; |
| + for (;;) { |
| + // Look up each desired-partition string only once, |
| + // not for each (desired, supported) pair. |
| + UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); |
| + if (USTRINGTRIE_HAS_NEXT(result)) { |
| + uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; |
| + for (;;) { |
| + result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); |
| + int32_t d; |
| + if (USTRINGTRIE_HAS_VALUE(result)) { |
| + d = iter.getValue(); |
| + } else if (star) { |
| + d = 0; |
| + } else { |
| + d = getFallbackRegionDistance(iter, startState); |
| + star = true; |
| + } |
| + if (d >= threshold) { |
| + return d; |
| + } else if (regionDistance < d) { |
| + regionDistance = d; |
| + } |
| + if ((supported = *supportedPartitions++) != 0) { |
| + iter.resetToState64(desState); |
| + } else { |
| + break; |
| + } |
| + } |
| + } else if (!star) { |
| + int32_t d = getFallbackRegionDistance(iter, startState); |
| + if (d >= threshold) { |
| + return d; |
| + } else if (regionDistance < d) { |
| + regionDistance = d; |
| + } |
| + star = true; |
| + } |
| + if ((desired = *desiredPartitions++) != 0) { |
| + iter.resetToState64(startState); |
| + supportedPartitions = supportedStart; |
| + supported = *supportedPartitions++; |
| + } else { |
| + break; |
| + } |
| + } |
| + return regionDistance; |
| +} |
| + |
| +int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { |
| +#if U_DEBUG |
| + UStringTrieResult result = |
| +#endif |
| + iter.resetToState64(startState).next(u'*'); // <*, *> |
| + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
| + int32_t distance = iter.getValue(); |
| + U_ASSERT(distance >= 0); |
| + return distance; |
| +} |
| + |
| +int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { |
| + uint8_t c; |
| + if ((c = *s) == 0) { |
| + return -1; // no empty subtags in the distance data |
| + } |
| + for (;;) { |
| + c = uprv_invCharToAscii(c); |
| + // EBCDIC: If *s is not an invariant character, |
| + // then c is now 0 and will simply not match anything, which is harmless. |
| + uint8_t next = *++s; |
| + if (next != 0) { |
| + if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { |
| + return -1; |
| + } |
| + } else { |
| + // last character of this subtag |
| + UStringTrieResult result = iter.next(c | END_OF_SUBTAG); |
| + if (wantValue) { |
| + if (USTRINGTRIE_HAS_VALUE(result)) { |
| + int32_t value = iter.getValue(); |
| + if (result == USTRINGTRIE_FINAL_VALUE) { |
| + value |= DISTANCE_IS_FINAL; |
| + } |
| + return value; |
| + } |
| + } else { |
| + if (USTRINGTRIE_HAS_NEXT(result)) { |
| + return 0; |
| + } |
| + } |
| + return -1; |
| + } |
| + c = next; |
| + } |
| +} |
| + |
| +UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { |
| + // Linear search for a very short list (length 6 as of 2019). |
| + // If there are many paradigm LSRs we should use a hash set. |
| + U_ASSERT(paradigmLSRsLength <= 15); |
| + for (int32_t i = 0; i < paradigmLSRsLength; ++i) { |
| + if (lsr == paradigmLSRs[i]) { return true; } |
| + } |
| + return false; |
| +} |
| + |
| +U_NAMESPACE_END |
| diff --git a/source/common/locdistance.h b/source/common/locdistance.h |
| new file mode 100644 |
| index 00000000..7439f51c |
| --- /dev/null |
| +++ b/source/common/locdistance.h |
| @@ -0,0 +1,109 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// locdistance.h |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#ifndef __LOCDISTANCE_H__ |
| +#define __LOCDISTANCE_H__ |
| + |
| +#include "unicode/utypes.h" |
| +#include "unicode/bytestrie.h" |
| +#include "unicode/localematcher.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/uobject.h" |
| +#include "lsr.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +struct LocaleDistanceData; |
| + |
| +/** |
| + * Offline-built data for LocaleMatcher. |
| + * Mostly but not only the data for mapping locales to their maximized forms. |
| + */ |
| +class LocaleDistance final : public UMemory { |
| +public: |
| + static const LocaleDistance *getSingleton(UErrorCode &errorCode); |
| + |
| + /** |
| + * Finds the supported LSR with the smallest distance from the desired one. |
| + * Equivalent LSR subtags must be normalized into a canonical form. |
| + * |
| + * <p>Returns the index of the lowest-distance supported LSR in bits 31..8 |
| + * (negative if none has a distance below the threshold), |
| + * and its distance (0..ABOVE_THRESHOLD) in bits 7..0. |
| + */ |
| + int32_t getBestIndexAndDistance(const LSR &desired, |
| + const LSR **supportedLSRs, int32_t supportedLSRsLength, |
| + int32_t threshold, ULocMatchFavorSubtag favorSubtag) const; |
| + |
| + int32_t getParadigmLSRsLength() const { return paradigmLSRsLength; } |
| + |
| + UBool isParadigmLSR(const LSR &lsr) const; |
| + |
| + int32_t getDefaultScriptDistance() const { |
| + return defaultScriptDistance; |
| + } |
| + |
| + int32_t getDefaultDemotionPerDesiredLocale() const { |
| + return defaultDemotionPerDesiredLocale; |
| + } |
| + |
| +private: |
| + LocaleDistance(const LocaleDistanceData &data); |
| + LocaleDistance(const LocaleDistance &other) = delete; |
| + LocaleDistance &operator=(const LocaleDistance &other) = delete; |
| + |
| + static void initLocaleDistance(UErrorCode &errorCode); |
| + |
| + static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState, |
| + const char *desired, const char *supported); |
| + |
| + static int32_t getRegionPartitionsDistance( |
| + BytesTrie &iter, uint64_t startState, |
| + const char *desiredPartitions, const char *supportedPartitions, |
| + int32_t threshold); |
| + |
| + static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState); |
| + |
| + static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue); |
| + |
| + const char *partitionsForRegion(const LSR &lsr) const { |
| + // ill-formed region -> one non-matching string |
| + int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex]; |
| + return partitionArrays[pIndex]; |
| + } |
| + |
| + int32_t getDefaultRegionDistance() const { |
| + return defaultRegionDistance; |
| + } |
| + |
| + // The trie maps each dlang+slang+dscript+sscript+dregion+sregion |
| + // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance. |
| + // There is also a trie value for each subsequence of whole subtags. |
| + // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"". |
| + BytesTrie trie; |
| + |
| + /** |
| + * Maps each region to zero or more single-character partitions. |
| + */ |
| + const uint8_t *regionToPartitionsIndex; |
| + const char **partitionArrays; |
| + |
| + /** |
| + * Used to get the paradigm region for a cluster, if there is one. |
| + */ |
| + const LSR *paradigmLSRs; |
| + int32_t paradigmLSRsLength; |
| + |
| + int32_t defaultLanguageDistance; |
| + int32_t defaultScriptDistance; |
| + int32_t defaultRegionDistance; |
| + int32_t minRegionDistance; |
| + int32_t defaultDemotionPerDesiredLocale; |
| +}; |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif // __LOCDISTANCE_H__ |
| diff --git a/source/common/locid.cpp b/source/common/locid.cpp |
| index caffdb8b..93f3d3cb 100644 |
| --- a/source/common/locid.cpp |
| +++ b/source/common/locid.cpp |
| @@ -1399,5 +1399,7 @@ Locale::getBaseName() const { |
| return baseName; |
| } |
| |
| +Locale::Iterator::~Iterator() = default; |
| + |
| //eof |
| U_NAMESPACE_END |
| diff --git a/source/common/loclikelysubtags.cpp b/source/common/loclikelysubtags.cpp |
| new file mode 100644 |
| index 00000000..d7f5e124 |
| --- /dev/null |
| +++ b/source/common/loclikelysubtags.cpp |
| @@ -0,0 +1,638 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// loclikelysubtags.cpp |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#include <utility> |
| +#include "unicode/utypes.h" |
| +#include "unicode/bytestrie.h" |
| +#include "unicode/localpointer.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/uobject.h" |
| +#include "unicode/ures.h" |
| +#include "charstr.h" |
| +#include "cstring.h" |
| +#include "loclikelysubtags.h" |
| +#include "lsr.h" |
| +#include "uassert.h" |
| +#include "ucln_cmn.h" |
| +#include "uhash.h" |
| +#include "uinvchar.h" |
| +#include "umutex.h" |
| +#include "uresdata.h" |
| +#include "uresimp.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +namespace { |
| + |
| +constexpr char PSEUDO_ACCENTS_PREFIX = '\''; // -XA, -PSACCENT |
| +constexpr char PSEUDO_BIDI_PREFIX = '+'; // -XB, -PSBIDI |
| +constexpr char PSEUDO_CRACKED_PREFIX = ','; // -XC, -PSCRACK |
| + |
| +/** |
| + * Stores NUL-terminated strings with duplicate elimination. |
| + * Checks for unique UTF-16 string pointers and converts to invariant characters. |
| + */ |
| +class UniqueCharStrings { |
| +public: |
| + UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) { |
| + uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + strings = new CharString(); |
| + if (strings == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + } |
| + } |
| + ~UniqueCharStrings() { |
| + uhash_close(&map); |
| + delete strings; |
| + } |
| + |
| + /** Returns/orphans the CharString that contains all strings. */ |
| + CharString *orphanCharStrings() { |
| + CharString *result = strings; |
| + strings = nullptr; |
| + return result; |
| + } |
| + |
| + /** Adds a string and returns a unique number for it. */ |
| + int32_t add(const UnicodeString &s, UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return 0; } |
| + if (isFrozen) { |
| + errorCode = U_NO_WRITE_PERMISSION; |
| + return 0; |
| + } |
| + // The string points into the resource bundle. |
| + const char16_t *p = s.getBuffer(); |
| + int32_t oldIndex = uhash_geti(&map, p); |
| + if (oldIndex != 0) { // found duplicate |
| + return oldIndex; |
| + } |
| + // Explicit NUL terminator for the previous string. |
| + // The strings object is also terminated with one implicit NUL. |
| + strings->append(0, errorCode); |
| + int32_t newIndex = strings->length(); |
| + strings->appendInvariantChars(s, errorCode); |
| + uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode); |
| + return newIndex; |
| + } |
| + |
| + void freeze() { isFrozen = true; } |
| + |
| + /** |
| + * Returns a string pointer for its unique number, if this object is frozen. |
| + * Otherwise nullptr. |
| + */ |
| + const char *get(int32_t i) const { |
| + U_ASSERT(isFrozen); |
| + return isFrozen && i > 0 ? strings->data() + i : nullptr; |
| + } |
| + |
| +private: |
| + UHashtable map; |
| + CharString *strings; |
| + bool isFrozen = false; |
| +}; |
| + |
| +} // namespace |
| + |
| +LocaleDistanceData::LocaleDistanceData(LocaleDistanceData &&data) : |
| + distanceTrieBytes(data.distanceTrieBytes), |
| + regionToPartitions(data.regionToPartitions), |
| + partitions(data.partitions), |
| + paradigms(data.paradigms), paradigmsLength(data.paradigmsLength), |
| + distances(data.distances) { |
| + data.partitions = nullptr; |
| + data.paradigms = nullptr; |
| +} |
| + |
| +LocaleDistanceData::~LocaleDistanceData() { |
| + uprv_free(partitions); |
| + delete[] paradigms; |
| +} |
| + |
| +// TODO(ICU-20777): Rename to just LikelySubtagsData. |
| +struct XLikelySubtagsData { |
| + UResourceBundle *langInfoBundle = nullptr; |
| + UniqueCharStrings strings; |
| + CharStringMap languageAliases; |
| + CharStringMap regionAliases; |
| + const uint8_t *trieBytes = nullptr; |
| + LSR *lsrs = nullptr; |
| + int32_t lsrsLength = 0; |
| + |
| + LocaleDistanceData distanceData; |
| + |
| + XLikelySubtagsData(UErrorCode &errorCode) : strings(errorCode) {} |
| + |
| + ~XLikelySubtagsData() { |
| + ures_close(langInfoBundle); |
| + delete[] lsrs; |
| + } |
| + |
| + void load(UErrorCode &errorCode) { |
| + langInfoBundle = ures_openDirect(nullptr, "langInfo", &errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + StackUResourceBundle stackTempBundle; |
| + ResourceDataValue value; |
| + ures_getValueWithFallback(langInfoBundle, "likely", stackTempBundle.getAlias(), |
| + value, errorCode); |
| + ResourceTable likelyTable = value.getTable(errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + |
| + // Read all strings in the resource bundle and convert them to invariant char *. |
| + LocalMemory<int32_t> languageIndexes, regionIndexes, lsrSubtagIndexes; |
| + int32_t languagesLength = 0, regionsLength = 0, lsrSubtagsLength = 0; |
| + if (!readStrings(likelyTable, "languageAliases", value, |
| + languageIndexes, languagesLength, errorCode) || |
| + !readStrings(likelyTable, "regionAliases", value, |
| + regionIndexes, regionsLength, errorCode) || |
| + !readStrings(likelyTable, "lsrs", value, |
| + lsrSubtagIndexes,lsrSubtagsLength, errorCode)) { |
| + return; |
| + } |
| + if ((languagesLength & 1) != 0 || |
| + (regionsLength & 1) != 0 || |
| + (lsrSubtagsLength % 3) != 0) { |
| + errorCode = U_INVALID_FORMAT_ERROR; |
| + return; |
| + } |
| + if (lsrSubtagsLength == 0) { |
| + errorCode = U_MISSING_RESOURCE_ERROR; |
| + return; |
| + } |
| + |
| + if (!likelyTable.findValue("trie", value)) { |
| + errorCode = U_MISSING_RESOURCE_ERROR; |
| + return; |
| + } |
| + int32_t length; |
| + trieBytes = value.getBinary(length, errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + |
| + // Also read distance/matcher data if available, |
| + // to open & keep only one resource bundle pointer |
| + // and to use one single UniqueCharStrings. |
| + UErrorCode matchErrorCode = U_ZERO_ERROR; |
| + ures_getValueWithFallback(langInfoBundle, "match", stackTempBundle.getAlias(), |
| + value, matchErrorCode); |
| + LocalMemory<int32_t> partitionIndexes, paradigmSubtagIndexes; |
| + int32_t partitionsLength = 0, paradigmSubtagsLength = 0; |
| + if (U_SUCCESS(matchErrorCode)) { |
| + ResourceTable matchTable = value.getTable(errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + |
| + if (matchTable.findValue("trie", value)) { |
| + distanceData.distanceTrieBytes = value.getBinary(length, errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + } |
| + |
| + if (matchTable.findValue("regionToPartitions", value)) { |
| + distanceData.regionToPartitions = value.getBinary(length, errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + if (length < LSR::REGION_INDEX_LIMIT) { |
| + errorCode = U_INVALID_FORMAT_ERROR; |
| + return; |
| + } |
| + } |
| + |
| + if (!readStrings(matchTable, "partitions", value, |
| + partitionIndexes, partitionsLength, errorCode) || |
| + !readStrings(matchTable, "paradigms", value, |
| + paradigmSubtagIndexes, paradigmSubtagsLength, errorCode)) { |
| + return; |
| + } |
| + if ((paradigmSubtagsLength % 3) != 0) { |
| + errorCode = U_INVALID_FORMAT_ERROR; |
| + return; |
| + } |
| + |
| + if (matchTable.findValue("distances", value)) { |
| + distanceData.distances = value.getIntVector(length, errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + if (length < 4) { // LocaleDistance IX_LIMIT |
| + errorCode = U_INVALID_FORMAT_ERROR; |
| + return; |
| + } |
| + } |
| + } else if (matchErrorCode == U_MISSING_RESOURCE_ERROR) { |
| + // ok for likely subtags |
| + } else { // error other than missing resource |
| + errorCode = matchErrorCode; |
| + return; |
| + } |
| + |
| + // Fetch & store invariant-character versions of strings |
| + // only after we have collected and de-duplicated all of them. |
| + strings.freeze(); |
| + |
| + languageAliases = CharStringMap(languagesLength / 2, errorCode); |
| + for (int32_t i = 0; i < languagesLength; i += 2) { |
| + languageAliases.put(strings.get(languageIndexes[i]), |
| + strings.get(languageIndexes[i + 1]), errorCode); |
| + } |
| + |
| + regionAliases = CharStringMap(regionsLength / 2, errorCode); |
| + for (int32_t i = 0; i < regionsLength; i += 2) { |
| + regionAliases.put(strings.get(regionIndexes[i]), |
| + strings.get(regionIndexes[i + 1]), errorCode); |
| + } |
| + if (U_FAILURE(errorCode)) { return; } |
| + |
| + lsrsLength = lsrSubtagsLength / 3; |
| + lsrs = new LSR[lsrsLength]; |
| + if (lsrs == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) { |
| + lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]), |
| + strings.get(lsrSubtagIndexes[i + 1]), |
| + strings.get(lsrSubtagIndexes[i + 2])); |
| + } |
| + |
| + if (partitionsLength > 0) { |
| + distanceData.partitions = static_cast<const char **>( |
| + uprv_malloc(partitionsLength * sizeof(const char *))); |
| + if (distanceData.partitions == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + for (int32_t i = 0; i < partitionsLength; ++i) { |
| + distanceData.partitions[i] = strings.get(partitionIndexes[i]); |
| + } |
| + } |
| + |
| + if (paradigmSubtagsLength > 0) { |
| + distanceData.paradigmsLength = paradigmSubtagsLength / 3; |
| + LSR *paradigms = new LSR[distanceData.paradigmsLength]; |
| + if (paradigms == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) { |
| + paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]), |
| + strings.get(paradigmSubtagIndexes[i + 1]), |
| + strings.get(paradigmSubtagIndexes[i + 2])); |
| + } |
| + distanceData.paradigms = paradigms; |
| + } |
| + } |
| + |
| +private: |
| + bool readStrings(const ResourceTable &table, const char *key, ResourceValue &value, |
| + LocalMemory<int32_t> &indexes, int32_t &length, UErrorCode &errorCode) { |
| + if (table.findValue(key, value)) { |
| + ResourceArray stringArray = value.getArray(errorCode); |
| + if (U_FAILURE(errorCode)) { return false; } |
| + length = stringArray.getSize(); |
| + if (length == 0) { return true; } |
| + int32_t *rawIndexes = indexes.allocateInsteadAndCopy(length); |
| + if (rawIndexes == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return false; |
| + } |
| + for (int i = 0; i < length; ++i) { |
| + stringArray.getValue(i, value); // returns TRUE because i < length |
| + rawIndexes[i] = strings.add(value.getUnicodeString(errorCode), errorCode); |
| + if (U_FAILURE(errorCode)) { return false; } |
| + } |
| + } |
| + return true; |
| + } |
| +}; |
| + |
| +namespace { |
| + |
| +XLikelySubtags *gLikelySubtags = nullptr; |
| +UInitOnce gInitOnce = U_INITONCE_INITIALIZER; |
| + |
| +UBool U_CALLCONV cleanup() { |
| + delete gLikelySubtags; |
| + gLikelySubtags = nullptr; |
| + gInitOnce.reset(); |
| + return TRUE; |
| +} |
| + |
| +} // namespace |
| + |
| +void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) { |
| + // This function is invoked only via umtx_initOnce(). |
| + U_ASSERT(gLikelySubtags == nullptr); |
| + XLikelySubtagsData data(errorCode); |
| + data.load(errorCode); |
| + if (U_FAILURE(errorCode)) { return; } |
| + gLikelySubtags = new XLikelySubtags(data); |
| + if (gLikelySubtags == nullptr) { |
| + errorCode = U_MEMORY_ALLOCATION_ERROR; |
| + return; |
| + } |
| + ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup); |
| +} |
| + |
| +const XLikelySubtags *XLikelySubtags::getSingleton(UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return nullptr; } |
| + umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode); |
| + return gLikelySubtags; |
| +} |
| + |
| +XLikelySubtags::XLikelySubtags(XLikelySubtagsData &data) : |
| + langInfoBundle(data.langInfoBundle), |
| + strings(data.strings.orphanCharStrings()), |
| + languageAliases(std::move(data.languageAliases)), |
| + regionAliases(std::move(data.regionAliases)), |
| + trie(data.trieBytes), |
| + lsrs(data.lsrs), |
| +#if U_DEBUG |
| + lsrsLength(data.lsrsLength), |
| +#endif |
| + distanceData(std::move(data.distanceData)) { |
| + data.langInfoBundle = nullptr; |
| + data.lsrs = nullptr; |
| + |
| + // Cache the result of looking up language="und" encoded as "*", and "und-Zzzz" ("**"). |
| + UStringTrieResult result = trie.next(u'*'); |
| + U_ASSERT(USTRINGTRIE_HAS_NEXT(result)); |
| + trieUndState = trie.getState64(); |
| + result = trie.next(u'*'); |
| + U_ASSERT(USTRINGTRIE_HAS_NEXT(result)); |
| + trieUndZzzzState = trie.getState64(); |
| + result = trie.next(u'*'); |
| + U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); |
| + defaultLsrIndex = trie.getValue(); |
| + trie.reset(); |
| + |
| + for (char16_t c = u'a'; c <= u'z'; ++c) { |
| + result = trie.next(c); |
| + if (result == USTRINGTRIE_NO_VALUE) { |
| + trieFirstLetterStates[c - u'a'] = trie.getState64(); |
| + } |
| + trie.reset(); |
| + } |
| +} |
| + |
| +XLikelySubtags::~XLikelySubtags() { |
| + ures_close(langInfoBundle); |
| + delete strings; |
| + delete[] lsrs; |
| +} |
| + |
| +LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const { |
| + const char *name = locale.getName(); |
| + if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=") |
| + // Private use language tag x-subtag-subtag... |
| + return LSR(name, "", ""); |
| + } |
| + return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), |
| + locale.getVariant(), errorCode); |
| +} |
| + |
| +namespace { |
| + |
| +const char *getCanonical(const CharStringMap &aliases, const char *alias) { |
| + const char *canonical = aliases.get(alias); |
| + return canonical == nullptr ? alias : canonical; |
| +} |
| + |
| +} // namespace |
| + |
| +LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region, |
| + const char *variant, UErrorCode &errorCode) const { |
| + // Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK. |
| + // They should match only themselves, |
| + // not other locales with what looks like the same language and script subtags. |
| + char c1; |
| + if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { |
| + switch (c1) { |
| + case 'A': |
| + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, errorCode); |
| + case 'B': |
| + return LSR(PSEUDO_BIDI_PREFIX, language, script, region, errorCode); |
| + case 'C': |
| + return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, errorCode); |
| + default: // normal locale |
| + break; |
| + } |
| + } |
| + |
| + if (variant[0] == 'P' && variant[1] == 'S') { |
| + if (uprv_strcmp(variant, "PSACCENT") == 0) { |
| + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, |
| + *region == 0 ? "XA" : region, errorCode); |
| + } else if (uprv_strcmp(variant, "PSBIDI") == 0) { |
| + return LSR(PSEUDO_BIDI_PREFIX, language, script, |
| + *region == 0 ? "XB" : region, errorCode); |
| + } else if (uprv_strcmp(variant, "PSCRACK") == 0) { |
| + return LSR(PSEUDO_CRACKED_PREFIX, language, script, |
| + *region == 0 ? "XC" : region, errorCode); |
| + } |
| + // else normal locale |
| + } |
| + |
| + language = getCanonical(languageAliases, language); |
| + // (We have no script mappings.) |
| + region = getCanonical(regionAliases, region); |
| + return maximize(language, script, region); |
| +} |
| + |
| +LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region) const { |
| + if (uprv_strcmp(language, "und") == 0) { |
| + language = ""; |
| + } |
| + if (uprv_strcmp(script, "Zzzz") == 0) { |
| + script = ""; |
| + } |
| + if (uprv_strcmp(region, "ZZ") == 0) { |
| + region = ""; |
| + } |
| + if (*script != 0 && *region != 0 && *language != 0) { |
| + return LSR(language, script, region); // already maximized |
| + } |
| + |
| + uint32_t retainOldMask = 0; |
| + BytesTrie iter(trie); |
| + uint64_t state; |
| + int32_t value; |
| + // Small optimization: Array lookup for first language letter. |
| + int32_t c0; |
| + if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 && |
| + language[1] != 0 && // language.length() >= 2 |
| + (state = trieFirstLetterStates[c0]) != 0) { |
| + value = trieNext(iter.resetToState64(state), language, 1); |
| + } else { |
| + value = trieNext(iter, language, 0); |
| + } |
| + if (value >= 0) { |
| + if (*language != 0) { |
| + retainOldMask |= 4; |
| + } |
| + state = iter.getState64(); |
| + } else { |
| + retainOldMask |= 4; |
| + iter.resetToState64(trieUndState); // "und" ("*") |
| + state = 0; |
| + } |
| + |
| + if (value > 0) { |
| + // Intermediate or final value from just language. |
| + if (value == SKIP_SCRIPT) { |
| + value = 0; |
| + } |
| + if (*script != 0) { |
| + retainOldMask |= 2; |
| + } |
| + } else { |
| + value = trieNext(iter, script, 0); |
| + if (value >= 0) { |
| + if (*script != 0) { |
| + retainOldMask |= 2; |
| + } |
| + state = iter.getState64(); |
| + } else { |
| + retainOldMask |= 2; |
| + if (state == 0) { |
| + iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**") |
| + } else { |
| + iter.resetToState64(state); |
| + value = trieNext(iter, "", 0); |
| + U_ASSERT(value >= 0); |
| + state = iter.getState64(); |
| + } |
| + } |
| + } |
| + |
| + if (value > 0) { |
| + // Final value from just language or language+script. |
| + if (*region != 0) { |
| + retainOldMask |= 1; |
| + } |
| + } else { |
| + value = trieNext(iter, region, 0); |
| + if (value >= 0) { |
| + if (*region != 0) { |
| + retainOldMask |= 1; |
| + } |
| + } else { |
| + retainOldMask |= 1; |
| + if (state == 0) { |
| + value = defaultLsrIndex; |
| + } else { |
| + iter.resetToState64(state); |
| + value = trieNext(iter, "", 0); |
| + U_ASSERT(value > 0); |
| + } |
| + } |
| + } |
| + U_ASSERT(value < lsrsLength); |
| + const LSR &result = lsrs[value]; |
| + |
| + if (*language == 0) { |
| + language = "und"; |
| + } |
| + |
| + if (retainOldMask == 0) { |
| + // Quickly return a copy of the lookup-result LSR |
| + // without new allocation of the subtags. |
| + return LSR(result.language, result.script, result.region); |
| + } |
| + if ((retainOldMask & 4) == 0) { |
| + language = result.language; |
| + } |
| + if ((retainOldMask & 2) == 0) { |
| + script = result.script; |
| + } |
| + if ((retainOldMask & 1) == 0) { |
| + region = result.region; |
| + } |
| + return LSR(language, script, region); |
| +} |
| + |
| +int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) { |
| + UStringTrieResult result; |
| + uint8_t c; |
| + if ((c = s[i]) == 0) { |
| + result = iter.next(u'*'); |
| + } else { |
| + for (;;) { |
| + c = uprv_invCharToAscii(c); |
| + // EBCDIC: If s[i] is not an invariant character, |
| + // then c is now 0 and will simply not match anything, which is harmless. |
| + uint8_t next = s[++i]; |
| + if (next != 0) { |
| + if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { |
| + return -1; |
| + } |
| + } else { |
| + // last character of this subtag |
| + result = iter.next(c | 0x80); |
| + break; |
| + } |
| + c = next; |
| + } |
| + } |
| + switch (result) { |
| + case USTRINGTRIE_NO_MATCH: return -1; |
| + case USTRINGTRIE_NO_VALUE: return 0; |
| + case USTRINGTRIE_INTERMEDIATE_VALUE: |
| + U_ASSERT(iter.getValue() == SKIP_SCRIPT); |
| + return SKIP_SCRIPT; |
| + case USTRINGTRIE_FINAL_VALUE: return iter.getValue(); |
| + default: return -1; |
| + } |
| +} |
| + |
| +// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code |
| +// in loclikely.cpp to this new code, including activating this |
| +// minimizeSubtags() function. The LocaleMatcher does not minimize. |
| +#if 0 |
| +LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn, |
| + const char *regionIn, ULocale.Minimize fieldToFavor, |
| + UErrorCode &errorCode) const { |
| + LSR result = maximize(languageIn, scriptIn, regionIn); |
| + |
| + // We could try just a series of checks, like: |
| + // LSR result2 = addLikelySubtags(languageIn, "", ""); |
| + // if result.equals(result2) return result2; |
| + // However, we can optimize 2 of the cases: |
| + // (languageIn, "", "") |
| + // (languageIn, "", regionIn) |
| + |
| + // value00 = lookup(result.language, "", "") |
| + BytesTrie iter = new BytesTrie(trie); |
| + int value = trieNext(iter, result.language, 0); |
| + U_ASSERT(value >= 0); |
| + if (value == 0) { |
| + value = trieNext(iter, "", 0); |
| + U_ASSERT(value >= 0); |
| + if (value == 0) { |
| + value = trieNext(iter, "", 0); |
| + } |
| + } |
| + U_ASSERT(value > 0); |
| + LSR value00 = lsrs[value]; |
| + boolean favorRegionOk = false; |
| + if (result.script.equals(value00.script)) { //script is default |
| + if (result.region.equals(value00.region)) { |
| + return new LSR(result.language, "", ""); |
| + } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) { |
| + return new LSR(result.language, "", result.region); |
| + } else { |
| + favorRegionOk = true; |
| + } |
| + } |
| + |
| + // The last case is not as easy to optimize. |
| + // Maybe do later, but for now use the straightforward code. |
| + LSR result2 = maximize(languageIn, scriptIn, ""); |
| + if (result2.equals(result)) { |
| + return new LSR(result.language, result.script, ""); |
| + } else if (favorRegionOk) { |
| + return new LSR(result.language, "", result.region); |
| + } |
| + return result; |
| +} |
| +#endif |
| + |
| +U_NAMESPACE_END |
| diff --git a/source/common/loclikelysubtags.h b/source/common/loclikelysubtags.h |
| new file mode 100644 |
| index 00000000..8c8a08ac |
| --- /dev/null |
| +++ b/source/common/loclikelysubtags.h |
| @@ -0,0 +1,143 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// loclikelysubtags.h |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#ifndef __LOCLIKELYSUBTAGS_H__ |
| +#define __LOCLIKELYSUBTAGS_H__ |
| + |
| +#include <utility> |
| +#include "unicode/utypes.h" |
| +#include "unicode/bytestrie.h" |
| +#include "unicode/locid.h" |
| +#include "unicode/uobject.h" |
| +#include "unicode/ures.h" |
| +#include "lsr.h" |
| +#include "uhash.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +struct XLikelySubtagsData; |
| + |
| +/** |
| + * Map of const char * keys & values. |
| + * Stores pointers as is: Does not own/copy/adopt/release strings. |
| + */ |
| +class CharStringMap final : public UMemory { |
| +public: |
| + /** Constructs an unusable non-map. */ |
| + CharStringMap() : map(nullptr) {} |
| + CharStringMap(int32_t size, UErrorCode &errorCode) { |
| + map = uhash_openSize(uhash_hashChars, uhash_compareChars, uhash_compareChars, |
| + size, &errorCode); |
| + } |
| + CharStringMap(CharStringMap &&other) U_NOEXCEPT : map(other.map) { |
| + other.map = nullptr; |
| + } |
| + CharStringMap(const CharStringMap &other) = delete; |
| + ~CharStringMap() { |
| + uhash_close(map); |
| + } |
| + |
| + CharStringMap &operator=(CharStringMap &&other) U_NOEXCEPT { |
| + map = other.map; |
| + other.map = nullptr; |
| + return *this; |
| + } |
| + CharStringMap &operator=(const CharStringMap &other) = delete; |
| + |
| + const char *get(const char *key) const { return static_cast<const char *>(uhash_get(map, key)); } |
| + void put(const char *key, const char *value, UErrorCode &errorCode) { |
| + uhash_put(map, const_cast<char *>(key), const_cast<char *>(value), &errorCode); |
| + } |
| + |
| +private: |
| + UHashtable *map; |
| +}; |
| + |
| +struct LocaleDistanceData { |
| + LocaleDistanceData() = default; |
| + LocaleDistanceData(LocaleDistanceData &&data); |
| + ~LocaleDistanceData(); |
| + |
| + const uint8_t *distanceTrieBytes = nullptr; |
| + const uint8_t *regionToPartitions = nullptr; |
| + const char **partitions = nullptr; |
| + const LSR *paradigms = nullptr; |
| + int32_t paradigmsLength = 0; |
| + const int32_t *distances = nullptr; |
| + |
| +private: |
| + LocaleDistanceData &operator=(const LocaleDistanceData &) = delete; |
| +}; |
| + |
| +// TODO(ICU-20777): Rename to just LikelySubtags. |
| +class XLikelySubtags final : public UMemory { |
| +public: |
| + ~XLikelySubtags(); |
| + |
| + static constexpr int32_t SKIP_SCRIPT = 1; |
| + |
| + // VisibleForTesting |
| + static const XLikelySubtags *getSingleton(UErrorCode &errorCode); |
| + |
| + // VisibleForTesting |
| + LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const; |
| + |
| + // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code |
| + // in loclikely.cpp to this new code, including activating this |
| + // minimizeSubtags() function. The LocaleMatcher does not minimize. |
| +#if 0 |
| + LSR minimizeSubtags(const char *languageIn, const char *scriptIn, const char *regionIn, |
| + ULocale.Minimize fieldToFavor, UErrorCode &errorCode) const; |
| +#endif |
| + |
| + // visible for LocaleDistance |
| + const LocaleDistanceData &getDistanceData() const { return distanceData; } |
| + |
| +private: |
| + XLikelySubtags(XLikelySubtagsData &data); |
| + XLikelySubtags(const XLikelySubtags &other) = delete; |
| + XLikelySubtags &operator=(const XLikelySubtags &other) = delete; |
| + |
| + static void initLikelySubtags(UErrorCode &errorCode); |
| + |
| + LSR makeMaximizedLsr(const char *language, const char *script, const char *region, |
| + const char *variant, UErrorCode &errorCode) const; |
| + |
| + /** |
| + * Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN". |
| + */ |
| + LSR maximize(const char *language, const char *script, const char *region) const; |
| + |
| + static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i); |
| + |
| + UResourceBundle *langInfoBundle; |
| + // We could store the strings by value, except that if there were few enough strings, |
| + // moving the contents could copy it to a different array, |
| + // invalidating the pointers stored in the maps. |
| + CharString *strings; |
| + CharStringMap languageAliases; |
| + CharStringMap regionAliases; |
| + |
| + // The trie maps each lang+script+region (encoded in ASCII) to an index into lsrs. |
| + // There is also a trie value for each intermediate lang and lang+script. |
| + // '*' is used instead of "und", "Zzzz"/"" and "ZZ"/"". |
| + BytesTrie trie; |
| + uint64_t trieUndState; |
| + uint64_t trieUndZzzzState; |
| + int32_t defaultLsrIndex; |
| + uint64_t trieFirstLetterStates[26]; |
| + const LSR *lsrs; |
| +#if U_DEBUG |
| + int32_t lsrsLength; |
| +#endif |
| + |
| + // distance/matcher data: see comment in XLikelySubtagsData::load() |
| + LocaleDistanceData distanceData; |
| +}; |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif // __LOCLIKELYSUBTAGS_H__ |
| diff --git a/source/common/lsr.cpp b/source/common/lsr.cpp |
| new file mode 100644 |
| index 00000000..0c28eeda |
| --- /dev/null |
| +++ b/source/common/lsr.cpp |
| @@ -0,0 +1,101 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// lsr.cpp |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#include "unicode/utypes.h" |
| +#include "charstr.h" |
| +#include "cmemory.h" |
| +#include "cstring.h" |
| +#include "lsr.h" |
| +#include "uinvchar.h" |
| +#include "ustr_imp.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode) : |
| + language(nullptr), script(nullptr), region(r), |
| + regionIndex(indexForRegion(region)) { |
| + if (U_SUCCESS(errorCode)) { |
| + CharString langScript; |
| + langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode); |
| + int32_t scriptOffset = langScript.length(); |
| + langScript.append(prefix, errorCode).append(scr, errorCode); |
| + owned = langScript.cloneData(errorCode); |
| + if (U_SUCCESS(errorCode)) { |
| + language = owned; |
| + script = owned + scriptOffset; |
| + } |
| + } |
| +} |
| + |
| +LSR::LSR(LSR &&other) U_NOEXCEPT : |
| + language(other.language), script(other.script), region(other.region), owned(other.owned), |
| + regionIndex(other.regionIndex), hashCode(other.hashCode) { |
| + if (owned != nullptr) { |
| + other.language = other.script = ""; |
| + other.owned = nullptr; |
| + other.hashCode = 0; |
| + } |
| +} |
| + |
| +void LSR::deleteOwned() { |
| + uprv_free(owned); |
| +} |
| + |
| +LSR &LSR::operator=(LSR &&other) U_NOEXCEPT { |
| + this->~LSR(); |
| + language = other.language; |
| + script = other.script; |
| + region = other.region; |
| + regionIndex = other.regionIndex; |
| + owned = other.owned; |
| + hashCode = other.hashCode; |
| + if (owned != nullptr) { |
| + other.language = other.script = ""; |
| + other.owned = nullptr; |
| + other.hashCode = 0; |
| + } |
| + return *this; |
| +} |
| + |
| +UBool LSR::operator==(const LSR &other) const { |
| + return |
| + uprv_strcmp(language, other.language) == 0 && |
| + uprv_strcmp(script, other.script) == 0 && |
| + regionIndex == other.regionIndex && |
| + // Compare regions if both are ill-formed (and their indexes are 0). |
| + (regionIndex > 0 || uprv_strcmp(region, other.region) == 0); |
| +} |
| + |
| +int32_t LSR::indexForRegion(const char *region) { |
| + int32_t c = region[0]; |
| + int32_t a = c - '0'; |
| + if (0 <= a && a <= 9) { // digits: "419" |
| + int32_t b = region[1] - '0'; |
| + if (b < 0 || 9 < b) { return 0; } |
| + c = region[2] - '0'; |
| + if (c < 0 || 9 < c || region[3] != 0) { return 0; } |
| + return (10 * a + b) * 10 + c + 1; |
| + } else { // letters: "DE" |
| + a = uprv_upperOrdinal(c); |
| + if (a < 0 || 25 < a) { return 0; } |
| + int32_t b = uprv_upperOrdinal(region[1]); |
| + if (b < 0 || 25 < b || region[2] != 0) { return 0; } |
| + return 26 * a + b + 1001; |
| + } |
| + return 0; |
| +} |
| + |
| +LSR &LSR::setHashCode() { |
| + if (hashCode == 0) { |
| + hashCode = |
| + (ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language))) * 37 + |
| + ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)))) * 37 + |
| + regionIndex; |
| + } |
| + return *this; |
| +} |
| + |
| +U_NAMESPACE_END |
| diff --git a/source/common/lsr.h b/source/common/lsr.h |
| new file mode 100644 |
| index 00000000..db6cf938 |
| --- /dev/null |
| +++ b/source/common/lsr.h |
| @@ -0,0 +1,72 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// lsr.h |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#ifndef __LSR_H__ |
| +#define __LSR_H__ |
| + |
| +#include "unicode/utypes.h" |
| +#include "unicode/uobject.h" |
| +#include "cstring.h" |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +struct LSR final : public UMemory { |
| + static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26; |
| + |
| + const char *language; |
| + const char *script; |
| + const char *region; |
| + char *owned = nullptr; |
| + /** Index for region, 0 if ill-formed. @see indexForRegion */ |
| + int32_t regionIndex = 0; |
| + /** Only set for LSRs that will be used in a hash table. */ |
| + int32_t hashCode = 0; |
| + |
| + LSR() : language("und"), script(""), region("") {} |
| + |
| + /** Constructor which aliases all subtag pointers. */ |
| + LSR(const char *lang, const char *scr, const char *r) : |
| + language(lang), script(scr), region(r), |
| + regionIndex(indexForRegion(region)) {} |
| + /** |
| + * Constructor which prepends the prefix to the language and script, |
| + * copies those into owned memory, and aliases the region. |
| + */ |
| + LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode); |
| + LSR(LSR &&other) U_NOEXCEPT; |
| + LSR(const LSR &other) = delete; |
| + inline ~LSR() { |
| + // Pure inline code for almost all instances. |
| + if (owned != nullptr) { |
| + deleteOwned(); |
| + } |
| + } |
| + |
| + LSR &operator=(LSR &&other) U_NOEXCEPT; |
| + LSR &operator=(const LSR &other) = delete; |
| + |
| + /** |
| + * Returns a positive index (>0) for a well-formed region code. |
| + * Do not rely on a particular region->index mapping; it may change. |
| + * Returns 0 for ill-formed strings. |
| + */ |
| + static int32_t indexForRegion(const char *region); |
| + |
| + UBool operator==(const LSR &other) const; |
| + |
| + inline UBool operator!=(const LSR &other) const { |
| + return !operator==(other); |
| + } |
| + |
| + LSR &setHashCode(); |
| + |
| +private: |
| + void deleteOwned(); |
| +}; |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif // __LSR_H__ |
| diff --git a/source/common/resource.h b/source/common/resource.h |
| index ee93d41a..5199b858 100644 |
| --- a/source/common/resource.h |
| +++ b/source/common/resource.h |
| @@ -94,13 +94,20 @@ public: |
| */ |
| int32_t getSize() const { return length; } |
| /** |
| - * @param i Array item index. |
| + * @param i Table item index. |
| * @param key Output-only, receives the key of the i'th item. |
| * @param value Output-only, receives the value of the i'th item. |
| * @return TRUE if i is non-negative and less than getSize(). |
| */ |
| UBool getKeyAndValue(int32_t i, const char *&key, ResourceValue &value) const; |
| |
| + /** |
| + * @param key Key string to find in the table. |
| + * @param value Output-only, receives the value of the item with that key. |
| + * @return TRUE if the table contains the key. |
| + */ |
| + UBool findValue(const char *key, ResourceValue &value) const; |
| + |
| private: |
| const uint16_t *keys16; |
| const int32_t *keys32; |
| diff --git a/source/common/ucln_cmn.h b/source/common/ucln_cmn.h |
| index 0ca911b4..c4b22ca4 100644 |
| --- a/source/common/ucln_cmn.h |
| +++ b/source/common/ucln_cmn.h |
| @@ -41,6 +41,8 @@ typedef enum ECleanupCommonType { |
| UCLN_COMMON_LOCALE_KEY_TYPE, |
| UCLN_COMMON_LOCALE, |
| UCLN_COMMON_LOCALE_AVAILABLE, |
| + UCLN_COMMON_LIKELY_SUBTAGS, |
| + UCLN_COMMON_LOCALE_DISTANCE, |
| UCLN_COMMON_ULOC, |
| UCLN_COMMON_CURRENCY, |
| UCLN_COMMON_LOADED_NORMALIZER2, |
| diff --git a/source/common/uinvchar.cpp b/source/common/uinvchar.cpp |
| index 8ce2350d..6e5fb48c 100644 |
| --- a/source/common/uinvchar.cpp |
| +++ b/source/common/uinvchar.cpp |
| @@ -445,6 +445,13 @@ uprv_copyEbcdic(const UDataSwapper *ds, |
| return length; |
| } |
| |
| +U_CFUNC UBool |
| +uprv_isEbcdicAtSign(char c) { |
| + static const uint8_t ebcdicAtSigns[] = { |
| + 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 }; |
| + return c != 0 && uprv_strchr((const char *)ebcdicAtSigns, c) != nullptr; |
| +} |
| + |
| /* compare invariant strings; variant characters compare less than others and unlike each other */ |
| U_CFUNC int32_t |
| uprv_compareInvAscii(const UDataSwapper *ds, |
| @@ -561,6 +568,11 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) { |
| } |
| } |
| |
| +U_CAPI char U_EXPORT2 |
| +uprv_ebcdicToAscii(char c) { |
| + return (char)asciiFromEbcdic[(uint8_t)c]; |
| +} |
| + |
| U_CAPI char U_EXPORT2 |
| uprv_ebcdicToLowercaseAscii(char c) { |
| return (char)lowercaseAsciiFromEbcdic[(uint8_t)c]; |
| diff --git a/source/common/uinvchar.h b/source/common/uinvchar.h |
| index 56dddfa8..a43cfcd9 100644 |
| --- a/source/common/uinvchar.h |
| +++ b/source/common/uinvchar.h |
| @@ -68,6 +68,75 @@ uprv_isInvariantUString(const UChar *s, int32_t length); |
| # error Unknown charset family! |
| #endif |
| |
| +#ifdef __cplusplus |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +/** |
| + * Like U_UPPER_ORDINAL(x) but with validation. |
| + * Returns 0..25 for A..Z else a value outside 0..25. |
| + */ |
| +inline int32_t uprv_upperOrdinal(int32_t c) { |
| +#if U_CHARSET_FAMILY==U_ASCII_FAMILY |
| + return c - 'A'; |
| +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY |
| + // EBCDIC: A-Z (26 letters) is split into three ranges A-I (9 letters), J-R (9), S-Z (8). |
| + // https://en.wikipedia.org/wiki/EBCDIC_037#Codepage_layout |
| + if (c <= 'I') { return c - 'A'; } // A-I --> 0-8 |
| + if (c < 'J') { return -1; } |
| + if (c <= 'R') { return c - 'J' + 9; } // J-R --> 9..17 |
| + if (c < 'S') { return -1; } |
| + return c - 'S' + 18; // S-Z --> 18..25 |
| +#else |
| +# error Unknown charset family! |
| +#endif |
| +} |
| + |
| +// Like U_UPPER_ORDINAL(x) but for lowercase and with validation. |
| +// Returns 0..25 for a..z else a value outside 0..25. |
| +inline int32_t uprv_lowerOrdinal(int32_t c) { |
| +#if U_CHARSET_FAMILY==U_ASCII_FAMILY |
| + return c - 'a'; |
| +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY |
| + // EBCDIC: a-z (26 letters) is split into three ranges a-i (9 letters), j-r (9), s-z (8). |
| + // https://en.wikipedia.org/wiki/EBCDIC_037#Codepage_layout |
| + if (c <= 'i') { return c - 'a'; } // a-i --> 0-8 |
| + if (c < 'j') { return -1; } |
| + if (c <= 'r') { return c - 'j' + 9; } // j-r --> 9..17 |
| + if (c < 's') { return -1; } |
| + return c - 's' + 18; // s-z --> 18..25 |
| +#else |
| +# error Unknown charset family! |
| +#endif |
| +} |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif |
| + |
| +/** |
| + * Returns true if c == '@' is possible. |
| + * The @ sign is variant, and the @ sign used on one |
| + * EBCDIC machine won't be compiled the same way on other EBCDIC based machines. |
| + * @internal |
| + */ |
| +U_CFUNC UBool |
| +uprv_isEbcdicAtSign(char c); |
| + |
| +/** |
| + * \def uprv_isAtSign |
| + * Returns true if c == '@' is possible. |
| + * For ASCII, checks for exactly '@'. For EBCDIC, calls uprv_isEbcdicAtSign(). |
| + * @internal |
| + */ |
| +#if U_CHARSET_FAMILY==U_ASCII_FAMILY |
| +# define uprv_isAtSign(c) ((c)=='@') |
| +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY |
| +# define uprv_isAtSign(c) uprv_isEbcdicAtSign(c) |
| +#else |
| +# error Unknown charset family! |
| +#endif |
| + |
| /** |
| * Compare two EBCDIC invariant-character strings in ASCII order. |
| * @internal |
| @@ -88,6 +157,26 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2); |
| # error Unknown charset family! |
| #endif |
| |
| +/** |
| + * Converts an EBCDIC invariant character to ASCII. |
| + * @internal |
| + */ |
| +U_INTERNAL char U_EXPORT2 |
| +uprv_ebcdicToAscii(char c); |
| + |
| +/** |
| + * \def uprv_invCharToAscii |
| + * Converts an invariant character to ASCII. |
| + * @internal |
| + */ |
| +#if U_CHARSET_FAMILY==U_ASCII_FAMILY |
| +# define uprv_invCharToAscii(c) (c) |
| +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY |
| +# define uprv_invCharToAscii(c) uprv_ebcdicToAscii(c) |
| +#else |
| +# error Unknown charset family! |
| +#endif |
| + |
| /** |
| * Converts an EBCDIC invariant character to lowercase ASCII. |
| * @internal |
| diff --git a/source/common/unicode/localebuilder.h b/source/common/unicode/localebuilder.h |
| index 960e5980..a91a0b51 100644 |
| --- a/source/common/unicode/localebuilder.h |
| +++ b/source/common/unicode/localebuilder.h |
| @@ -4,6 +4,7 @@ |
| #define __LOCALEBUILDER_H__ |
| |
| #include "unicode/locid.h" |
| +#include "unicode/localematcher.h" |
| #include "unicode/stringpiece.h" |
| #include "unicode/uobject.h" |
| #include "unicode/utypes.h" |
| @@ -277,6 +278,10 @@ public: |
| Locale build(UErrorCode& status); |
| |
| private: |
| + friend class LocaleMatcher::Result; |
| + |
| + void copyExtensionsFrom(const Locale& src, UErrorCode& errorCode); |
| + |
| UErrorCode status_; |
| char language_[9]; |
| char script_[5]; |
| diff --git a/source/common/unicode/localematcher.h b/source/common/unicode/localematcher.h |
| new file mode 100644 |
| index 00000000..701123f7 |
| --- /dev/null |
| +++ b/source/common/unicode/localematcher.h |
| @@ -0,0 +1,605 @@ |
| +// © 2019 and later: Unicode, Inc. and others. |
| +// License & terms of use: http://www.unicode.org/copyright.html#License |
| + |
| +// localematcher.h |
| +// created: 2019may08 Markus W. Scherer |
| + |
| +#ifndef __LOCALEMATCHER_H__ |
| +#define __LOCALEMATCHER_H__ |
| + |
| +#include "unicode/utypes.h" |
| + |
| +#if U_SHOW_CPLUSPLUS_API |
| + |
| +#include "unicode/locid.h" |
| +#include "unicode/stringpiece.h" |
| +#include "unicode/uobject.h" |
| + |
| +/** |
| + * \file |
| + * \brief C++ API: Locale matcher: User's desired locales vs. application's supported locales. |
| + */ |
| + |
| +#ifndef U_HIDE_DRAFT_API |
| + |
| +/** |
| + * Builder option for whether the language subtag or the script subtag is most important. |
| + * |
| + * @see Builder#setFavorSubtag(FavorSubtag) |
| + * @draft ICU 65 |
| + */ |
| +enum ULocMatchFavorSubtag { |
| + /** |
| + * Language differences are most important, then script differences, then region differences. |
| + * (This is the default behavior.) |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ULOCMATCH_FAVOR_LANGUAGE, |
| + /** |
| + * Makes script differences matter relatively more than language differences. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ULOCMATCH_FAVOR_SCRIPT |
| +}; |
| +#ifndef U_IN_DOXYGEN |
| +typedef enum ULocMatchFavorSubtag ULocMatchFavorSubtag; |
| +#endif |
| + |
| +/** |
| + * Builder option for whether all desired locales are treated equally or |
| + * earlier ones are preferred. |
| + * |
| + * @see Builder#setDemotionPerDesiredLocale(Demotion) |
| + * @draft ICU 65 |
| + */ |
| +enum ULocMatchDemotion { |
| + /** |
| + * All desired locales are treated equally. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ULOCMATCH_DEMOTION_NONE, |
| + /** |
| + * Earlier desired locales are preferred. |
| + * |
| + * <p>From each desired locale to the next, |
| + * the distance to any supported locale is increased by an additional amount |
| + * which is at least as large as most region mismatches. |
| + * A later desired locale has to have a better match with some supported locale |
| + * due to more than merely having the same region subtag. |
| + * |
| + * <p>For example: <code>Supported={en, sv} desired=[en-GB, sv]</code> |
| + * yields <code>Result(en-GB, en)</code> because |
| + * with the demotion of sv its perfect match is no better than |
| + * the region distance between the earlier desired locale en-GB and en=en-US. |
| + * |
| + * <p>Notes: |
| + * <ul> |
| + * <li>In some cases, language and/or script differences can be as small as |
| + * the typical region difference. (Example: sr-Latn vs. sr-Cyrl) |
| + * <li>It is possible for certain region differences to be larger than usual, |
| + * and larger than the demotion. |
| + * (As of CLDR 35 there is no such case, but |
| + * this is possible in future versions of the data.) |
| + * </ul> |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ULOCMATCH_DEMOTION_REGION |
| +}; |
| +#ifndef U_IN_DOXYGEN |
| +typedef enum ULocMatchDemotion ULocMatchDemotion; |
| +#endif |
| + |
| +struct UHashtable; |
| + |
| +U_NAMESPACE_BEGIN |
| + |
| +struct LSR; |
| + |
| +class LocaleDistance; |
| +class LocaleLsrIterator; |
| +class UVector; |
| +class XLikelySubtags; |
| + |
| +/** |
| + * Immutable class that picks the best match between a user's desired locales and |
| + * an application's supported locales. |
| + * Movable but not copyable. |
| + * |
| + * <p>Example: |
| + * <pre> |
| + * UErrorCode errorCode = U_ZERO_ERROR; |
| + * LocaleMatcher matcher = LocaleMatcher::Builder().setSupportedLocales("fr, en-GB, en").build(errorCode); |
| + * Locale *bestSupported = matcher.getBestLocale(Locale.US, errorCode); // "en" |
| + * </pre> |
| + * |
| + * <p>A matcher takes into account when languages are close to one another, |
| + * such as Danish and Norwegian, |
| + * and when regional variants are close, like en-GB and en-AU as opposed to en-US. |
| + * |
| + * <p>If there are multiple supported locales with the same (language, script, region) |
| + * likely subtags, then the current implementation returns the first of those locales. |
| + * It ignores variant subtags (except for pseudolocale variants) and extensions. |
| + * This may change in future versions. |
| + * |
| + * <p>For example, the current implementation does not distinguish between |
| + * de, de-DE, de-Latn, de-1901, de-u-co-phonebk. |
| + * |
| + * <p>If you prefer one equivalent locale over another, then provide only the preferred one, |
| + * or place it earlier in the list of supported locales. |
| + * |
| + * <p>Otherwise, the order of supported locales may have no effect on the best-match results. |
| + * The current implementation compares each desired locale with supported locales |
| + * in the following order: |
| + * 1. Default locale, if supported; |
| + * 2. CLDR "paradigm locales" like en-GB and es-419; |
| + * 3. other supported locales. |
| + * This may change in future versions. |
| + * |
| + * <p>Often a product will just need one matcher instance, built with the languages |
| + * that it supports. However, it may want multiple instances with different |
| + * default languages based on additional information, such as the domain. |
| + * |
| + * <p>This class is not intended for public subclassing. |
| + * |
| + * @draft ICU 65 |
| + */ |
| +class U_COMMON_API LocaleMatcher : public UMemory { |
| +public: |
| + /** |
| + * Data for the best-matching pair of a desired and a supported locale. |
| + * Movable but not copyable. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + class U_COMMON_API Result : public UMemory { |
| + public: |
| + /** |
| + * Move constructor; might modify the source. |
| + * This object will have the same contents that the source object had. |
| + * |
| + * @param src Result to move contents from. |
| + * @draft ICU 65 |
| + */ |
| + Result(Result &&src) U_NOEXCEPT; |
| + |
| + /** |
| + * Destructor. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ~Result(); |
| + |
| + /** |
| + * Move assignment; might modify the source. |
| + * This object will have the same contents that the source object had. |
| + * |
| + * @param src Result to move contents from. |
| + * @draft ICU 65 |
| + */ |
| + Result &operator=(Result &&src) U_NOEXCEPT; |
| + |
| + /** |
| + * Returns the best-matching desired locale. |
| + * nullptr if the list of desired locales is empty or if none matched well enough. |
| + * |
| + * @return the best-matching desired locale, or nullptr. |
| + * @draft ICU 65 |
| + */ |
| + inline const Locale *getDesiredLocale() const { return desiredLocale; } |
| + |
| + /** |
| + * Returns the best-matching supported locale. |
| + * If none matched well enough, this is the default locale. |
| + * The default locale is nullptr if the list of supported locales is empty and |
| + * no explicit default locale is set. |
| + * |
| + * @return the best-matching supported locale, or nullptr. |
| + * @draft ICU 65 |
| + */ |
| + inline const Locale *getSupportedLocale() const { return supportedLocale; } |
| + |
| + /** |
| + * Returns the index of the best-matching desired locale in the input Iterable order. |
| + * -1 if the list of desired locales is empty or if none matched well enough. |
| + * |
| + * @return the index of the best-matching desired locale, or -1. |
| + * @draft ICU 65 |
| + */ |
| + inline int32_t getDesiredIndex() const { return desiredIndex; } |
| + |
| + /** |
| + * Returns the index of the best-matching supported locale in the |
| + * constructor’s or builder’s input order (“set” Collection plus “added” locales). |
| + * If the matcher was built from a locale list string, then the iteration order is that |
| + * of a LocalePriorityList built from the same string. |
| + * -1 if the list of supported locales is empty or if none matched well enough. |
| + * |
| + * @return the index of the best-matching supported locale, or -1. |
| + * @draft ICU 65 |
| + */ |
| + inline int32_t getSupportedIndex() const { return supportedIndex; } |
| + |
| + /** |
| + * Takes the best-matching supported locale and adds relevant fields of the |
| + * best-matching desired locale, such as the -t- and -u- extensions. |
| + * May replace some fields of the supported locale. |
| + * The result is the locale that should be used for date and number formatting, collation, etc. |
| + * Returns the root locale if getSupportedLocale() returns nullptr. |
| + * |
| + * <p>Example: desired=ar-SA-u-nu-latn, supported=ar-EG, resolved locale=ar-SA-u-nu-latn |
| + * |
| + * @return a locale combining the best-matching desired and supported locales. |
| + * @draft ICU 65 |
| + */ |
| + Locale makeResolvedLocale(UErrorCode &errorCode) const; |
| + |
| + private: |
| + Result(const Locale *desired, const Locale *supported, |
| + int32_t desIndex, int32_t suppIndex, UBool owned) : |
| + desiredLocale(desired), supportedLocale(supported), |
| + desiredIndex(desIndex), supportedIndex(suppIndex), |
| + desiredIsOwned(owned) {} |
| + |
| + Result(const Result &other) = delete; |
| + Result &operator=(const Result &other) = delete; |
| + |
| + const Locale *desiredLocale; |
| + const Locale *supportedLocale; |
| + int32_t desiredIndex; |
| + int32_t supportedIndex; |
| + UBool desiredIsOwned; |
| + |
| + friend class LocaleMatcher; |
| + }; |
| + |
| + /** |
| + * LocaleMatcher builder. |
| + * Movable but not copyable. |
| + * |
| + * @see LocaleMatcher#builder() |
| + * @draft ICU 65 |
| + */ |
| + class U_COMMON_API Builder : public UMemory { |
| + public: |
| + /** |
| + * Constructs a builder used in chaining parameters for building a LocaleMatcher. |
| + * |
| + * @return a new Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder() {} |
| + |
| + /** |
| + * Move constructor; might modify the source. |
| + * This builder will have the same contents that the source builder had. |
| + * |
| + * @param src Builder to move contents from. |
| + * @draft ICU 65 |
| + */ |
| + Builder(Builder &&src) U_NOEXCEPT; |
| + |
| + /** |
| + * Destructor. |
| + * |
| + * @draft ICU 65 |
| + */ |
| + ~Builder(); |
| + |
| + /** |
| + * Move assignment; might modify the source. |
| + * This builder will have the same contents that the source builder had. |
| + * |
| + * @param src Builder to move contents from. |
| + * @draft ICU 65 |
| + */ |
| + Builder &operator=(Builder &&src) U_NOEXCEPT; |
| + |
| + /** |
| + * Parses an Accept-Language string |
| + * (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>), |
| + * such as "af, en, fr;q=0.9", and sets the supported locales accordingly. |
| + * Allows whitespace in more places but does not allow "*". |
| + * Clears any previously set/added supported locales first. |
| + * |
| + * @param locales the Accept-Language string of locales to set |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder &setSupportedLocalesFromListString(StringPiece locales); |
| + |
| + /** |
| + * Copies the supported locales, preserving iteration order. |
| + * Clears any previously set/added supported locales first. |
| + * Duplicates are allowed, and are not removed. |
| + * |
| + * @param locales the list of locale |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder &setSupportedLocales(Locale::Iterator &locales); |
| + |
| + /** |
| + * Copies the supported locales from the begin/end range, preserving iteration order. |
| + * Clears any previously set/added supported locales first. |
| + * Duplicates are allowed, and are not removed. |
| + * |
| + * Each of the iterator parameter values must be an |
| + * input iterator whose value is convertible to const Locale &. |
| + * |
| + * @param begin Start of range. |
| + * @param end Exclusive end of range. |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + template<typename Iter> |
| + Builder &setSupportedLocales(Iter begin, Iter end) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + clearSupportedLocales(); |
| + while (begin != end) { |
| + addSupportedLocale(*begin++); |
| + } |
| + return *this; |
| + } |
| + |
| + /** |
| + * Copies the supported locales from the begin/end range, preserving iteration order. |
| + * Calls the converter to convert each *begin to a Locale or const Locale &. |
| + * Clears any previously set/added supported locales first. |
| + * Duplicates are allowed, and are not removed. |
| + * |
| + * Each of the iterator parameter values must be an |
| + * input iterator whose value is convertible to const Locale &. |
| + * |
| + * @param begin Start of range. |
| + * @param end Exclusive end of range. |
| + * @param converter Converter from *begin to const Locale & or compatible. |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + template<typename Iter, typename Conv> |
| + Builder &setSupportedLocalesViaConverter(Iter begin, Iter end, Conv converter) { |
| + if (U_FAILURE(errorCode_)) { return *this; } |
| + clearSupportedLocales(); |
| + while (begin != end) { |
| + addSupportedLocale(converter(*begin++)); |
| + } |
| + return *this; |
| + } |
| + |
| + /** |
| + * Adds another supported locale. |
| + * Duplicates are allowed, and are not removed. |
| + * |
| + * @param locale another locale |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder &addSupportedLocale(const Locale &locale); |
| + |
| + /** |
| + * Sets the default locale; if nullptr, or if it is not set explicitly, |
| + * then the first supported locale is used as the default locale. |
| + * |
| + * @param defaultLocale the default locale (will be copied) |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder &setDefaultLocale(const Locale *defaultLocale); |
| + |
| + /** |
| + * If ULOCMATCH_FAVOR_SCRIPT, then the language differences are smaller than script |
| + * differences. |
| + * This is used in situations (such as maps) where |
| + * it is better to fall back to the same script than a similar language. |
| + * |
| + * @param subtag the subtag to favor |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder &setFavorSubtag(ULocMatchFavorSubtag subtag); |
| + |
| + /** |
| + * Option for whether all desired locales are treated equally or |
| + * earlier ones are preferred (this is the default). |
| + * |
| + * @param demotion the demotion per desired locale to set. |
| + * @return this Builder object |
| + * @draft ICU 65 |
| + */ |
| + Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion); |
| + |
| + /** |
| + * Sets the UErrorCode if an error occurred while setting parameters. |
| + * Preserves older error codes in the outErrorCode. |
| + * |
| + * @param outErrorCode Set to an error code if it does not contain one already |
| + * and an error occurred while setting parameters. |
| + * Otherwise unchanged. |
| + * @return TRUE if U_FAILURE(outErrorCode) |
| + * @draft ICU 65 |
| + */ |
| + UBool copyErrorTo(UErrorCode &outErrorCode) const; |
| + |
| + /** |
| + * Builds and returns a new locale matcher. |
| + * This builder can continue to be used. |
| + * |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return new LocaleMatcher. |
| + * @draft ICU 65 |
| + */ |
| + LocaleMatcher build(UErrorCode &errorCode) const; |
| + |
| + private: |
| + friend class LocaleMatcher; |
| + |
| + Builder(const Builder &other) = delete; |
| + Builder &operator=(const Builder &other) = delete; |
| + |
| + void clearSupportedLocales(); |
| + bool ensureSupportedLocaleVector(); |
| + |
| + UErrorCode errorCode_ = U_ZERO_ERROR; |
| + UVector *supportedLocales_ = nullptr; |
| + int32_t thresholdDistance_ = -1; |
| + ULocMatchDemotion demotion_ = ULOCMATCH_DEMOTION_REGION; |
| + Locale *defaultLocale_ = nullptr; |
| + ULocMatchFavorSubtag favor_ = ULOCMATCH_FAVOR_LANGUAGE; |
| + }; |
| + |
| + // FYI No public LocaleMatcher constructors in C++; use the Builder. |
| + |
| + /** |
| + * Move copy constructor; might modify the source. |
| + * This matcher will have the same settings that the source matcher had. |
| + * @param src source matcher |
| + * @draft ICU 65 |
| + */ |
| + LocaleMatcher(LocaleMatcher &&src) U_NOEXCEPT; |
| + |
| + /** |
| + * Destructor. |
| + * @draft ICU 65 |
| + */ |
| + ~LocaleMatcher(); |
| + |
| + /** |
| + * Move assignment operator; might modify the source. |
| + * This matcher will have the same settings that the source matcher had. |
| + * The behavior is undefined if *this and src are the same object. |
| + * @param src source matcher |
| + * @return *this |
| + * @draft ICU 65 |
| + */ |
| + LocaleMatcher &operator=(LocaleMatcher &&src) U_NOEXCEPT; |
| + |
| + /** |
| + * Returns the supported locale which best matches the desired locale. |
| + * |
| + * @param desiredLocale Typically a user's language. |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return the best-matching supported locale. |
| + * @draft ICU 65 |
| + */ |
| + const Locale *getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const; |
| + |
| + /** |
| + * Returns the supported locale which best matches one of the desired locales. |
| + * |
| + * @param desiredLocales Typically a user's languages, in order of preference (descending). |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return the best-matching supported locale. |
| + * @draft ICU 65 |
| + */ |
| + const Locale *getBestMatch(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const; |
| + |
| + /** |
| + * Parses an Accept-Language string |
| + * (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>), |
| + * such as "af, en, fr;q=0.9", |
| + * and returns the supported locale which best matches one of the desired locales. |
| + * Allows whitespace in more places but does not allow "*". |
| + * |
| + * @param desiredLocaleList Typically a user's languages, as an Accept-Language string. |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return the best-matching supported locale. |
| + * @draft ICU 65 |
| + */ |
| + const Locale *getBestMatchForListString(StringPiece desiredLocaleList, UErrorCode &errorCode) const; |
| + |
| + /** |
| + * Returns the best match between the desired locale and the supported locales. |
| + * If the result's desired locale is not nullptr, then it is the address of the input locale. |
| + * It has not been cloned. |
| + * |
| + * @param desiredLocale Typically a user's language. |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return the best-matching pair of the desired and a supported locale. |
| + * @draft ICU 65 |
| + */ |
| + Result getBestMatchResult(const Locale &desiredLocale, UErrorCode &errorCode) const; |
| + |
| + /** |
| + * Returns the best match between the desired and supported locales. |
| + * If the result's desired locale is not nullptr, then it is a clone of |
| + * the best-matching desired locale. The Result object owns the clone. |
| + * |
| + * @param desiredLocales Typically a user's languages, in order of preference (descending). |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return the best-matching pair of a desired and a supported locale. |
| + * @draft ICU 65 |
| + */ |
| + Result getBestMatchResult(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const; |
| + |
| +#ifndef U_HIDE_INTERNAL_API |
| + /** |
| + * Returns a fraction between 0 and 1, where 1 means that the languages are a |
| + * perfect match, and 0 means that they are completely different. |
| + * |
| + * <p>This is mostly an implementation detail, and the precise values may change over time. |
| + * The implementation may use either the maximized forms or the others ones, or both. |
| + * The implementation may or may not rely on the forms to be consistent with each other. |
| + * |
| + * <p>Callers should construct and use a matcher rather than match pairs of locales directly. |
| + * |
| + * @param desired Desired locale. |
| + * @param supported Supported locale. |
| + * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
| + * or else the function returns immediately. Check for U_FAILURE() |
| + * on output or use with function chaining. (See User Guide for details.) |
| + * @return value between 0 and 1, inclusive. |
| + * @internal (has a known user) |
| + */ |
| + double internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const; |
| +#endif // U_HIDE_INTERNAL_API |
| + |
| +private: |
| + LocaleMatcher(const Builder &builder, UErrorCode &errorCode); |
| + LocaleMatcher(const LocaleMatcher &other) = delete; |
| + LocaleMatcher &operator=(const LocaleMatcher &other) = delete; |
| + |
| + int32_t getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, UErrorCode &errorCode) const; |
| + |
| + const XLikelySubtags &likelySubtags; |
| + const LocaleDistance &localeDistance; |
| + int32_t thresholdDistance; |
| + int32_t demotionPerDesiredLocale; |
| + ULocMatchFavorSubtag favorSubtag; |
| + |
| + // These are in input order. |
| + const Locale ** supportedLocales; |
| + LSR *lsrs; |
| + int32_t supportedLocalesLength; |
| + // These are in preference order: 1. Default locale 2. paradigm locales 3. others. |
| + UHashtable *supportedLsrToIndex; // Map<LSR, Integer> stores index+1 because 0 is "not found" |
| + // Array versions of the supportedLsrToIndex keys and values. |
| + // The distance lookup loops over the supportedLSRs and returns the index of the best match. |
| + const LSR **supportedLSRs; |
| + int32_t *supportedIndexes; |
| + int32_t supportedLSRsLength; |
| + Locale *ownedDefaultLocale; |
| + const Locale *defaultLocale; |
| + int32_t defaultLocaleIndex; |
| +}; |
| + |
| +U_NAMESPACE_END |
| + |
| +#endif // U_HIDE_DRAFT_API |
| +#endif // U_SHOW_CPLUSPLUS_API |
| +#endif // __LOCALEMATCHER_H__ |
| diff --git a/source/common/unicode/locid.h b/source/common/unicode/locid.h |
| index 7e410e53..0c9aecb6 100644 |
| --- a/source/common/unicode/locid.h |
| +++ b/source/common/unicode/locid.h |
| @@ -1008,6 +1008,104 @@ public: |
| */ |
| virtual UClassID getDynamicClassID() const; |
| |
| +#ifndef U_HIDE_DRAFT_API |
| + /** |
| + * A Locale iterator interface similar to a Java Iterator<Locale>. |
| + * @draft ICU 65 |
| + */ |
| + class U_COMMON_API Iterator /* not : public UObject because this is an interface/mixin class */ { |
| + public: |
| + /** @draft ICU 65 */ |
| + virtual ~Iterator(); |
| + |
| + /** |
| + * @return TRUE if next() can be called again. |
| + * @draft ICU 65 |
| + */ |
| + virtual UBool hasNext() const = 0; |
| + |
| + /** |
| + * @return the next locale. |
| + * @draft ICU 65 |
| + */ |
| + virtual const Locale &next() = 0; |
| + }; |
| + |
| + /** |
| + * A generic Locale iterator implementation over Locale input iterators. |
| + * @draft ICU 65 |
| + */ |
| + template<typename Iter> |
| + class RangeIterator : public Iterator, public UMemory { |
| + public: |
| + /** |
| + * Constructs an iterator from a begin/end range. |
| + * Each of the iterator parameter values must be an |
| + * input iterator whose value is convertible to const Locale &. |
| + * |
| + * @param begin Start of range. |
| + * @param end Exclusive end of range. |
| + * @draft ICU 65 |
| + */ |
| + RangeIterator(Iter begin, Iter end) : it_(begin), end_(end) {} |
| + |
| + /** |
| + * @return TRUE if next() can be called again. |
| + * @draft ICU 65 |
| + */ |
| + UBool hasNext() const override { return it_ != end_; } |
| + |
| + /** |
| + * @return the next locale. |
| + * @draft ICU 65 |
| + */ |
| + const Locale &next() override { return *it_++; } |
| + |
| + private: |
| + Iter it_; |
| + const Iter end_; |
| + }; |
| + |
| + /** |
| + * A generic Locale iterator implementation over Locale input iterators. |
| + * Calls the converter to convert each *begin to a const Locale &. |
| + * @draft ICU 65 |
| + */ |
| + template<typename Iter, typename Conv> |
| + class ConvertingIterator : public Iterator, public UMemory { |
| + public: |
| + /** |
| + * Constructs an iterator from a begin/end range. |
| + * Each of the iterator parameter values must be an |
| + * input iterator whose value the converter converts to const Locale &. |
| + * |
| + * @param begin Start of range. |
| + * @param end Exclusive end of range. |
| + * @param converter Converter from *begin to const Locale & or compatible. |
| + * @draft ICU 65 |
| + */ |
| + ConvertingIterator(Iter begin, Iter end, Conv converter) : |
| + it_(begin), end_(end), converter_(converter) {} |
| + |
| + /** |
| + * @return TRUE if next() can be called again. |
| + * @draft ICU 65 |
| + */ |
| + UBool hasNext() const override { return it_ != end_; } |
| + |
| + /** |
| + * @return the next locale. |
| + * @draft ICU 65 |
| + */ |
| + const Locale &next() override { return converter_(*it_++); } |
| + |
| + private: |
| + Iter it_; |
| + const Iter end_; |
| + Conv converter_; |
| + }; |
| +#endif // U_HIDE_DRAFT_API |
| + |
| protected: /* only protected for testing purposes. DO NOT USE. */ |
| #ifndef U_HIDE_INTERNAL_API |
| /** |
| diff --git a/source/common/uresbund.cpp b/source/common/uresbund.cpp |
| index 585c0e5f..3224fb37 100644 |
| --- a/source/common/uresbund.cpp |
| +++ b/source/common/uresbund.cpp |
| @@ -38,6 +38,7 @@ |
| #include "umutex.h" |
| #include "putilimp.h" |
| #include "uassert.h" |
| +#include "uresdata.h" |
| |
| using namespace icu; |
| |
| @@ -1963,7 +1964,7 @@ void getAllItemsWithFallback( |
| // When the sink sees the no-fallback/no-inheritance marker, |
| // then it would remove the parent's item. |
| // We would deserialize parent values even though they are overridden in a child bundle. |
| - value.pResData = &bundle->fResData; |
| + value.setData(&bundle->fResData); |
| UResourceDataEntry *parentEntry = bundle->fData->fParent; |
| UBool hasParent = parentEntry != NULL && U_SUCCESS(parentEntry->fBogus); |
| value.setResource(bundle->fRes, ResourceTracer(bundle)); |
| @@ -2011,31 +2012,60 @@ void getAllItemsWithFallback( |
| |
| } // namespace |
| |
| +// Requires a ResourceDataValue fill-in, so that we need not cast from a ResourceValue. |
| +// Unfortunately, the caller must know which subclass to make and pass in. |
| +// Alternatively, we could make it as polymorphic as in Java by |
| +// returning a ResourceValue pointer (possibly wrapped into a LocalPointer) |
| +// that the caller then owns. |
| +// |
| +// Also requires a UResourceBundle fill-in, so that the value's ResourceTracer |
| +// can point to a non-local bundle. |
| +// Without tracing, the child bundle could be a function-local object. |
| +U_CAPI void U_EXPORT2 |
| +ures_getValueWithFallback(const UResourceBundle *bundle, const char *path, |
| + UResourceBundle *tempFillIn, |
| + ResourceDataValue &value, UErrorCode &errorCode) { |
| + if (U_FAILURE(errorCode)) { return; } |
| + if (path == nullptr) { |
| + errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| + return; |
| + } |
| + const UResourceBundle *rb; |
| + if (*path == 0) { |
| + // empty path |
| + rb = bundle; |
| + } else { |
| + rb = ures_getByKeyWithFallback(bundle, path, tempFillIn, &errorCode); |
| + if (U_FAILURE(errorCode)) { |
| + return; |
| + } |
| + } |
| + value.setData(&rb->fResData); |
| + value.setResource(rb->fRes, ResourceTracer(rb)); |
| +} |
| + |
| U_CAPI void U_EXPORT2 |
| ures_getAllItemsWithFallback(const UResourceBundle *bundle, const char *path, |
| icu::ResourceSink &sink, UErrorCode &errorCode) { |
| if (U_FAILURE(errorCode)) { return; } |
| - if (path == NULL) { |
| + if (path == nullptr) { |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| - UResourceBundle stackBundle; |
| - ures_initStackObject(&stackBundle); |
| + StackUResourceBundle stackBundle; |
| const UResourceBundle *rb; |
| if (*path == 0) { |
| // empty path |
| rb = bundle; |
| } else { |
| - rb = ures_getByKeyWithFallback(bundle, path, &stackBundle, &errorCode); |
| + rb = ures_getByKeyWithFallback(bundle, path, stackBundle.getAlias(), &errorCode); |
| if (U_FAILURE(errorCode)) { |
| - ures_close(&stackBundle); |
| return; |
| } |
| } |
| // Get all table items with fallback. |
| ResourceDataValue value; |
| getAllItemsWithFallback(rb, value, sink, errorCode); |
| - ures_close(&stackBundle); |
| } |
| |
| U_CAPI UResourceBundle* U_EXPORT2 ures_getByKey(const UResourceBundle *resB, const char* inKey, UResourceBundle *fillIn, UErrorCode *status) { |
| diff --git a/source/common/uresdata.cpp b/source/common/uresdata.cpp |
| index ce04142f..b3c2e2e2 100644 |
| --- a/source/common/uresdata.cpp |
| +++ b/source/common/uresdata.cpp |
| @@ -509,7 +509,7 @@ const UChar *ResourceDataValue::getString(int32_t &length, UErrorCode &errorCode |
| if(U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| - const UChar *s = res_getString(fTraceInfo, pResData, res, &length); |
| + const UChar *s = res_getString(fTraceInfo, &getData(), res, &length); |
| if(s == NULL) { |
| errorCode = U_RESOURCE_TYPE_MISMATCH; |
| } |
| @@ -520,7 +520,7 @@ const UChar *ResourceDataValue::getAliasString(int32_t &length, UErrorCode &erro |
| if(U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| - const UChar *s = res_getAlias(pResData, res, &length); |
| + const UChar *s = res_getAlias(&getData(), res, &length); |
| if(s == NULL) { |
| errorCode = U_RESOURCE_TYPE_MISMATCH; |
| } |
| @@ -551,7 +551,7 @@ const int32_t *ResourceDataValue::getIntVector(int32_t &length, UErrorCode &erro |
| if(U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| - const int32_t *iv = res_getIntVector(fTraceInfo, pResData, res, &length); |
| + const int32_t *iv = res_getIntVector(fTraceInfo, &getData(), res, &length); |
| if(iv == NULL) { |
| errorCode = U_RESOURCE_TYPE_MISMATCH; |
| } |
| @@ -562,7 +562,7 @@ const uint8_t *ResourceDataValue::getBinary(int32_t &length, UErrorCode &errorCo |
| if(U_FAILURE(errorCode)) { |
| return NULL; |
| } |
| - const uint8_t *b = res_getBinary(fTraceInfo, pResData, res, &length); |
| + const uint8_t *b = res_getBinary(fTraceInfo, &getData(), res, &length); |
| if(b == NULL) { |
| errorCode = U_RESOURCE_TYPE_MISMATCH; |
| } |
| @@ -580,12 +580,12 @@ ResourceArray ResourceDataValue::getArray(UErrorCode &errorCode) const { |
| switch(RES_GET_TYPE(res)) { |
| case URES_ARRAY: |
| if (offset!=0) { // empty if offset==0 |
| - items32 = (const Resource *)pResData->pRoot+offset; |
| + items32 = (const Resource *)getData().pRoot+offset; |
| length = *items32++; |
| } |
| break; |
| case URES_ARRAY16: |
| - items16 = pResData->p16BitUnits+offset; |
| + items16 = getData().p16BitUnits+offset; |
| length = *items16++; |
| break; |
| default: |
| @@ -608,19 +608,19 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { |
| switch(RES_GET_TYPE(res)) { |
| case URES_TABLE: |
| if (offset != 0) { // empty if offset==0 |
| - keys16 = (const uint16_t *)(pResData->pRoot+offset); |
| + keys16 = (const uint16_t *)(getData().pRoot+offset); |
| length = *keys16++; |
| items32 = (const Resource *)(keys16+length+(~length&1)); |
| } |
| break; |
| case URES_TABLE16: |
| - keys16 = pResData->p16BitUnits+offset; |
| + keys16 = getData().p16BitUnits+offset; |
| length = *keys16++; |
| items16 = keys16 + length; |
| break; |
| case URES_TABLE32: |
| if (offset != 0) { // empty if offset==0 |
| - keys32 = pResData->pRoot+offset; |
| + keys32 = getData().pRoot+offset; |
| length = *keys32++; |
| items32 = (const Resource *)keys32 + length; |
| } |
| @@ -633,18 +633,18 @@ ResourceTable ResourceDataValue::getTable(UErrorCode &errorCode) const { |
| } |
| |
| UBool ResourceDataValue::isNoInheritanceMarker() const { |
| - return ::isNoInheritanceMarker(pResData, res); |
| + return ::isNoInheritanceMarker(&getData(), res); |
| } |
| |
| int32_t ResourceDataValue::getStringArray(UnicodeString *dest, int32_t capacity, |
| UErrorCode &errorCode) const { |
| - return ::getStringArray(pResData, getArray(errorCode), dest, capacity, errorCode); |
| + return ::getStringArray(&getData(), getArray(errorCode), dest, capacity, errorCode); |
| } |
| |
| int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, int32_t capacity, |
| UErrorCode &errorCode) const { |
| if(URES_IS_ARRAY(res)) { |
| - return ::getStringArray(pResData, getArray(errorCode), dest, capacity, errorCode); |
| + return ::getStringArray(&getData(), getArray(errorCode), dest, capacity, errorCode); |
| } |
| if(U_FAILURE(errorCode)) { |
| return 0; |
| @@ -658,7 +658,7 @@ int32_t ResourceDataValue::getStringArrayOrStringAsArray(UnicodeString *dest, in |
| return 1; |
| } |
| int32_t sLength; |
| - const UChar *s = res_getString(fTraceInfo, pResData, res, &sLength); |
| + const UChar *s = res_getString(fTraceInfo, &getData(), res, &sLength); |
| if(s != NULL) { |
| dest[0].setTo(TRUE, s, sLength); |
| return 1; |
| @@ -673,7 +673,7 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) |
| return us; |
| } |
| int32_t sLength; |
| - const UChar *s = res_getString(fTraceInfo, pResData, res, &sLength); |
| + const UChar *s = res_getString(fTraceInfo, &getData(), res, &sLength); |
| if(s != NULL) { |
| us.setTo(TRUE, s, sLength); |
| return us; |
| @@ -684,7 +684,7 @@ UnicodeString ResourceDataValue::getStringOrFirstOfArray(UErrorCode &errorCode) |
| } |
| if(array.getSize() > 0) { |
| // Tracing is already performed above (unimportant for trace that this is an array) |
| - s = res_getStringNoTrace(pResData, array.internalGetResource(pResData, 0), &sLength); |
| + s = res_getStringNoTrace(&getData(), array.internalGetResource(&getData(), 0), &sLength); |
| if(s != NULL) { |
| us.setTo(TRUE, s, sLength); |
| return us; |
| @@ -821,14 +821,14 @@ UBool icu::ResourceTable::getKeyAndValue(int32_t i, |
| const char *&key, icu::ResourceValue &value) const { |
| if(0 <= i && i < length) { |
| icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); |
| - if (keys16 != NULL) { |
| - key = RES_GET_KEY16(rdValue.pResData, keys16[i]); |
| + if (keys16 != nullptr) { |
| + key = RES_GET_KEY16(&rdValue.getData(), keys16[i]); |
| } else { |
| - key = RES_GET_KEY32(rdValue.pResData, keys32[i]); |
| + key = RES_GET_KEY32(&rdValue.getData(), keys32[i]); |
| } |
| Resource res; |
| - if (items16 != NULL) { |
| - res = makeResourceFrom16(rdValue.pResData, items16[i]); |
| + if (items16 != nullptr) { |
| + res = makeResourceFrom16(&rdValue.getData(), items16[i]); |
| } else { |
| res = items32[i]; |
| } |
| @@ -842,6 +842,29 @@ UBool icu::ResourceTable::getKeyAndValue(int32_t i, |
| return FALSE; |
| } |
| |
| +UBool icu::ResourceTable::findValue(const char *key, ResourceValue &value) const { |
| + icu::ResourceDataValue &rdValue = static_cast<icu::ResourceDataValue &>(value); |
| + const char *realKey = nullptr; |
| + int32_t i; |
| + if (keys16 != nullptr) { |
| + i = _res_findTableItem(&rdValue.getData(), keys16, length, key, &realKey); |
| + } else { |
| + i = _res_findTable32Item(&rdValue.getData(), keys32, length, key, &realKey); |
| + } |
| + if (i >= 0) { |
| + Resource res; |
| + if (items16 != nullptr) { |
| + res = makeResourceFrom16(&rdValue.getData(), items16[i]); |
| + } else { |
| + res = items32[i]; |
| + } |
| + // Same note about lifetime as in getKeyAndValue(). |
| + rdValue.setResource(res, ResourceTracer(fTraceInfo, key)); |
| + return TRUE; |
| + } |
| + return FALSE; |
| +} |
| + |
| U_CAPI Resource U_EXPORT2 |
| res_getArrayItem(const ResourceData *pResData, Resource array, int32_t indexR) { |
| uint32_t offset=RES_GET_OFFSET(array); |
| @@ -887,7 +910,7 @@ UBool icu::ResourceArray::getValue(int32_t i, icu::ResourceValue &value) const { |
| // alive for the duration that fields are being read from it |
| // (including nested fields). |
| rdValue.setResource( |
| - internalGetResource(rdValue.pResData, i), |
| + internalGetResource(&rdValue.getData(), i), |
| ResourceTracer(fTraceInfo, i)); |
| return TRUE; |
| } |
| diff --git a/source/common/uresdata.h b/source/common/uresdata.h |
| index 51647409..d1b67bab 100644 |
| --- a/source/common/uresdata.h |
| +++ b/source/common/uresdata.h |
| @@ -511,13 +511,12 @@ inline uint32_t res_getUInt(const ResourceTracer& traceInfo, Resource res) { |
| class ResourceDataValue : public ResourceValue { |
| public: |
| ResourceDataValue() : |
| - pResData(NULL), |
| res(static_cast<Resource>(URES_NONE)), |
| fTraceInfo() {} |
| virtual ~ResourceDataValue(); |
| |
| void setData(const ResourceData *data) { |
| - pResData = data; |
| + resData = *data; |
| } |
| |
| void setResource(Resource r, ResourceTracer&& traceInfo) { |
| @@ -525,6 +524,7 @@ public: |
| fTraceInfo = traceInfo; |
| } |
| |
| + const ResourceData &getData() const { return resData; } |
| virtual UResType getType() const; |
| virtual const UChar *getString(int32_t &length, UErrorCode &errorCode) const; |
| virtual const UChar *getAliasString(int32_t &length, UErrorCode &errorCode) const; |
| @@ -541,9 +541,10 @@ public: |
| UErrorCode &errorCode) const; |
| virtual UnicodeString getStringOrFirstOfArray(UErrorCode &errorCode) const; |
| |
| - const ResourceData *pResData; |
| - |
| private: |
| + // TODO(ICU-20769): If UResourceBundle.fResData becomes a pointer, |
| + // then remove this value field again and just store a pResData pointer. |
| + ResourceData resData; |
| Resource res; |
| ResourceTracer fTraceInfo; |
| }; |
| diff --git a/source/common/uresimp.h b/source/common/uresimp.h |
| index 51db6c52..f453ddc0 100644 |
| --- a/source/common/uresimp.h |
| +++ b/source/common/uresimp.h |
| @@ -67,6 +67,9 @@ struct UResourceBundle { |
| char *fVersion; |
| UResourceDataEntry *fTopLevelData; /* for getting the valid locale */ |
| char *fResPath; /* full path to the resource: "zh_TW/CollationElements/Sequence" */ |
| + // TODO(ICU-20769): Try to change the by-value fResData into a pointer, |
| + // with the struct in only one place for each bundle. |
| + // Also replace class ResourceDataValue.resData with a pResData pointer again. |
| ResourceData fResData; |
| char fResBuf[RES_BUFSIZE]; |
| int32_t fResPathLen; |
| @@ -281,6 +284,11 @@ ures_getStringByKeyWithFallback(const UResourceBundle *resB, |
| |
| #ifdef __cplusplus |
| |
| +U_CAPI void U_EXPORT2 |
| +ures_getValueWithFallback(const UResourceBundle *bundle, const char *path, |
| + UResourceBundle *tempFillIn, |
| + icu::ResourceDataValue &value, UErrorCode &errorCode); |
| + |
| U_CAPI void U_EXPORT2 |
| ures_getAllItemsWithFallback(const UResourceBundle *bundle, const char *path, |
| icu::ResourceSink &sink, UErrorCode &errorCode); |