1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
5 // created: 2019may08 Markus W. Scherer
7 #include "unicode/utypes.h"
8 #include "unicode/bytestrie.h"
9 #include "unicode/localematcher.h"
10 #include "unicode/locid.h"
11 #include "unicode/uobject.h"
12 #include "unicode/ures.h"
14 #include "locdistance.h"
15 #include "loclikelysubtags.h"
26 * Bit flag used on the last character of a subtag in the trie.
27 * Must be set consistently by the builder and the lookup code.
29 constexpr int32_t END_OF_SUBTAG
= 0x80;
30 /** Distance value bit flag, set by the builder. */
31 constexpr int32_t DISTANCE_SKIP_SCRIPT
= 0x80;
32 /** Distance value bit flag, set by trieNext(). */
33 constexpr int32_t DISTANCE_IS_FINAL
= 0x100;
34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT
= DISTANCE_IS_FINAL
| DISTANCE_SKIP_SCRIPT
;
36 constexpr int32_t ABOVE_THRESHOLD
= 100;
38 // Indexes into array of distances.
41 IX_DEF_SCRIPT_DISTANCE
,
42 IX_DEF_REGION_DISTANCE
,
43 IX_MIN_REGION_DISTANCE
,
47 LocaleDistance
*gLocaleDistance
= nullptr;
48 UInitOnce gInitOnce
= U_INITONCE_INITIALIZER
;
50 UBool U_CALLCONV
cleanup() {
51 delete gLocaleDistance
;
52 gLocaleDistance
= nullptr;
59 void U_CALLCONV
LocaleDistance::initLocaleDistance(UErrorCode
&errorCode
) {
60 // This function is invoked only via umtx_initOnce().
61 U_ASSERT(gLocaleDistance
== nullptr);
62 const XLikelySubtags
&likely
= *XLikelySubtags::getSingleton(errorCode
);
63 if (U_FAILURE(errorCode
)) { return; }
64 const LocaleDistanceData
&data
= likely
.getDistanceData();
65 if (data
.distanceTrieBytes
== nullptr ||
66 data
.regionToPartitions
== nullptr || data
.partitions
== nullptr ||
68 data
.distances
== nullptr) {
69 errorCode
= U_MISSING_RESOURCE_ERROR
;
72 gLocaleDistance
= new LocaleDistance(data
);
73 if (gLocaleDistance
== nullptr) {
74 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
77 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE
, cleanup
);
80 const LocaleDistance
*LocaleDistance::getSingleton(UErrorCode
&errorCode
) {
81 if (U_FAILURE(errorCode
)) { return nullptr; }
82 umtx_initOnce(gInitOnce
, &LocaleDistance::initLocaleDistance
, errorCode
);
83 return gLocaleDistance
;
86 LocaleDistance::LocaleDistance(const LocaleDistanceData
&data
) :
87 trie(data
.distanceTrieBytes
),
88 regionToPartitionsIndex(data
.regionToPartitions
), partitionArrays(data
.partitions
),
89 paradigmLSRs(data
.paradigms
), paradigmLSRsLength(data
.paradigmsLength
),
90 defaultLanguageDistance(data
.distances
[IX_DEF_LANG_DISTANCE
]),
91 defaultScriptDistance(data
.distances
[IX_DEF_SCRIPT_DISTANCE
]),
92 defaultRegionDistance(data
.distances
[IX_DEF_REGION_DISTANCE
]),
93 minRegionDistance(data
.distances
[IX_MIN_REGION_DISTANCE
]) {
94 // For the default demotion value, use the
95 // default region distance between unrelated Englishes.
96 // Thus, unless demotion is turned off,
97 // a mere region difference for one desired locale
98 // is as good as a perfect match for the next following desired locale.
99 // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
100 LSR
en("en", "Latn", "US");
101 LSR
enGB("en", "Latn", "GB");
102 const LSR
*p_enGB
= &enGB
;
103 defaultDemotionPerDesiredLocale
= getBestIndexAndDistance(en
, &p_enGB
, 1,
104 50, ULOCMATCH_FAVOR_LANGUAGE
) & 0xff;
107 int32_t LocaleDistance::getBestIndexAndDistance(
109 const LSR
**supportedLSRs
, int32_t supportedLSRsLength
,
110 int32_t threshold
, ULocMatchFavorSubtag favorSubtag
) const {
111 BytesTrie
iter(trie
);
112 // Look up the desired language only once for all supported LSRs.
113 // Its "distance" is either a match point value of 0, or a non-match negative value.
114 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
115 int32_t desLangDistance
= trieNext(iter
, desired
.language
, false);
116 uint64_t desLangState
= desLangDistance
>= 0 && supportedLSRsLength
> 1 ? iter
.getState64() : 0;
117 // Index of the supported LSR with the lowest distance.
118 int32_t bestIndex
= -1;
119 for (int32_t slIndex
= 0; slIndex
< supportedLSRsLength
; ++slIndex
) {
120 const LSR
&supported
= *supportedLSRs
[slIndex
];
122 int32_t distance
= desLangDistance
;
124 U_ASSERT((distance
& DISTANCE_IS_FINAL
) == 0);
126 iter
.resetToState64(desLangState
);
128 distance
= trieNext(iter
, supported
.language
, true);
130 // Note: The data builder verifies that there are no rules with "any" (*) language and
131 // real (non *) script or region subtags.
132 // This means that if the lookup for either language fails we can use
133 // the default distances without further lookups.
136 flags
= distance
& DISTANCE_IS_FINAL_OR_SKIP_SCRIPT
;
137 distance
&= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT
;
139 if (uprv_strcmp(desired
.language
, supported
.language
) == 0) {
142 distance
= defaultLanguageDistance
;
147 U_ASSERT(0 <= distance
&& distance
<= 100);
148 // We implement "favor subtag" by reducing the language subtag distance
149 // (unscientifically reducing it to a quarter of the normal value),
150 // so that the script distance is relatively more important.
151 // For example, given a default language distance of 80, we reduce it to 20,
152 // which is below the default threshold of 50, which is the default script distance.
153 if (favorSubtag
== ULOCMATCH_FAVOR_SCRIPT
) {
156 if (distance
>= threshold
) {
160 int32_t scriptDistance
;
161 if (star
|| flags
!= 0) {
162 if (uprv_strcmp(desired
.script
, supported
.script
) == 0) {
165 scriptDistance
= defaultScriptDistance
;
168 scriptDistance
= getDesSuppScriptDistance(iter
, iter
.getState64(),
169 desired
.script
, supported
.script
);
170 flags
= scriptDistance
& DISTANCE_IS_FINAL
;
171 scriptDistance
&= ~DISTANCE_IS_FINAL
;
173 distance
+= scriptDistance
;
174 if (distance
>= threshold
) {
178 if (uprv_strcmp(desired
.region
, supported
.region
) == 0) {
179 // regionDistance = 0
180 } else if (star
|| (flags
& DISTANCE_IS_FINAL
) != 0) {
181 distance
+= defaultRegionDistance
;
183 int32_t remainingThreshold
= threshold
- distance
;
184 if (minRegionDistance
>= remainingThreshold
) {
188 // From here on we know the regions are not equal.
189 // Map each region to zero or more partitions. (zero = one non-matching string)
190 // (Each array of single-character partition strings is encoded as one string.)
191 // If either side has more than one, then we find the maximum distance.
192 // This could be optimized by adding some more structure, but probably not worth it.
193 distance
+= getRegionPartitionsDistance(
194 iter
, iter
.getState64(),
195 partitionsForRegion(desired
),
196 partitionsForRegion(supported
),
199 if (distance
< threshold
) {
204 threshold
= distance
;
207 return bestIndex
>= 0 ? (bestIndex
<< 8) | threshold
: 0xffffff00 | ABOVE_THRESHOLD
;
210 int32_t LocaleDistance::getDesSuppScriptDistance(
211 BytesTrie
&iter
, uint64_t startState
, const char *desired
, const char *supported
) {
212 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
213 int32_t distance
= trieNext(iter
, desired
, false);
215 distance
= trieNext(iter
, supported
, true);
218 UStringTrieResult result
= iter
.resetToState64(startState
).next(u
'*'); // <*, *>
219 U_ASSERT(USTRINGTRIE_HAS_VALUE(result
));
220 if (uprv_strcmp(desired
, supported
) == 0) {
221 distance
= 0; // same script
223 distance
= iter
.getValue();
224 U_ASSERT(distance
>= 0);
226 if (result
== USTRINGTRIE_FINAL_VALUE
) {
227 distance
|= DISTANCE_IS_FINAL
;
233 int32_t LocaleDistance::getRegionPartitionsDistance(
234 BytesTrie
&iter
, uint64_t startState
,
235 const char *desiredPartitions
, const char *supportedPartitions
, int32_t threshold
) {
236 char desired
= *desiredPartitions
++;
237 char supported
= *supportedPartitions
++;
238 U_ASSERT(desired
!= 0 && supported
!= 0);
239 // See if we have single desired/supported partitions, from NUL-terminated
240 // partition strings without explicit length.
241 bool suppLengthGt1
= *supportedPartitions
!= 0; // gt1: more than 1 character
242 // equivalent to: if (desLength == 1 && suppLength == 1)
243 if (*desiredPartitions
== 0 && !suppLengthGt1
) {
244 // Fastpath for single desired/supported partitions.
245 UStringTrieResult result
= iter
.next(uprv_invCharToAscii(desired
) | END_OF_SUBTAG
);
246 if (USTRINGTRIE_HAS_NEXT(result
)) {
247 result
= iter
.next(uprv_invCharToAscii(supported
) | END_OF_SUBTAG
);
248 if (USTRINGTRIE_HAS_VALUE(result
)) {
249 return iter
.getValue();
252 return getFallbackRegionDistance(iter
, startState
);
255 const char *supportedStart
= supportedPartitions
- 1; // for restart of inner loop
256 int32_t regionDistance
= 0;
257 // Fall back to * only once, not for each pair of partition strings.
260 // Look up each desired-partition string only once,
261 // not for each (desired, supported) pair.
262 UStringTrieResult result
= iter
.next(uprv_invCharToAscii(desired
) | END_OF_SUBTAG
);
263 if (USTRINGTRIE_HAS_NEXT(result
)) {
264 uint64_t desState
= suppLengthGt1
? iter
.getState64() : 0;
266 result
= iter
.next(uprv_invCharToAscii(supported
) | END_OF_SUBTAG
);
268 if (USTRINGTRIE_HAS_VALUE(result
)) {
273 d
= getFallbackRegionDistance(iter
, startState
);
276 if (d
>= threshold
) {
278 } else if (regionDistance
< d
) {
281 if ((supported
= *supportedPartitions
++) != 0) {
282 iter
.resetToState64(desState
);
288 int32_t d
= getFallbackRegionDistance(iter
, startState
);
289 if (d
>= threshold
) {
291 } else if (regionDistance
< d
) {
296 if ((desired
= *desiredPartitions
++) != 0) {
297 iter
.resetToState64(startState
);
298 supportedPartitions
= supportedStart
;
299 supported
= *supportedPartitions
++;
304 return regionDistance
;
307 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie
&iter
, uint64_t startState
) {
309 UStringTrieResult result
=
311 iter
.resetToState64(startState
).next(u
'*'); // <*, *>
312 U_ASSERT(USTRINGTRIE_HAS_VALUE(result
));
313 int32_t distance
= iter
.getValue();
314 U_ASSERT(distance
>= 0);
318 int32_t LocaleDistance::trieNext(BytesTrie
&iter
, const char *s
, bool wantValue
) {
321 return -1; // no empty subtags in the distance data
324 c
= uprv_invCharToAscii(c
);
325 // EBCDIC: If *s is not an invariant character,
326 // then c is now 0 and will simply not match anything, which is harmless.
329 if (!USTRINGTRIE_HAS_NEXT(iter
.next(c
))) {
333 // last character of this subtag
334 UStringTrieResult result
= iter
.next(c
| END_OF_SUBTAG
);
336 if (USTRINGTRIE_HAS_VALUE(result
)) {
337 int32_t value
= iter
.getValue();
338 if (result
== USTRINGTRIE_FINAL_VALUE
) {
339 value
|= DISTANCE_IS_FINAL
;
344 if (USTRINGTRIE_HAS_NEXT(result
)) {
354 UBool
LocaleDistance::isParadigmLSR(const LSR
&lsr
) const {
355 // Linear search for a very short list (length 6 as of 2019).
356 // If there are many paradigm LSRs we should use a hash set.
357 U_ASSERT(paradigmLSRsLength
<= 15);
358 for (int32_t i
= 0; i
< paradigmLSRsLength
; ++i
) {
359 if (lsr
== paradigmLSRs
[i
]) { return true; }