]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ***************************************************************************************** | |
3d1f044b | 3 | * Copyright (C) 2014-2019 Apple Inc. All Rights Reserved. |
57a6839d A |
4 | ***************************************************************************************** |
5 | */ | |
6 | ||
2ca993e8 A |
7 | #define DEBUG_UALOC 0 |
8 | #if DEBUG_UALOC | |
9 | #include <stdio.h> | |
10 | #endif | |
340931cb | 11 | #include <stdlib.h> |
2ca993e8 | 12 | #include <string.h> |
9f1b1155 | 13 | #include <ctype.h> |
57a6839d A |
14 | #include "unicode/utypes.h" |
15 | #include "unicode/ualoc.h" | |
16 | #include "unicode/uloc.h" | |
17 | #include "unicode/ures.h" | |
18 | #include "unicode/putil.h" | |
f3c0d7a5 | 19 | #include "unicode/ustring.h" |
57a6839d A |
20 | #include "cstring.h" |
21 | #include "cmemory.h" | |
b331163b A |
22 | #include "uhash.h" |
23 | #include "umutex.h" | |
24 | #include "ucln_cmn.h" | |
57a6839d A |
25 | // the following has replacements for some math.h funcs etc |
26 | #include "putilimp.h" | |
340931cb A |
27 | // For <rdar://problem/63880069> |
28 | #include "uresimp.h" | |
57a6839d A |
29 | |
30 | // The numeric values in territoryInfo are in "IntF" format from LDML2ICUConverter. | |
31 | // From its docs (adapted): [IntF is] a special integer that represents the number in | |
32 | // normalized scientific notation. | |
33 | // Resultant integers are in the form -?xxyyyyyy, where xx is the exponent | |
34 | // offset by 50 and yyyyyy is the coefficient to 5 decimal places (range 1.0 to 9.99999), e.g. | |
35 | // 14660000000000 -> 1.46600E13 -> 63146600 | |
36 | // 0.0001 -> 1.00000E-4 -> 46100000 | |
37 | // -123.456 -> -1.23456E-2 -> -48123456 | |
38 | // | |
39 | // Here to avoid an extra division we have the max coefficient as 999999 (instead of | |
40 | // 9.99999) and instead offset the exponent by -55. | |
41 | // | |
42 | static double doubleFromIntF(int32_t intF) { | |
43 | double coefficient = (double)(intF % 1000000); | |
44 | int32_t exponent = (intF / 1000000) - 55; | |
45 | return coefficient * uprv_pow10(exponent); | |
46 | } | |
47 | ||
48 | static int compareLangEntries(const void * entry1, const void * entry2) { | |
49 | double fraction1 = ((const UALanguageEntry *)entry1)->userFraction; | |
50 | double fraction2 = ((const UALanguageEntry *)entry2)->userFraction; | |
51 | // want descending order | |
52 | if (fraction1 > fraction2) return -1; | |
53 | if (fraction1 < fraction2) return 1; | |
54 | // userFractions the same, sort by languageCode | |
55 | return uprv_strcmp(((const UALanguageEntry *)entry1)->languageCode,((const UALanguageEntry *)entry2)->languageCode); | |
56 | } | |
57 | ||
f3c0d7a5 A |
58 | // language codes to version with default script |
59 | // must be sorted by language code | |
60 | static const char * langToDefaultScript[] = { | |
61 | "az", "az_Latn", | |
3d1f044b | 62 | "bm", "bm_Latn", // <rdar://problem/47494729> added |
f3c0d7a5 | 63 | "bs", "bs_Latn", |
3d1f044b A |
64 | "byn", "byn_Ethi", // <rdar://problem/47494729> added |
65 | "cu", "cu_Cyrl", // <rdar://problem/47494729> added | |
66 | "ff", "ff_Latn", // <rdar://problem/47494729> added | |
67 | "ha", "ha_Latn", // <rdar://problem/47494729> added | |
f3c0d7a5 | 68 | "iu", "iu_Cans", |
3d1f044b A |
69 | "kk", "kk_Cyrl", // <rdar://problem/47494729> changed from _Arab |
70 | "ks", "ks_Arab", // unnecessary? | |
f3c0d7a5 A |
71 | "ku", "ku_Latn", |
72 | "ky", "ky_Cyrl", | |
73 | "mn", "mn_Cyrl", | |
74 | "ms", "ms_Latn", | |
75 | "pa", "pa_Guru", | |
3d1f044b A |
76 | "rif", "rif_Tfng", // unnecessary? no locale support anyway |
77 | "sd", "sd_Arab", // <rdar://problem/47494729> added | |
f3c0d7a5 A |
78 | "shi", "shi_Tfng", |
79 | "sr", "sr_Cyrl", | |
80 | "tg", "tg_Cyrl", | |
3d1f044b | 81 | "tk", "tk_Latn", // unnecessary? |
f3c0d7a5 A |
82 | "ug", "ug_Arab", |
83 | "uz", "uz_Latn", | |
84 | "vai", "vai_Vaii", | |
3d1f044b | 85 | "yue", "yue_Hant", // to match CLDR data, not Apple default |
f3c0d7a5 A |
86 | "zh", "zh_Hans", |
87 | NULL | |
88 | }; | |
89 | ||
90 | static const char * langCodeWithScriptIfAmbig(const char * langCode) { | |
91 | const char ** langToDefScriptPtr = langToDefaultScript; | |
92 | const char * testCurLoc; | |
93 | while ( (testCurLoc = *langToDefScriptPtr++) != NULL ) { | |
94 | int cmp = uprv_strcmp(langCode, testCurLoc); | |
95 | if (cmp <= 0) { | |
96 | if (cmp == 0) { | |
97 | return *langToDefScriptPtr; | |
98 | } | |
99 | break; | |
100 | } | |
101 | langToDefScriptPtr++; | |
102 | } | |
103 | return langCode; | |
104 | } | |
105 | ||
57a6839d A |
106 | static const UChar ustrLangStatusDefacto[] = {0x64,0x65,0x5F,0x66,0x61,0x63,0x74,0x6F,0x5F,0x6F,0x66,0x66,0x69,0x63,0x69,0x61,0x6C,0}; //"de_facto_official" |
107 | static const UChar ustrLangStatusOfficial[] = {0x6F,0x66,0x66,0x69,0x63,0x69,0x61,0x6C,0}; //"official" | |
108 | static const UChar ustrLangStatusRegional[] = {0x6F,0x66,0x66,0x69,0x63,0x69,0x61,0x6C,0x5F,0x72,0x65,0x67,0x69,0x6F,0x6E,0x61,0x6C,0}; //"official_regional" | |
109 | ||
110 | enum { | |
111 | kLocalLangEntriesMax = 26, // enough for most regions to minimumFraction 0.001 except India | |
112 | kLangEntriesFactor = 3 // if we have to allocate, multiply existing size by this | |
113 | }; | |
114 | ||
115 | U_CAPI int32_t U_EXPORT2 | |
116 | ualoc_getLanguagesForRegion(const char *regionID, double minimumFraction, | |
117 | UALanguageEntry *entries, int32_t entriesCapacity, | |
118 | UErrorCode *err) | |
119 | { | |
120 | if (U_FAILURE(*err)) { | |
121 | return 0; | |
122 | } | |
123 | if ( regionID == NULL || minimumFraction < 0.0 || minimumFraction > 1.0 || | |
124 | ((entries==NULL)? entriesCapacity!=0: entriesCapacity<0) ) { | |
125 | *err = U_ILLEGAL_ARGUMENT_ERROR; | |
126 | return 0; | |
127 | } | |
128 | UResourceBundle *rb = ures_openDirect(NULL, "supplementalData", err); | |
129 | rb = ures_getByKey(rb, "territoryInfo", rb, err); | |
130 | rb = ures_getByKey(rb, regionID, rb, err); | |
131 | if (U_FAILURE(*err)) { | |
132 | ures_close(rb); | |
133 | return 0; | |
134 | } | |
135 | ||
136 | int32_t entryCount = 0; | |
137 | UResourceBundle *langBund = NULL; | |
138 | int32_t lbIdx, lbCount = ures_getSize(rb); | |
139 | UALanguageEntry localLangEntries[kLocalLangEntriesMax]; | |
140 | UALanguageEntry * langEntries = localLangEntries; | |
141 | int32_t langEntriesMax = kLocalLangEntriesMax; | |
142 | ||
143 | for (lbIdx = 0; lbIdx < lbCount; lbIdx++) { | |
144 | langBund = ures_getByIndex(rb, lbIdx, langBund, err); | |
145 | if (U_FAILURE(*err)) { | |
146 | break; | |
147 | } | |
148 | const char * langCode = ures_getKey(langBund); | |
149 | if (uprv_strcmp(langCode,"territoryF") == 0) { | |
150 | continue; | |
151 | } | |
152 | if (strnlen(langCode, UALANGDATA_CODELEN+1) > UALANGDATA_CODELEN) { // no uprv_strnlen | |
153 | continue; // a code we cannot handle | |
154 | } | |
155 | ||
156 | UErrorCode localErr = U_ZERO_ERROR; | |
157 | double userFraction = 0.0; | |
158 | UResourceBundle *itemBund = ures_getByKey(langBund, "populationShareF", NULL, &localErr); | |
159 | if (U_SUCCESS(localErr)) { | |
160 | int32_t intF = ures_getInt(itemBund, &localErr); | |
161 | if (U_SUCCESS(localErr)) { | |
162 | userFraction = doubleFromIntF(intF); | |
163 | } | |
164 | ures_close(itemBund); | |
165 | } | |
166 | if (userFraction < minimumFraction) { | |
167 | continue; | |
168 | } | |
169 | if (entries != NULL) { | |
170 | localErr = U_ZERO_ERROR; | |
171 | UALanguageStatus langStatus = UALANGSTATUS_UNSPECIFIED; | |
172 | int32_t ulen; | |
173 | const UChar * ustrLangStatus = ures_getStringByKey(langBund, "officialStatus", &ulen, &localErr); | |
174 | if (U_SUCCESS(localErr)) { | |
175 | int32_t cmp = u_strcmp(ustrLangStatus, ustrLangStatusOfficial); | |
176 | if (cmp == 0) { | |
177 | langStatus = UALANGSTATUS_OFFICIAL; | |
178 | } else if (cmp < 0 && u_strcmp(ustrLangStatus, ustrLangStatusDefacto) == 0) { | |
179 | langStatus = UALANGSTATUS_DEFACTO_OFFICIAL; | |
180 | } else if (u_strcmp(ustrLangStatus, ustrLangStatusRegional) == 0) { | |
181 | langStatus = UALANGSTATUS_REGIONAL_OFFICIAL; | |
182 | } | |
183 | } | |
184 | // Now we have all of the info for our next entry | |
185 | if (entryCount >= langEntriesMax) { | |
186 | int32_t newMax = langEntriesMax * kLangEntriesFactor; | |
187 | if (langEntries == localLangEntries) { | |
188 | // first allocation, copy from local buf | |
189 | langEntries = (UALanguageEntry*)uprv_malloc(newMax*sizeof(UALanguageEntry)); | |
190 | if (langEntries == NULL) { | |
191 | *err = U_MEMORY_ALLOCATION_ERROR; | |
192 | break; | |
193 | } | |
194 | uprv_memcpy(langEntries, localLangEntries, entryCount*sizeof(UALanguageEntry)); | |
195 | } else { | |
196 | langEntries = (UALanguageEntry*)uprv_realloc(langEntries, newMax*sizeof(UALanguageEntry)); | |
197 | if (langEntries == NULL) { | |
198 | *err = U_MEMORY_ALLOCATION_ERROR; | |
199 | break; | |
200 | } | |
201 | } | |
202 | langEntriesMax = newMax; | |
203 | } | |
f3c0d7a5 | 204 | uprv_strcpy(langEntries[entryCount].languageCode, langCodeWithScriptIfAmbig(langCode)); |
57a6839d A |
205 | langEntries[entryCount].userFraction = userFraction; |
206 | langEntries[entryCount].status = langStatus; | |
207 | } | |
208 | entryCount++; | |
209 | } | |
210 | ures_close(langBund); | |
211 | ures_close(rb); | |
212 | if (U_FAILURE(*err)) { | |
213 | if (langEntries != localLangEntries) { | |
214 | free(langEntries); | |
215 | } | |
216 | return 0; | |
217 | } | |
218 | if (entries != NULL) { | |
219 | // sort langEntries, copy entries that fit to provided array | |
220 | qsort(langEntries, entryCount, sizeof(UALanguageEntry), compareLangEntries); | |
221 | if (entryCount > entriesCapacity) { | |
222 | entryCount = entriesCapacity; | |
223 | } | |
224 | uprv_memcpy(entries, langEntries, entryCount*sizeof(UALanguageEntry)); | |
225 | if (langEntries != localLangEntries) { | |
226 | free(langEntries); | |
227 | } | |
228 | } | |
229 | return entryCount; | |
230 | } | |
231 | ||
340931cb | 232 | static const char * forceParent[] = { // Not used by ualoc_localizationsToUse |
a961784b | 233 | "en_150", "en_GB", // en for Europe |
b331163b | 234 | "en_AU", "en_GB", |
a961784b A |
235 | "en_BD", "en_GB", // en for Bangladesh |
236 | "en_BE", "en_150", // en for Belgium goes to en for Europe | |
237 | "en_DG", "en_GB", | |
238 | "en_FK", "en_GB", | |
239 | "en_GG", "en_GB", | |
240 | "en_GI", "en_GB", | |
241 | "en_HK", "en_GB", // en for Hong Kong | |
242 | "en_IE", "en_GB", | |
243 | "en_IM", "en_GB", | |
b331163b | 244 | "en_IN", "en_GB", |
a961784b A |
245 | "en_IO", "en_GB", |
246 | "en_JE", "en_GB", | |
a62d09fc | 247 | "en_JM", "en_GB", |
3d1f044b | 248 | "en_LK", "en_GB", |
a961784b A |
249 | "en_MO", "en_GB", |
250 | "en_MT", "en_GB", | |
2ca993e8 | 251 | "en_MV", "en_GB", // for Maldives |
a961784b | 252 | "en_MY", "en_GB", // en for Malaysia |
2ca993e8 | 253 | "en_NZ", "en_AU", |
a961784b A |
254 | "en_PK", "en_GB", // en for Pakistan |
255 | "en_SG", "en_GB", | |
256 | "en_SH", "en_GB", | |
257 | "en_VG", "en_GB", | |
f3c0d7a5 | 258 | "yue", "yue_CN", // yue_CN has 71M users (5.2% of 1.37G), yue_HK has 6.5M (90% of 7.17M) |
340931cb A |
259 | "yue_CN", "root", // should this change to e.g. "zh_Hans_CN" for <rdar://problem/30671866>? |
260 | "yue_HK", "root", // should this change to e.g. "zh_Hant_HK" for <rdar://problem/30671866>? | |
f3c0d7a5 A |
261 | "yue_Hans","yue_CN", |
262 | "yue_Hant","yue_HK", | |
57a6839d A |
263 | "zh", "zh_CN", |
264 | "zh_CN", "root", | |
265 | "zh_Hant", "zh_TW", | |
266 | "zh_TW", "root", | |
267 | NULL | |
268 | }; | |
269 | ||
2ca993e8 A |
270 | enum { kLocBaseNameMax = 16 }; |
271 | ||
57a6839d A |
272 | U_CAPI int32_t U_EXPORT2 |
273 | ualoc_getAppleParent(const char* localeID, | |
274 | char * parent, | |
275 | int32_t parentCapacity, | |
276 | UErrorCode* err) | |
277 | { | |
278 | UResourceBundle *rb; | |
279 | int32_t len; | |
280 | UErrorCode tempStatus; | |
281 | char locbuf[ULOC_FULLNAME_CAPACITY+1]; | |
08b89b0a | 282 | char * foundDoubleUnderscore; |
57a6839d A |
283 | |
284 | if (U_FAILURE(*err)) { | |
285 | return 0; | |
286 | } | |
287 | if ( (parent==NULL)? parentCapacity!=0: parentCapacity<0 ) { | |
288 | *err = U_ILLEGAL_ARGUMENT_ERROR; | |
289 | return 0; | |
290 | } | |
08b89b0a | 291 | len = uloc_getBaseName(localeID, locbuf, ULOC_FULLNAME_CAPACITY, err); /* canonicalize and strip keywords */ |
57a6839d A |
292 | if (U_FAILURE(*err)) { |
293 | return 0; | |
294 | } | |
295 | if (*err == U_STRING_NOT_TERMINATED_WARNING) { | |
296 | locbuf[ULOC_FULLNAME_CAPACITY] = 0; | |
297 | *err = U_ZERO_ERROR; | |
298 | } | |
08b89b0a A |
299 | foundDoubleUnderscore = uprv_strstr(locbuf, "__"); /* __ comes from bad/missing subtag or variant */ |
300 | if (foundDoubleUnderscore != NULL) { | |
301 | *foundDoubleUnderscore = 0; /* terminate at the __ */ | |
302 | len = uprv_strlen(locbuf); | |
303 | } | |
b331163b | 304 | if (len >= 2 && (uprv_strncmp(locbuf, "en", 2) == 0 || uprv_strncmp(locbuf, "zh", 2) == 0)) { |
57a6839d A |
305 | const char ** forceParentPtr = forceParent; |
306 | const char * testCurLoc; | |
307 | while ( (testCurLoc = *forceParentPtr++) != NULL ) { | |
308 | int cmp = uprv_strcmp(locbuf, testCurLoc); | |
309 | if (cmp <= 0) { | |
310 | if (cmp == 0) { | |
311 | len = uprv_strlen(*forceParentPtr); | |
312 | if (len < parentCapacity) { | |
313 | uprv_strcpy(parent, *forceParentPtr); | |
314 | } else { | |
315 | *err = U_BUFFER_OVERFLOW_ERROR; | |
316 | } | |
317 | return len; | |
318 | } | |
319 | break; | |
320 | } | |
321 | forceParentPtr++; | |
322 | } | |
323 | } | |
324 | tempStatus = U_ZERO_ERROR; | |
325 | rb = ures_openDirect(NULL, locbuf, &tempStatus); | |
326 | if (U_SUCCESS(tempStatus)) { | |
327 | const char * actualLocale = ures_getLocaleByType(rb, ULOC_ACTUAL_LOCALE, &tempStatus); | |
2ca993e8 | 328 | ures_close(rb); |
57a6839d A |
329 | if (U_SUCCESS(tempStatus) && uprv_strcmp(locbuf, actualLocale) != 0) { |
330 | // we have followed an alias | |
331 | len = uprv_strlen(actualLocale); | |
332 | if (len < parentCapacity) { | |
333 | uprv_strcpy(parent, actualLocale); | |
334 | } else { | |
335 | *err = U_BUFFER_OVERFLOW_ERROR; | |
336 | } | |
57a6839d A |
337 | return len; |
338 | } | |
2ca993e8 A |
339 | } |
340 | tempStatus = U_ZERO_ERROR; | |
341 | rb = ures_openDirect(NULL, "supplementalData", &tempStatus); | |
342 | rb = ures_getByKey(rb, "parentLocales", rb, &tempStatus); | |
343 | if (U_SUCCESS(tempStatus)) { | |
344 | UResourceBundle * parentMapBundle = NULL; | |
345 | int32_t childLen = 0; | |
346 | while (childLen == 0) { | |
347 | tempStatus = U_ZERO_ERROR; | |
348 | parentMapBundle = ures_getNextResource(rb, parentMapBundle, &tempStatus); | |
349 | if (U_FAILURE(tempStatus)) { | |
350 | break; // no more parent bundles, normal exit | |
351 | } | |
352 | char childName[kLocBaseNameMax + 1]; | |
353 | childName[kLocBaseNameMax] = 0; | |
354 | const char * childPtr = NULL; | |
355 | if (ures_getType(parentMapBundle) == URES_STRING) { | |
356 | childLen = kLocBaseNameMax; | |
357 | childPtr = ures_getUTF8String(parentMapBundle, childName, &childLen, FALSE, &tempStatus); | |
358 | if (U_FAILURE(tempStatus) || uprv_strncmp(locbuf, childPtr, kLocBaseNameMax) != 0) { | |
359 | childLen = 0; | |
360 | } | |
361 | } else { // should be URES_ARRAY | |
362 | int32_t childCur, childCount = ures_getSize(parentMapBundle); | |
363 | for (childCur = 0; childCur < childCount && childLen == 0; childCur++) { | |
364 | tempStatus = U_ZERO_ERROR; | |
365 | childLen = kLocBaseNameMax; | |
366 | childPtr = ures_getUTF8StringByIndex(parentMapBundle, childCur, childName, &childLen, FALSE, &tempStatus); | |
367 | if (U_FAILURE(tempStatus) || uprv_strncmp(locbuf, childPtr, kLocBaseNameMax) != 0) { | |
368 | childLen = 0; | |
369 | } | |
370 | } | |
371 | } | |
372 | } | |
373 | ures_close(rb); | |
374 | if (childLen > 0) { | |
375 | // parentMapBundle key is the parent we are looking for | |
376 | const char * keyStr = ures_getKey(parentMapBundle); | |
377 | len = uprv_strlen(keyStr); | |
57a6839d | 378 | if (len < parentCapacity) { |
2ca993e8 | 379 | uprv_strcpy(parent, keyStr); |
57a6839d A |
380 | } else { |
381 | *err = U_BUFFER_OVERFLOW_ERROR; | |
382 | } | |
2ca993e8 | 383 | ures_close(parentMapBundle); |
57a6839d A |
384 | return len; |
385 | } | |
2ca993e8 | 386 | ures_close(parentMapBundle); |
57a6839d | 387 | } |
2ca993e8 | 388 | |
57a6839d A |
389 | len = uloc_getParent(locbuf, parent, parentCapacity, err); |
390 | if (U_SUCCESS(*err) && len == 0) { | |
391 | len = 4; | |
392 | if (len < parentCapacity) { | |
393 | uprv_strcpy(parent, "root"); | |
394 | } else { | |
395 | *err = U_BUFFER_OVERFLOW_ERROR; | |
396 | } | |
397 | } | |
398 | return len; | |
399 | } | |
400 | ||
b331163b A |
401 | // ================= |
402 | // Data and related functions for ualoc_localizationsToUse | |
403 | // ================= | |
404 | ||
405 | static const char * appleAliasMap[][2] = { | |
406 | // names are lowercase here because they are looked up after being processed by uloc_getBaseName | |
407 | { "arabic", "ar" }, // T2 | |
408 | { "chinese", "zh_Hans" }, // T0 | |
409 | { "danish", "da" }, // T2 | |
410 | { "dutch", "nl" }, // T1, still in use | |
411 | { "english", "en" }, // T0, still in use | |
412 | { "finnish", "fi" }, // T2 | |
413 | { "french", "fr" }, // T0, still in use | |
414 | { "german", "de" }, // T0, still in use | |
415 | { "italian", "it" }, // T1, still in use | |
416 | { "japanese", "ja" }, // T0, still in use | |
417 | { "korean", "ko" }, // T1 | |
a961784b | 418 | { "no_NO", "nb_NO" }, // special |
b331163b A |
419 | { "norwegian", "nb" }, // T2 |
420 | { "polish", "pl" }, // T2 | |
421 | { "portuguese", "pt" }, // T2 | |
422 | { "russian", "ru" }, // T2 | |
423 | { "spanish", "es" }, // T1, still in use | |
424 | { "swedish", "sv" }, // T2 | |
425 | { "thai", "th" }, // T2 | |
426 | { "turkish", "tr" }, // T2 | |
b331163b | 427 | }; |
2ca993e8 | 428 | enum { kAppleAliasMapCount = UPRV_LENGTHOF(appleAliasMap) }; |
b331163b | 429 | |
340931cb A |
430 | // Most of the entries in the following are cases in which |
431 | // localization bundle inheritance is different from | |
432 | // ICU resource inheritance, and thus are not in parentLocales data. | |
433 | // <rdar://problem/63880069> However, since this is now checked before | |
434 | // the hashmap of parentLocales data, we add a few important entries | |
435 | // from parentLocales data for lookup efficiency. | |
b331163b | 436 | static const char * appleParentMap[][2] = { |
340931cb | 437 | { "ars", "ar" }, // rdar://64497611 |
b331163b | 438 | { "en_150", "en_GB" }, // Apple custom parent |
f3c0d7a5 A |
439 | { "en_AG", "en_GB" }, // Antigua & Barbuda |
440 | { "en_AI", "en_GB" }, // Anguilla | |
b331163b | 441 | { "en_AU", "en_GB" }, // Apple custom parent |
f3c0d7a5 | 442 | { "en_BB", "en_GB" }, // Barbados |
b331163b | 443 | { "en_BD", "en_GB" }, // Apple custom parent |
f3c0d7a5 | 444 | { "en_BM", "en_GB" }, // Bermuda |
340931cb | 445 | { "en_BN", "en_GB" }, // Brunei |
f3c0d7a5 A |
446 | { "en_BS", "en_GB" }, // Bahamas |
447 | { "en_BW", "en_GB" }, // Botswana | |
448 | { "en_BZ", "en_GB" }, // Belize | |
449 | { "en_CC", "en_AU" }, // Cocos (Keeling) Islands | |
f3c0d7a5 A |
450 | { "en_CK", "en_AU" }, // Cook Islands (maybe to en_NZ instead?) |
451 | { "en_CX", "en_AU" }, // Christmas Island | |
b331163b | 452 | { "en_CY", "en_150" }, // Apple locale addition |
a961784b | 453 | { "en_DG", "en_GB" }, |
f3c0d7a5 | 454 | { "en_DM", "en_GB" }, // Dominica |
f3c0d7a5 | 455 | { "en_FJ", "en_GB" }, // Fiji |
a961784b | 456 | { "en_FK", "en_GB" }, |
340931cb | 457 | { "en_GB", "en_001" }, // from parentLocales, added here for efficiency |
f3c0d7a5 | 458 | { "en_GD", "en_GB" }, // Grenada |
a961784b | 459 | { "en_GG", "en_GB" }, |
f3c0d7a5 | 460 | { "en_GH", "en_GB" }, // Ghana |
a961784b | 461 | { "en_GI", "en_GB" }, |
f3c0d7a5 | 462 | { "en_GM", "en_GB" }, // Gambia |
f3c0d7a5 | 463 | { "en_GY", "en_GB" }, // Guyana |
b331163b | 464 | { "en_HK", "en_GB" }, // Apple custom parent |
a961784b | 465 | { "en_IE", "en_GB" }, |
a961784b | 466 | { "en_IM", "en_GB" }, |
b331163b | 467 | { "en_IN", "en_GB" }, // Apple custom parent |
a961784b | 468 | { "en_IO", "en_GB" }, |
a961784b | 469 | { "en_JE", "en_GB" }, |
a62d09fc | 470 | { "en_JM", "en_GB" }, |
f3c0d7a5 A |
471 | { "en_KE", "en_GB" }, // Kenya |
472 | { "en_KI", "en_GB" }, // Kiribati | |
473 | { "en_KN", "en_GB" }, // St. Kitts & Nevis | |
474 | { "en_KY", "en_GB" }, // Cayman Islands | |
475 | { "en_LC", "en_GB" }, // St. Lucia | |
3d1f044b | 476 | { "en_LK", "en_GB" }, // Apple custom parent |
f3c0d7a5 | 477 | { "en_LS", "en_GB" }, // Lesotho |
a961784b | 478 | { "en_MO", "en_GB" }, |
f3c0d7a5 | 479 | { "en_MS", "en_GB" }, // Montserrat |
a961784b | 480 | { "en_MT", "en_GB" }, |
f3c0d7a5 | 481 | { "en_MU", "en_GB" }, // Mauritius |
2ca993e8 | 482 | { "en_MV", "en_GB" }, |
f3c0d7a5 | 483 | { "en_MW", "en_GB" }, // Malawi |
b331163b | 484 | { "en_MY", "en_GB" }, // Apple custom parent |
f3c0d7a5 A |
485 | { "en_NA", "en_GB" }, // Namibia |
486 | { "en_NF", "en_AU" }, // Norfolk Island | |
487 | { "en_NG", "en_GB" }, // Nigeria | |
f3c0d7a5 A |
488 | { "en_NR", "en_AU" }, // Nauru |
489 | { "en_NU", "en_AU" }, // Niue (maybe to en_NZ instead?) | |
2ca993e8 | 490 | { "en_NZ", "en_AU" }, |
f3c0d7a5 | 491 | { "en_PG", "en_AU" }, // Papua New Guinea |
b331163b | 492 | { "en_PK", "en_GB" }, // Apple custom parent |
f3c0d7a5 | 493 | { "en_PN", "en_GB" }, // Pitcairn Islands |
f3c0d7a5 A |
494 | { "en_SB", "en_GB" }, // Solomon Islands |
495 | { "en_SC", "en_GB" }, // Seychelles | |
496 | { "en_SD", "en_GB" }, // Sudan | |
a961784b A |
497 | { "en_SG", "en_GB" }, |
498 | { "en_SH", "en_GB" }, | |
f3c0d7a5 A |
499 | { "en_SL", "en_GB" }, // Sierra Leone |
500 | { "en_SS", "en_GB" }, // South Sudan | |
501 | { "en_SZ", "en_GB" }, // Swaziland | |
502 | { "en_TC", "en_GB" }, // Tristan da Cunha | |
503 | { "en_TO", "en_GB" }, // Tonga | |
504 | { "en_TT", "en_GB" }, // Trinidad & Tobago | |
505 | { "en_TV", "en_GB" }, // Tuvalu | |
506 | { "en_TZ", "en_GB" }, // Tanzania | |
507 | { "en_UG", "en_GB" }, // Uganda | |
508 | { "en_VC", "en_GB" }, // St. Vincent & Grenadines | |
a961784b | 509 | { "en_VG", "en_GB" }, |
f3c0d7a5 A |
510 | { "en_VU", "en_GB" }, // Vanuatu |
511 | { "en_WS", "en_AU" }, // Samoa (maybe to en_NZ instead?) | |
512 | { "en_ZA", "en_GB" }, // South Africa | |
513 | { "en_ZM", "en_GB" }, // Zambia | |
514 | { "en_ZW", "en_GB" }, // Zimbabwe | |
340931cb A |
515 | { "es_MX", "es_419" }, // from parentLocales, added here for efficiency |
516 | { "wuu", "wuu_Hans"}, // rdar://64497611 | |
517 | { "wuu_Hans", "zh_Hans" }, // rdar://64497611 | |
518 | { "wuu_Hant", "zh_Hant" }, // rdar://64497611 | |
519 | { "yue", "yue_Hant"}, | |
520 | { "yue_Hans", "zh_Hans" }, // <rdar://problem/30671866> | |
521 | { "yue_Hant", "zh_Hant" }, // <rdar://problem/30671866> | |
522 | { "zh_Hant", "root" }, // from parentLocales, added here for efficiency | |
b331163b | 523 | }; |
2ca993e8 A |
524 | enum { kAppleParentMapCount = UPRV_LENGTHOF(appleParentMap) }; |
525 | ||
b331163b A |
526 | U_CDECL_BEGIN |
527 | static UBool U_CALLCONV ualocale_cleanup(void); | |
528 | U_CDECL_END | |
529 | ||
530 | U_NAMESPACE_BEGIN | |
531 | ||
532 | static UInitOnce gUALocaleCacheInitOnce = U_INITONCE_INITIALIZER; | |
533 | ||
534 | static int gMapDataState = 0; // 0 = not initialized, 1 = initialized, -1 = failure | |
340931cb | 535 | static UResourceBundle* gLanguageAliasesBundle = NULL; |
b331163b A |
536 | |
537 | U_NAMESPACE_END | |
538 | ||
539 | U_CDECL_BEGIN | |
540 | ||
541 | static UBool U_CALLCONV ualocale_cleanup(void) | |
542 | { | |
543 | U_NAMESPACE_USE | |
544 | ||
b331163b | 545 | if (gMapDataState > 0) { |
340931cb A |
546 | ures_close(gLanguageAliasesBundle); |
547 | gLanguageAliasesBundle = NULL; | |
b331163b A |
548 | } |
549 | gMapDataState = 0; | |
340931cb | 550 | gUALocaleCacheInitOnce.reset(); |
b331163b A |
551 | return TRUE; |
552 | } | |
553 | ||
554 | static void initializeMapData() { | |
555 | U_NAMESPACE_USE | |
556 | ||
b331163b A |
557 | ucln_common_registerCleanup(UCLN_COMMON_LOCALE, ualocale_cleanup); |
558 | ||
340931cb A |
559 | UResourceBundle * curBundle; |
560 | UErrorCode status = U_ZERO_ERROR; | |
561 | curBundle = ures_openDirect(NULL, "metadata", &status); | |
562 | curBundle = ures_getByKey(curBundle, "alias", curBundle, &status); | |
563 | curBundle = ures_getByKey(curBundle, "language", curBundle, &status); | |
564 | if (U_FAILURE(status)) { | |
b331163b A |
565 | gMapDataState = -1; // failure |
566 | return; | |
567 | } | |
340931cb | 568 | gLanguageAliasesBundle = curBundle; // URES_TABLE resource, 420 entries in ICU-6600n |
2ca993e8 | 569 | #if DEBUG_UALOC |
340931cb | 570 | printf("# metadata/alias/language size %d\n", ures_getSize(curBundle)); |
2ca993e8 | 571 | #endif |
340931cb | 572 | |
b331163b A |
573 | gMapDataState = 1; |
574 | } | |
575 | ||
576 | U_CDECL_END | |
577 | ||
340931cb A |
578 | // comparator for binary search of appleAliasMap |
579 | static int compareAppleMapElements(const void *key, const void *entry) { | |
580 | return uprv_strcmp((const char *)key, ((const char **)entry)[0]); | |
581 | } | |
582 | ||
b331163b A |
583 | // The following maps aliases, etc. Ensures 0-termination if no error. |
584 | static void ualoc_normalize(const char *locale, char *normalized, int32_t normalizedCapacity, UErrorCode *status) | |
585 | { | |
586 | if (U_FAILURE(*status)) { | |
587 | return; | |
588 | } | |
589 | // uloc_minimizeSubtags(locale, normalized, normalizedCapacity, status); | |
590 | ||
340931cb A |
591 | const char *replacement = locale; // fallback to no replacement |
592 | int32_t len; | |
593 | // first check in appleAliasMap using binary search | |
594 | const char** entry = (const char**)bsearch(locale, appleAliasMap, kAppleAliasMapCount, sizeof(appleAliasMap[0]), compareAppleMapElements); | |
595 | if (entry != NULL) { | |
596 | replacement = entry[1]; | |
597 | } else if (icu::gMapDataState > 0) { | |
598 | // check in gLanguageAliasesBundle | |
599 | UErrorCode localStatus = U_ZERO_ERROR; | |
600 | UResourceBundle * aliasMapBundle = ures_getByKey(icu::gLanguageAliasesBundle, locale, NULL, &localStatus); | |
601 | if (U_SUCCESS(localStatus) && aliasMapBundle != NULL) { | |
602 | len = normalizedCapacity; | |
603 | ures_getUTF8StringByKey(aliasMapBundle, "replacement", normalized, &len, TRUE, status); | |
604 | if (U_SUCCESS(*status) && len >= normalizedCapacity) { | |
605 | *status = U_BUFFER_OVERFLOW_ERROR; // treat unterminated as error | |
606 | } | |
607 | ures_close(aliasMapBundle); | |
608 | return; | |
609 | } | |
610 | } | |
611 | ||
612 | len = strnlen(replacement, normalizedCapacity); | |
b331163b A |
613 | if (len < normalizedCapacity) { // allow for 0 termination |
614 | uprv_strcpy(normalized, replacement); | |
615 | } else { | |
616 | *status = U_BUFFER_OVERFLOW_ERROR; | |
617 | } | |
618 | } | |
619 | ||
620 | static void ualoc_getParent(const char *locale, char *parent, int32_t parentCapacity, UErrorCode *status) | |
621 | { | |
622 | if (U_FAILURE(*status)) { | |
623 | return; | |
624 | } | |
340931cb A |
625 | // first check in appleParentMap using binary search |
626 | int32_t len; | |
627 | const char** entry = (const char**)bsearch(locale, appleParentMap, kAppleParentMapCount, sizeof(appleParentMap[0]), compareAppleMapElements); | |
628 | if (entry != NULL) { | |
629 | const char* replacement = entry[1]; | |
630 | len = uprv_strlen(replacement); | |
631 | if (len < parentCapacity) { // allow for 0 termination | |
632 | uprv_strcpy(parent, replacement); | |
633 | } else { | |
634 | *status = U_BUFFER_OVERFLOW_ERROR; | |
b331163b | 635 | } |
340931cb A |
636 | return; |
637 | } | |
638 | len = ures_getLocParent(locale, parent, parentCapacity - 1, status); | |
639 | if (len > 0 || U_FAILURE(*status)) { | |
640 | parent[parentCapacity - 1] = 0; // ensure 0 termination in case of U_STRING_NOT_TERMINATED_WARNING | |
641 | return; | |
b331163b A |
642 | } |
643 | uloc_getParent(locale, parent, parentCapacity - 1, status); | |
644 | parent[parentCapacity - 1] = 0; // ensure 0 termination in case of U_STRING_NOT_TERMINATED_WARNING | |
645 | } | |
646 | ||
340931cb A |
647 | enum { kLangScriptRegMaxLen = ULOC_LANG_CAPACITY + ULOC_SCRIPT_CAPACITY + ULOC_COUNTRY_CAPACITY }; // currently 22 |
648 | ||
649 | const int32_t kMaxLocaleIDLength = 58; // ULOC_FULLNAME_CAPACITY - ULOC_KEYWORD_AND_VALUES_CAPACITY: locales without variants should never be more than 24 chars, the excess is just to cover variant codes (+1 for null termination) | |
650 | const int32_t kMaxParentChainLength = 7; | |
651 | const int32_t kCharStorageBlockSize = 650; // very few of the unit tests used more than 650 bytes of character storage | |
652 | ||
653 | struct LocIDCharStorage { | |
654 | char chars[kCharStorageBlockSize]; | |
655 | char* curTop; | |
656 | char* limit; | |
657 | LocIDCharStorage* nextBlock; | |
658 | ||
659 | LocIDCharStorage() : chars(), curTop(chars), limit(curTop + kCharStorageBlockSize), nextBlock(NULL) {} | |
660 | ~LocIDCharStorage() { delete nextBlock; } | |
661 | ||
662 | char* nextPtr() { | |
663 | if (nextBlock == NULL) { | |
664 | if (limit - curTop > kMaxLocaleIDLength) { | |
665 | // return the top of the current block only if there's enough room for a maximum-length locale ID-- | |
666 | // this keeps us from having to preflight or repeat any of the actual uloc calls and wastes | |
667 | // relatively little space | |
668 | return curTop; | |
669 | } else { | |
670 | // if we DON'T have enough space for a max-length locale ID, allocate a new block... | |
671 | nextBlock = new LocIDCharStorage(); | |
672 | // ...and fall through to the line below to return its top pointer | |
673 | } | |
b331163b | 674 | } |
340931cb | 675 | return nextBlock->nextPtr(); |
b331163b | 676 | } |
340931cb A |
677 | |
678 | void advance(int32_t charsUsed) { | |
679 | if (nextBlock == NULL) { | |
680 | curTop += charsUsed; | |
681 | *curTop++ = '\0'; // in rare cases, the ICU call might not have null-terminated the result, so force it here | |
682 | } else { | |
683 | nextBlock->advance(charsUsed); | |
f3c0d7a5 A |
684 | } |
685 | } | |
340931cb A |
686 | }; |
687 | ||
688 | /** | |
689 | * Data structure used by ualoc_localizationsToUse() below to cache the various transformed versions of a single locale ID. | |
690 | * All char* members are pointers into storage managed separately by the caller-- usually pointers into a separate array of char intended to | |
691 | * hold all of the strings in bulk. | |
692 | */ | |
693 | struct LocaleIDInfo { | |
694 | const char* original; //< Pointer to the original locale ID | |
695 | const char* base; //< The result of uloc_getBaseName() on the original locale ID | |
696 | const char* normalized; //< The result of ualoc_normalize() on the value of `base` | |
697 | const char* language; //< The language code from `normalized` | |
698 | const char* languageGroup; //< Same as `language`, except for certain languages that fall back to other languages | |
699 | const char* parentChain[kMaxParentChainLength]; //< Array of the results of calling ualoc_getParent() repeatedly on `normalized` | |
700 | ||
701 | LocaleIDInfo(); | |
702 | void initBaseNames(const char* originalID, LocIDCharStorage& charStorage, UErrorCode* err); | |
703 | void calcParentChain(LocIDCharStorage& charStorage, UBool penalizeNonDefaultCountry, UErrorCode* err); | |
704 | UBool specifiesCountry(); | |
705 | #if DEBUG_UALOC | |
706 | void dump(const char *originalID, LocIDCharStorage& charStorage, UBool penalizeNonDefaultCountry, UErrorCode *err); | |
707 | #endif | |
708 | }; | |
709 | ||
710 | LocaleIDInfo::LocaleIDInfo() { | |
711 | // these are the only two fields that HAVE to be initialized to NULL | |
712 | original = NULL; | |
713 | parentChain[0] = NULL; | |
b331163b A |
714 | } |
715 | ||
340931cb A |
716 | /** |
717 | * Caches the `originalID` in `original` and fills in `base`, `normalized`, and `language. If these fields have already been filled in by an earlier call, this | |
718 | * function won't fill them in again. | |
719 | * @param originalID The locale ID to base the other values on. | |
720 | * @param textPtr A pointer to a `char*` variable that points into an array of character storage maintained by the caller. The actual characters in this | |
721 | * object's strings are written to this storage and `textPtr` is advanced to point to the first memory position after the last string written to the storage. | |
722 | * @param textPtrLimit A pointer to the position immediately beyond the end of the separate character storage. This function won't write beyond | |
723 | * this point and will return U_BUFFER_OVERFLOW if the storage is filled (which shouldn't happen). | |
724 | * @param err Pointer to a variable holding the ICU error code. | |
725 | */ | |
726 | void LocaleIDInfo::initBaseNames(const char *originalID, LocIDCharStorage& charStorage, UErrorCode *err) { | |
727 | // don't fill in the fields if they're already filled in | |
728 | if (original == NULL) { | |
729 | original = originalID; | |
730 | ||
731 | base = charStorage.nextPtr(); | |
732 | int32_t length = uloc_getBaseName(original, const_cast<char*>(base), kMaxLocaleIDLength, err); | |
733 | charStorage.advance(length); | |
734 | ||
735 | normalized = charStorage.nextPtr(); | |
736 | ualoc_normalize(base, const_cast<char*>(normalized), kMaxLocaleIDLength, err); | |
737 | charStorage.advance(uprv_strlen(normalized)); | |
738 | ||
739 | language = charStorage.nextPtr(); | |
740 | length = uloc_getLanguage(normalized, const_cast<char*>(language), kMaxLocaleIDLength, err); | |
741 | charStorage.advance(length); | |
742 | languageGroup = language; | |
743 | ||
744 | // The `languageGroup` field is used for performance optimization; we don't need to walk the parent chain if the | |
745 | // languages of the two locales being compared are different. This code accounts for the few cases of different | |
746 | // language codes that need to be considered equivalent for comparison purposes. | |
747 | static const char* likeLanguages[] = { | |
748 | "ars", "ar", | |
749 | "no", "nb", | |
750 | "wuu", "zh", | |
751 | "yue", "zh" | |
752 | }; | |
753 | for (int32_t i = 0; i < UPRV_LENGTHOF(likeLanguages); i += 2) { | |
754 | if (uprv_strcmp(language, likeLanguages[i]) == 0) { | |
755 | languageGroup = likeLanguages[i + 1]; | |
756 | break; | |
757 | } | |
b331163b A |
758 | } |
759 | } | |
b331163b A |
760 | } |
761 | ||
340931cb A |
762 | /** |
763 | * Calculates the parent chain for the locale ID in `original` by calling `ualoc_getParent()` repeatedly until it returns the empty string or "root". If this object's | |
764 | * parent chain has previously been calculated, this won't do it again. The parent chain in the LocaleIDInfo object is terminated by a NULL entry. | |
765 | * @param textPtr A pointer to a `char*` variable that points into an array of character storage maintained by the caller. The actual characters in this | |
766 | * object's strings are written to this storage and `textPtr` is advanced to point to the first memory position after the last string written to the storage. | |
767 | * @param textPtrLimit A pointer to the position immediately beyond the end of the separate character storage. This function won't write beyond | |
768 | * this point and will return U_BUFFER_OVERFLOW if the storage is filled (which shouldn't happen). | |
769 | * @param penalizeNonDefaultCountry If TRUE, an extra entry is added to the parent chain if the original locale specifies a country other than | |
770 | * the default country for the locale's language. | |
771 | * @param err Pointer to a variable holding the ICU error code. | |
772 | */ | |
773 | void LocaleIDInfo::calcParentChain(LocIDCharStorage& charStorage, UBool penalizeNonDefaultCountry, UErrorCode *err) { | |
774 | // don't calculate the parent chain if it's already been calculated | |
775 | if (parentChain[0] != NULL) { | |
776 | return; | |
777 | } | |
778 | ||
779 | int32_t index = 0; | |
780 | ||
781 | // Entry 0 in the parent chain is always the same as `normalized`-- this simplifies distance calculations. | |
782 | parentChain[index] = normalized; | |
783 | ||
784 | // If the caller asks to penalize the non-default country (which it does for entries in `availableLocalizations` | |
785 | // but not for entries in `preferredLanguages`), check to see if the original locale ID specifies a country code | |
786 | // for a country other than the default country for the specified language (as determined by uloc_minimizeSubtags() ). | |
787 | // If the country is NOT the default for the language, artifically lengthen the parent chain by also putting | |
788 | // `normalized` into entry 1 in the parent chain. We do this to bias our similarity scores toward the default country. | |
789 | // (e.g., if `preferredLanguages` is { it } and `availableLocalizations` is { it_CH, it_IT }, this causes us to return | |
790 | // `it_IT` even though it comes second in the list because it's the default country for the language.) | |
791 | if (penalizeNonDefaultCountry) { | |
792 | UErrorCode dummyErr = U_ZERO_ERROR; | |
793 | if (uloc_getCountry(normalized, NULL, 0, &dummyErr) > 0) { | |
794 | if (uprv_strcmp(normalized, "es_MX") != 0 && uprv_strcmp(normalized, "zh_Hant_TW") != 0) { | |
795 | dummyErr = U_ZERO_ERROR; | |
796 | char minimizedLocale[kLocBaseNameMax]; | |
797 | uloc_minimizeSubtags(normalized, minimizedLocale, kLocBaseNameMax, &dummyErr); | |
798 | if (uloc_getCountry(minimizedLocale, NULL, 0, &dummyErr)) { | |
799 | parentChain[++index] = normalized; | |
800 | } | |
801 | } | |
802 | } | |
803 | } | |
804 | ||
805 | // Walk the locale ID's parent chain using ualoc_getParent(). That function will return "" or "root" when it | |
806 | // gets to the end of the chain, but internall we use NULL to mark the end of the chain. | |
807 | while (index < kMaxParentChainLength && parentChain[index] != NULL) { | |
808 | char* textPtr = charStorage.nextPtr(); | |
809 | ualoc_getParent(parentChain[index], textPtr, kMaxLocaleIDLength, err); | |
810 | ++index; | |
811 | if (textPtr[0] == '\0' || uprv_strcmp(textPtr, "root") == 0) { | |
812 | parentChain[index] = NULL; | |
813 | } else { | |
814 | parentChain[index] = textPtr; | |
815 | charStorage.advance(uprv_strlen(textPtr)); | |
816 | } | |
817 | } | |
818 | } | |
819 | ||
820 | UBool LocaleIDInfo::specifiesCountry() { | |
821 | UErrorCode err = U_ZERO_ERROR; | |
822 | int32_t countryLength = uloc_getCountry(normalized, NULL, 0, &err); | |
823 | return countryLength != 0; | |
824 | } | |
825 | ||
826 | #if DEBUG_UALOC | |
827 | /** | |
828 | * Debugging function that dumps the contents of this object to stdout. Parameters are the same as the functions above. | |
829 | */ | |
830 | void LocaleIDInfo::dump(const char *originalID, LocIDCharStorage& charStorage, UBool penalizeNonDefaultCountry, UErrorCode *err) { | |
831 | initBaseNames(originalID, charStorage, err); | |
832 | calcParentChain(charStorage, penalizeNonDefaultCountry, err); | |
833 | ||
834 | printf("[ %s -> %s -> %s ]", original, base, normalized); | |
835 | for (int32_t i = 1; parentChain[i] != NULL; i++) { | |
836 | printf(" -> %s", parentChain[i]); | |
837 | } | |
838 | printf("\n"); | |
839 | } | |
840 | #endif // DEBUG_UALOC | |
b331163b A |
841 | |
842 | int32_t | |
843 | ualoc_localizationsToUse( const char* const *preferredLanguages, | |
844 | int32_t preferredLanguagesCount, | |
845 | const char* const *availableLocalizations, | |
846 | int32_t availableLocalizationsCount, | |
847 | const char* *localizationsToUse, | |
848 | int32_t localizationsToUseCapacity, | |
849 | UErrorCode *status ) | |
850 | { | |
851 | if (U_FAILURE(*status)) { | |
852 | return -1; | |
853 | } | |
854 | if (preferredLanguages == NULL || availableLocalizations == NULL || localizationsToUse == NULL) { | |
855 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
856 | return -1; | |
857 | } | |
858 | // get resource data, need to protect with mutex | |
0f5d89e8 A |
859 | if (icu::gMapDataState == 0) { |
860 | umtx_initOnce(icu::gUALocaleCacheInitOnce, initializeMapData); | |
b331163b | 861 | } |
340931cb | 862 | |
2ca993e8 | 863 | #if DEBUG_UALOC |
340931cb A |
864 | printf("--------------------------------------------------------------------------------\n"); |
865 | printf("Preferred languages: "); | |
866 | for (int32_t i = 0; i < preferredLanguagesCount; i++) { | |
867 | printf("%s ", preferredLanguages[i]); | |
2ca993e8 | 868 | } |
340931cb A |
869 | printf("\nAvailable localizations: "); |
870 | for (int32_t i = 0; i < availableLocalizationsCount; i++) { | |
871 | printf("%s ", availableLocalizations[i]); | |
872 | } | |
873 | printf("\n\n"); | |
874 | #endif // DEBUG_UALOC | |
875 | ||
876 | LocaleIDInfo prefLangInfos[preferredLanguagesCount]; | |
877 | LocaleIDInfo availLocInfos[availableLocalizationsCount]; | |
878 | LocIDCharStorage charStorage; | |
879 | LocaleIDInfo* result = NULL; | |
880 | LocaleIDInfo* portugueseResult = NULL; | |
881 | int32_t resultScore = 999; | |
882 | ||
2ca993e8 | 883 | #if DEBUG_UALOC |
340931cb A |
884 | for (int32_t i = 0; i < preferredLanguagesCount; i++) { |
885 | prefLangInfos[i].dump(preferredLanguages[i], charStorage, FALSE, status); | |
886 | } | |
887 | printf("\n"); | |
888 | for (int32_t i = 0; i < availableLocalizationsCount; i++) { | |
889 | availLocInfos[i].dump(availableLocalizations[i], charStorage, TRUE, status); | |
890 | } | |
891 | printf("\n"); | |
892 | #endif // DEBUG_UALOC | |
893 | ||
894 | // Loop over the entries in `preferredLanguages` matching them against `availableLocalizations`. The first preferred | |
895 | // language that has a matching available localization is the only one that contributes to the result (except in the | |
896 | // case of Portuguese, about which more below). | |
897 | for (int32_t prefLangIndex = 0; result == NULL && prefLangIndex < preferredLanguagesCount; ++prefLangIndex) { | |
898 | LocaleIDInfo* prefLangInfo = &prefLangInfos[prefLangIndex]; | |
899 | prefLangInfo->initBaseNames(preferredLanguages[prefLangIndex], charStorage, status); | |
900 | ||
901 | // Loop over the entries in `availableLocalizations`, looking for the best match to the current entry | |
902 | // from `preferredLanguages`. | |
903 | for (int32_t availLocIndex = 0; availLocIndex < availableLocalizationsCount; ++availLocIndex) { | |
904 | LocaleIDInfo* availLocInfo = &availLocInfos[availLocIndex]; | |
905 | availLocInfo->initBaseNames(availableLocalizations[availLocIndex], charStorage, status); | |
906 | ||
907 | // Give the highest preference (a score of -1) to locales whose base names are an exact match. | |
908 | if (resultScore > -1 && uprv_strcmp(prefLangInfo->base, availLocInfo->base) == 0) { | |
909 | result = availLocInfo; | |
910 | resultScore = -1; | |
911 | // Give the second-highest preference (a score of 0) to locales whose normalized names are an exact match. | |
912 | } else if (resultScore > 0 && uprv_strcmp(prefLangInfo->normalized, availLocInfo->normalized) == 0) { | |
913 | result = availLocInfo; | |
914 | resultScore = 0; | |
915 | } else if (resultScore > 0 && uprv_strcmp(prefLangInfo->languageGroup, availLocInfo->languageGroup) == 0) { | |
916 | // If we haven't yet found an exact match, look to see if the two locales have an exact match further | |
917 | // down in their parent chains. We can skip checking the parent chains if the locales' languages are | |
918 | // different since (with a couple of important exceptions) the parent chain will never change language. | |
919 | prefLangInfo->calcParentChain(charStorage, FALSE, status); | |
920 | availLocInfo->calcParentChain(charStorage, TRUE, status); | |
921 | ||
922 | if (U_SUCCESS(*status)) { | |
923 | // Compare each pair of entries in the two locales' parent chains. If we find an exact match, | |
924 | // assign it a score based on how deep into the two parent chains it is (preference is given | |
925 | // to matches higher in the two locales' parent chains). The locale with the lowest score | |
926 | // will be our result. | |
927 | for (int32_t prefLangParentIndex = 0; prefLangInfo->parentChain[prefLangParentIndex] != NULL; ++prefLangParentIndex) { | |
928 | for (int32_t availLocParentIndex = 0; availLocInfo->parentChain[availLocParentIndex] != NULL; ++availLocParentIndex) { | |
929 | if (uprv_strcmp(prefLangInfo->parentChain[prefLangParentIndex], availLocInfo->parentChain[availLocParentIndex]) == 0) { | |
930 | if (uprv_strcmp(prefLangInfo->normalized, "pt_PT") == 0 && uprv_strcmp(availLocInfo->normalized, "pt_BR") == 0) { | |
931 | // We don't want to match pt_BR with pt_PT unless there are no better matches anywhere-- | |
932 | // if we see this match, store it "off to the side", but continue as though we didn't find | |
933 | // a match at all. We only return it if we _don't_ find any other matches. | |
934 | portugueseResult = availLocInfo; | |
935 | } else { | |
936 | int32_t score = prefLangParentIndex + availLocParentIndex; | |
937 | if (uprv_strcmp(prefLangInfo->language, availLocInfo->language) != 0) { | |
938 | // Add a one-point penalty to the score if the two locales have different languages | |
939 | ++score; | |
940 | } | |
941 | if (score < resultScore) { | |
942 | resultScore = score; | |
943 | result = availLocInfo; | |
2ca993e8 A |
944 | } |
945 | } | |
946 | } | |
947 | } | |
948 | } | |
b331163b | 949 | } |
b331163b A |
950 | } |
951 | } | |
2ca993e8 | 952 | } |
340931cb A |
953 | |
954 | // If our result isn't an exact match and does specify a country, check to see if there are any entries further | |
955 | // down in the preferred language list that have the same language as the current result but ARE an exact match with | |
956 | // something in the available-localizations list. That is, if the preferred languages list is [ fr-CH, fr-CA ] and | |
957 | // the available localizations list is [ fr-FR, fr-CA ], we want to return fr-CA, but we only want to do that with | |
958 | // variations of the language we originally matched. (We do go with the match if it doesn't specify a country-- | |
959 | // we want "en" to match "en-US" and to be preferred over matches later in the preferred-languages list.) | |
960 | // [NOTE: This logic was causing side effects with Chinese, which is more complicated, so for now we have logic | |
961 | // to skip it when the original result is Chinese.] | |
962 | if (result != NULL && resultScore > 0 && result->specifiesCountry() && uprv_strcmp(result->language, "zh") != 0) { | |
963 | for (int32_t prefLangIndex = 0; prefLangIndex < preferredLanguagesCount; ++prefLangIndex) { | |
964 | LocaleIDInfo* prefLangInfo = &prefLangInfos[prefLangIndex]; | |
965 | prefLangInfo->initBaseNames(preferredLanguages[prefLangIndex], charStorage, status); | |
966 | if (uprv_strcmp(prefLangInfo->language, result->language) == 0) { | |
967 | for (int32_t availLocIndex = 0; availLocIndex < availableLocalizationsCount; ++availLocIndex) { | |
968 | LocaleIDInfo* availLocInfo = &availLocInfos[availLocIndex]; | |
969 | if (uprv_strcmp(prefLangInfo->base, availLocInfo->base) == 0 || uprv_strcmp(prefLangInfo->normalized, availLocInfo->normalized) == 0) { | |
970 | result = &availLocInfos[availLocIndex]; | |
971 | break; | |
2ca993e8 | 972 | } |
340931cb | 973 | } |
2ca993e8 | 974 | } |
b331163b A |
975 | } |
976 | } | |
340931cb A |
977 | |
978 | // Write out our results. | |
979 | int32_t locsToUseCount = 0; | |
980 | ||
981 | // If the only match we found above is matching pt_PT to pt_BR, we can use it as our result. | |
982 | if (result == NULL && portugueseResult != NULL) { | |
983 | result = portugueseResult; | |
984 | } | |
985 | ||
986 | // If we found a match above, walk its parent chain and search `availableLocales` for any entries that occur in the | |
987 | // main result's parent chain. If we find any, we want to return those too. (The extra wrinkles below are to keep | |
988 | // us from putting the same locale into the result list more than once.) | |
989 | if (result != NULL) { | |
990 | localizationsToUse[locsToUseCount++] = result->original; | |
991 | ||
992 | result->calcParentChain(charStorage, TRUE, status); | |
993 | for (int32_t parentChainIndex = 0; result->parentChain[parentChainIndex] != NULL; ++parentChainIndex) { | |
994 | if (parentChainIndex > 0 && result->parentChain[parentChainIndex - 1] == result->parentChain[parentChainIndex]) { | |
995 | continue; | |
996 | } | |
997 | for (int32_t availLocIndex = 0; availLocIndex < availableLocalizationsCount; ++availLocIndex) { | |
998 | LocaleIDInfo* availLocInfo = &availLocInfos[availLocIndex]; | |
999 | if (result->original == availLocInfo->original) { | |
1000 | continue; | |
1001 | } else if (locsToUseCount < localizationsToUseCapacity && uprv_strcmp(result->parentChain[parentChainIndex], "zh_Hant_HK") == 0 && uprv_strcmp(availLocInfo->normalized, "zh_Hant_TW") == 0) { | |
1002 | // HACK for Chinese: If we find "zh_Hant_HK" while walking the result's parent chain and the available localizations list includes "zh_Hant_TW", include "zh_Hant_TW" in the results list too | |
1003 | localizationsToUse[locsToUseCount++] = availLocInfo->original; | |
1004 | } else if (locsToUseCount < localizationsToUseCapacity && uprv_strcmp(result->parentChain[parentChainIndex], availLocInfo->normalized) == 0) { | |
1005 | localizationsToUse[locsToUseCount++] = availLocInfo->original; | |
b331163b A |
1006 | } |
1007 | } | |
1008 | } | |
340931cb A |
1009 | } |
1010 | ||
1011 | // if our result array is empty, check to see if the availableLocalizations list contains the special sentinel | |
1012 | // value "zxx" (which means "no linguistic content"). If it does, return that instead of the empty list | |
1013 | if (locsToUseCount == 0) { | |
1014 | int32_t zxxPos = -1; | |
1015 | for (int32_t i = 0; i < availableLocalizationsCount; i++) { | |
1016 | if (uprv_strcmp(availableLocalizations[i], "zxx") == 0) { | |
1017 | zxxPos = i; | |
1018 | break; | |
b331163b A |
1019 | } |
1020 | } | |
340931cb A |
1021 | if (zxxPos >= 0) { |
1022 | localizationsToUse[locsToUseCount++] = availableLocalizations[zxxPos]; | |
1023 | } | |
b331163b | 1024 | } |
340931cb A |
1025 | |
1026 | #if DEBUG_UALOC | |
1027 | printf("Localizations to use: "); | |
1028 | for (int32_t i = 0; i < locsToUseCount; i++) { | |
1029 | printf("%s ", localizationsToUse[i]); | |
1030 | } | |
1031 | printf("\n\n"); | |
1032 | #endif // DEBUG_UALOC | |
b331163b A |
1033 | return locsToUseCount; |
1034 | } |