]> git.saurik.com Git - apple/cf.git/blob - CFLocaleIdentifier.c
CF-476.13.tar.gz
[apple/cf.git] / CFLocaleIdentifier.c
1 /*
2 * Copyright (c) 2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 CFLocaleIdentifier.c
25 Copyright (c) 2002-2007, Apple Inc. All rights reserved.
26 Responsibility: Christopher Kane
27
28 CFLocaleIdentifier.c defines
29 - enum value kLocaleIdentifierCStringMax
30 - structs KeyStringToResultString, SpecialCaseUpdates
31 and provides the following data for the functions
32 CFLocaleCreateCanonicalLocaleIdentifierFromScriptManagerCodes,
33 CFLocaleCreateCanonicalLocaleIdentifierFromString
34 CFLocaleCreateCanonicalLanguageIdentifierFromString
35
36 1. static const char * regionCodeToLocaleString[]; enum kNumRegionCodeToLocaleString;
37 map RegionCode 0..kNumRegionCodeToLocaleString-1 to canonical locale string
38
39 2. static const char * langCodeToLocaleString[]; enum kNumLangCodeToLocaleString;
40 map LangCode 0..kNumLangCodeToLocaleString-1 to canonical locale string
41
42 3. static const KeyStringToResultString oldAppleLocaleToCanonical[]; enum kNumOldAppleLocaleToCanonical;
43 map old Apple string oldAppleLocaleToCanonical[n].key
44 to canonical locale string oldAppleLocaleToCanonical[n].result
45 for n = 0..kNumOldAppleLocaleToCanonical-1
46
47 4. static const KeyStringToResultString localeStringPrefixToCanonical[]; enum kNumLocaleStringPrefixToCanonical;
48 map non-canonical language prefix (3-letter, obsolete) localeStringPrefixToCanonical[].key
49 to updated replacement localeStringPrefixToCanonical[].result
50 for n = 0..kNumLocaleStringPrefixToCanonical-1
51
52 5. static const SpecialCaseUpdates specialCases[];
53 various special cases for updating region codes, or for updating language codes based on region codes
54
55 6. static const KeyStringToResultString localeStringRegionToDefaults[]; enum kNumLocaleStringRegionToDefaults;
56 map locale string region tag localeStringRegionToDefaults[n].key
57 to default substrings to delete localeStringRegionToDefaults[n].result
58 for n = 0..kNumLocaleStringRegionToDefaults-1
59
60 7. static const KeyStringToResultString localeStringPrefixToDefaults[]; enum kNumLocaleStringPrefixToDefaults;
61 map locale string initial part localeStringPrefixToDefaults[n].key
62 to default substrings to delete localeStringPrefixToDefaults[n].result
63 for n = 0..kNumLocaleStringPrefixToDefaults-1
64
65 8. static const KeyStringToResultString appleLocaleToLanguageString[]; enum kNumAppleLocaleToLanguageString;
66 map Apple locale string appleLocaleToLanguageString[].key
67 to equivalent language string appleLocaleToLanguageString[].result
68 for n = 0..kNumAppleLocaleToLanguageString-1
69
70 */
71
72 #include <CoreFoundation/CFString.h>
73 #include <ctype.h>
74 #include <string.h>
75 #include <stdlib.h>
76 #include <unicode/uloc.h>
77
78
79 // Max byte length of locale identifier (ASCII) as C string, including terminating null byte
80 enum {
81 kLocaleIdentifierCStringMax = ULOC_FULLNAME_CAPACITY + ULOC_KEYWORD_AND_VALUES_CAPACITY // currently 56 + 100
82 };
83
84 // KeyStringToResultString struct used in data tables for CFLocaleCreateCanonicalLocaleIdentifierFromString
85 struct KeyStringToResultString {
86 const char * key;
87 const char * result;
88 };
89 typedef struct KeyStringToResultString KeyStringToResultString;
90
91 // SpecialCaseUpdates struct used in data tables for CFLocaleCreateCanonicalLocaleIdentifierFromString
92 struct SpecialCaseUpdates {
93 const char * lang;
94 const char * reg1;
95 const char * update1;
96 const char * reg2;
97 const char * update2;
98 };
99 typedef struct SpecialCaseUpdates SpecialCaseUpdates;
100
101
102 static const char * const regionCodeToLocaleString[] = {
103 // map RegionCode (array index) to canonical locale string
104 //
105 // canon. string region code; language code; [comment] [ # __CFBundleLocaleAbbreviationsArray
106 // -------- ------------ ------------------ ------------ -------- string, if different ]
107 "en_US", // 0 verUS; 0 langEnglish;
108 "fr_FR", // 1 verFrance; 1 langFrench;
109 "en_GB", // 2 verBritain; 0 langEnglish;
110 "de_DE", // 3 verGermany; 2 langGerman;
111 "it_IT", // 4 verItaly; 3 langItalian;
112 "nl_NL", // 5 verNetherlands; 4 langDutch;
113 "nl_BE", // 6 verFlemish; 34 langFlemish (redundant, =Dutch);
114 "sv_SE", // 7 verSweden; 5 langSwedish;
115 "es_ES", // 8 verSpain; 6 langSpanish;
116 "da_DK", // 9 verDenmark; 7 langDanish;
117 "pt_PT", // 10 verPortugal; 8 langPortuguese;
118 "fr_CA", // 11 verFrCanada; 1 langFrench;
119 "nb_NO", // 12 verNorway; 9 langNorwegian (Bokmal); # "no_NO"
120 "he_IL", // 13 verIsrael; 10 langHebrew;
121 "ja_JP", // 14 verJapan; 11 langJapanese;
122 "en_AU", // 15 verAustralia; 0 langEnglish;
123 "ar", // 16 verArabic; 12 langArabic;
124 "fi_FI", // 17 verFinland; 13 langFinnish;
125 "fr_CH", // 18 verFrSwiss; 1 langFrench;
126 "de_CH", // 19 verGrSwiss; 2 langGerman;
127 "el_GR", // 20 verGreece; 14 langGreek (modern)-Grek-mono;
128 "is_IS", // 21 verIceland; 15 langIcelandic;
129 "mt_MT", // 22 verMalta; 16 langMaltese;
130 "el_CY", // 23 verCyprus; 14 langGreek?; el or tr? guess el # ""
131 "tr_TR", // 24 verTurkey; 17 langTurkish;
132 "hr_HR", // 25 verYugoCroatian; 18 langCroatian; * one-way mapping -> verCroatia
133 "nl_NL", // 26 KCHR, Netherlands; 4 langDutch; * one-way mapping
134 "nl_BE", // 27 KCHR, verFlemish; 34 langFlemish; * one-way mapping
135 "_CA", // 28 KCHR, Canada-en/fr?; -1 none; * one-way mapping # "en_CA"
136 "_CA", // 29 KCHR, Canada-en/fr?; -1 none; * one-way mapping # "en_CA"
137 "pt_PT", // 30 KCHR, Portugal; 8 langPortuguese; * one-way mapping
138 "nb_NO", // 31 KCHR, Norway; 9 langNorwegian (Bokmal); * one-way mapping # "no_NO"
139 "da_DK", // 32 KCHR, Denmark; 7 langDanish; * one-way mapping
140 "hi_IN", // 33 verIndiaHindi; 21 langHindi;
141 "ur_PK", // 34 verPakistanUrdu; 20 langUrdu;
142 "tr_TR", // 35 verTurkishModified; 17 langTurkish; * one-way mapping
143 "it_CH", // 36 verItalianSwiss; 3 langItalian;
144 "en_001", // 37 verInternational; 0 langEnglish; ASCII only # "en"
145 NULL, // 38 *unassigned; -1 none; * one-way mapping # ""
146 "ro_RO", // 39 verRomania; 37 langRomanian;
147 "grc", // 40 verGreekAncient; 148 langGreekAncient -Grek-poly; # "el_GR"
148 "lt_LT", // 41 verLithuania; 24 langLithuanian;
149 "pl_PL", // 42 verPoland; 25 langPolish;
150 "hu_HU", // 43 verHungary; 26 langHungarian;
151 "et_EE", // 44 verEstonia; 27 langEstonian;
152 "lv_LV", // 45 verLatvia; 28 langLatvian;
153 "se", // 46 verSami; 29 langSami;
154 "fo_FO", // 47 verFaroeIsl; 30 langFaroese;
155 "fa_IR", // 48 verIran; 31 langFarsi/Persian;
156 "ru_RU", // 49 verRussia; 32 langRussian;
157 "ga_IE", // 50 verIreland; 35 langIrishGaelic (no dots);
158 "ko_KR", // 51 verKorea; 23 langKorean;
159 "zh_CN", // 52 verChina; 33 langSimpChinese;
160 "zh_TW", // 53 verTaiwan; 19 langTradChinese;
161 "th_TH", // 54 verThailand; 22 langThai;
162 "und", // 55 verScriptGeneric; -1 none; # "" // <1.9>
163 "cs_CZ", // 56 verCzech; 38 langCzech;
164 "sk_SK", // 57 verSlovak; 39 langSlovak;
165 "und", // 58 verEastAsiaGeneric; -1 none; * one-way mapping # "" // <1.9>
166 "hu_HU", // 59 verMagyar; 26 langHungarian; * one-way mapping -> verHungary
167 "bn", // 60 verBengali; 67 langBengali; _IN or _BD? guess generic
168 "be_BY", // 61 verBelarus; 46 langBelorussian;
169 "uk_UA", // 62 verUkraine; 45 langUkrainian;
170 NULL, // 63 *unused; -1 none; * one-way mapping # ""
171 "el_GR", // 64 verGreeceAlt; 14 langGreek (modern)-Grek-mono; * one-way mapping
172 "sr_CS", // 65 verSerbian; 42 langSerbian -Cyrl; // <1.18>
173 "sl_SI", // 66 verSlovenian; 40 langSlovenian;
174 "mk_MK", // 67 verMacedonian; 43 langMacedonian;
175 "hr_HR", // 68 verCroatia; 18 langCroatian;
176 NULL, // 69 *unused; -1 none; * one-way mapping # ""
177 "de-1996", // 70 verGermanReformed; 2 langGerman; 1996 orthogr. # "de_DE"
178 "pt_BR", // 71 verBrazil; 8 langPortuguese;
179 "bg_BG", // 72 verBulgaria; 44 langBulgarian;
180 "ca_ES", // 73 verCatalonia; 130 langCatalan;
181 "mul", // 74 verMultilingual; -1 none; # ""
182 "gd", // 75 verScottishGaelic; 144 langScottishGaelic;
183 "gv", // 76 verManxGaelic; 145 langManxGaelic;
184 "br", // 77 verBreton; 142 langBreton;
185 "iu_CA", // 78 verNunavut; 143 langInuktitut -Cans;
186 "cy", // 79 verWelsh; 128 langWelsh;
187 "_CA", // 80 KCHR, Canada-en/fr?; -1 none; * one-way mapping # "en_CA"
188 "ga-Latg_IE", // 81 verIrishGaelicScrip; 146 langIrishGaelicScript -dots; # "ga_IE" // <xx>
189 "en_CA", // 82 verEngCanada; 0 langEnglish;
190 "dz_BT", // 83 verBhutan; 137 langDzongkha;
191 "hy_AM", // 84 verArmenian; 51 langArmenian;
192 "ka_GE", // 85 verGeorgian; 52 langGeorgian;
193 "es_419", // 86 verSpLatinAmerica; 6 langSpanish; # "es"
194 "es_ES", // 87 KCHR, Spain; 6 langSpanish; * one-way mapping
195 "to_TO", // 88 verTonga; 147 langTongan;
196 "pl_PL", // 89 KCHR, Poland; 25 langPolish; * one-way mapping
197 "ca_ES", // 90 KCHR, Catalonia; 130 langCatalan; * one-way mapping
198 "fr_001", // 91 verFrenchUniversal; 1 langFrench;
199 "de_AT", // 92 verAustria; 2 langGerman;
200 "es_419", // 93 > verSpLatinAmerica; 6 langSpanish; * one-way mapping # "es"
201 "gu_IN", // 94 verGujarati; 69 langGujarati;
202 "pa", // 95 verPunjabi; 70 langPunjabi; _IN or _PK? guess generic
203 "ur_IN", // 96 verIndiaUrdu; 20 langUrdu;
204 "vi_VN", // 97 verVietnam; 80 langVietnamese;
205 "fr_BE", // 98 verFrBelgium; 1 langFrench;
206 "uz_UZ", // 99 verUzbek; 47 langUzbek;
207 "en_SG", // 100 verSingapore; 0 langEnglish?; en, zh, or ms? guess en # ""
208 "nn_NO", // 101 verNynorsk; 151 langNynorsk; # ""
209 "af_ZA", // 102 verAfrikaans; 141 langAfrikaans;
210 "eo", // 103 verEsperanto; 94 langEsperanto;
211 "mr_IN", // 104 verMarathi; 66 langMarathi;
212 "bo", // 105 verTibetan; 63 langTibetan;
213 "ne_NP", // 106 verNepal; 64 langNepali;
214 "kl", // 107 verGreenland; 149 langGreenlandic;
215 "en_IE", // 108 verIrelandEnglish; 0 langEnglish; # (no entry)
216 };
217 enum {
218 kNumRegionCodeToLocaleString = sizeof(regionCodeToLocaleString)/sizeof(char *)
219 };
220
221 static const char * const langCodeToLocaleString[] = {
222 // map LangCode (array index) to canonical locale string
223 //
224 // canon. string language code; [ comment] [ # __CFBundleLanguageAbbreviationsArray
225 // -------- -------------- ---------- -------- string, if different ]
226 "en", // 0 langEnglish;
227 "fr", // 1 langFrench;
228 "de", // 2 langGerman;
229 "it", // 3 langItalian;
230 "nl", // 4 langDutch;
231 "sv", // 5 langSwedish;
232 "es", // 6 langSpanish;
233 "da", // 7 langDanish;
234 "pt", // 8 langPortuguese;
235 "nb", // 9 langNorwegian (Bokmal); # "no"
236 "he", // 10 langHebrew -Hebr;
237 "ja", // 11 langJapanese -Jpan;
238 "ar", // 12 langArabic -Arab;
239 "fi", // 13 langFinnish;
240 "el", // 14 langGreek (modern)-Grek-mono;
241 "is", // 15 langIcelandic;
242 "mt", // 16 langMaltese -Latn;
243 "tr", // 17 langTurkish -Latn;
244 "hr", // 18 langCroatian;
245 "zh-Hant", // 19 langTradChinese; # "zh"
246 "ur", // 20 langUrdu -Arab;
247 "hi", // 21 langHindi -Deva;
248 "th", // 22 langThai -Thai;
249 "ko", // 23 langKorean -Hang;
250 "lt", // 24 langLithuanian;
251 "pl", // 25 langPolish;
252 "hu", // 26 langHungarian;
253 "et", // 27 langEstonian;
254 "lv", // 28 langLatvian;
255 "se", // 29 langSami;
256 "fo", // 30 langFaroese;
257 "fa", // 31 langFarsi/Persian -Arab;
258 "ru", // 32 langRussian -Cyrl;
259 "zh-Hans", // 33 langSimpChinese; # "zh"
260 "nl-BE", // 34 langFlemish (redundant, =Dutch); # "nl"
261 "ga", // 35 langIrishGaelic (no dots);
262 "sq", // 36 langAlbanian; no region codes
263 "ro", // 37 langRomanian;
264 "cs", // 38 langCzech;
265 "sk", // 39 langSlovak;
266 "sl", // 40 langSlovenian;
267 "yi", // 41 langYiddish -Hebr; no region codes
268 "sr", // 42 langSerbian -Cyrl;
269 "mk", // 43 langMacedonian -Cyrl;
270 "bg", // 44 langBulgarian -Cyrl;
271 "uk", // 45 langUkrainian -Cyrl;
272 "be", // 46 langBelorussian -Cyrl;
273 "uz-Cyrl", // 47 langUzbek -Cyrl; also -Latn, -Arab
274 "kk", // 48 langKazakh -Cyrl; no region codes; also -Latn, -Arab
275 "az-Cyrl", // 49 langAzerbaijani -Cyrl; no region codes # "az"
276 "az-Arab", // 50 langAzerbaijanAr -Arab; no region codes # "az"
277 "hy", // 51 langArmenian -Armn;
278 "ka", // 52 langGeorgian -Geor;
279 "mo", // 53 langMoldavian -Cyrl; no region codes
280 "ky", // 54 langKirghiz -Cyrl; no region codes; also -Latn, -Arab
281 "tg-Cyrl", // 55 langTajiki -Cyrl; no region codes; also -Latn, -Arab
282 "tk-Cyrl", // 56 langTurkmen -Cyrl; no region codes; also -Latn, -Arab
283 "mn-Mong", // 57 langMongolian -Mong; no region codes # "mn"
284 "mn-Cyrl", // 58 langMongolianCyr -Cyrl; no region codes # "mn"
285 "ps", // 59 langPashto -Arab; no region codes
286 "ku", // 60 langKurdish -Arab; no region codes
287 "ks", // 61 langKashmiri -Arab; no region codes
288 "sd", // 62 langSindhi -Arab; no region codes
289 "bo", // 63 langTibetan -Tibt;
290 "ne", // 64 langNepali -Deva;
291 "sa", // 65 langSanskrit -Deva; no region codes
292 "mr", // 66 langMarathi -Deva;
293 "bn", // 67 langBengali -Beng;
294 "as", // 68 langAssamese -Beng; no region codes
295 "gu", // 69 langGujarati -Gujr;
296 "pa", // 70 langPunjabi -Guru;
297 "or", // 71 langOriya -Orya; no region codes
298 "ml", // 72 langMalayalam -Mlym; no region codes
299 "kn", // 73 langKannada -Knda; no region codes
300 "ta", // 74 langTamil -Taml; no region codes
301 "te", // 75 langTelugu -Telu; no region codes
302 "si", // 76 langSinhalese -Sinh; no region codes
303 "my", // 77 langBurmese -Mymr; no region codes
304 "km", // 78 langKhmer -Khmr; no region codes
305 "lo", // 79 langLao -Laoo; no region codes
306 "vi", // 80 langVietnamese -Latn;
307 "id", // 81 langIndonesian -Latn; no region codes
308 "tl", // 82 langTagalog -Latn; no region codes
309 "ms", // 83 langMalayRoman -Latn; no region codes # "ms"
310 "ms-Arab", // 84 langMalayArabic -Arab; no region codes # "ms"
311 "am", // 85 langAmharic -Ethi; no region codes
312 "ti", // 86 langTigrinya -Ethi; no region codes
313 "om", // 87 langOromo -Ethi; no region codes
314 "so", // 88 langSomali -Latn; no region codes
315 "sw", // 89 langSwahili -Latn; no region codes
316 "rw", // 90 langKinyarwanda -Latn; no region codes
317 "rn", // 91 langRundi -Latn; no region codes
318 "ny", // 92 langNyanja/Chewa -Latn; no region codes # ""
319 "mg", // 93 langMalagasy -Latn; no region codes
320 "eo", // 94 langEsperanto -Latn;
321 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, // 95 to 105 (gap)
322 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, // 106 to 116 (gap)
323 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, // 107 to 117 (gap)
324 "cy", // 128 langWelsh -Latn;
325 "eu", // 129 langBasque -Latn; no region codes
326 "ca", // 130 langCatalan -Latn;
327 "la", // 131 langLatin -Latn; no region codes
328 "qu", // 132 langQuechua -Latn; no region codes
329 "gn", // 133 langGuarani -Latn; no region codes
330 "ay", // 134 langAymara -Latn; no region codes
331 "tt-Cyrl", // 135 langTatar -Cyrl; no region codes
332 "ug", // 136 langUighur -Arab; no region codes
333 "dz", // 137 langDzongkha -Tibt;
334 "jv", // 138 langJavaneseRom -Latn; no region codes
335 "su", // 139 langSundaneseRom -Latn; no region codes
336 "gl", // 140 langGalician -Latn; no region codes
337 "af", // 141 langAfrikaans -Latn;
338 "br", // 142 langBreton -Latn;
339 "iu", // 143 langInuktitut -Cans;
340 "gd", // 144 langScottishGaelic;
341 "gv", // 145 langManxGaelic -Latn;
342 "ga-Latg", // 146 langIrishGaelicScript -Latn-dots; # "ga" // <xx>
343 "to", // 147 langTongan -Latn;
344 "grc", // 148 langGreekAncient -Grek-poly; # "el"
345 "kl", // 149 langGreenlandic -Latn;
346 "az-Latn", // 150 langAzerbaijanRoman -Latn; no region codes # "az"
347 "nn", // 151 langNynorsk -Latn; # (no entry)
348 };
349 enum {
350 kNumLangCodeToLocaleString = sizeof(langCodeToLocaleString)/sizeof(char *)
351 };
352
353 static const KeyStringToResultString oldAppleLocaleToCanonical[] = {
354 // Map obsolete/old-style Apple strings to canonical
355 // Must be sorted according to how strcmp compares the strings in the first column
356 //
357 // non-canonical canonical [ comment ] # source/reason for non-canonical string
358 // string string
359 // ------------- ---------
360 { "Afrikaans", "af" }, // # __CFBundleLanguageNamesArray
361 { "Albanian", "sq" }, // # __CFBundleLanguageNamesArray
362 { "Amharic", "am" }, // # __CFBundleLanguageNamesArray
363 { "Arabic", "ar" }, // # __CFBundleLanguageNamesArray
364 { "Armenian", "hy" }, // # __CFBundleLanguageNamesArray
365 { "Assamese", "as" }, // # __CFBundleLanguageNamesArray
366 { "Aymara", "ay" }, // # __CFBundleLanguageNamesArray
367 { "Azerbaijani", "az" }, // -Arab,-Cyrl,-Latn? # __CFBundleLanguageNamesArray (had 3 entries "Azerbaijani" for "az-Arab", "az-Cyrl", "az-Latn")
368 { "Basque", "eu" }, // # __CFBundleLanguageNamesArray
369 { "Belarusian", "be" }, // # handle other names
370 { "Belorussian", "be" }, // # handle other names
371 { "Bengali", "bn" }, // # __CFBundleLanguageNamesArray
372 { "Brazilian Portugese", "pt-BR" }, // # from Installer.app Info.plist IFLanguages key, misspelled
373 { "Brazilian Portuguese", "pt-BR" }, // # correct spelling for above
374 { "Breton", "br" }, // # __CFBundleLanguageNamesArray
375 { "Bulgarian", "bg" }, // # __CFBundleLanguageNamesArray
376 { "Burmese", "my" }, // # __CFBundleLanguageNamesArray
377 { "Byelorussian", "be" }, // # __CFBundleLanguageNamesArray
378 { "Catalan", "ca" }, // # __CFBundleLanguageNamesArray
379 { "Chewa", "ny" }, // # handle other names
380 { "Chichewa", "ny" }, // # handle other names
381 { "Chinese", "zh" }, // -Hans,-Hant? # __CFBundleLanguageNamesArray (had 2 entries "Chinese" for "zh-Hant", "zh-Hans")
382 { "Chinese, Simplified", "zh-Hans" }, // # from Installer.app Info.plist IFLanguages key
383 { "Chinese, Traditional", "zh-Hant" }, // # correct spelling for below
384 { "Chinese, Tradtional", "zh-Hant" }, // # from Installer.app Info.plist IFLanguages key, misspelled
385 { "Croatian", "hr" }, // # __CFBundleLanguageNamesArray
386 { "Czech", "cs" }, // # __CFBundleLanguageNamesArray
387 { "Danish", "da" }, // # __CFBundleLanguageNamesArray
388 { "Dutch", "nl" }, // # __CFBundleLanguageNamesArray (had 2 entries "Dutch" for "nl", "nl-BE")
389 { "Dzongkha", "dz" }, // # __CFBundleLanguageNamesArray
390 { "English", "en" }, // # __CFBundleLanguageNamesArray
391 { "Esperanto", "eo" }, // # __CFBundleLanguageNamesArray
392 { "Estonian", "et" }, // # __CFBundleLanguageNamesArray
393 { "Faroese", "fo" }, // # __CFBundleLanguageNamesArray
394 { "Farsi", "fa" }, // # __CFBundleLanguageNamesArray
395 { "Finnish", "fi" }, // # __CFBundleLanguageNamesArray
396 { "Flemish", "nl-BE" }, // # handle other names
397 { "French", "fr" }, // # __CFBundleLanguageNamesArray
398 { "Galician", "gl" }, // # __CFBundleLanguageNamesArray
399 { "Gallegan", "gl" }, // # handle other names
400 { "Georgian", "ka" }, // # __CFBundleLanguageNamesArray
401 { "German", "de" }, // # __CFBundleLanguageNamesArray
402 { "Greek", "el" }, // # __CFBundleLanguageNamesArray (had 2 entries "Greek" for "el", "grc")
403 { "Greenlandic", "kl" }, // # __CFBundleLanguageNamesArray
404 { "Guarani", "gn" }, // # __CFBundleLanguageNamesArray
405 { "Gujarati", "gu" }, // # __CFBundleLanguageNamesArray
406 { "Hawaiian", "haw" }, // # handle new languages
407 { "Hebrew", "he" }, // # __CFBundleLanguageNamesArray
408 { "Hindi", "hi" }, // # __CFBundleLanguageNamesArray
409 { "Hungarian", "hu" }, // # __CFBundleLanguageNamesArray
410 { "Icelandic", "is" }, // # __CFBundleLanguageNamesArray
411 { "Indonesian", "id" }, // # __CFBundleLanguageNamesArray
412 { "Inuktitut", "iu" }, // # __CFBundleLanguageNamesArray
413 { "Irish", "ga" }, // # __CFBundleLanguageNamesArray (had 2 entries "Irish" for "ga", "ga-dots")
414 { "Italian", "it" }, // # __CFBundleLanguageNamesArray
415 { "Japanese", "ja" }, // # __CFBundleLanguageNamesArray
416 { "Javanese", "jv" }, // # __CFBundleLanguageNamesArray
417 { "Kalaallisut", "kl" }, // # handle other names
418 { "Kannada", "kn" }, // # __CFBundleLanguageNamesArray
419 { "Kashmiri", "ks" }, // # __CFBundleLanguageNamesArray
420 { "Kazakh", "kk" }, // # __CFBundleLanguageNamesArray
421 { "Khmer", "km" }, // # __CFBundleLanguageNamesArray
422 { "Kinyarwanda", "rw" }, // # __CFBundleLanguageNamesArray
423 { "Kirghiz", "ky" }, // # __CFBundleLanguageNamesArray
424 { "Korean", "ko" }, // # __CFBundleLanguageNamesArray
425 { "Kurdish", "ku" }, // # __CFBundleLanguageNamesArray
426 { "Lao", "lo" }, // # __CFBundleLanguageNamesArray
427 { "Latin", "la" }, // # __CFBundleLanguageNamesArray
428 { "Latvian", "lv" }, // # __CFBundleLanguageNamesArray
429 { "Lithuanian", "lt" }, // # __CFBundleLanguageNamesArray
430 { "Macedonian", "mk" }, // # __CFBundleLanguageNamesArray
431 { "Malagasy", "mg" }, // # __CFBundleLanguageNamesArray
432 { "Malay", "ms" }, // -Latn,-Arab? # __CFBundleLanguageNamesArray (had 2 entries "Malay" for "ms-Latn", "ms-Arab")
433 { "Malayalam", "ml" }, // # __CFBundleLanguageNamesArray
434 { "Maltese", "mt" }, // # __CFBundleLanguageNamesArray
435 { "Manx", "gv" }, // # __CFBundleLanguageNamesArray
436 { "Marathi", "mr" }, // # __CFBundleLanguageNamesArray
437 { "Moldavian", "mo" }, // # __CFBundleLanguageNamesArray
438 { "Mongolian", "mn" }, // -Mong,-Cyrl? # __CFBundleLanguageNamesArray (had 2 entries "Mongolian" for "mn-Mong", "mn-Cyrl")
439 { "Nepali", "ne" }, // # __CFBundleLanguageNamesArray
440 { "Norwegian", "nb" }, // # __CFBundleLanguageNamesArray (had "Norwegian" mapping to "no")
441 { "Nyanja", "ny" }, // # __CFBundleLanguageNamesArray
442 { "Nynorsk", "nn" }, // # handle other names (no entry in __CFBundleLanguageNamesArray)
443 { "Oriya", "or" }, // # __CFBundleLanguageNamesArray
444 { "Oromo", "om" }, // # __CFBundleLanguageNamesArray
445 { "Panjabi", "pa" }, // # handle other names
446 { "Pashto", "ps" }, // # __CFBundleLanguageNamesArray
447 { "Persian", "fa" }, // # handle other names
448 { "Polish", "pl" }, // # __CFBundleLanguageNamesArray
449 { "Portuguese", "pt" }, // # __CFBundleLanguageNamesArray
450 { "Portuguese, Brazilian", "pt-BR" }, // # handle other names
451 { "Punjabi", "pa" }, // # __CFBundleLanguageNamesArray
452 { "Pushto", "ps" }, // # handle other names
453 { "Quechua", "qu" }, // # __CFBundleLanguageNamesArray
454 { "Romanian", "ro" }, // # __CFBundleLanguageNamesArray
455 { "Ruanda", "rw" }, // # handle other names
456 { "Rundi", "rn" }, // # __CFBundleLanguageNamesArray
457 { "Russian", "ru" }, // # __CFBundleLanguageNamesArray
458 { "Sami", "se" }, // # __CFBundleLanguageNamesArray
459 { "Sanskrit", "sa" }, // # __CFBundleLanguageNamesArray
460 { "Scottish", "gd" }, // # __CFBundleLanguageNamesArray
461 { "Serbian", "sr" }, // # __CFBundleLanguageNamesArray
462 { "Simplified Chinese", "zh-Hans" }, // # handle other names
463 { "Sindhi", "sd" }, // # __CFBundleLanguageNamesArray
464 { "Sinhalese", "si" }, // # __CFBundleLanguageNamesArray
465 { "Slovak", "sk" }, // # __CFBundleLanguageNamesArray
466 { "Slovenian", "sl" }, // # __CFBundleLanguageNamesArray
467 { "Somali", "so" }, // # __CFBundleLanguageNamesArray
468 { "Spanish", "es" }, // # __CFBundleLanguageNamesArray
469 { "Sundanese", "su" }, // # __CFBundleLanguageNamesArray
470 { "Swahili", "sw" }, // # __CFBundleLanguageNamesArray
471 { "Swedish", "sv" }, // # __CFBundleLanguageNamesArray
472 { "Tagalog", "tl" }, // # __CFBundleLanguageNamesArray
473 { "Tajik", "tg" }, // # handle other names
474 { "Tajiki", "tg" }, // # __CFBundleLanguageNamesArray
475 { "Tamil", "ta" }, // # __CFBundleLanguageNamesArray
476 { "Tatar", "tt" }, // # __CFBundleLanguageNamesArray
477 { "Telugu", "te" }, // # __CFBundleLanguageNamesArray
478 { "Thai", "th" }, // # __CFBundleLanguageNamesArray
479 { "Tibetan", "bo" }, // # __CFBundleLanguageNamesArray
480 { "Tigrinya", "ti" }, // # __CFBundleLanguageNamesArray
481 { "Tongan", "to" }, // # __CFBundleLanguageNamesArray
482 { "Traditional Chinese", "zh-Hant" }, // # handle other names
483 { "Turkish", "tr" }, // # __CFBundleLanguageNamesArray
484 { "Turkmen", "tk" }, // # __CFBundleLanguageNamesArray
485 { "Uighur", "ug" }, // # __CFBundleLanguageNamesArray
486 { "Ukrainian", "uk" }, // # __CFBundleLanguageNamesArray
487 { "Urdu", "ur" }, // # __CFBundleLanguageNamesArray
488 { "Uzbek", "uz" }, // # __CFBundleLanguageNamesArray
489 { "Vietnamese", "vi" }, // # __CFBundleLanguageNamesArray
490 { "Welsh", "cy" }, // # __CFBundleLanguageNamesArray
491 { "Yiddish", "yi" }, // # __CFBundleLanguageNamesArray
492 { "ar_??", "ar" }, // # from old MapScriptInfoAndISOCodes
493 { "az.Ar", "az-Arab" }, // # from old LocaleRefGetPartString
494 { "az.Cy", "az-Cyrl" }, // # from old LocaleRefGetPartString
495 { "az.La", "az-Latn" }, // # from old LocaleRefGetPartString
496 { "be_??", "be_BY" }, // # from old MapScriptInfoAndISOCodes
497 { "bn_??", "bn" }, // # from old LocaleRefGetPartString
498 { "bo_??", "bo" }, // # from old MapScriptInfoAndISOCodes
499 { "br_??", "br" }, // # from old MapScriptInfoAndISOCodes
500 { "cy_??", "cy" }, // # from old MapScriptInfoAndISOCodes
501 { "de-96", "de-1996" }, // # from old MapScriptInfoAndISOCodes // <1.9>
502 { "de_96", "de-1996" }, // # from old MapScriptInfoAndISOCodes // <1.9>
503 { "de_??", "de-1996" }, // # from old MapScriptInfoAndISOCodes
504 { "el.El-P", "grc" }, // # from old LocaleRefGetPartString
505 { "en-ascii", "en_001" }, // # from earlier version of tables in this file!
506 { "en_??", "en_001" }, // # from old MapScriptInfoAndISOCodes
507 { "eo_??", "eo" }, // # from old MapScriptInfoAndISOCodes
508 { "es_??", "es_419" }, // # from old MapScriptInfoAndISOCodes
509 { "es_XL", "es_419" }, // # from earlier version of tables in this file!
510 { "fr_??", "fr_001" }, // # from old MapScriptInfoAndISOCodes
511 { "ga-dots", "ga-Latg" }, // # from earlier version of tables in this file! // <1.8>
512 { "ga-dots_IE", "ga-Latg_IE" }, // # from earlier version of tables in this file! // <1.8>
513 { "ga.Lg", "ga-Latg" }, // # from old LocaleRefGetPartString // <1.8>
514 { "ga.Lg_IE", "ga-Latg_IE" }, // # from old LocaleRefGetPartString // <1.8>
515 { "gd_??", "gd" }, // # from old MapScriptInfoAndISOCodes
516 { "gv_??", "gv" }, // # from old MapScriptInfoAndISOCodes
517 { "jv.La", "jv" }, // # logical extension // <1.9>
518 { "jw.La", "jv" }, // # from old LocaleRefGetPartString
519 { "kk.Cy", "kk" }, // # from old LocaleRefGetPartString
520 { "kl.La", "kl" }, // # from old LocaleRefGetPartString
521 { "kl.La_GL", "kl_GL" }, // # from old LocaleRefGetPartString // <1.9>
522 { "lp_??", "se" }, // # from old MapScriptInfoAndISOCodes
523 { "mk_??", "mk_MK" }, // # from old MapScriptInfoAndISOCodes
524 { "mn.Cy", "mn-Cyrl" }, // # from old LocaleRefGetPartString
525 { "mn.Mn", "mn-Mong" }, // # from old LocaleRefGetPartString
526 { "ms.Ar", "ms-Arab" }, // # from old LocaleRefGetPartString
527 { "ms.La", "ms" }, // # from old LocaleRefGetPartString
528 { "nl-be", "nl-BE" }, // # from old LocaleRefGetPartString
529 { "nl-be_BE", "nl_BE" }, // # from old LocaleRefGetPartString
530 // { "no-bok_NO", "nb_NO" }, // # from old LocaleRefGetPartString - handled by localeStringPrefixToCanonical
531 // { "no-nyn_NO", "nn_NO" }, // # from old LocaleRefGetPartString - handled by localeStringPrefixToCanonical
532 // { "nya", "ny" }, // # from old LocaleRefGetPartString - handled by localeStringPrefixToCanonical
533 { "pa_??", "pa" }, // # from old LocaleRefGetPartString
534 { "sa.Dv", "sa" }, // # from old LocaleRefGetPartString
535 { "sl_??", "sl_SI" }, // # from old MapScriptInfoAndISOCodes
536 { "sr_??", "sr_CS" }, // # from old MapScriptInfoAndISOCodes // <1.18>
537 { "su.La", "su" }, // # from old LocaleRefGetPartString
538 { "yi.He", "yi" }, // # from old LocaleRefGetPartString
539 { "zh-simp", "zh-Hans" }, // # from earlier version of tables in this file!
540 { "zh-trad", "zh-Hant" }, // # from earlier version of tables in this file!
541 { "zh.Ha-S", "zh-Hans" }, // # from old LocaleRefGetPartString
542 { "zh.Ha-S_CN", "zh_CN" }, // # from old LocaleRefGetPartString
543 { "zh.Ha-T", "zh-Hant" }, // # from old LocaleRefGetPartString
544 { "zh.Ha-T_TW", "zh_TW" }, // # from old LocaleRefGetPartString
545 };
546 enum {
547 kNumOldAppleLocaleToCanonical = sizeof(oldAppleLocaleToCanonical)/sizeof(KeyStringToResultString)
548 };
549
550 static const KeyStringToResultString localeStringPrefixToCanonical[] = {
551 // Map 3-letter & obsolete ISO 639 codes, plus obsolete RFC 3066 codes, to 2-letter ISO 639 code.
552 // (special cases for 'sh' handled separately)
553 // First column must be all lowercase; must be sorted according to how strcmp compares the strings in the first column.
554 //
555 // non-canonical canonical [ comment ] # source/reason for non-canonical string
556 // prefix prefix
557 // ------------- ---------
558
559 { "afr", "af" }, // Afrikaans
560 { "alb", "sq" }, // Albanian
561 { "amh", "am" }, // Amharic
562 { "ara", "ar" }, // Arabic
563 { "arm", "hy" }, // Armenian
564 { "asm", "as" }, // Assamese
565 { "aym", "ay" }, // Aymara
566 { "aze", "az" }, // Azerbaijani
567 { "baq", "eu" }, // Basque
568 { "bel", "be" }, // Belarusian
569 { "ben", "bn" }, // Bengali
570 { "bih", "bh" }, // Bihari
571 { "bod", "bo" }, // Tibetan
572 { "bos", "bs" }, // Bosnian
573 { "bre", "br" }, // Breton
574 { "bul", "bg" }, // Bulgarian
575 { "bur", "my" }, // Burmese
576 { "cat", "ca" }, // Catalan
577 { "ces", "cs" }, // Czech
578 { "che", "ce" }, // Chechen
579 { "chi", "zh" }, // Chinese
580 { "cor", "kw" }, // Cornish
581 { "cos", "co" }, // Corsican
582 { "cym", "cy" }, // Welsh
583 { "cze", "cs" }, // Czech
584 { "dan", "da" }, // Danish
585 { "deu", "de" }, // German
586 { "dut", "nl" }, // Dutch
587 { "dzo", "dz" }, // Dzongkha
588 { "ell", "el" }, // Greek, Modern (1453-)
589 { "eng", "en" }, // English
590 { "epo", "eo" }, // Esperanto
591 { "est", "et" }, // Estonian
592 { "eus", "eu" }, // Basque
593 { "fao", "fo" }, // Faroese
594 { "fas", "fa" }, // Persian
595 { "fin", "fi" }, // Finnish
596 { "fra", "fr" }, // French
597 { "fre", "fr" }, // French
598 { "geo", "ka" }, // Georgian
599 { "ger", "de" }, // German
600 { "gla", "gd" }, // Gaelic,Scottish
601 { "gle", "ga" }, // Irish
602 { "glg", "gl" }, // Gallegan
603 { "glv", "gv" }, // Manx
604 { "gre", "el" }, // Greek, Modern (1453-)
605 { "grn", "gn" }, // Guarani
606 { "guj", "gu" }, // Gujarati
607 { "heb", "he" }, // Hebrew
608 { "hin", "hi" }, // Hindi
609 { "hrv", "hr" }, // Croatian
610 { "hun", "hu" }, // Hungarian
611 { "hye", "hy" }, // Armenian
612 { "i-hak", "zh-hakka" }, // Hakka # deprecated RFC 3066
613 { "i-lux", "lb" }, // Luxembourgish # deprecated RFC 3066
614 { "i-navajo", "nv" }, // Navajo # deprecated RFC 3066
615 { "ice", "is" }, // Icelandic
616 { "iku", "iu" }, // Inuktitut
617 { "ile", "ie" }, // Interlingue
618 { "in", "id" }, // Indonesian # deprecated 639 code in -> id (1989)
619 { "ina", "ia" }, // Interlingua
620 { "ind", "id" }, // Indonesian
621 { "isl", "is" }, // Icelandic
622 { "ita", "it" }, // Italian
623 { "iw", "he" }, // Hebrew # deprecated 639 code iw -> he (1989)
624 { "jav", "jv" }, // Javanese
625 { "jaw", "jv" }, // Javanese # deprecated 639 code jaw -> jv (2001)
626 { "ji", "yi" }, // Yiddish # deprecated 639 code ji -> yi (1989)
627 { "jpn", "ja" }, // Japanese
628 { "kal", "kl" }, // Kalaallisut
629 { "kan", "kn" }, // Kannada
630 { "kas", "ks" }, // Kashmiri
631 { "kat", "ka" }, // Georgian
632 { "kaz", "kk" }, // Kazakh
633 { "khm", "km" }, // Khmer
634 { "kin", "rw" }, // Kinyarwanda
635 { "kir", "ky" }, // Kirghiz
636 { "kor", "ko" }, // Korean
637 { "kur", "ku" }, // Kurdish
638 { "lao", "lo" }, // Lao
639 { "lat", "la" }, // Latin
640 { "lav", "lv" }, // Latvian
641 { "lit", "lt" }, // Lithuanian
642 { "ltz", "lb" }, // Letzeburgesch
643 { "mac", "mk" }, // Macedonian
644 { "mal", "ml" }, // Malayalam
645 { "mar", "mr" }, // Marathi
646 { "may", "ms" }, // Malay
647 { "mkd", "mk" }, // Macedonian
648 { "mlg", "mg" }, // Malagasy
649 { "mlt", "mt" }, // Maltese
650 { "mol", "mo" }, // Moldavian
651 { "mon", "mn" }, // Mongolian
652 { "msa", "ms" }, // Malay
653 { "mya", "my" }, // Burmese
654 { "nep", "ne" }, // Nepali
655 { "nld", "nl" }, // Dutch
656 { "nno", "nn" }, // Norwegian Nynorsk
657 { "no", "nb" }, // Norwegian generic # ambiguous 639 code no -> nb
658 { "no-bok", "nb" }, // Norwegian Bokmal # deprecated RFC 3066 tag - used in old LocaleRefGetPartString
659 { "no-nyn", "nn" }, // Norwegian Nynorsk # deprecated RFC 3066 tag - used in old LocaleRefGetPartString
660 { "nob", "nb" }, // Norwegian Bokmal
661 { "nor", "nb" }, // Norwegian generic # ambiguous 639 code nor -> nb
662 { "nya", "ny" }, // Nyanja/Chewa/Chichewa # 3-letter code used in old LocaleRefGetPartString
663 { "oci", "oc" }, // Occitan/Provencal
664 { "ori", "or" }, // Oriya
665 { "orm", "om" }, // Oromo,Galla
666 { "pan", "pa" }, // Panjabi
667 { "per", "fa" }, // Persian
668 { "pol", "pl" }, // Polish
669 { "por", "pt" }, // Portuguese
670 { "pus", "ps" }, // Pushto
671 { "que", "qu" }, // Quechua
672 { "roh", "rm" }, // Raeto-Romance
673 { "ron", "ro" }, // Romanian
674 { "rum", "ro" }, // Romanian
675 { "run", "rn" }, // Rundi
676 { "rus", "ru" }, // Russian
677 { "san", "sa" }, // Sanskrit
678 { "scc", "sr" }, // Serbian
679 { "scr", "hr" }, // Croatian
680 { "sin", "si" }, // Sinhalese
681 { "slk", "sk" }, // Slovak
682 { "slo", "sk" }, // Slovak
683 { "slv", "sl" }, // Slovenian
684 { "sme", "se" }, // Sami,Northern
685 { "snd", "sd" }, // Sindhi
686 { "som", "so" }, // Somali
687 { "spa", "es" }, // Spanish
688 { "sqi", "sq" }, // Albanian
689 { "srp", "sr" }, // Serbian
690 { "sun", "su" }, // Sundanese
691 { "swa", "sw" }, // Swahili
692 { "swe", "sv" }, // Swedish
693 { "tam", "ta" }, // Tamil
694 { "tat", "tt" }, // Tatar
695 { "tel", "te" }, // Telugu
696 { "tgk", "tg" }, // Tajik
697 { "tgl", "tl" }, // Tagalog
698 { "tha", "th" }, // Thai
699 { "tib", "bo" }, // Tibetan
700 { "tir", "ti" }, // Tigrinya
701 { "ton", "to" }, // Tongan
702 { "tuk", "tk" }, // Turkmen
703 { "tur", "tr" }, // Turkish
704 { "uig", "ug" }, // Uighur
705 { "ukr", "uk" }, // Ukrainian
706 { "urd", "ur" }, // Urdu
707 { "uzb", "uz" }, // Uzbek
708 { "vie", "vi" }, // Vietnamese
709 { "wel", "cy" }, // Welsh
710 { "yid", "yi" }, // Yiddish
711 { "zho", "zh" }, // Chinese
712 };
713 enum {
714 kNumLocaleStringPrefixToCanonical = sizeof(localeStringPrefixToCanonical)/sizeof(KeyStringToResultString)
715 };
716
717
718 static const SpecialCaseUpdates specialCases[] = {
719 // Data for special cases
720 // a) The 3166 code CS was used for Czechoslovakia until 1993, when that country split and the code was
721 // replaced by CZ and SK. Then in 2003-07, the code YU (formerly designating all of Yugoslavia, then after
722 // the 1990s breakup just designating what is now Serbia and Montenegro) was changed to CS! However, ICU
723 // and RFC 3066bis will continue to use YU for this. So now CS is ambiguous. We guess as follows: If we
724 // see CS but a language of cs or sk, we change CS to CZ or SK. Otherwise, we change CS to YU.
725 // b) The 639 code sh for Serbo-Croatian was also replaced in the 1990s by separate codes hr and sr, and
726 // deprecated in 2000. We guess which one to map it to as follows: If there is a region tag of HR we use
727 // hr; if there is a region tag of (now) YU we use sr; else we do not change it (not enough info).
728 // c) There are other codes that have been updated without these issues (eg. TP to TL), plus among the
729 // "exceptionally reserved" codes some are just alternates for standard codes (eg. UK for GB).
730 { NULL, "-UK", "GB", NULL, NULL }, // always change UK to GB (UK is "exceptionally reserved" to mean GB)
731 { NULL, "-TP", "TL", NULL, NULL }, // always change TP to TL (East Timor, code changed 2002-05)
732 { "cs", "-CS", "CZ", NULL, NULL }, // if language is cs, change CS (pre-1993 Czechoslovakia) to CZ (Czech Republic)
733 { "sk", "-CS", "SK", NULL, NULL }, // if language is sk, change CS (pre-1993 Czechoslovakia) to SK (Slovakia)
734 { NULL, "-YU", "CS", NULL, NULL }, // then always change YU to CS (map old Yugoslavia code to new 2003-07 ISO code
735 // for Serbia & Montenegro per RFC3066bis & ICU) // <1.18>
736 // Note: do this after fixing CS for cs/sk as above.
737 { "sh", "-HR", "hr", "-CS", "sr" }, // if language is old 'sh' (SerboCroatian), change it to 'hr' (Croatian) if we find
738 // HR (Croatia) or to 'sr' (Serbian) if we find CS (Serbia & Montenegro, Yugoslavia). // <1.18>
739 // Note: Do this after changing YU to CS as above.
740 { NULL, NULL, NULL, NULL, NULL } // terminator
741 };
742
743
744 static const KeyStringToResultString localeStringRegionToDefaults[] = {
745 // For some region-code suffixes, there are default substrings to strip off for canonical string.
746 // Must be sorted according to how strcmp compares the strings in the first column
747 //
748 // region default writing
749 // suffix system tags, strip comment
750 // -------- ------------- ---------
751 { "_CN", "-Hans" }, // mainland China, default is simplified
752 { "_HK", "-Hant" }, // Hong Kong, default is traditional
753 { "_MO", "-Hant" }, // Macao, default is traditional
754 { "_SG", "-Hans" }, // Singapore, default is simplified
755 { "_TW", "-Hant" }, // Taiwan, default is traditional
756 };
757 enum {
758 kNumLocaleStringRegionToDefaults = sizeof(localeStringRegionToDefaults)/sizeof(KeyStringToResultString)
759 };
760
761 static const KeyStringToResultString localeStringPrefixToDefaults[] = {
762 // For some initial portions of language tag, there are default substrings to strip off for canonical string.
763 // Must be sorted according to how strcmp compares the strings in the first column
764 //
765 // language default writing
766 // tag prefix system tags, strip comment
767 // -------- ------------- ---------
768 { "ab-", "-Cyrl" }, // Abkhazian
769 { "af-", "-Latn" }, // Afrikaans
770 { "am-", "-Ethi" }, // Amharic
771 { "ar-", "-Arab" }, // Arabic
772 { "as-", "-Beng" }, // Assamese
773 { "ay-", "-Latn" }, // Aymara
774 { "be-", "-Cyrl" }, // Belarusian
775 { "bg-", "-Cyrl" }, // Bulgarian
776 { "bn-", "-Beng" }, // Bengali
777 { "bo-", "-Tibt" }, // Tibetan (? not Suppress-Script)
778 { "br-", "-Latn" }, // Breton (? not Suppress-Script)
779 { "bs-", "-Latn" }, // Bosnian
780 { "ca-", "-Latn" }, // Catalan
781 { "cs-", "-Latn" }, // Czech
782 { "cy-", "-Latn" }, // Welsh
783 { "da-", "-Latn" }, // Danish
784 { "de-", "-Latn -1901" }, // German, traditional orthography
785 { "dv-", "-Thaa" }, // Divehi/Maldivian
786 { "dz-", "-Tibt" }, // Dzongkha
787 { "el-", "-Grek" }, // Greek (modern, monotonic)
788 { "en-", "-Latn" }, // English
789 { "eo-", "-Latn" }, // Esperanto
790 { "es-", "-Latn" }, // Spanish
791 { "et-", "-Latn" }, // Estonian
792 { "eu-", "-Latn" }, // Basque
793 { "fa-", "-Arab" }, // Farsi
794 { "fi-", "-Latn" }, // Finnish
795 { "fo-", "-Latn" }, // Faroese
796 { "fr-", "-Latn" }, // French
797 { "ga-", "-Latn" }, // Irish
798 { "gd-", "-Latn" }, // Scottish Gaelic (? not Suppress-Script)
799 { "gl-", "-Latn" }, // Galician
800 { "gn-", "-Latn" }, // Guarani
801 { "gu-", "-Gujr" }, // Gujarati
802 { "gv-", "-Latn" }, // Manx
803 { "haw-", "-Latn" }, // Hawaiian (? not Suppress-Script)
804 { "he-", "-Hebr" }, // Hebrew
805 { "hi-", "-Deva" }, // Hindi
806 { "hr-", "-Latn" }, // Croatian
807 { "hu-", "-Latn" }, // Hungarian
808 { "hy-", "-Armn" }, // Armenian
809 { "id-", "-Latn" }, // Indonesian
810 { "is-", "-Latn" }, // Icelandic
811 { "it-", "-Latn" }, // Italian
812 { "ja-", "-Jpan" }, // Japanese
813 { "ka-", "-Geor" }, // Georgian
814 { "kk-", "-Cyrl" }, // Kazakh
815 { "kl-", "-Latn" }, // Kalaallisut/Greenlandic
816 { "km-", "-Khmr" }, // Central Khmer
817 { "kn-", "-Knda" }, // Kannada
818 { "ko-", "-Hang" }, // Korean (? not Suppress-Script)
819 { "kok-", "-Deva" }, // Konkani
820 { "la-", "-Latn" }, // Latin
821 { "lb-", "-Latn" }, // Luxembourgish
822 { "lo-", "-Laoo" }, // Lao
823 { "lt-", "-Latn" }, // Lithuanian
824 { "lv-", "-Latn" }, // Latvian
825 { "mg-", "-Latn" }, // Malagasy
826 { "mk-", "-Cyrl" }, // Macedonian
827 { "ml-", "-Mlym" }, // Malayalam
828 { "mo-", "-Latn" }, // Moldavian
829 { "mr-", "-Deva" }, // Marathi
830 { "ms-", "-Latn" }, // Malay
831 { "mt-", "-Latn" }, // Maltese
832 { "my-", "-Mymr" }, // Burmese/Myanmar
833 { "nb-", "-Latn" }, // Norwegian Bokmal
834 { "ne-", "-Deva" }, // Nepali
835 { "nl-", "-Latn" }, // Dutch
836 { "nn-", "-Latn" }, // Norwegian Nynorsk
837 { "ny-", "-Latn" }, // Chichewa/Nyanja
838 { "om-", "-Latn" }, // Oromo
839 { "or-", "-Orya" }, // Oriya
840 { "pa-", "-Guru" }, // Punjabi
841 { "pl-", "-Latn" }, // Polish
842 { "ps-", "-Arab" }, // Pushto
843 { "pt-", "-Latn" }, // Portuguese
844 { "qu-", "-Latn" }, // Quechua
845 { "rn-", "-Latn" }, // Rundi
846 { "ro-", "-Latn" }, // Romanian
847 { "ru-", "-Cyrl" }, // Russian
848 { "rw-", "-Latn" }, // Kinyarwanda
849 { "sa-", "-Deva" }, // Sanskrit (? not Suppress-Script)
850 { "se-", "-Latn" }, // Sami (? not Suppress-Script)
851 { "si-", "-Sinh" }, // Sinhala
852 { "sk-", "-Latn" }, // Slovak
853 { "sl-", "-Latn" }, // Slovenian
854 { "so-", "-Latn" }, // Somali
855 { "sq-", "-Latn" }, // Albanian
856 { "sv-", "-Latn" }, // Swedish
857 { "sw-", "-Latn" }, // Swahili
858 { "ta-", "-Taml" }, // Tamil
859 { "te-", "-Telu" }, // Telugu
860 { "th-", "-Thai" }, // Thai
861 { "ti-", "-Ethi" }, // Tigrinya
862 { "tl-", "-Latn" }, // Tagalog
863 { "tn-", "-Latn" }, // Tswana
864 { "to-", "-Latn" }, // Tonga of Tonga Islands
865 { "tr-", "-Latn" }, // Turkish
866 { "uk-", "-Cyrl" }, // Ukrainian
867 { "ur-", "-Arab" }, // Urdu
868 { "vi-", "-Latn" }, // Vietnamese
869 { "wo-", "-Latn" }, // Wolof
870 { "xh-", "-Latn" }, // Xhosa
871 { "yi-", "-Hebr" }, // Yiddish
872 { "zh-", "-Hani" }, // Chinese (? not Suppress-Script)
873 { "zu-", "-Latn" }, // Zulu
874 };
875 enum {
876 kNumLocaleStringPrefixToDefaults = sizeof(localeStringPrefixToDefaults)/sizeof(KeyStringToResultString)
877 };
878
879 static const KeyStringToResultString appleLocaleToLanguageString[] = {
880 // Map locale strings that Apple uses as language IDs to real language strings.
881 // Must be sorted according to how strcmp compares the strings in the first column.
882 // Note: Now we remove all transforms of the form ll_RR -> ll-RR, they are now
883 // handled in the code. <1.19>
884 //
885 // locale lang [ comment ]
886 // string string
887 // ------- -------
888 { "en_US_POSIX", "en-US-POSIX" }, // POSIX locale, need as language string // <1.17> [3840752]
889 { "zh_CN", "zh-Hans" }, // mainland China => simplified
890 { "zh_HK", "zh-Hant" }, // Hong Kong => traditional, not currently used
891 { "zh_MO", "zh-Hant" }, // Macao => traditional, not currently used
892 { "zh_SG", "zh-Hans" }, // Singapore => simplified, not currently used
893 { "zh_TW", "zh-Hant" }, // Taiwan => traditional
894 };
895 enum {
896 kNumAppleLocaleToLanguageString = sizeof(appleLocaleToLanguageString)/sizeof(KeyStringToResultString)
897 };
898
899 static const KeyStringToResultString appleLocaleToLanguageStringForCFBundle[] = {
900 // Map locale strings that Apple uses as language IDs to real language strings.
901 // Must be sorted according to how strcmp compares the strings in the first column.
902 //
903 // locale lang [ comment ]
904 // string string
905 // ------- -------
906 { "de_AT", "de-AT" }, // Austrian German
907 { "de_CH", "de-CH" }, // Swiss German
908 // { "de_DE", "de-DE" }, // German for Germany (default), not currently used
909 { "en_AU", "en-AU" }, // Australian English
910 { "en_CA", "en-CA" }, // Canadian English
911 { "en_GB", "en-GB" }, // British English
912 // { "en_IE", "en-IE" }, // Irish English, not currently used
913 { "en_US", "en-US" }, // U.S. English
914 { "en_US_POSIX", "en-US-POSIX" }, // POSIX locale, need as language string // <1.17> [3840752]
915 // { "fr_BE", "fr-BE" }, // Belgian French, not currently used
916 { "fr_CA", "fr-CA" }, // Canadian French
917 { "fr_CH", "fr-CH" }, // Swiss French
918 // { "fr_FR", "fr-FR" }, // French for France (default), not currently used
919 { "nl_BE", "nl-BE" }, // Flemish = Vlaams, Dutch for Belgium
920 // { "nl_NL", "nl-NL" }, // Dutch for Netherlands (default), not currently used
921 { "pt_BR", "pt-BR" }, // Brazilian Portuguese
922 { "pt_PT", "pt-PT" }, // Portuguese for Portugal
923 { "zh_CN", "zh-Hans" }, // mainland China => simplified
924 { "zh_HK", "zh-Hant" }, // Hong Kong => traditional, not currently used
925 { "zh_MO", "zh-Hant" }, // Macao => traditional, not currently used
926 { "zh_SG", "zh-Hans" }, // Singapore => simplified, not currently used
927 { "zh_TW", "zh-Hant" }, // Taiwan => traditional
928 };
929 enum {
930 kNumAppleLocaleToLanguageStringForCFBundle = sizeof(appleLocaleToLanguageStringForCFBundle)/sizeof(KeyStringToResultString)
931 };
932
933
934 struct LocaleToLegacyCodes {
935 const char * locale; // reduced to language plus one other component (script, region, variant), separators normalized to'_'
936 RegionCode regCode;
937 LangCode langCode;
938 CFStringEncoding encoding;
939 };
940 typedef struct LocaleToLegacyCodes LocaleToLegacyCodes;
941
942 static const LocaleToLegacyCodes localeToLegacyCodes[] = {
943 // locale RegionCode LangCode CFStringEncoding
944 { "af"/*ZA*/, 102/*verAfrikaans*/, 141/*langAfrikaans*/, 0/*Roman*/ }, // Latn
945 { "am", -1, 85/*langAmharic*/, 28/*Ethiopic*/ }, // Ethi
946 { "ar", 16/*verArabic*/, 12/*langArabic*/, 4/*Arabic*/ }, // Arab;
947 { "as", -1, 68/*langAssamese*/, 13/*Bengali*/ }, // Beng;
948 { "ay", -1, 134/*langAymara*/, 0/*Roman*/ }, // Latn;
949 { "az", -1, 49/*langAzerbaijani*/, 7/*Cyrillic*/ }, // assume "az" defaults to -Cyrl
950 { "az_Arab", -1, 50/*langAzerbaijanAr*/, 4/*Arabic*/ }, // Arab;
951 { "az_Cyrl", -1, 49/*langAzerbaijani*/, 7/*Cyrillic*/ }, // Cyrl;
952 { "az_Latn", -1, 150/*langAzerbaijanRoman*/, 0/*Roman*/ }, // Latn;
953 { "be"/*BY*/, 61/*verBelarus*/, 46/*langBelorussian*/, 7/*Cyrillic*/ }, // Cyrl;
954 { "bg"/*BG*/, 72/*verBulgaria*/, 44/*langBulgarian*/, 7/*Cyrillic*/ }, // Cyrl;
955 { "bn", 60/*verBengali*/, 67/*langBengali*/, 13/*Bengali*/ }, // Beng;
956 { "bo", 105/*verTibetan*/, 63/*langTibetan*/, 26/*Tibetan*/ }, // Tibt;
957 { "br", 77/*verBreton*/, 142/*langBreton*/, 39/*Celtic*/ }, // Latn;
958 { "ca"/*ES*/, 73/*verCatalonia*/, 130/*langCatalan*/, 0/*Roman*/ }, // Latn;
959 { "cs"/*CZ*/, 56/*verCzech*/, 38/*langCzech*/, 29/*CentralEurRoman*/ }, // Latn;
960 { "cy", 79/*verWelsh*/, 128/*langWelsh*/, 39/*Celtic*/ }, // Latn;
961 { "da"/*DK*/, 9/*verDenmark*/, 7/*langDanish*/, 0/*Roman*/ }, // Latn;
962 { "de", 3/*verGermany*/, 2/*langGerman*/, 0/*Roman*/ }, // assume "de" defaults to verGermany
963 { "de_1996", 70/*verGermanReformed*/, 2/*langGerman*/, 0/*Roman*/ },
964 { "de_AT", 92/*verAustria*/, 2/*langGerman*/, 0/*Roman*/ },
965 { "de_CH", 19/*verGrSwiss*/, 2/*langGerman*/, 0/*Roman*/ },
966 { "de_DE", 3/*verGermany*/, 2/*langGerman*/, 0/*Roman*/ },
967 { "dz"/*BT*/, 83/*verBhutan*/, 137/*langDzongkha*/, 26/*Tibetan*/ }, // Tibt;
968 { "el", 20/*verGreece*/, 14/*langGreek*/, 6/*Greek*/ }, // assume "el" defaults to verGreece
969 { "el_CY", 23/*verCyprus*/, 14/*langGreek*/, 6/*Greek*/ },
970 { "el_GR", 20/*verGreece*/, 14/*langGreek*/, 6/*Greek*/ }, // modern monotonic
971 { "en", 0/*verUS*/, 0/*langEnglish*/, 0/*Roman*/ }, // "en" defaults to verUS (per Chris Hansten)
972 { "en_001", 37/*verInternational*/, 0/*langEnglish*/, 0/*Roman*/ },
973 { "en_AU", 15/*verAustralia*/, 0/*langEnglish*/, 0/*Roman*/ },
974 { "en_CA", 82/*verEngCanada*/, 0/*langEnglish*/, 0/*Roman*/ },
975 { "en_GB", 2/*verBritain*/, 0/*langEnglish*/, 0/*Roman*/ },
976 { "en_IE", 108/*verIrelandEnglish*/, 0/*langEnglish*/, 0/*Roman*/ },
977 { "en_SG", 100/*verSingapore*/, 0/*langEnglish*/, 0/*Roman*/ },
978 { "en_US", 0/*verUS*/, 0/*langEnglish*/, 0/*Roman*/ },
979 { "eo", 103/*verEsperanto*/, 94/*langEsperanto*/, 0/*Roman*/ }, // Latn;
980 { "es", 8/*verSpain*/, 6/*langSpanish*/, 0/*Roman*/ }, // "es" defaults to verSpain (per Chris Hansten)
981 { "es_419", 86/*verSpLatinAmerica*/, 6/*langSpanish*/, 0/*Roman*/ }, // new BCP 47 tag
982 { "es_ES", 8/*verSpain*/, 6/*langSpanish*/, 0/*Roman*/ },
983 { "es_MX", 86/*verSpLatinAmerica*/, 6/*langSpanish*/, 0/*Roman*/ },
984 { "es_US", 86/*verSpLatinAmerica*/, 6/*langSpanish*/, 0/*Roman*/ },
985 { "et"/*EE*/, 44/*verEstonia*/, 27/*langEstonian*/, 29/*CentralEurRoman*/ },
986 { "eu", -1, 129/*langBasque*/, 0/*Roman*/ }, // Latn;
987 { "fa"/*IR*/, 48/*verIran*/, 31/*langFarsi/Persian*/, 0x8C/*Farsi*/ }, // Arab;
988 { "fi"/*FI*/, 17/*verFinland*/, 13/*langFinnish*/, 0/*Roman*/ },
989 { "fo"/*FO*/, 47/*verFaroeIsl*/, 30/*langFaroese*/, 37/*Icelandic*/ },
990 { "fr", 1/*verFrance*/, 1/*langFrench*/, 0/*Roman*/ }, // "fr" defaults to verFrance (per Chris Hansten)
991 { "fr_001", 91/*verFrenchUniversal*/, 1/*langFrench*/, 0/*Roman*/ },
992 { "fr_BE", 98/*verFrBelgium*/, 1/*langFrench*/, 0/*Roman*/ },
993 { "fr_CA", 11/*verFrCanada*/, 1/*langFrench*/, 0/*Roman*/ },
994 { "fr_CH", 18/*verFrSwiss*/, 1/*langFrench*/, 0/*Roman*/ },
995 { "fr_FR", 1/*verFrance*/, 1/*langFrench*/, 0/*Roman*/ },
996 { "ga"/*IE*/, 50/*verIreland*/, 35/*langIrishGaelic*/, 0/*Roman*/ }, // no dots (h after)
997 { "ga_Latg"/*IE*/, 81/*verIrishGaelicScrip*/, 146/*langIrishGaelicScript*/, 40/*Gaelic*/ }, // using dots
998 { "gd", 75/*verScottishGaelic*/, 144/*langScottishGaelic*/, 39/*Celtic*/ },
999 { "gl", -1, 140/*langGalician*/, 0/*Roman*/ }, // Latn;
1000 { "gn", -1, 133/*langGuarani*/, 0/*Roman*/ }, // Latn;
1001 { "grc", 40/*verGreekAncient*/, 148/*langGreekAncient*/, 6/*Greek*/ }, // polytonic (MacGreek doesn't actually support it)
1002 { "gu"/*IN*/, 94/*verGujarati*/, 69/*langGujarati*/, 11/*Gujarati*/ }, // Gujr;
1003 { "gv", 76/*verManxGaelic*/, 145/*langManxGaelic*/, 39/*Celtic*/ }, // Latn;
1004 { "he"/*IL*/, 13/*verIsrael*/, 10/*langHebrew*/, 5/*Hebrew*/ }, // Hebr;
1005 { "hi"/*IN*/, 33/*verIndiaHindi*/, 21/*langHindi*/, 9/*Devanagari*/ }, // Deva;
1006 { "hr"/*HR*/, 68/*verCroatia*/, 18/*langCroatian*/, 36/*Croatian*/ },
1007 { "hu"/*HU*/, 43/*verHungary*/, 26/*langHungarian*/, 29/*CentralEurRoman*/ },
1008 { "hy"/*AM*/, 84/*verArmenian*/, 51/*langArmenian*/, 24/*Armenian*/ }, // Armn;
1009 { "id", -1, 81/*langIndonesian*/, 0/*Roman*/ }, // Latn;
1010 { "is"/*IS*/, 21/*verIceland*/, 15/*langIcelandic*/, 37/*Icelandic*/ },
1011 { "it", 4/*verItaly*/, 3/*langItalian*/, 0/*Roman*/ }, // "it" defaults to verItaly
1012 { "it_CH", 36/*verItalianSwiss*/, 3/*langItalian*/, 0/*Roman*/ },
1013 { "it_IT", 4/*verItaly*/, 3/*langItalian*/, 0/*Roman*/ },
1014 { "iu"/*CA*/, 78/*verNunavut*/, 143/*langInuktitut*/, 0xEC/*Inuit*/ }, // Cans;
1015 { "ja"/*JP*/, 14/*verJapan*/, 11/*langJapanese*/, 1/*Japanese*/ }, // Jpan;
1016 { "jv", -1, 138/*langJavaneseRom*/, 0/*Roman*/ }, // Latn;
1017 { "ka"/*GE*/, 85/*verGeorgian*/, 52/*langGeorgian*/, 23/*Georgian*/ }, // Geor;
1018 { "kk", -1, 48/*langKazakh*/, 7/*Cyrillic*/ }, // "kk" defaults to -Cyrl; also have -Latn, -Arab
1019 { "kl", 107/*verGreenland*/, 149/*langGreenlandic*/, 0/*Roman*/ }, // Latn;
1020 { "km", -1, 78/*langKhmer*/, 20/*Khmer*/ }, // Khmr;
1021 { "kn", -1, 73/*langKannada*/, 16/*Kannada*/ }, // Knda;
1022 { "ko"/*KR*/, 51/*verKorea*/, 23/*langKorean*/, 3/*Korean*/ }, // Hang;
1023 { "ks", -1, 61/*langKashmiri*/, 4/*Arabic*/ }, // Arab;
1024 { "ku", -1, 60/*langKurdish*/, 4/*Arabic*/ }, // Arab;
1025 { "ky", -1, 54/*langKirghiz*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1026 { "la", -1, 131/*langLatin*/, 0/*Roman*/ }, // Latn;
1027 { "lo", -1, 79/*langLao*/, 22/*Laotian*/ }, // Laoo;
1028 { "lt"/*LT*/, 41/*verLithuania*/, 24/*langLithuanian*/, 29/*CentralEurRoman*/ },
1029 { "lv"/*LV*/, 45/*verLatvia*/, 28/*langLatvian*/, 29/*CentralEurRoman*/ },
1030 { "mg", -1, 93/*langMalagasy*/, 0/*Roman*/ }, // Latn;
1031 { "mk"/*MK*/, 67/*verMacedonian*/, 43/*langMacedonian*/, 7/*Cyrillic*/ }, // Cyrl;
1032 { "ml", -1, 72/*langMalayalam*/, 17/*Malayalam*/ }, // Mlym;
1033 { "mn", -1, 57/*langMongolian*/, 27/*Mongolian*/ }, // "mn" defaults to -Mong
1034 { "mn_Cyrl", -1, 58/*langMongolianCyr*/, 7/*Cyrillic*/ }, // Cyrl;
1035 { "mn_Mong", -1, 57/*langMongolian*/, 27/*Mongolian*/ }, // Mong;
1036 { "mo", -1, 53/*langMoldavian*/, 7/*Cyrillic*/ }, // Cyrl;
1037 { "mr"/*IN*/, 104/*verMarathi*/, 66/*langMarathi*/, 9/*Devanagari*/ }, // Deva;
1038 { "ms", -1, 83/*langMalayRoman*/, 0/*Roman*/ }, // "ms" defaults to -Latn;
1039 { "ms_Arab", -1, 84/*langMalayArabic*/, 4/*Arabic*/ }, // Arab;
1040 { "mt"/*MT*/, 22/*verMalta*/, 16/*langMaltese*/, 0/*Roman*/ }, // Latn;
1041 { "mul", 74/*verMultilingual*/, -1, 0 },
1042 { "my", -1, 77/*langBurmese*/, 19/*Burmese*/ }, // Mymr;
1043 { "nb"/*NO*/, 12/*verNorway*/, 9/*langNorwegian*/, 0/*Roman*/ },
1044 { "ne"/*NP*/, 106/*verNepal*/, 64/*langNepali*/, 9/*Devanagari*/ }, // Deva;
1045 { "nl", 5/*verNetherlands*/, 4/*langDutch*/, 0/*Roman*/ }, // "nl" defaults to verNetherlands
1046 { "nl_BE", 6/*verFlemish*/, 34/*langFlemish*/, 0/*Roman*/ },
1047 { "nl_NL", 5/*verNetherlands*/, 4/*langDutch*/, 0/*Roman*/ },
1048 { "nn"/*NO*/, 101/*verNynorsk*/, 151/*langNynorsk*/, 0/*Roman*/ },
1049 { "ny", -1, 92/*langNyanja/Chewa*/, 0/*Roman*/ }, // Latn;
1050 { "om", -1, 87/*langOromo*/, 28/*Ethiopic*/ }, // Ethi;
1051 { "or", -1, 71/*langOriya*/, 12/*Oriya*/ }, // Orya;
1052 { "pa", 95/*verPunjabi*/, 70/*langPunjabi*/, 10/*Gurmukhi*/ }, // Guru;
1053 { "pl"/*PL*/, 42/*verPoland*/, 25/*langPolish*/, 29/*CentralEurRoman*/ },
1054 { "ps", -1, 59/*langPashto*/, 0x8C/*Farsi*/ }, // Arab;
1055 { "pt", 71/*verBrazil*/, 8/*langPortuguese*/, 0/*Roman*/ }, // "pt" defaults to verBrazil (per Chris Hansten)
1056 { "pt_BR", 71/*verBrazil*/, 8/*langPortuguese*/, 0/*Roman*/ },
1057 { "pt_PT", 10/*verPortugal*/, 8/*langPortuguese*/, 0/*Roman*/ },
1058 { "qu", -1, 132/*langQuechua*/, 0/*Roman*/ }, // Latn;
1059 { "rn", -1, 91/*langRundi*/, 0/*Roman*/ }, // Latn;
1060 { "ro"/*RO*/, 39/*verRomania*/, 37/*langRomanian*/, 38/*Romanian*/ },
1061 { "ru"/*RU*/, 49/*verRussia*/, 32/*langRussian*/, 7/*Cyrillic*/ }, // Cyrl;
1062 { "rw", -1, 90/*langKinyarwanda*/, 0/*Roman*/ }, // Latn;
1063 { "sa", -1, 65/*langSanskrit*/, 9/*Devanagari*/ }, // Deva;
1064 { "sd", -1, 62/*langSindhi*/, 0x8C/*Farsi*/ }, // Arab;
1065 { "se", 46/*verSami*/, 29/*langSami*/, 0/*Roman*/ },
1066 { "si", -1, 76/*langSinhalese*/, 18/*Sinhalese*/ }, // Sinh;
1067 { "sk"/*SK*/, 57/*verSlovak*/, 39/*langSlovak*/, 29/*CentralEurRoman*/ },
1068 { "sl"/*SI*/, 66/*verSlovenian*/, 40/*langSlovenian*/, 36/*Croatian*/ },
1069 { "so", -1, 88/*langSomali*/, 0/*Roman*/ }, // Latn;
1070 { "sq", -1, 36/*langAlbanian*/, 0/*Roman*/ },
1071 { "sr"/*CS,RS*/, 65/*verSerbian*/, 42/*langSerbian*/, 7/*Cyrillic*/ }, // Cyrl;
1072 { "su", -1, 139/*langSundaneseRom*/, 0/*Roman*/ }, // Latn;
1073 { "sv"/*SE*/, 7/*verSweden*/, 5/*langSwedish*/, 0/*Roman*/ },
1074 { "sw", -1, 89/*langSwahili*/, 0/*Roman*/ }, // Latn;
1075 { "ta", -1, 74/*langTamil*/, 14/*Tamil*/ }, // Taml;
1076 { "te", -1, 75/*langTelugu*/, 15/*Telugu*/ }, // Telu
1077 { "tg", -1, 55/*langTajiki*/, 7/*Cyrillic*/ }, // "tg" defaults to "Cyrl"
1078 { "tg_Cyrl", -1, 55/*langTajiki*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1079 { "th"/*TH*/, 54/*verThailand*/, 22/*langThai*/, 21/*Thai*/ }, // Thai;
1080 { "ti", -1, 86/*langTigrinya*/, 28/*Ethiopic*/ }, // Ethi;
1081 { "tk", -1, 56/*langTurkmen*/, 7/*Cyrillic*/ }, // "tk" defaults to Cyrl
1082 { "tk_Cyrl", -1, 56/*langTurkmen*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1083 { "tl", -1, 82/*langTagalog*/, 0/*Roman*/ }, // Latn;
1084 { "to"/*TO*/, 88/*verTonga*/, 147/*langTongan*/, 0/*Roman*/ }, // Latn;
1085 { "tr"/*TR*/, 24/*verTurkey*/, 17/*langTurkish*/, 35/*Turkish*/ }, // Latn;
1086 { "tt", -1, 135/*langTatar*/, 7/*Cyrillic*/ }, // Cyrl;
1087 { "tt_Cyrl", -1, 135/*langTatar*/, 7/*Cyrillic*/ }, // Cyrl;
1088 { "ug", -1, 136/*langUighur*/, 4/*Arabic*/ }, // Arab;
1089 { "uk"/*UA*/, 62/*verUkraine*/, 45/*langUkrainian*/, 7/*Cyrillic*/ }, // Cyrl;
1090 { "und", 55/*verScriptGeneric*/, -1, 0 },
1091 { "ur", 34/*verPakistanUrdu*/, 20/*langUrdu*/, 0x8C/*Farsi*/ }, // "ur" defaults to verPakistanUrdu
1092 { "ur_IN", 96/*verIndiaUrdu*/, 20/*langUrdu*/, 0x8C/*Farsi*/ }, // Arab
1093 { "ur_PK", 34/*verPakistanUrdu*/, 20/*langUrdu*/, 0x8C/*Farsi*/ }, // Arab
1094 { "uz"/*UZ*/, 99/*verUzbek*/, 47/*langUzbek*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1095 { "uz_Cyrl", 99/*verUzbek*/, 47/*langUzbek*/, 7/*Cyrillic*/ },
1096 { "vi"/*VN*/, 97/*verVietnam*/, 80/*langVietnamese*/, 30/*Vietnamese*/ }, // Latn
1097 { "yi", -1, 41/*langYiddish*/, 5/*Hebrew*/ }, // Hebr;
1098 { "zh", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ }, // "zh" defaults to verChina, langSimpChinese
1099 { "zh_CN", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ },
1100 { "zh_HK", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1101 { "zh_Hans", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ },
1102 { "zh_Hant", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1103 { "zh_MO", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1104 { "zh_SG", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ },
1105 { "zh_TW", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1106 };
1107 enum {
1108 kNumLocaleToLegacyCodes = sizeof(localeToLegacyCodes)/sizeof(localeToLegacyCodes[0])
1109 };
1110
1111 /*
1112 For reference here is a list of ICU locales with variants and how some
1113 of them are canonicalized with the ICU function uloc_canonicalize:
1114
1115 ICU 3.0 has:
1116 en_US_POSIX x no change
1117 hy_AM_REVISED x no change
1118 ja_JP_TRADITIONAL -> ja_JP@calendar=japanese
1119 th_TH_TRADITIONAL -> th_TH@calendar=buddhist
1120
1121 ICU 2.8 also had the following (now obsolete):
1122 ca_ES_PREEURO
1123 de__PHONEBOOK -> de@collation=phonebook
1124 de_AT_PREEURO
1125 de_DE_PREEURO
1126 de_LU_PREEURO
1127 el_GR_PREEURO
1128 en_BE_PREEURO
1129 en_GB_EURO -> en_GB@currency=EUR
1130 en_IE_PREEURO -> en_IE@currency=IEP
1131 es__TRADITIONAL -> es@collation=traditional
1132 es_ES_PREEURO
1133 eu_ES_PREEURO
1134 fi_FI_PREEURO
1135 fr_BE_PREEURO
1136 fr_FR_PREEURO -> fr_FR@currency=FRF
1137 fr_LU_PREEURO
1138 ga_IE_PREEURO
1139 gl_ES_PREEURO
1140 hi__DIRECT -> hi@collation=direct
1141 it_IT_PREEURO
1142 nl_BE_PREEURO
1143 nl_NL_PREEURO
1144 pt_PT_PREEURO
1145 zh__PINYIN -> zh@collation=pinyin
1146 zh_TW_STROKE -> zh_TW@collation=stroke
1147
1148 */
1149
1150 // _CompareTestEntryToTableEntryKey
1151 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1152 // comparison function for bsearch
1153 static int _CompareTestEntryToTableEntryKey(const void *testEntryPtr, const void *tableEntryKeyPtr) {
1154 return strcmp( ((const KeyStringToResultString *)testEntryPtr)->key, ((const KeyStringToResultString *)tableEntryKeyPtr)->key );
1155 }
1156
1157 // _CompareTestEntryPrefixToTableEntryKey
1158 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1159 // Comparison function for bsearch. Assumes prefix IS terminated with '-' or '_'.
1160 // Do the following instead of strlen & strncmp so we don't walk tableEntry key twice.
1161 static int _CompareTestEntryPrefixToTableEntryKey(const void *testEntryPtr, const void *tableEntryKeyPtr) {
1162 const char * testPtr = ((const KeyStringToResultString *)testEntryPtr)->key;
1163 const char * tablePtr = ((const KeyStringToResultString *)tableEntryKeyPtr)->key;
1164
1165 while ( *testPtr == *tablePtr && *tablePtr != 0 ) {
1166 testPtr++; tablePtr++;
1167 }
1168 if ( *tablePtr != 0 ) {
1169 // strings are different, and the string in the table has not run out;
1170 // i.e. the table entry is not a prefix of the text string.
1171 return ( *testPtr < *tablePtr )? -1: 1;
1172 }
1173 return 0;
1174 }
1175
1176 // _CompareLowerTestEntryPrefixToTableEntryKey
1177 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1178 // Comparison function for bsearch. Assumes prefix NOT terminated with '-' or '_'.
1179 // Lowercases the test string before comparison (the table should already have lowercased entries).
1180 static int _CompareLowerTestEntryPrefixToTableEntryKey(const void *testEntryPtr, const void *tableEntryKeyPtr) {
1181 const char * testPtr = ((const KeyStringToResultString *)testEntryPtr)->key;
1182 const char * tablePtr = ((const KeyStringToResultString *)tableEntryKeyPtr)->key;
1183 char lowerTestChar;
1184
1185 while ( (lowerTestChar = tolower(*testPtr)) == *tablePtr && *tablePtr != 0 && lowerTestChar != '_' ) { // <1.9>
1186 testPtr++; tablePtr++;
1187 }
1188 if ( *tablePtr != 0 ) {
1189 // strings are different, and the string in the table has not run out;
1190 // i.e. the table entry is not a prefix of the text string.
1191 if (lowerTestChar == '_') // <1.9>
1192 return -1; // <1.9>
1193 return ( lowerTestChar < *tablePtr )? -1: 1;
1194 }
1195 // The string in the table has run out. If the test string char is not alnum,
1196 // then the string matches, else the test string sorts after.
1197 return ( !isalnum(lowerTestChar) )? 0: 1;
1198 }
1199
1200 // _DeleteCharsAtPointer
1201 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1202 // remove _length_ characters from the beginning of the string indicated by _stringPtr_
1203 // (we know that the string has at least _length_ characters in it)
1204 static void _DeleteCharsAtPointer(char *stringPtr, int length) {
1205 do {
1206 *stringPtr = stringPtr[length];
1207 } while (*stringPtr++ != 0);
1208 }
1209
1210 // _CopyReplacementAtPointer
1211 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1212 // Copy replacement string (*excluding* terminating NULL byte) to the place indicated by stringPtr
1213 static void _CopyReplacementAtPointer(char *stringPtr, const char *replacementPtr) {
1214 while (*replacementPtr != 0) {
1215 *stringPtr++ = *replacementPtr++;
1216 }
1217 }
1218
1219 // _CheckForTag
1220 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1221 static Boolean _CheckForTag(const char *localeStringPtr, const char *tagPtr, int tagLen) {
1222 return ( strncmp(localeStringPtr, tagPtr, tagLen) == 0 && !isalnum(localeStringPtr[tagLen]) );
1223 }
1224
1225 // _ReplacePrefix
1226 // Move this code from _UpdateFullLocaleString into separate function // <1.10>
1227 static void _ReplacePrefix(char locString[], int locStringMaxLen, int oldPrefixLen, const char *newPrefix) {
1228 int newPrefixLen = strlen(newPrefix);
1229 int lengthDelta = newPrefixLen - oldPrefixLen;
1230
1231 if (lengthDelta < 0) {
1232 // replacement is shorter, delete chars by shifting tail of string
1233 _DeleteCharsAtPointer(locString + newPrefixLen, -lengthDelta);
1234 } else if (lengthDelta > 0) {
1235 // replacement is longer...
1236 int stringLen = strlen(locString);
1237
1238 if (stringLen + lengthDelta < locStringMaxLen) {
1239 // make room by shifting tail of string
1240 char * tailShiftPtr = locString + stringLen;
1241 char * tailStartPtr = locString + oldPrefixLen; // pointer to tail of string to shift
1242
1243 while (tailShiftPtr >= tailStartPtr) {
1244 tailShiftPtr[lengthDelta] = *tailShiftPtr;
1245 tailShiftPtr--;
1246 }
1247 } else {
1248 // no room, can't do substitution
1249 newPrefix = NULL;
1250 }
1251 }
1252
1253 if (newPrefix) {
1254 // do the substitution
1255 _CopyReplacementAtPointer(locString, newPrefix);
1256 }
1257 }
1258
1259 // _UpdateFullLocaleString
1260 // Given a locale string that uses standard codes (not a special old-style Apple string),
1261 // update all the language codes and region codes to latest versions, map 3-letter
1262 // language codes to 2-letter codes if possible, and normalize casing. If requested, return
1263 // pointers to a language-region variant subtag (if present) and a region tag (if present).
1264 // (add locStringMaxLen parameter) // <1.10>
1265 static void _UpdateFullLocaleString(char inLocaleString[], int locStringMaxLen,
1266 char **langRegSubtagRef, char **regionTagRef,
1267 char varKeyValueString[]) // <1.17>
1268 {
1269 KeyStringToResultString testEntry;
1270 KeyStringToResultString * foundEntry;
1271 const SpecialCaseUpdates * specialCasePtr;
1272 char * inLocalePtr;
1273 char * subtagPtr;
1274 char * langRegSubtag = NULL;
1275 char * regionTag = NULL;
1276 char * variantTag = NULL;
1277 Boolean subtagHasDigits, pastPrimarySubtag, hadRegion;
1278
1279 // 1. First replace any non-canonical prefix (case insensitive) with canonical
1280 // (change 3-letter ISO 639 code to 2-letter, update obsolete ISO 639 codes & RFC 3066 tags, etc.)
1281
1282 testEntry.key = inLocaleString;
1283 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToCanonical, kNumLocaleStringPrefixToCanonical,
1284 sizeof(KeyStringToResultString), _CompareLowerTestEntryPrefixToTableEntryKey );
1285 if (foundEntry) {
1286 // replace key (at beginning of string) with result
1287 _ReplacePrefix(inLocaleString, locStringMaxLen, strlen(foundEntry->key), foundEntry->result); // <1.10>
1288 }
1289
1290 // 2. Walk through input string, normalizing case & marking use of ISO 3166 codes
1291
1292 inLocalePtr = inLocaleString;
1293 subtagPtr = inLocaleString;
1294 subtagHasDigits = false;
1295 pastPrimarySubtag = false;
1296 hadRegion = false;
1297
1298 while ( true ) {
1299 if ( isalpha(*inLocalePtr) ) {
1300 // if not past a region tag, then lowercase, else uppercase
1301 *inLocalePtr = (!hadRegion)? tolower(*inLocalePtr): toupper(*inLocalePtr);
1302 } else if ( isdigit(*inLocalePtr) ) {
1303 subtagHasDigits = true;
1304 } else {
1305
1306 if (!pastPrimarySubtag) {
1307 // may have a NULL primary subtag
1308 if (subtagHasDigits) {
1309 break;
1310 }
1311 pastPrimarySubtag = true;
1312 } else if (!hadRegion) {
1313 // We are after any primary language subtag, but not past any region tag.
1314 // This subtag is preceded by '-' or '_'.
1315 int subtagLength = inLocalePtr - subtagPtr; // includes leading '-' or '_'
1316
1317 if (subtagLength == 3 && !subtagHasDigits) {
1318 // potential ISO 3166 code for region or language variant; if so, needs uppercasing
1319 if (*subtagPtr == '_') {
1320 regionTag = subtagPtr;
1321 hadRegion = true;
1322 subtagPtr[1] = toupper(subtagPtr[1]);
1323 subtagPtr[2] = toupper(subtagPtr[2]);
1324 } else if (langRegSubtag == NULL) {
1325 langRegSubtag = subtagPtr;
1326 subtagPtr[1] = toupper(subtagPtr[1]);
1327 subtagPtr[2] = toupper(subtagPtr[2]);
1328 }
1329 } else if (subtagLength == 4 && subtagHasDigits) {
1330 // potential UN M.49 region code
1331 if (*subtagPtr == '_') {
1332 regionTag = subtagPtr;
1333 hadRegion = true;
1334 } else if (langRegSubtag == NULL) {
1335 langRegSubtag = subtagPtr;
1336 }
1337 } else if (subtagLength == 5 && !subtagHasDigits) {
1338 // ISO 15924 script code, uppercase just the first letter
1339 subtagPtr[1] = toupper(subtagPtr[1]);
1340 } else if (subtagLength == 1 && *subtagPtr == '_') { // <1.17>
1341 hadRegion = true;
1342 }
1343
1344 if (!hadRegion) {
1345 // convert improper '_' to '-'
1346 *subtagPtr = '-';
1347 }
1348 } else {
1349 variantTag = subtagPtr; // <1.17>
1350 }
1351
1352 if (*inLocalePtr == '-' || *inLocalePtr == '_') {
1353 subtagPtr = inLocalePtr;
1354 subtagHasDigits = false;
1355 } else {
1356 break;
1357 }
1358 }
1359
1360 inLocalePtr++;
1361 }
1362
1363 // 3 If there is a variant tag, see if ICU canonicalizes it to keywords. // <1.17> [3577669]
1364 // If so, copy the keywords to varKeyValueString and delete the variant tag
1365 // from the original string (but don't otherwise use the ICU canonicalization).
1366 varKeyValueString[0] = 0;
1367 if (variantTag) {
1368 UErrorCode icuStatus;
1369 int icuCanonStringLen;
1370 char * varKeyValueStringPtr = varKeyValueString;
1371
1372 icuStatus = U_ZERO_ERROR;
1373 icuCanonStringLen = uloc_canonicalize( inLocaleString, varKeyValueString, locStringMaxLen, &icuStatus );
1374 if ( U_SUCCESS(icuStatus) ) {
1375 char * icuCanonStringPtr = varKeyValueString;
1376
1377 if (icuCanonStringLen >= locStringMaxLen)
1378 icuCanonStringLen = locStringMaxLen - 1;
1379 varKeyValueString[icuCanonStringLen] = 0;
1380 while (*icuCanonStringPtr != 0 && *icuCanonStringPtr != ULOC_KEYWORD_SEPARATOR)
1381 ++icuCanonStringPtr;
1382 if (*icuCanonStringPtr != 0) {
1383 // the canonicalized string has keywords
1384 // delete the variant tag in the original string (and other trailing '_' or '-')
1385 *variantTag-- = 0;
1386 while (*variantTag == '_')
1387 *variantTag-- = 0;
1388 // delete all of the canonicalized string except the keywords
1389 while (*icuCanonStringPtr != 0)
1390 *varKeyValueStringPtr++ = *icuCanonStringPtr++;
1391 }
1392 *varKeyValueStringPtr = 0;
1393 }
1394 }
1395
1396 // 4. Handle special cases of updating region codes, or updating language codes based on
1397 // region code.
1398 for (specialCasePtr = specialCases; specialCasePtr->reg1 != NULL; specialCasePtr++) {
1399 if ( specialCasePtr->lang == NULL || _CheckForTag(inLocaleString, specialCasePtr->lang, 2) ) {
1400 // OK, we matched any language specified. Now what needs updating?
1401 char * foundTag;
1402
1403 if ( isupper(specialCasePtr->update1[0]) ) {
1404 // updating a region code
1405 if ( ( foundTag = strstr(inLocaleString, specialCasePtr->reg1) ) && !isalnum(foundTag[3]) ) {
1406 _CopyReplacementAtPointer(foundTag+1, specialCasePtr->update1);
1407 }
1408 if ( regionTag && _CheckForTag(regionTag+1, specialCasePtr->reg1 + 1, 2) ) {
1409 _CopyReplacementAtPointer(regionTag+1, specialCasePtr->update1);
1410 }
1411
1412 } else {
1413 // updating the language, there will be two choices based on region
1414 if ( ( regionTag && _CheckForTag(regionTag+1, specialCasePtr->reg1 + 1, 2) ) ||
1415 ( ( foundTag = strstr(inLocaleString, specialCasePtr->reg1) ) && !isalnum(foundTag[3]) ) ) {
1416 _CopyReplacementAtPointer(inLocaleString, specialCasePtr->update1);
1417 } else if ( ( regionTag && _CheckForTag(regionTag+1, specialCasePtr->reg2 + 1, 2) ) ||
1418 ( ( foundTag = strstr(inLocaleString, specialCasePtr->reg2) ) && !isalnum(foundTag[3]) ) ) {
1419 _CopyReplacementAtPointer(inLocaleString, specialCasePtr->update2);
1420 }
1421 }
1422 }
1423 }
1424
1425 // 5. return pointers if requested.
1426 if (langRegSubtagRef != NULL) {
1427 *langRegSubtagRef = langRegSubtag;
1428 }
1429 if (regionTagRef != NULL) {
1430 *regionTagRef = regionTag;
1431 }
1432 }
1433
1434
1435 // _RemoveSubstringsIfPresent
1436 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1437 // substringList is a list of space-separated substrings to strip if found in localeString
1438 static void _RemoveSubstringsIfPresent(char *localeString, const char *substringList) {
1439 while (*substringList != 0) {
1440 char currentSubstring[kLocaleIdentifierCStringMax];
1441 int substringLength = 0;
1442 char * foundSubstring;
1443
1444 // copy current substring & get its length
1445 while ( isgraph(*substringList) ) {
1446 currentSubstring[substringLength++] = *substringList++;
1447 }
1448 // move to next substring
1449 while ( isspace(*substringList) ) {
1450 substringList++;
1451 }
1452
1453 // search for current substring in locale string
1454 if (substringLength == 0)
1455 continue;
1456 currentSubstring[substringLength] = 0;
1457 foundSubstring = strstr(localeString, currentSubstring);
1458
1459 // if substring is found, delete it
1460 if (foundSubstring) {
1461 _DeleteCharsAtPointer(foundSubstring, substringLength);
1462 }
1463 }
1464 }
1465
1466
1467 // _GetKeyValueString // <1.10>
1468 // Removes any key-value string from inLocaleString, puts canonized version in keyValueString
1469
1470 static void _GetKeyValueString(char inLocaleString[], char keyValueString[]) {
1471 char * inLocalePtr = inLocaleString;
1472
1473 while (*inLocalePtr != 0 && *inLocalePtr != ULOC_KEYWORD_SEPARATOR) {
1474 inLocalePtr++;
1475 }
1476 if (*inLocalePtr != 0) { // we found a key-value section
1477 char * keyValuePtr = keyValueString;
1478
1479 *keyValuePtr = *inLocalePtr;
1480 *inLocalePtr = 0;
1481 do {
1482 if ( *(++inLocalePtr) != ' ' ) {
1483 *(++keyValuePtr) = *inLocalePtr; // remove "tolower() for *inLocalePtr" // <1.11>
1484 }
1485 } while (*inLocalePtr != 0);
1486 } else {
1487 keyValueString[0] = 0;
1488 }
1489 }
1490
1491 static void _AppendKeyValueString(char inLocaleString[], int locStringMaxLen, char keyValueString[]) {
1492 if (keyValueString[0] != 0) {
1493 UErrorCode uerr = U_ZERO_ERROR;
1494 UEnumeration * uenum = uloc_openKeywords(keyValueString, &uerr);
1495 if ( uenum != NULL ) {
1496 const char * keyword;
1497 int32_t length;
1498 char value[ULOC_KEYWORDS_CAPACITY]; // use as max for keyword value
1499 while ( U_SUCCESS(uerr) ) {
1500 keyword = uenum_next(uenum, &length, &uerr);
1501 if ( keyword == NULL ) {
1502 break;
1503 }
1504 length = uloc_getKeywordValue( keyValueString, keyword, value, sizeof(value), &uerr );
1505 length = uloc_setKeywordValue( keyword, value, inLocaleString, locStringMaxLen, &uerr );
1506 }
1507 uenum_close(uenum);
1508 }
1509 }
1510 }
1511
1512 __private_extern__ CFStringRef _CFLocaleCreateCanonicalLanguageIdentifierForCFBundle(CFAllocatorRef allocator, CFStringRef localeIdentifier) {
1513 char inLocaleString[kLocaleIdentifierCStringMax];
1514 CFStringRef outStringRef = NULL;
1515
1516 if ( localeIdentifier && CFStringGetCString(localeIdentifier, inLocaleString, sizeof(inLocaleString), kCFStringEncodingASCII) ) {
1517 KeyStringToResultString testEntry;
1518 KeyStringToResultString * foundEntry;
1519 char keyValueString[sizeof(inLocaleString)]; // <1.10>
1520 char varKeyValueString[sizeof(inLocaleString)]; // <1.17>
1521
1522 _GetKeyValueString(inLocaleString, keyValueString); // <1.10>
1523 testEntry.result = NULL;
1524
1525 // A. First check if input string matches an old-style string that has a replacement
1526 // (do this before case normalization)
1527 testEntry.key = inLocaleString;
1528 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, oldAppleLocaleToCanonical, kNumOldAppleLocaleToCanonical,
1529 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1530 if (foundEntry) {
1531 // It does match, so replace old string with new
1532 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1533 varKeyValueString[0] = 0;
1534 } else {
1535 // B. No match with an old-style string, use input string but update codes, normalize case, etc.
1536 _UpdateFullLocaleString(inLocaleString, sizeof(inLocaleString), NULL, NULL, varKeyValueString); // <1.10><1.17>
1537 }
1538
1539 // C. Now we have an up-to-date locale string, but we need to strip defaults and turn it into a language string
1540
1541 // 1. Strip defaults in input string based on initial part of locale string
1542 // (mainly to strip default script tag for a language)
1543 testEntry.key = inLocaleString;
1544 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToDefaults, kNumLocaleStringPrefixToDefaults,
1545 sizeof(KeyStringToResultString), _CompareTestEntryPrefixToTableEntryKey );
1546 if (foundEntry) {
1547 // The input string begins with a character sequence for which
1548 // there are default substrings which should be stripped if present
1549 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1550 }
1551
1552 // 2. If the string matches a locale string used by Apple as a language string, turn it into a language string
1553 testEntry.key = inLocaleString;
1554 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, appleLocaleToLanguageStringForCFBundle, kNumAppleLocaleToLanguageStringForCFBundle,
1555 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1556 if (foundEntry) {
1557 // it does match
1558 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1559 } else {
1560 // just delete the region tag and anything after
1561 char * inLocalePtr = inLocaleString;
1562 while (*inLocalePtr != 0 && *inLocalePtr != '_') {
1563 inLocalePtr++;
1564 }
1565 *inLocalePtr = 0;
1566 }
1567
1568 // D. Re-append any key-value strings, now canonical // <1.10><1.17>
1569 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), varKeyValueString );
1570 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), keyValueString );
1571
1572 // All done, return what we came up with.
1573 outStringRef = CFStringCreateWithCString(allocator, inLocaleString, kCFStringEncodingASCII);
1574 }
1575
1576 return outStringRef;
1577 }
1578
1579 CFStringRef CFLocaleCreateCanonicalLanguageIdentifierFromString(CFAllocatorRef allocator, CFStringRef localeIdentifier) {
1580 char inLocaleString[kLocaleIdentifierCStringMax];
1581 CFStringRef outStringRef = NULL;
1582
1583 if ( localeIdentifier && CFStringGetCString(localeIdentifier, inLocaleString, sizeof(inLocaleString), kCFStringEncodingASCII) ) {
1584 KeyStringToResultString testEntry;
1585 KeyStringToResultString * foundEntry;
1586 char keyValueString[sizeof(inLocaleString)]; // <1.10>
1587 char varKeyValueString[sizeof(inLocaleString)]; // <1.17>
1588
1589 _GetKeyValueString(inLocaleString, keyValueString); // <1.10>
1590 testEntry.result = NULL;
1591
1592 // A. First check if input string matches an old-style string that has a replacement
1593 // (do this before case normalization)
1594 testEntry.key = inLocaleString;
1595 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, oldAppleLocaleToCanonical, kNumOldAppleLocaleToCanonical,
1596 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1597 if (foundEntry) {
1598 // It does match, so replace old string with new
1599 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1600 varKeyValueString[0] = 0;
1601 } else {
1602 char * langRegSubtag = NULL;
1603 char * regionTag = NULL;
1604
1605 // B. No match with an old-style string, use input string but update codes, normalize case, etc.
1606 _UpdateFullLocaleString(inLocaleString, sizeof(inLocaleString), &langRegSubtag, &regionTag, varKeyValueString); // <1.10><1.17><1.19>
1607
1608 // if the language part already includes a regional variant, then delete any region tag. <1.19>
1609 if (langRegSubtag && regionTag)
1610 *regionTag = 0;
1611 }
1612
1613 // C. Now we have an up-to-date locale string, but we need to strip defaults and turn it into a language string
1614
1615 // 1. Strip defaults in input string based on initial part of locale string
1616 // (mainly to strip default script tag for a language)
1617 testEntry.key = inLocaleString;
1618 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToDefaults, kNumLocaleStringPrefixToDefaults,
1619 sizeof(KeyStringToResultString), _CompareTestEntryPrefixToTableEntryKey );
1620 if (foundEntry) {
1621 // The input string begins with a character sequence for which
1622 // there are default substrings which should be stripped if present
1623 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1624 }
1625
1626 // 2. If the string matches a locale string used by Apple as a language string, turn it into a language string
1627 testEntry.key = inLocaleString;
1628 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, appleLocaleToLanguageString, kNumAppleLocaleToLanguageString,
1629 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1630 if (foundEntry) {
1631 // it does match
1632 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1633 } else {
1634 // skip to any region tag or java-type variant
1635 char * inLocalePtr = inLocaleString;
1636 while (*inLocalePtr != 0 && *inLocalePtr != '_') {
1637 inLocalePtr++;
1638 }
1639 // if there is still a region tag, turn it into a language variant <1.19>
1640 if (*inLocalePtr == '_') {
1641 // handle 3-digit regions in addition to 2-letter ones
1642 char * regionTag = inLocalePtr++;
1643 long expectedLength = 0;
1644 if ( isalpha(*inLocalePtr) ) {
1645 while ( isalpha(*(++inLocalePtr)) )
1646 ;
1647 expectedLength = 3;
1648 } else if ( isdigit(*inLocalePtr) ) {
1649 while ( isdigit(*(++inLocalePtr)) )
1650 ;
1651 expectedLength = 4;
1652 }
1653 *regionTag = (inLocalePtr - regionTag == expectedLength)? '-': 0;
1654 }
1655 // anything else at/after '_' just gets deleted
1656 *inLocalePtr = 0;
1657 }
1658
1659 // D. Re-append any key-value strings, now canonical // <1.10><1.17>
1660 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), varKeyValueString );
1661 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), keyValueString );
1662
1663 // All done, return what we came up with.
1664 outStringRef = CFStringCreateWithCString(allocator, inLocaleString, kCFStringEncodingASCII);
1665 }
1666
1667 return outStringRef;
1668 }
1669
1670
1671 CFStringRef CFLocaleCreateCanonicalLocaleIdentifierFromString(CFAllocatorRef allocator, CFStringRef localeIdentifier) {
1672 char inLocaleString[kLocaleIdentifierCStringMax];
1673 CFStringRef outStringRef = NULL;
1674
1675 if ( localeIdentifier && CFStringGetCString(localeIdentifier, inLocaleString, sizeof(inLocaleString), kCFStringEncodingASCII) ) {
1676 KeyStringToResultString testEntry;
1677 KeyStringToResultString * foundEntry;
1678 char keyValueString[sizeof(inLocaleString)]; // <1.10>
1679 char varKeyValueString[sizeof(inLocaleString)]; // <1.17>
1680
1681 _GetKeyValueString(inLocaleString, keyValueString); // <1.10>
1682 testEntry.result = NULL;
1683
1684 // A. First check if input string matches an old-style Apple string that has a replacement
1685 // (do this before case normalization)
1686 testEntry.key = inLocaleString;
1687 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, oldAppleLocaleToCanonical, kNumOldAppleLocaleToCanonical,
1688 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1689 if (foundEntry) {
1690 // It does match, so replace old string with new // <1.10>
1691 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1692 varKeyValueString[0] = 0;
1693 } else {
1694 char * langRegSubtag = NULL;
1695 char * regionTag = NULL;
1696
1697 // B. No match with an old-style string, use input string but update codes, normalize case, etc.
1698 _UpdateFullLocaleString(inLocaleString, sizeof(inLocaleString), &langRegSubtag, &regionTag, varKeyValueString); // <1.10><1.17>
1699
1700
1701 // C. Now strip defaults that are implied by other fields.
1702
1703 // 1. If an ISO 3166 region tag matches an ISO 3166 regional language variant subtag, strip the latter.
1704 if ( langRegSubtag && regionTag && strncmp(langRegSubtag+1, regionTag+1, 2) == 0 ) {
1705 _DeleteCharsAtPointer(langRegSubtag, 3);
1706 }
1707
1708 // 2. Strip defaults in input string based on final region tag in locale string
1709 // (mainly for Chinese, to strip -Hans for _CN/_SG, -Hant for _TW/_HK/_MO)
1710 if ( regionTag ) {
1711 testEntry.key = regionTag;
1712 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringRegionToDefaults, kNumLocaleStringRegionToDefaults,
1713 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1714 if (foundEntry) {
1715 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1716 }
1717 }
1718
1719 // 3. Strip defaults in input string based on initial part of locale string
1720 // (mainly to strip default script tag for a language)
1721 testEntry.key = inLocaleString;
1722 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToDefaults, kNumLocaleStringPrefixToDefaults,
1723 sizeof(KeyStringToResultString), _CompareTestEntryPrefixToTableEntryKey );
1724 if (foundEntry) {
1725 // The input string begins with a character sequence for which
1726 // there are default substrings which should be stripped if present
1727 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1728 }
1729 }
1730
1731 // D. Re-append any key-value strings, now canonical // <1.10><1.17>
1732 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), varKeyValueString );
1733 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), keyValueString );
1734
1735 // Now create the CFString (even if empty!)
1736 outStringRef = CFStringCreateWithCString(allocator, inLocaleString, kCFStringEncodingASCII);
1737 }
1738
1739 return outStringRef;
1740 }
1741
1742 // CFLocaleCreateCanonicalLocaleIdentifierFromScriptManagerCodes, based on
1743 // the first part of the SPI CFBundleCopyLocalizationForLocalizationInfo in CFBundle_Resources.c
1744 CFStringRef CFLocaleCreateCanonicalLocaleIdentifierFromScriptManagerCodes(CFAllocatorRef allocator, LangCode lcode, RegionCode rcode) {
1745 CFStringRef result = NULL;
1746 if (0 <= rcode && rcode < kNumRegionCodeToLocaleString) {
1747 const char *localeString = regionCodeToLocaleString[rcode];
1748 if (localeString != NULL && *localeString != '\0') {
1749 result = CFStringCreateWithCStringNoCopy(allocator, localeString, kCFStringEncodingASCII, kCFAllocatorNull);
1750 }
1751 }
1752 if (result) return result;
1753 if (0 <= lcode && lcode < kNumLangCodeToLocaleString) {
1754 const char *localeString = langCodeToLocaleString[lcode];
1755 if (localeString != NULL && *localeString != '\0') {
1756 result = CFStringCreateWithCStringNoCopy(allocator, localeString, kCFStringEncodingASCII, kCFAllocatorNull);
1757 }
1758 }
1759 return result;
1760 }
1761
1762
1763 CFDictionaryRef CFLocaleCreateComponentsFromLocaleIdentifier(CFAllocatorRef allocator, CFStringRef localeID) {
1764 char cLocaleID[ULOC_FULLNAME_CAPACITY+ULOC_KEYWORD_AND_VALUES_CAPACITY];
1765 char buffer[ULOC_FULLNAME_CAPACITY+ULOC_KEYWORD_AND_VALUES_CAPACITY];
1766 CFMutableDictionaryRef working = CFDictionaryCreateMutable(allocator, 10, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
1767
1768 UErrorCode icuStatus = U_ZERO_ERROR;
1769 int32_t length = 0;
1770
1771 // Extract the C string locale ID, for ICU
1772 CFIndex outBytes = 0;
1773 CFStringGetBytes(localeID, CFRangeMake(0, CFStringGetLength(localeID)), kCFStringEncodingASCII, (UInt8) '?', true, (unsigned char *)cLocaleID, sizeof(cLocaleID)/sizeof(char) - 1, &outBytes);
1774 cLocaleID[outBytes] = '\0';
1775
1776 // Get the components
1777 length = uloc_getLanguage(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1778 if (U_SUCCESS(icuStatus) && length > 0)
1779 {
1780 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1781 CFDictionaryAddValue(working, kCFLocaleLanguageCode, string);
1782 CFRelease(string);
1783 }
1784 icuStatus = U_ZERO_ERROR;
1785
1786 length = uloc_getScript(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1787 if (U_SUCCESS(icuStatus) && length > 0)
1788 {
1789 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1790 CFDictionaryAddValue(working, kCFLocaleScriptCode, string);
1791 CFRelease(string);
1792 }
1793 icuStatus = U_ZERO_ERROR;
1794
1795 length = uloc_getCountry(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1796 if (U_SUCCESS(icuStatus) && length > 0)
1797 {
1798 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1799 CFDictionaryAddValue(working, kCFLocaleCountryCode, string);
1800 CFRelease(string);
1801 }
1802 icuStatus = U_ZERO_ERROR;
1803
1804 length = uloc_getVariant(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1805 if (U_SUCCESS(icuStatus) && length > 0)
1806 {
1807 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1808 CFDictionaryAddValue(working, kCFLocaleVariantCode, string);
1809 CFRelease(string);
1810 }
1811 icuStatus = U_ZERO_ERROR;
1812
1813 // Now get the keywords; open an enumerator on them
1814 UEnumeration *iter = uloc_openKeywords(cLocaleID, &icuStatus);
1815 const char *locKey = NULL;
1816 int32_t locKeyLen = 0;
1817 while ((locKey = uenum_next(iter, &locKeyLen, &icuStatus)) && U_SUCCESS(icuStatus))
1818 {
1819 char locValue[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1820
1821 // Get the value for this keyword
1822 if (uloc_getKeywordValue(cLocaleID, locKey, locValue, sizeof(locValue)/sizeof(char), &icuStatus) > 0
1823 && U_SUCCESS(icuStatus))
1824 {
1825 CFStringRef key = CFStringCreateWithBytes(allocator, (UInt8 *)locKey, strlen(locKey), kCFStringEncodingASCII, true);
1826 CFStringRef value = CFStringCreateWithBytes(allocator, (UInt8 *)locValue, strlen(locValue), kCFStringEncodingASCII, true);
1827 if (key && value)
1828 CFDictionaryAddValue(working, key, value);
1829 if (key)
1830 CFRelease(key);
1831 if (value)
1832 CFRelease(value);
1833 }
1834 }
1835 uenum_close(iter);
1836
1837 // Convert to an immutable dictionary and return
1838 CFDictionaryRef result = CFDictionaryCreateCopy(allocator, working);
1839 CFRelease(working);
1840 return result;
1841 }
1842
1843 typedef struct __AppendContext
1844 {
1845 char separator;
1846 CFMutableStringRef working;
1847 } __AppendContext;
1848
1849 static void __AppendKeywords(const void *k, const void *v, void *c)
1850 {
1851 __AppendContext *context = (__AppendContext *) c;
1852 CFStringRef key = (CFStringRef) k;
1853 CFStringRef value = (CFStringRef) v;
1854 if (CFEqual(key, kCFLocaleLanguageCode) || CFEqual(key, kCFLocaleScriptCode) || CFEqual(key, kCFLocaleCountryCode) || CFEqual(key, kCFLocaleVariantCode))
1855 return;
1856 CFStringAppendFormat(context->working, NULL, CFSTR("%c%@%c%@"), context->separator, key, ULOC_KEYWORD_ASSIGN, value);
1857 context->separator = ULOC_KEYWORD_ITEM_SEPARATOR;
1858 }
1859
1860 CFStringRef CFLocaleCreateLocaleIdentifierFromComponents(CFAllocatorRef allocator, CFDictionaryRef dictionary) {
1861 CFMutableStringRef working = CFStringCreateMutable(allocator, 0);
1862 CFStringRef value = NULL;
1863 bool country = false;
1864 __AppendContext context = {ULOC_KEYWORD_SEPARATOR, working};
1865
1866 if ((value = (CFStringRef) CFDictionaryGetValue(dictionary, kCFLocaleLanguageCode)))
1867 {
1868 CFStringAppend(working, value);
1869 }
1870
1871 if ((value = (CFStringRef) CFDictionaryGetValue(dictionary, kCFLocaleScriptCode)))
1872 {
1873 CFStringAppendFormat(working, NULL, CFSTR("_%@"), value);
1874 }
1875
1876 if ((value = (CFStringRef) CFDictionaryGetValue(dictionary, kCFLocaleCountryCode)))
1877 {
1878 CFStringAppendFormat(working, NULL, CFSTR("_%@"), value);
1879 country = true;
1880 }
1881
1882 if ((value = (CFStringRef) CFDictionaryGetValue(dictionary, kCFLocaleVariantCode)))
1883 {
1884 if (!country)
1885 CFStringAppend(working, CFSTR("_"));
1886 CFStringAppendFormat(working, NULL, CFSTR("_%@"), value);
1887 }
1888
1889 // Now iterate through any remaining entries and append as keywords
1890 CFDictionaryApplyFunction(dictionary, __AppendKeywords, &context);
1891
1892 // Convert to immutable string and return
1893 CFStringRef result = (CFStringRef)CFStringCreateCopy(allocator, working);
1894 CFRelease(working);
1895 return result;
1896 }
1897