]> git.saurik.com Git - apple/cf.git/blob - CFLocaleIdentifier.c
CF-550.43.tar.gz
[apple/cf.git] / CFLocaleIdentifier.c
1 /*
2 * Copyright (c) 2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /*
25 CFLocaleIdentifier.c
26 Copyright (c) 2002-2009, Apple Inc. All rights reserved.
27 Responsibility: Christopher Kane
28
29 CFLocaleIdentifier.c defines
30 - enum value kLocaleIdentifierCStringMax
31 - structs KeyStringToResultString, SpecialCaseUpdates
32 and provides the following data for the functions
33 CFLocaleCreateCanonicalLocaleIdentifierFromScriptManagerCodes,
34 CFLocaleCreateCanonicalLocaleIdentifierFromString
35 CFLocaleCreateCanonicalLanguageIdentifierFromString
36
37 1. static const char * regionCodeToLocaleString[]; enum kNumRegionCodeToLocaleString;
38 map RegionCode 0..kNumRegionCodeToLocaleString-1 to canonical locale string
39
40 2. static const char * langCodeToLocaleString[]; enum kNumLangCodeToLocaleString;
41 map LangCode 0..kNumLangCodeToLocaleString-1 to canonical locale string
42
43 3. static const KeyStringToResultString oldAppleLocaleToCanonical[]; enum kNumOldAppleLocaleToCanonical;
44 map old Apple string oldAppleLocaleToCanonical[n].key
45 to canonical locale string oldAppleLocaleToCanonical[n].result
46 for n = 0..kNumOldAppleLocaleToCanonical-1
47
48 4. static const KeyStringToResultString localeStringPrefixToCanonical[]; enum kNumLocaleStringPrefixToCanonical;
49 map non-canonical language prefix (3-letter, obsolete) localeStringPrefixToCanonical[].key
50 to updated replacement localeStringPrefixToCanonical[].result
51 for n = 0..kNumLocaleStringPrefixToCanonical-1
52
53 5. static const SpecialCaseUpdates specialCases[];
54 various special cases for updating region codes, or for updating language codes based on region codes
55
56 6. static const KeyStringToResultString localeStringRegionToDefaults[]; enum kNumLocaleStringRegionToDefaults;
57 map locale string region tag localeStringRegionToDefaults[n].key
58 to default substrings to delete localeStringRegionToDefaults[n].result
59 for n = 0..kNumLocaleStringRegionToDefaults-1
60
61 7. static const KeyStringToResultString localeStringPrefixToDefaults[]; enum kNumLocaleStringPrefixToDefaults;
62 map locale string initial part localeStringPrefixToDefaults[n].key
63 to default substrings to delete localeStringPrefixToDefaults[n].result
64 for n = 0..kNumLocaleStringPrefixToDefaults-1
65
66 8. static const KeyStringToResultString appleLocaleToLanguageString[]; enum kNumAppleLocaleToLanguageString;
67 map Apple locale string appleLocaleToLanguageString[].key
68 to equivalent language string appleLocaleToLanguageString[].result
69 for n = 0..kNumAppleLocaleToLanguageString-1
70
71 */
72
73 #include <CoreFoundation/CFString.h>
74 #include <ctype.h>
75 #include <string.h>
76 #include <stdlib.h>
77 #include <unicode/uloc.h>
78 #include "CFInternal.h"
79 #include "CFLocaleInternal.h"
80
81 // Max byte length of locale identifier (ASCII) as C string, including terminating null byte
82 enum {
83 kLocaleIdentifierCStringMax = ULOC_FULLNAME_CAPACITY + ULOC_KEYWORD_AND_VALUES_CAPACITY // currently 56 + 100
84 };
85
86 // KeyStringToResultString struct used in data tables for CFLocaleCreateCanonicalLocaleIdentifierFromString
87 struct KeyStringToResultString {
88 const char * key;
89 const char * result;
90 };
91 typedef struct KeyStringToResultString KeyStringToResultString;
92
93 // SpecialCaseUpdates struct used in data tables for CFLocaleCreateCanonicalLocaleIdentifierFromString
94 struct SpecialCaseUpdates {
95 const char * lang;
96 const char * reg1;
97 const char * update1;
98 const char * reg2;
99 const char * update2;
100 };
101 typedef struct SpecialCaseUpdates SpecialCaseUpdates;
102
103
104 static const char * const regionCodeToLocaleString[] = {
105 // map RegionCode (array index) to canonical locale string
106 //
107 // canon. string region code; language code; [comment] [ # __CFBundleLocaleAbbreviationsArray
108 // -------- ------------ ------------------ ------------ -------- string, if different ]
109 "en_US", // 0 verUS; 0 langEnglish;
110 "fr_FR", // 1 verFrance; 1 langFrench;
111 "en_GB", // 2 verBritain; 0 langEnglish;
112 "de_DE", // 3 verGermany; 2 langGerman;
113 "it_IT", // 4 verItaly; 3 langItalian;
114 "nl_NL", // 5 verNetherlands; 4 langDutch;
115 "nl_BE", // 6 verFlemish; 34 langFlemish (redundant, =Dutch);
116 "sv_SE", // 7 verSweden; 5 langSwedish;
117 "es_ES", // 8 verSpain; 6 langSpanish;
118 "da_DK", // 9 verDenmark; 7 langDanish;
119 "pt_PT", // 10 verPortugal; 8 langPortuguese;
120 "fr_CA", // 11 verFrCanada; 1 langFrench;
121 "nb_NO", // 12 verNorway; 9 langNorwegian (Bokmal); # "no_NO"
122 "he_IL", // 13 verIsrael; 10 langHebrew;
123 "ja_JP", // 14 verJapan; 11 langJapanese;
124 "en_AU", // 15 verAustralia; 0 langEnglish;
125 "ar", // 16 verArabic; 12 langArabic;
126 "fi_FI", // 17 verFinland; 13 langFinnish;
127 "fr_CH", // 18 verFrSwiss; 1 langFrench;
128 "de_CH", // 19 verGrSwiss; 2 langGerman;
129 "el_GR", // 20 verGreece; 14 langGreek (modern)-Grek-mono;
130 "is_IS", // 21 verIceland; 15 langIcelandic;
131 "mt_MT", // 22 verMalta; 16 langMaltese;
132 "el_CY", // 23 verCyprus; 14 langGreek?; el or tr? guess el # ""
133 "tr_TR", // 24 verTurkey; 17 langTurkish;
134 "hr_HR", // 25 verYugoCroatian; 18 langCroatian; * one-way mapping -> verCroatia
135 "nl_NL", // 26 KCHR, Netherlands; 4 langDutch; * one-way mapping
136 "nl_BE", // 27 KCHR, verFlemish; 34 langFlemish; * one-way mapping
137 "_CA", // 28 KCHR, Canada-en/fr?; -1 none; * one-way mapping # "en_CA"
138 "_CA", // 29 KCHR, Canada-en/fr?; -1 none; * one-way mapping # "en_CA"
139 "pt_PT", // 30 KCHR, Portugal; 8 langPortuguese; * one-way mapping
140 "nb_NO", // 31 KCHR, Norway; 9 langNorwegian (Bokmal); * one-way mapping # "no_NO"
141 "da_DK", // 32 KCHR, Denmark; 7 langDanish; * one-way mapping
142 "hi_IN", // 33 verIndiaHindi; 21 langHindi;
143 "ur_PK", // 34 verPakistanUrdu; 20 langUrdu;
144 "tr_TR", // 35 verTurkishModified; 17 langTurkish; * one-way mapping
145 "it_CH", // 36 verItalianSwiss; 3 langItalian;
146 "en_001", // 37 verInternational; 0 langEnglish; ASCII only # "en"
147 NULL, // 38 *unassigned; -1 none; * one-way mapping # ""
148 "ro_RO", // 39 verRomania; 37 langRomanian;
149 "grc", // 40 verGreekAncient; 148 langGreekAncient -Grek-poly; # "el_GR"
150 "lt_LT", // 41 verLithuania; 24 langLithuanian;
151 "pl_PL", // 42 verPoland; 25 langPolish;
152 "hu_HU", // 43 verHungary; 26 langHungarian;
153 "et_EE", // 44 verEstonia; 27 langEstonian;
154 "lv_LV", // 45 verLatvia; 28 langLatvian;
155 "se", // 46 verSami; 29 langSami;
156 "fo_FO", // 47 verFaroeIsl; 30 langFaroese;
157 "fa_IR", // 48 verIran; 31 langFarsi/Persian;
158 "ru_RU", // 49 verRussia; 32 langRussian;
159 "ga_IE", // 50 verIreland; 35 langIrishGaelic (no dots);
160 "ko_KR", // 51 verKorea; 23 langKorean;
161 "zh_CN", // 52 verChina; 33 langSimpChinese;
162 "zh_TW", // 53 verTaiwan; 19 langTradChinese;
163 "th_TH", // 54 verThailand; 22 langThai;
164 "und", // 55 verScriptGeneric; -1 none; # "" // <1.9>
165 "cs_CZ", // 56 verCzech; 38 langCzech;
166 "sk_SK", // 57 verSlovak; 39 langSlovak;
167 "und", // 58 verEastAsiaGeneric; -1 none; * one-way mapping # "" // <1.9>
168 "hu_HU", // 59 verMagyar; 26 langHungarian; * one-way mapping -> verHungary
169 "bn", // 60 verBengali; 67 langBengali; _IN or _BD? guess generic
170 "be_BY", // 61 verBelarus; 46 langBelorussian;
171 "uk_UA", // 62 verUkraine; 45 langUkrainian;
172 NULL, // 63 *unused; -1 none; * one-way mapping # ""
173 "el_GR", // 64 verGreeceAlt; 14 langGreek (modern)-Grek-mono; * one-way mapping
174 "sr_RS", // 65 verSerbian; 42 langSerbian -Cyrl; // <1.18>
175 "sl_SI", // 66 verSlovenian; 40 langSlovenian;
176 "mk_MK", // 67 verMacedonian; 43 langMacedonian;
177 "hr_HR", // 68 verCroatia; 18 langCroatian;
178 NULL, // 69 *unused; -1 none; * one-way mapping # ""
179 "de-1996", // 70 verGermanReformed; 2 langGerman; 1996 orthogr. # "de_DE"
180 "pt_BR", // 71 verBrazil; 8 langPortuguese;
181 "bg_BG", // 72 verBulgaria; 44 langBulgarian;
182 "ca_ES", // 73 verCatalonia; 130 langCatalan;
183 "mul", // 74 verMultilingual; -1 none; # ""
184 "gd", // 75 verScottishGaelic; 144 langScottishGaelic;
185 "gv", // 76 verManxGaelic; 145 langManxGaelic;
186 "br", // 77 verBreton; 142 langBreton;
187 "iu_CA", // 78 verNunavut; 143 langInuktitut -Cans;
188 "cy", // 79 verWelsh; 128 langWelsh;
189 "_CA", // 80 KCHR, Canada-en/fr?; -1 none; * one-way mapping # "en_CA"
190 "ga-Latg_IE", // 81 verIrishGaelicScrip; 146 langIrishGaelicScript -dots; # "ga_IE" // <xx>
191 "en_CA", // 82 verEngCanada; 0 langEnglish;
192 "dz_BT", // 83 verBhutan; 137 langDzongkha;
193 "hy_AM", // 84 verArmenian; 51 langArmenian;
194 "ka_GE", // 85 verGeorgian; 52 langGeorgian;
195 "es_419", // 86 verSpLatinAmerica; 6 langSpanish; # "es"
196 "es_ES", // 87 KCHR, Spain; 6 langSpanish; * one-way mapping
197 "to_TO", // 88 verTonga; 147 langTongan;
198 "pl_PL", // 89 KCHR, Poland; 25 langPolish; * one-way mapping
199 "ca_ES", // 90 KCHR, Catalonia; 130 langCatalan; * one-way mapping
200 "fr_001", // 91 verFrenchUniversal; 1 langFrench;
201 "de_AT", // 92 verAustria; 2 langGerman;
202 "es_419", // 93 > verSpLatinAmerica; 6 langSpanish; * one-way mapping # "es"
203 "gu_IN", // 94 verGujarati; 69 langGujarati;
204 "pa", // 95 verPunjabi; 70 langPunjabi; _IN or _PK? guess generic
205 "ur_IN", // 96 verIndiaUrdu; 20 langUrdu;
206 "vi_VN", // 97 verVietnam; 80 langVietnamese;
207 "fr_BE", // 98 verFrBelgium; 1 langFrench;
208 "uz_UZ", // 99 verUzbek; 47 langUzbek;
209 "en_SG", // 100 verSingapore; 0 langEnglish?; en, zh, or ms? guess en # ""
210 "nn_NO", // 101 verNynorsk; 151 langNynorsk; # ""
211 "af_ZA", // 102 verAfrikaans; 141 langAfrikaans;
212 "eo", // 103 verEsperanto; 94 langEsperanto;
213 "mr_IN", // 104 verMarathi; 66 langMarathi;
214 "bo", // 105 verTibetan; 63 langTibetan;
215 "ne_NP", // 106 verNepal; 64 langNepali;
216 "kl", // 107 verGreenland; 149 langGreenlandic;
217 "en_IE", // 108 verIrelandEnglish; 0 langEnglish; # (no entry)
218 };
219 enum {
220 kNumRegionCodeToLocaleString = sizeof(regionCodeToLocaleString)/sizeof(char *)
221 };
222
223 static const char * const langCodeToLocaleString[] = {
224 // map LangCode (array index) to canonical locale string
225 //
226 // canon. string language code; [ comment] [ # __CFBundleLanguageAbbreviationsArray
227 // -------- -------------- ---------- -------- string, if different ]
228 "en", // 0 langEnglish;
229 "fr", // 1 langFrench;
230 "de", // 2 langGerman;
231 "it", // 3 langItalian;
232 "nl", // 4 langDutch;
233 "sv", // 5 langSwedish;
234 "es", // 6 langSpanish;
235 "da", // 7 langDanish;
236 "pt", // 8 langPortuguese;
237 "nb", // 9 langNorwegian (Bokmal); # "no"
238 "he", // 10 langHebrew -Hebr;
239 "ja", // 11 langJapanese -Jpan;
240 "ar", // 12 langArabic -Arab;
241 "fi", // 13 langFinnish;
242 "el", // 14 langGreek (modern)-Grek-mono;
243 "is", // 15 langIcelandic;
244 "mt", // 16 langMaltese -Latn;
245 "tr", // 17 langTurkish -Latn;
246 "hr", // 18 langCroatian;
247 "zh-Hant", // 19 langTradChinese; # "zh"
248 "ur", // 20 langUrdu -Arab;
249 "hi", // 21 langHindi -Deva;
250 "th", // 22 langThai -Thai;
251 "ko", // 23 langKorean -Hang;
252 "lt", // 24 langLithuanian;
253 "pl", // 25 langPolish;
254 "hu", // 26 langHungarian;
255 "et", // 27 langEstonian;
256 "lv", // 28 langLatvian;
257 "se", // 29 langSami;
258 "fo", // 30 langFaroese;
259 "fa", // 31 langFarsi/Persian -Arab;
260 "ru", // 32 langRussian -Cyrl;
261 "zh-Hans", // 33 langSimpChinese; # "zh"
262 "nl-BE", // 34 langFlemish (redundant, =Dutch); # "nl"
263 "ga", // 35 langIrishGaelic (no dots);
264 "sq", // 36 langAlbanian; no region codes
265 "ro", // 37 langRomanian;
266 "cs", // 38 langCzech;
267 "sk", // 39 langSlovak;
268 "sl", // 40 langSlovenian;
269 "yi", // 41 langYiddish -Hebr; no region codes
270 "sr", // 42 langSerbian -Cyrl;
271 "mk", // 43 langMacedonian -Cyrl;
272 "bg", // 44 langBulgarian -Cyrl;
273 "uk", // 45 langUkrainian -Cyrl;
274 "be", // 46 langBelorussian -Cyrl;
275 "uz-Cyrl", // 47 langUzbek -Cyrl; also -Latn, -Arab
276 "kk", // 48 langKazakh -Cyrl; no region codes; also -Latn, -Arab
277 "az-Cyrl", // 49 langAzerbaijani -Cyrl; no region codes # "az"
278 "az-Arab", // 50 langAzerbaijanAr -Arab; no region codes # "az"
279 "hy", // 51 langArmenian -Armn;
280 "ka", // 52 langGeorgian -Geor;
281 "mo", // 53 langMoldavian -Cyrl; no region codes
282 "ky", // 54 langKirghiz -Cyrl; no region codes; also -Latn, -Arab
283 "tg-Cyrl", // 55 langTajiki -Cyrl; no region codes; also -Latn, -Arab
284 "tk-Cyrl", // 56 langTurkmen -Cyrl; no region codes; also -Latn, -Arab
285 "mn-Mong", // 57 langMongolian -Mong; no region codes # "mn"
286 "mn-Cyrl", // 58 langMongolianCyr -Cyrl; no region codes # "mn"
287 "ps", // 59 langPashto -Arab; no region codes
288 "ku", // 60 langKurdish -Arab; no region codes
289 "ks", // 61 langKashmiri -Arab; no region codes
290 "sd", // 62 langSindhi -Arab; no region codes
291 "bo", // 63 langTibetan -Tibt;
292 "ne", // 64 langNepali -Deva;
293 "sa", // 65 langSanskrit -Deva; no region codes
294 "mr", // 66 langMarathi -Deva;
295 "bn", // 67 langBengali -Beng;
296 "as", // 68 langAssamese -Beng; no region codes
297 "gu", // 69 langGujarati -Gujr;
298 "pa", // 70 langPunjabi -Guru;
299 "or", // 71 langOriya -Orya; no region codes
300 "ml", // 72 langMalayalam -Mlym; no region codes
301 "kn", // 73 langKannada -Knda; no region codes
302 "ta", // 74 langTamil -Taml; no region codes
303 "te", // 75 langTelugu -Telu; no region codes
304 "si", // 76 langSinhalese -Sinh; no region codes
305 "my", // 77 langBurmese -Mymr; no region codes
306 "km", // 78 langKhmer -Khmr; no region codes
307 "lo", // 79 langLao -Laoo; no region codes
308 "vi", // 80 langVietnamese -Latn;
309 "id", // 81 langIndonesian -Latn; no region codes
310 "tl", // 82 langTagalog -Latn; no region codes
311 "ms", // 83 langMalayRoman -Latn; no region codes # "ms"
312 "ms-Arab", // 84 langMalayArabic -Arab; no region codes # "ms"
313 "am", // 85 langAmharic -Ethi; no region codes
314 "ti", // 86 langTigrinya -Ethi; no region codes
315 "om", // 87 langOromo -Ethi; no region codes
316 "so", // 88 langSomali -Latn; no region codes
317 "sw", // 89 langSwahili -Latn; no region codes
318 "rw", // 90 langKinyarwanda -Latn; no region codes
319 "rn", // 91 langRundi -Latn; no region codes
320 "ny", // 92 langNyanja/Chewa -Latn; no region codes # ""
321 "mg", // 93 langMalagasy -Latn; no region codes
322 "eo", // 94 langEsperanto -Latn;
323 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, // 95 to 105 (gap)
324 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, // 106 to 116 (gap)
325 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, // 107 to 117 (gap)
326 "cy", // 128 langWelsh -Latn;
327 "eu", // 129 langBasque -Latn; no region codes
328 "ca", // 130 langCatalan -Latn;
329 "la", // 131 langLatin -Latn; no region codes
330 "qu", // 132 langQuechua -Latn; no region codes
331 "gn", // 133 langGuarani -Latn; no region codes
332 "ay", // 134 langAymara -Latn; no region codes
333 "tt-Cyrl", // 135 langTatar -Cyrl; no region codes
334 "ug", // 136 langUighur -Arab; no region codes
335 "dz", // 137 langDzongkha -Tibt;
336 "jv", // 138 langJavaneseRom -Latn; no region codes
337 "su", // 139 langSundaneseRom -Latn; no region codes
338 "gl", // 140 langGalician -Latn; no region codes
339 "af", // 141 langAfrikaans -Latn;
340 "br", // 142 langBreton -Latn;
341 "iu", // 143 langInuktitut -Cans;
342 "gd", // 144 langScottishGaelic;
343 "gv", // 145 langManxGaelic -Latn;
344 "ga-Latg", // 146 langIrishGaelicScript -Latn-dots; # "ga" // <xx>
345 "to", // 147 langTongan -Latn;
346 "grc", // 148 langGreekAncient -Grek-poly; # "el"
347 "kl", // 149 langGreenlandic -Latn;
348 "az-Latn", // 150 langAzerbaijanRoman -Latn; no region codes # "az"
349 "nn", // 151 langNynorsk -Latn; # (no entry)
350 };
351 enum {
352 kNumLangCodeToLocaleString = sizeof(langCodeToLocaleString)/sizeof(char *)
353 };
354
355 static const KeyStringToResultString oldAppleLocaleToCanonical[] = {
356 // Map obsolete/old-style Apple strings to canonical
357 // Must be sorted according to how strcmp compares the strings in the first column
358 //
359 // non-canonical canonical [ comment ] # source/reason for non-canonical string
360 // string string
361 // ------------- ---------
362 { "Afrikaans", "af" }, // # __CFBundleLanguageNamesArray
363 { "Albanian", "sq" }, // # __CFBundleLanguageNamesArray
364 { "Amharic", "am" }, // # __CFBundleLanguageNamesArray
365 { "Arabic", "ar" }, // # __CFBundleLanguageNamesArray
366 { "Armenian", "hy" }, // # __CFBundleLanguageNamesArray
367 { "Assamese", "as" }, // # __CFBundleLanguageNamesArray
368 { "Aymara", "ay" }, // # __CFBundleLanguageNamesArray
369 { "Azerbaijani", "az" }, // -Arab,-Cyrl,-Latn? # __CFBundleLanguageNamesArray (had 3 entries "Azerbaijani" for "az-Arab", "az-Cyrl", "az-Latn")
370 { "Basque", "eu" }, // # __CFBundleLanguageNamesArray
371 { "Belarusian", "be" }, // # handle other names
372 { "Belorussian", "be" }, // # handle other names
373 { "Bengali", "bn" }, // # __CFBundleLanguageNamesArray
374 { "Brazilian Portugese", "pt-BR" }, // # from Installer.app Info.plist IFLanguages key, misspelled
375 { "Brazilian Portuguese", "pt-BR" }, // # correct spelling for above
376 { "Breton", "br" }, // # __CFBundleLanguageNamesArray
377 { "Bulgarian", "bg" }, // # __CFBundleLanguageNamesArray
378 { "Burmese", "my" }, // # __CFBundleLanguageNamesArray
379 { "Byelorussian", "be" }, // # __CFBundleLanguageNamesArray
380 { "Catalan", "ca" }, // # __CFBundleLanguageNamesArray
381 { "Chewa", "ny" }, // # handle other names
382 { "Chichewa", "ny" }, // # handle other names
383 { "Chinese", "zh" }, // -Hans,-Hant? # __CFBundleLanguageNamesArray (had 2 entries "Chinese" for "zh-Hant", "zh-Hans")
384 { "Chinese, Simplified", "zh-Hans" }, // # from Installer.app Info.plist IFLanguages key
385 { "Chinese, Traditional", "zh-Hant" }, // # correct spelling for below
386 { "Chinese, Tradtional", "zh-Hant" }, // # from Installer.app Info.plist IFLanguages key, misspelled
387 { "Croatian", "hr" }, // # __CFBundleLanguageNamesArray
388 { "Czech", "cs" }, // # __CFBundleLanguageNamesArray
389 { "Danish", "da" }, // # __CFBundleLanguageNamesArray
390 { "Dutch", "nl" }, // # __CFBundleLanguageNamesArray (had 2 entries "Dutch" for "nl", "nl-BE")
391 { "Dzongkha", "dz" }, // # __CFBundleLanguageNamesArray
392 { "English", "en" }, // # __CFBundleLanguageNamesArray
393 { "Esperanto", "eo" }, // # __CFBundleLanguageNamesArray
394 { "Estonian", "et" }, // # __CFBundleLanguageNamesArray
395 { "Faroese", "fo" }, // # __CFBundleLanguageNamesArray
396 { "Farsi", "fa" }, // # __CFBundleLanguageNamesArray
397 { "Finnish", "fi" }, // # __CFBundleLanguageNamesArray
398 { "Flemish", "nl-BE" }, // # handle other names
399 { "French", "fr" }, // # __CFBundleLanguageNamesArray
400 { "Galician", "gl" }, // # __CFBundleLanguageNamesArray
401 { "Gallegan", "gl" }, // # handle other names
402 { "Georgian", "ka" }, // # __CFBundleLanguageNamesArray
403 { "German", "de" }, // # __CFBundleLanguageNamesArray
404 { "Greek", "el" }, // # __CFBundleLanguageNamesArray (had 2 entries "Greek" for "el", "grc")
405 { "Greenlandic", "kl" }, // # __CFBundleLanguageNamesArray
406 { "Guarani", "gn" }, // # __CFBundleLanguageNamesArray
407 { "Gujarati", "gu" }, // # __CFBundleLanguageNamesArray
408 { "Hawaiian", "haw" }, // # handle new languages
409 { "Hebrew", "he" }, // # __CFBundleLanguageNamesArray
410 { "Hindi", "hi" }, // # __CFBundleLanguageNamesArray
411 { "Hungarian", "hu" }, // # __CFBundleLanguageNamesArray
412 { "Icelandic", "is" }, // # __CFBundleLanguageNamesArray
413 { "Indonesian", "id" }, // # __CFBundleLanguageNamesArray
414 { "Inuktitut", "iu" }, // # __CFBundleLanguageNamesArray
415 { "Irish", "ga" }, // # __CFBundleLanguageNamesArray (had 2 entries "Irish" for "ga", "ga-dots")
416 { "Italian", "it" }, // # __CFBundleLanguageNamesArray
417 { "Japanese", "ja" }, // # __CFBundleLanguageNamesArray
418 { "Javanese", "jv" }, // # __CFBundleLanguageNamesArray
419 { "Kalaallisut", "kl" }, // # handle other names
420 { "Kannada", "kn" }, // # __CFBundleLanguageNamesArray
421 { "Kashmiri", "ks" }, // # __CFBundleLanguageNamesArray
422 { "Kazakh", "kk" }, // # __CFBundleLanguageNamesArray
423 { "Khmer", "km" }, // # __CFBundleLanguageNamesArray
424 { "Kinyarwanda", "rw" }, // # __CFBundleLanguageNamesArray
425 { "Kirghiz", "ky" }, // # __CFBundleLanguageNamesArray
426 { "Korean", "ko" }, // # __CFBundleLanguageNamesArray
427 { "Kurdish", "ku" }, // # __CFBundleLanguageNamesArray
428 { "Lao", "lo" }, // # __CFBundleLanguageNamesArray
429 { "Latin", "la" }, // # __CFBundleLanguageNamesArray
430 { "Latvian", "lv" }, // # __CFBundleLanguageNamesArray
431 { "Lithuanian", "lt" }, // # __CFBundleLanguageNamesArray
432 { "Macedonian", "mk" }, // # __CFBundleLanguageNamesArray
433 { "Malagasy", "mg" }, // # __CFBundleLanguageNamesArray
434 { "Malay", "ms" }, // -Latn,-Arab? # __CFBundleLanguageNamesArray (had 2 entries "Malay" for "ms-Latn", "ms-Arab")
435 { "Malayalam", "ml" }, // # __CFBundleLanguageNamesArray
436 { "Maltese", "mt" }, // # __CFBundleLanguageNamesArray
437 { "Manx", "gv" }, // # __CFBundleLanguageNamesArray
438 { "Marathi", "mr" }, // # __CFBundleLanguageNamesArray
439 { "Moldavian", "mo" }, // # __CFBundleLanguageNamesArray
440 { "Mongolian", "mn" }, // -Mong,-Cyrl? # __CFBundleLanguageNamesArray (had 2 entries "Mongolian" for "mn-Mong", "mn-Cyrl")
441 { "Nepali", "ne" }, // # __CFBundleLanguageNamesArray
442 { "Norwegian", "nb" }, // # __CFBundleLanguageNamesArray (had "Norwegian" mapping to "no")
443 { "Nyanja", "ny" }, // # __CFBundleLanguageNamesArray
444 { "Nynorsk", "nn" }, // # handle other names (no entry in __CFBundleLanguageNamesArray)
445 { "Oriya", "or" }, // # __CFBundleLanguageNamesArray
446 { "Oromo", "om" }, // # __CFBundleLanguageNamesArray
447 { "Panjabi", "pa" }, // # handle other names
448 { "Pashto", "ps" }, // # __CFBundleLanguageNamesArray
449 { "Persian", "fa" }, // # handle other names
450 { "Polish", "pl" }, // # __CFBundleLanguageNamesArray
451 { "Portuguese", "pt" }, // # __CFBundleLanguageNamesArray
452 { "Portuguese, Brazilian", "pt-BR" }, // # handle other names
453 { "Punjabi", "pa" }, // # __CFBundleLanguageNamesArray
454 { "Pushto", "ps" }, // # handle other names
455 { "Quechua", "qu" }, // # __CFBundleLanguageNamesArray
456 { "Romanian", "ro" }, // # __CFBundleLanguageNamesArray
457 { "Ruanda", "rw" }, // # handle other names
458 { "Rundi", "rn" }, // # __CFBundleLanguageNamesArray
459 { "Russian", "ru" }, // # __CFBundleLanguageNamesArray
460 { "Sami", "se" }, // # __CFBundleLanguageNamesArray
461 { "Sanskrit", "sa" }, // # __CFBundleLanguageNamesArray
462 { "Scottish", "gd" }, // # __CFBundleLanguageNamesArray
463 { "Serbian", "sr" }, // # __CFBundleLanguageNamesArray
464 { "Simplified Chinese", "zh-Hans" }, // # handle other names
465 { "Sindhi", "sd" }, // # __CFBundleLanguageNamesArray
466 { "Sinhalese", "si" }, // # __CFBundleLanguageNamesArray
467 { "Slovak", "sk" }, // # __CFBundleLanguageNamesArray
468 { "Slovenian", "sl" }, // # __CFBundleLanguageNamesArray
469 { "Somali", "so" }, // # __CFBundleLanguageNamesArray
470 { "Spanish", "es" }, // # __CFBundleLanguageNamesArray
471 { "Sundanese", "su" }, // # __CFBundleLanguageNamesArray
472 { "Swahili", "sw" }, // # __CFBundleLanguageNamesArray
473 { "Swedish", "sv" }, // # __CFBundleLanguageNamesArray
474 { "Tagalog", "tl" }, // # __CFBundleLanguageNamesArray
475 { "Tajik", "tg" }, // # handle other names
476 { "Tajiki", "tg" }, // # __CFBundleLanguageNamesArray
477 { "Tamil", "ta" }, // # __CFBundleLanguageNamesArray
478 { "Tatar", "tt" }, // # __CFBundleLanguageNamesArray
479 { "Telugu", "te" }, // # __CFBundleLanguageNamesArray
480 { "Thai", "th" }, // # __CFBundleLanguageNamesArray
481 { "Tibetan", "bo" }, // # __CFBundleLanguageNamesArray
482 { "Tigrinya", "ti" }, // # __CFBundleLanguageNamesArray
483 { "Tongan", "to" }, // # __CFBundleLanguageNamesArray
484 { "Traditional Chinese", "zh-Hant" }, // # handle other names
485 { "Turkish", "tr" }, // # __CFBundleLanguageNamesArray
486 { "Turkmen", "tk" }, // # __CFBundleLanguageNamesArray
487 { "Uighur", "ug" }, // # __CFBundleLanguageNamesArray
488 { "Ukrainian", "uk" }, // # __CFBundleLanguageNamesArray
489 { "Urdu", "ur" }, // # __CFBundleLanguageNamesArray
490 { "Uzbek", "uz" }, // # __CFBundleLanguageNamesArray
491 { "Vietnamese", "vi" }, // # __CFBundleLanguageNamesArray
492 { "Welsh", "cy" }, // # __CFBundleLanguageNamesArray
493 { "Yiddish", "yi" }, // # __CFBundleLanguageNamesArray
494 { "ar_??", "ar" }, // # from old MapScriptInfoAndISOCodes
495 { "az.Ar", "az-Arab" }, // # from old LocaleRefGetPartString
496 { "az.Cy", "az-Cyrl" }, // # from old LocaleRefGetPartString
497 { "az.La", "az-Latn" }, // # from old LocaleRefGetPartString
498 { "be_??", "be_BY" }, // # from old MapScriptInfoAndISOCodes
499 { "bn_??", "bn" }, // # from old LocaleRefGetPartString
500 { "bo_??", "bo" }, // # from old MapScriptInfoAndISOCodes
501 { "br_??", "br" }, // # from old MapScriptInfoAndISOCodes
502 { "cy_??", "cy" }, // # from old MapScriptInfoAndISOCodes
503 { "de-96", "de-1996" }, // # from old MapScriptInfoAndISOCodes // <1.9>
504 { "de_96", "de-1996" }, // # from old MapScriptInfoAndISOCodes // <1.9>
505 { "de_??", "de-1996" }, // # from old MapScriptInfoAndISOCodes
506 { "el.El-P", "grc" }, // # from old LocaleRefGetPartString
507 { "en-ascii", "en_001" }, // # from earlier version of tables in this file!
508 { "en_??", "en_001" }, // # from old MapScriptInfoAndISOCodes
509 { "eo_??", "eo" }, // # from old MapScriptInfoAndISOCodes
510 { "es_??", "es_419" }, // # from old MapScriptInfoAndISOCodes
511 { "es_XL", "es_419" }, // # from earlier version of tables in this file!
512 { "fr_??", "fr_001" }, // # from old MapScriptInfoAndISOCodes
513 { "ga-dots", "ga-Latg" }, // # from earlier version of tables in this file! // <1.8>
514 { "ga-dots_IE", "ga-Latg_IE" }, // # from earlier version of tables in this file! // <1.8>
515 { "ga.Lg", "ga-Latg" }, // # from old LocaleRefGetPartString // <1.8>
516 { "ga.Lg_IE", "ga-Latg_IE" }, // # from old LocaleRefGetPartString // <1.8>
517 { "gd_??", "gd" }, // # from old MapScriptInfoAndISOCodes
518 { "gv_??", "gv" }, // # from old MapScriptInfoAndISOCodes
519 { "jv.La", "jv" }, // # logical extension // <1.9>
520 { "jw.La", "jv" }, // # from old LocaleRefGetPartString
521 { "kk.Cy", "kk" }, // # from old LocaleRefGetPartString
522 { "kl.La", "kl" }, // # from old LocaleRefGetPartString
523 { "kl.La_GL", "kl_GL" }, // # from old LocaleRefGetPartString // <1.9>
524 { "lp_??", "se" }, // # from old MapScriptInfoAndISOCodes
525 { "mk_??", "mk_MK" }, // # from old MapScriptInfoAndISOCodes
526 { "mn.Cy", "mn-Cyrl" }, // # from old LocaleRefGetPartString
527 { "mn.Mn", "mn-Mong" }, // # from old LocaleRefGetPartString
528 { "ms.Ar", "ms-Arab" }, // # from old LocaleRefGetPartString
529 { "ms.La", "ms" }, // # from old LocaleRefGetPartString
530 { "nl-be", "nl-BE" }, // # from old LocaleRefGetPartString
531 { "nl-be_BE", "nl_BE" }, // # from old LocaleRefGetPartString
532 { "no-NO", "nb-NO" }, // # not handled by localeStringPrefixToCanonical
533 { "no-NO_NO", "nb-NO_NO" }, // # not handled by localeStringPrefixToCanonical
534 // { "no-bok_NO", "nb_NO" }, // # from old LocaleRefGetPartString - handled by localeStringPrefixToCanonical
535 // { "no-nyn_NO", "nn_NO" }, // # from old LocaleRefGetPartString - handled by localeStringPrefixToCanonical
536 // { "nya", "ny" }, // # from old LocaleRefGetPartString - handled by localeStringPrefixToCanonical
537 { "pa_??", "pa" }, // # from old LocaleRefGetPartString
538 { "sa.Dv", "sa" }, // # from old LocaleRefGetPartString
539 { "sl_??", "sl_SI" }, // # from old MapScriptInfoAndISOCodes
540 { "sr_??", "sr_RS" }, // # from old MapScriptInfoAndISOCodes // <1.18>
541 { "su.La", "su" }, // # from old LocaleRefGetPartString
542 { "yi.He", "yi" }, // # from old LocaleRefGetPartString
543 { "zh-simp", "zh-Hans" }, // # from earlier version of tables in this file!
544 { "zh-trad", "zh-Hant" }, // # from earlier version of tables in this file!
545 { "zh.Ha-S", "zh-Hans" }, // # from old LocaleRefGetPartString
546 { "zh.Ha-S_CN", "zh_CN" }, // # from old LocaleRefGetPartString
547 { "zh.Ha-T", "zh-Hant" }, // # from old LocaleRefGetPartString
548 { "zh.Ha-T_TW", "zh_TW" }, // # from old LocaleRefGetPartString
549 };
550 enum {
551 kNumOldAppleLocaleToCanonical = sizeof(oldAppleLocaleToCanonical)/sizeof(KeyStringToResultString)
552 };
553
554 static const KeyStringToResultString localeStringPrefixToCanonical[] = {
555 // Map 3-letter & obsolete ISO 639 codes, plus obsolete RFC 3066 codes, to 2-letter ISO 639 code.
556 // (special cases for 'sh' handled separately)
557 // First column must be all lowercase; must be sorted according to how strcmp compares the strings in the first column.
558 //
559 // non-canonical canonical [ comment ] # source/reason for non-canonical string
560 // prefix prefix
561 // ------------- ---------
562
563 { "afr", "af" }, // Afrikaans
564 { "alb", "sq" }, // Albanian
565 { "amh", "am" }, // Amharic
566 { "ara", "ar" }, // Arabic
567 { "arm", "hy" }, // Armenian
568 { "asm", "as" }, // Assamese
569 { "aym", "ay" }, // Aymara
570 { "aze", "az" }, // Azerbaijani
571 { "baq", "eu" }, // Basque
572 { "bel", "be" }, // Belarusian
573 { "ben", "bn" }, // Bengali
574 { "bih", "bh" }, // Bihari
575 { "bod", "bo" }, // Tibetan
576 { "bos", "bs" }, // Bosnian
577 { "bre", "br" }, // Breton
578 { "bul", "bg" }, // Bulgarian
579 { "bur", "my" }, // Burmese
580 { "cat", "ca" }, // Catalan
581 { "ces", "cs" }, // Czech
582 { "che", "ce" }, // Chechen
583 { "chi", "zh" }, // Chinese
584 { "cor", "kw" }, // Cornish
585 { "cos", "co" }, // Corsican
586 { "cym", "cy" }, // Welsh
587 { "cze", "cs" }, // Czech
588 { "dan", "da" }, // Danish
589 { "deu", "de" }, // German
590 { "dut", "nl" }, // Dutch
591 { "dzo", "dz" }, // Dzongkha
592 { "ell", "el" }, // Greek, Modern (1453-)
593 { "eng", "en" }, // English
594 { "epo", "eo" }, // Esperanto
595 { "est", "et" }, // Estonian
596 { "eus", "eu" }, // Basque
597 { "fao", "fo" }, // Faroese
598 { "fas", "fa" }, // Persian
599 { "fin", "fi" }, // Finnish
600 { "fra", "fr" }, // French
601 { "fre", "fr" }, // French
602 { "geo", "ka" }, // Georgian
603 { "ger", "de" }, // German
604 { "gla", "gd" }, // Gaelic,Scottish
605 { "gle", "ga" }, // Irish
606 { "glg", "gl" }, // Gallegan
607 { "glv", "gv" }, // Manx
608 { "gre", "el" }, // Greek, Modern (1453-)
609 { "grn", "gn" }, // Guarani
610 { "guj", "gu" }, // Gujarati
611 { "heb", "he" }, // Hebrew
612 { "hin", "hi" }, // Hindi
613 { "hrv", "hr" }, // Croatian
614 { "hun", "hu" }, // Hungarian
615 { "hye", "hy" }, // Armenian
616 { "i-hak", "zh-hakka" }, // Hakka # deprecated RFC 3066
617 { "i-lux", "lb" }, // Luxembourgish # deprecated RFC 3066
618 { "i-navajo", "nv" }, // Navajo # deprecated RFC 3066
619 { "ice", "is" }, // Icelandic
620 { "iku", "iu" }, // Inuktitut
621 { "ile", "ie" }, // Interlingue
622 { "in", "id" }, // Indonesian # deprecated 639 code in -> id (1989)
623 { "ina", "ia" }, // Interlingua
624 { "ind", "id" }, // Indonesian
625 { "isl", "is" }, // Icelandic
626 { "ita", "it" }, // Italian
627 { "iw", "he" }, // Hebrew # deprecated 639 code iw -> he (1989)
628 { "jav", "jv" }, // Javanese
629 { "jaw", "jv" }, // Javanese # deprecated 639 code jaw -> jv (2001)
630 { "ji", "yi" }, // Yiddish # deprecated 639 code ji -> yi (1989)
631 { "jpn", "ja" }, // Japanese
632 { "kal", "kl" }, // Kalaallisut
633 { "kan", "kn" }, // Kannada
634 { "kas", "ks" }, // Kashmiri
635 { "kat", "ka" }, // Georgian
636 { "kaz", "kk" }, // Kazakh
637 { "khm", "km" }, // Khmer
638 { "kin", "rw" }, // Kinyarwanda
639 { "kir", "ky" }, // Kirghiz
640 { "kor", "ko" }, // Korean
641 { "kur", "ku" }, // Kurdish
642 { "lao", "lo" }, // Lao
643 { "lat", "la" }, // Latin
644 { "lav", "lv" }, // Latvian
645 { "lit", "lt" }, // Lithuanian
646 { "ltz", "lb" }, // Letzeburgesch
647 { "mac", "mk" }, // Macedonian
648 { "mal", "ml" }, // Malayalam
649 { "mar", "mr" }, // Marathi
650 { "may", "ms" }, // Malay
651 { "mkd", "mk" }, // Macedonian
652 { "mlg", "mg" }, // Malagasy
653 { "mlt", "mt" }, // Maltese
654 { "mol", "mo" }, // Moldavian
655 { "mon", "mn" }, // Mongolian
656 { "msa", "ms" }, // Malay
657 { "mya", "my" }, // Burmese
658 { "nep", "ne" }, // Nepali
659 { "nld", "nl" }, // Dutch
660 { "nno", "nn" }, // Norwegian Nynorsk
661 { "no", "nb" }, // Norwegian generic # ambiguous 639 code no -> nb
662 { "no-bok", "nb" }, // Norwegian Bokmal # deprecated RFC 3066 tag - used in old LocaleRefGetPartString
663 { "no-nyn", "nn" }, // Norwegian Nynorsk # deprecated RFC 3066 tag - used in old LocaleRefGetPartString
664 { "nob", "nb" }, // Norwegian Bokmal
665 { "nor", "nb" }, // Norwegian generic # ambiguous 639 code nor -> nb
666 { "nya", "ny" }, // Nyanja/Chewa/Chichewa # 3-letter code used in old LocaleRefGetPartString
667 { "oci", "oc" }, // Occitan/Provencal
668 { "ori", "or" }, // Oriya
669 { "orm", "om" }, // Oromo,Galla
670 { "pan", "pa" }, // Panjabi
671 { "per", "fa" }, // Persian
672 { "pol", "pl" }, // Polish
673 { "por", "pt" }, // Portuguese
674 { "pus", "ps" }, // Pushto
675 { "que", "qu" }, // Quechua
676 { "roh", "rm" }, // Raeto-Romance
677 { "ron", "ro" }, // Romanian
678 { "rum", "ro" }, // Romanian
679 { "run", "rn" }, // Rundi
680 { "rus", "ru" }, // Russian
681 { "san", "sa" }, // Sanskrit
682 { "scc", "sr" }, // Serbian
683 { "scr", "hr" }, // Croatian
684 { "sin", "si" }, // Sinhalese
685 { "slk", "sk" }, // Slovak
686 { "slo", "sk" }, // Slovak
687 { "slv", "sl" }, // Slovenian
688 { "sme", "se" }, // Sami,Northern
689 { "snd", "sd" }, // Sindhi
690 { "som", "so" }, // Somali
691 { "spa", "es" }, // Spanish
692 { "sqi", "sq" }, // Albanian
693 { "srp", "sr" }, // Serbian
694 { "sun", "su" }, // Sundanese
695 { "swa", "sw" }, // Swahili
696 { "swe", "sv" }, // Swedish
697 { "tam", "ta" }, // Tamil
698 { "tat", "tt" }, // Tatar
699 { "tel", "te" }, // Telugu
700 { "tgk", "tg" }, // Tajik
701 { "tgl", "tl" }, // Tagalog
702 { "tha", "th" }, // Thai
703 { "tib", "bo" }, // Tibetan
704 { "tir", "ti" }, // Tigrinya
705 { "ton", "to" }, // Tongan
706 { "tuk", "tk" }, // Turkmen
707 { "tur", "tr" }, // Turkish
708 { "uig", "ug" }, // Uighur
709 { "ukr", "uk" }, // Ukrainian
710 { "urd", "ur" }, // Urdu
711 { "uzb", "uz" }, // Uzbek
712 { "vie", "vi" }, // Vietnamese
713 { "wel", "cy" }, // Welsh
714 { "yid", "yi" }, // Yiddish
715 { "zho", "zh" }, // Chinese
716 };
717 enum {
718 kNumLocaleStringPrefixToCanonical = sizeof(localeStringPrefixToCanonical)/sizeof(KeyStringToResultString)
719 };
720
721
722 static const SpecialCaseUpdates specialCases[] = {
723 // Data for special cases
724 // a) The 3166 code CS was used for Czechoslovakia until 1993, when that country split and the code was
725 // replaced by CZ and SK. Then in 2003-07, the code YU (formerly designating all of Yugoslavia, then after
726 // the 1990s breakup just designating what is now Serbia and Montenegro) was changed to CS! Then after
727 // Serbia and Montenegro split, the code CS was replaced in 2006-09 with separate codes RS and ME. If we
728 // see CS but a language of cs or sk, we change CS to CZ or SK. Otherwise, we change CS (and old YU) to RS.
729 // b) The 639 code sh for Serbo-Croatian was also replaced in the 1990s by separate codes hr and sr, and
730 // deprecated in 2000. We guess which one to map it to as follows: If there is a region tag of HR we use
731 // hr; if there is a region tag of (now) RS we use sr; else we do not change it (not enough info).
732 // c) There are other codes that have been updated without these issues (eg. TP to TL), plus among the
733 // "exceptionally reserved" codes some are just alternates for standard codes (eg. UK for GB).
734 { NULL, "-UK", "GB", NULL, NULL }, // always change UK to GB (UK is "exceptionally reserved" to mean GB)
735 { NULL, "-TP", "TL", NULL, NULL }, // always change TP to TL (East Timor, code changed 2002-05)
736 { "cs", "-CS", "CZ", NULL, NULL }, // if language is cs, change CS (pre-1993 Czechoslovakia) to CZ (Czech Republic)
737 { "sk", "-CS", "SK", NULL, NULL }, // if language is sk, change CS (pre-1993 Czechoslovakia) to SK (Slovakia)
738 { NULL, "-CS", "RS", NULL, NULL }, // otherwise map CS (assume Serbia+Montenegro) to RS (Serbia)
739 { NULL, "-YU", "RS", NULL, NULL }, // also map old YU (assume Serbia+Montenegro) to RS (Serbia)
740 { "sh", "-HR", "hr", "-RS", "sr" }, // then if language is old 'sh' (SerboCroatian), change it to 'hr' (Croatian)
741 // if we find HR (Croatia) or to 'sr' (Serbian) if we find RS (Serbia).
742 // Note: Do this after changing YU/CS toRS as above.
743 { NULL, NULL, NULL, NULL, NULL } // terminator
744 };
745
746
747 static const KeyStringToResultString localeStringRegionToDefaults[] = {
748 // For some region-code suffixes, there are default substrings to strip off for canonical string.
749 // Must be sorted according to how strcmp compares the strings in the first column
750 //
751 // region default writing
752 // suffix system tags, strip comment
753 // -------- ------------- ---------
754 { "_CN", "-Hans" }, // mainland China, default is simplified
755 { "_HK", "-Hant" }, // Hong Kong, default is traditional
756 { "_MO", "-Hant" }, // Macao, default is traditional
757 { "_SG", "-Hans" }, // Singapore, default is simplified
758 { "_TW", "-Hant" }, // Taiwan, default is traditional
759 };
760 enum {
761 kNumLocaleStringRegionToDefaults = sizeof(localeStringRegionToDefaults)/sizeof(KeyStringToResultString)
762 };
763
764 static const KeyStringToResultString localeStringPrefixToDefaults[] = {
765 // For some initial portions of language tag, there are default substrings to strip off for canonical string.
766 // Must be sorted according to how strcmp compares the strings in the first column
767 //
768 // language default writing
769 // tag prefix system tags, strip comment
770 // -------- ------------- ---------
771 { "ab-", "-Cyrl" }, // Abkhazian
772 { "af-", "-Latn" }, // Afrikaans
773 { "am-", "-Ethi" }, // Amharic
774 { "ar-", "-Arab" }, // Arabic
775 { "as-", "-Beng" }, // Assamese
776 { "ay-", "-Latn" }, // Aymara
777 { "be-", "-Cyrl" }, // Belarusian
778 { "bg-", "-Cyrl" }, // Bulgarian
779 { "bn-", "-Beng" }, // Bengali
780 { "bo-", "-Tibt" }, // Tibetan (? not Suppress-Script)
781 { "br-", "-Latn" }, // Breton (? not Suppress-Script)
782 { "bs-", "-Latn" }, // Bosnian
783 { "ca-", "-Latn" }, // Catalan
784 { "cs-", "-Latn" }, // Czech
785 { "cy-", "-Latn" }, // Welsh
786 { "da-", "-Latn" }, // Danish
787 { "de-", "-Latn -1901" }, // German, traditional orthography
788 { "dv-", "-Thaa" }, // Divehi/Maldivian
789 { "dz-", "-Tibt" }, // Dzongkha
790 { "el-", "-Grek" }, // Greek (modern, monotonic)
791 { "en-", "-Latn" }, // English
792 { "eo-", "-Latn" }, // Esperanto
793 { "es-", "-Latn" }, // Spanish
794 { "et-", "-Latn" }, // Estonian
795 { "eu-", "-Latn" }, // Basque
796 { "fa-", "-Arab" }, // Farsi
797 { "fi-", "-Latn" }, // Finnish
798 { "fo-", "-Latn" }, // Faroese
799 { "fr-", "-Latn" }, // French
800 { "ga-", "-Latn" }, // Irish
801 { "gd-", "-Latn" }, // Scottish Gaelic (? not Suppress-Script)
802 { "gl-", "-Latn" }, // Galician
803 { "gn-", "-Latn" }, // Guarani
804 { "gu-", "-Gujr" }, // Gujarati
805 { "gv-", "-Latn" }, // Manx
806 { "haw-", "-Latn" }, // Hawaiian (? not Suppress-Script)
807 { "he-", "-Hebr" }, // Hebrew
808 { "hi-", "-Deva" }, // Hindi
809 { "hr-", "-Latn" }, // Croatian
810 { "hu-", "-Latn" }, // Hungarian
811 { "hy-", "-Armn" }, // Armenian
812 { "id-", "-Latn" }, // Indonesian
813 { "is-", "-Latn" }, // Icelandic
814 { "it-", "-Latn" }, // Italian
815 { "ja-", "-Jpan" }, // Japanese
816 { "ka-", "-Geor" }, // Georgian
817 { "kk-", "-Cyrl" }, // Kazakh
818 { "kl-", "-Latn" }, // Kalaallisut/Greenlandic
819 { "km-", "-Khmr" }, // Central Khmer
820 { "kn-", "-Knda" }, // Kannada
821 { "ko-", "-Hang" }, // Korean (? not Suppress-Script)
822 { "kok-", "-Deva" }, // Konkani
823 { "la-", "-Latn" }, // Latin
824 { "lb-", "-Latn" }, // Luxembourgish
825 { "lo-", "-Laoo" }, // Lao
826 { "lt-", "-Latn" }, // Lithuanian
827 { "lv-", "-Latn" }, // Latvian
828 { "mg-", "-Latn" }, // Malagasy
829 { "mk-", "-Cyrl" }, // Macedonian
830 { "ml-", "-Mlym" }, // Malayalam
831 { "mo-", "-Latn" }, // Moldavian
832 { "mr-", "-Deva" }, // Marathi
833 { "ms-", "-Latn" }, // Malay
834 { "mt-", "-Latn" }, // Maltese
835 { "my-", "-Mymr" }, // Burmese/Myanmar
836 { "nb-", "-Latn" }, // Norwegian Bokmal
837 { "ne-", "-Deva" }, // Nepali
838 { "nl-", "-Latn" }, // Dutch
839 { "nn-", "-Latn" }, // Norwegian Nynorsk
840 { "ny-", "-Latn" }, // Chichewa/Nyanja
841 { "om-", "-Latn" }, // Oromo
842 { "or-", "-Orya" }, // Oriya
843 { "pa-", "-Guru" }, // Punjabi
844 { "pl-", "-Latn" }, // Polish
845 { "ps-", "-Arab" }, // Pushto
846 { "pt-", "-Latn" }, // Portuguese
847 { "qu-", "-Latn" }, // Quechua
848 { "rn-", "-Latn" }, // Rundi
849 { "ro-", "-Latn" }, // Romanian
850 { "ru-", "-Cyrl" }, // Russian
851 { "rw-", "-Latn" }, // Kinyarwanda
852 { "sa-", "-Deva" }, // Sanskrit (? not Suppress-Script)
853 { "se-", "-Latn" }, // Sami (? not Suppress-Script)
854 { "si-", "-Sinh" }, // Sinhala
855 { "sk-", "-Latn" }, // Slovak
856 { "sl-", "-Latn" }, // Slovenian
857 { "so-", "-Latn" }, // Somali
858 { "sq-", "-Latn" }, // Albanian
859 { "sv-", "-Latn" }, // Swedish
860 { "sw-", "-Latn" }, // Swahili
861 { "ta-", "-Taml" }, // Tamil
862 { "te-", "-Telu" }, // Telugu
863 { "th-", "-Thai" }, // Thai
864 { "ti-", "-Ethi" }, // Tigrinya
865 { "tl-", "-Latn" }, // Tagalog
866 { "tn-", "-Latn" }, // Tswana
867 { "to-", "-Latn" }, // Tonga of Tonga Islands
868 { "tr-", "-Latn" }, // Turkish
869 { "uk-", "-Cyrl" }, // Ukrainian
870 { "ur-", "-Arab" }, // Urdu
871 { "vi-", "-Latn" }, // Vietnamese
872 { "wo-", "-Latn" }, // Wolof
873 { "xh-", "-Latn" }, // Xhosa
874 { "yi-", "-Hebr" }, // Yiddish
875 { "zh-", "-Hani" }, // Chinese (? not Suppress-Script)
876 { "zu-", "-Latn" }, // Zulu
877 };
878 enum {
879 kNumLocaleStringPrefixToDefaults = sizeof(localeStringPrefixToDefaults)/sizeof(KeyStringToResultString)
880 };
881
882 static const KeyStringToResultString appleLocaleToLanguageString[] = {
883 // Map locale strings that Apple uses as language IDs to real language strings.
884 // Must be sorted according to how strcmp compares the strings in the first column.
885 // Note: Now we remove all transforms of the form ll_RR -> ll-RR, they are now
886 // handled in the code. <1.19>
887 //
888 // locale lang [ comment ]
889 // string string
890 // ------- -------
891 { "en_US_POSIX", "en-US-POSIX" }, // POSIX locale, need as language string // <1.17> [3840752]
892 { "zh_CN", "zh-Hans" }, // mainland China => simplified
893 { "zh_HK", "zh-Hant" }, // Hong Kong => traditional, not currently used
894 { "zh_MO", "zh-Hant" }, // Macao => traditional, not currently used
895 { "zh_SG", "zh-Hans" }, // Singapore => simplified, not currently used
896 { "zh_TW", "zh-Hant" }, // Taiwan => traditional
897 };
898 enum {
899 kNumAppleLocaleToLanguageString = sizeof(appleLocaleToLanguageString)/sizeof(KeyStringToResultString)
900 };
901
902 static const KeyStringToResultString appleLocaleToLanguageStringForCFBundle[] = {
903 // Map locale strings that Apple uses as language IDs to real language strings.
904 // Must be sorted according to how strcmp compares the strings in the first column.
905 //
906 // locale lang [ comment ]
907 // string string
908 // ------- -------
909 { "de_AT", "de-AT" }, // Austrian German
910 { "de_CH", "de-CH" }, // Swiss German
911 // { "de_DE", "de-DE" }, // German for Germany (default), not currently used
912 { "en_AU", "en-AU" }, // Australian English
913 { "en_CA", "en-CA" }, // Canadian English
914 { "en_GB", "en-GB" }, // British English
915 // { "en_IE", "en-IE" }, // Irish English, not currently used
916 { "en_US", "en-US" }, // U.S. English
917 { "en_US_POSIX", "en-US-POSIX" }, // POSIX locale, need as language string // <1.17> [3840752]
918 // { "fr_BE", "fr-BE" }, // Belgian French, not currently used
919 { "fr_CA", "fr-CA" }, // Canadian French
920 { "fr_CH", "fr-CH" }, // Swiss French
921 // { "fr_FR", "fr-FR" }, // French for France (default), not currently used
922 { "nl_BE", "nl-BE" }, // Flemish = Vlaams, Dutch for Belgium
923 // { "nl_NL", "nl-NL" }, // Dutch for Netherlands (default), not currently used
924 { "pt_BR", "pt-BR" }, // Brazilian Portuguese
925 { "pt_PT", "pt-PT" }, // Portuguese for Portugal
926 { "zh_CN", "zh-Hans" }, // mainland China => simplified
927 { "zh_HK", "zh-Hant" }, // Hong Kong => traditional, not currently used
928 { "zh_MO", "zh-Hant" }, // Macao => traditional, not currently used
929 { "zh_SG", "zh-Hans" }, // Singapore => simplified, not currently used
930 { "zh_TW", "zh-Hant" }, // Taiwan => traditional
931 };
932 enum {
933 kNumAppleLocaleToLanguageStringForCFBundle = sizeof(appleLocaleToLanguageStringForCFBundle)/sizeof(KeyStringToResultString)
934 };
935
936
937 struct LocaleToLegacyCodes {
938 const char * locale; // reduced to language plus one other component (script, region, variant), separators normalized to'_'
939 RegionCode regCode;
940 LangCode langCode;
941 CFStringEncoding encoding;
942 };
943 typedef struct LocaleToLegacyCodes LocaleToLegacyCodes;
944
945 static const LocaleToLegacyCodes localeToLegacyCodes[] = {
946 // locale RegionCode LangCode CFStringEncoding
947 { "af"/*ZA*/, 102/*verAfrikaans*/, 141/*langAfrikaans*/, 0/*Roman*/ }, // Latn
948 { "am", -1, 85/*langAmharic*/, 28/*Ethiopic*/ }, // Ethi
949 { "ar", 16/*verArabic*/, 12/*langArabic*/, 4/*Arabic*/ }, // Arab;
950 { "as", -1, 68/*langAssamese*/, 13/*Bengali*/ }, // Beng;
951 { "ay", -1, 134/*langAymara*/, 0/*Roman*/ }, // Latn;
952 { "az", -1, 49/*langAzerbaijani*/, 7/*Cyrillic*/ }, // assume "az" defaults to -Cyrl
953 { "az_Arab", -1, 50/*langAzerbaijanAr*/, 4/*Arabic*/ }, // Arab;
954 { "az_Cyrl", -1, 49/*langAzerbaijani*/, 7/*Cyrillic*/ }, // Cyrl;
955 { "az_Latn", -1, 150/*langAzerbaijanRoman*/, 0/*Roman*/ }, // Latn;
956 { "be"/*BY*/, 61/*verBelarus*/, 46/*langBelorussian*/, 7/*Cyrillic*/ }, // Cyrl;
957 { "bg"/*BG*/, 72/*verBulgaria*/, 44/*langBulgarian*/, 7/*Cyrillic*/ }, // Cyrl;
958 { "bn", 60/*verBengali*/, 67/*langBengali*/, 13/*Bengali*/ }, // Beng;
959 { "bo", 105/*verTibetan*/, 63/*langTibetan*/, 26/*Tibetan*/ }, // Tibt;
960 { "br", 77/*verBreton*/, 142/*langBreton*/, 39/*Celtic*/ }, // Latn;
961 { "ca"/*ES*/, 73/*verCatalonia*/, 130/*langCatalan*/, 0/*Roman*/ }, // Latn;
962 { "cs"/*CZ*/, 56/*verCzech*/, 38/*langCzech*/, 29/*CentralEurRoman*/ }, // Latn;
963 { "cy", 79/*verWelsh*/, 128/*langWelsh*/, 39/*Celtic*/ }, // Latn;
964 { "da"/*DK*/, 9/*verDenmark*/, 7/*langDanish*/, 0/*Roman*/ }, // Latn;
965 { "de", 3/*verGermany*/, 2/*langGerman*/, 0/*Roman*/ }, // assume "de" defaults to verGermany
966 { "de_1996", 70/*verGermanReformed*/, 2/*langGerman*/, 0/*Roman*/ },
967 { "de_AT", 92/*verAustria*/, 2/*langGerman*/, 0/*Roman*/ },
968 { "de_CH", 19/*verGrSwiss*/, 2/*langGerman*/, 0/*Roman*/ },
969 { "de_DE", 3/*verGermany*/, 2/*langGerman*/, 0/*Roman*/ },
970 { "dz"/*BT*/, 83/*verBhutan*/, 137/*langDzongkha*/, 26/*Tibetan*/ }, // Tibt;
971 { "el", 20/*verGreece*/, 14/*langGreek*/, 6/*Greek*/ }, // assume "el" defaults to verGreece
972 { "el_CY", 23/*verCyprus*/, 14/*langGreek*/, 6/*Greek*/ },
973 { "el_GR", 20/*verGreece*/, 14/*langGreek*/, 6/*Greek*/ }, // modern monotonic
974 { "en", 0/*verUS*/, 0/*langEnglish*/, 0/*Roman*/ }, // "en" defaults to verUS (per Chris Hansten)
975 { "en_001", 37/*verInternational*/, 0/*langEnglish*/, 0/*Roman*/ },
976 { "en_AU", 15/*verAustralia*/, 0/*langEnglish*/, 0/*Roman*/ },
977 { "en_CA", 82/*verEngCanada*/, 0/*langEnglish*/, 0/*Roman*/ },
978 { "en_GB", 2/*verBritain*/, 0/*langEnglish*/, 0/*Roman*/ },
979 { "en_IE", 108/*verIrelandEnglish*/, 0/*langEnglish*/, 0/*Roman*/ },
980 { "en_SG", 100/*verSingapore*/, 0/*langEnglish*/, 0/*Roman*/ },
981 { "en_US", 0/*verUS*/, 0/*langEnglish*/, 0/*Roman*/ },
982 { "eo", 103/*verEsperanto*/, 94/*langEsperanto*/, 0/*Roman*/ }, // Latn;
983 { "es", 8/*verSpain*/, 6/*langSpanish*/, 0/*Roman*/ }, // "es" defaults to verSpain (per Chris Hansten)
984 { "es_419", 86/*verSpLatinAmerica*/, 6/*langSpanish*/, 0/*Roman*/ }, // new BCP 47 tag
985 { "es_ES", 8/*verSpain*/, 6/*langSpanish*/, 0/*Roman*/ },
986 { "es_MX", 86/*verSpLatinAmerica*/, 6/*langSpanish*/, 0/*Roman*/ },
987 { "es_US", 86/*verSpLatinAmerica*/, 6/*langSpanish*/, 0/*Roman*/ },
988 { "et"/*EE*/, 44/*verEstonia*/, 27/*langEstonian*/, 29/*CentralEurRoman*/ },
989 { "eu", -1, 129/*langBasque*/, 0/*Roman*/ }, // Latn;
990 { "fa"/*IR*/, 48/*verIran*/, 31/*langFarsi/Persian*/, 0x8C/*Farsi*/ }, // Arab;
991 { "fi"/*FI*/, 17/*verFinland*/, 13/*langFinnish*/, 0/*Roman*/ },
992 { "fo"/*FO*/, 47/*verFaroeIsl*/, 30/*langFaroese*/, 37/*Icelandic*/ },
993 { "fr", 1/*verFrance*/, 1/*langFrench*/, 0/*Roman*/ }, // "fr" defaults to verFrance (per Chris Hansten)
994 { "fr_001", 91/*verFrenchUniversal*/, 1/*langFrench*/, 0/*Roman*/ },
995 { "fr_BE", 98/*verFrBelgium*/, 1/*langFrench*/, 0/*Roman*/ },
996 { "fr_CA", 11/*verFrCanada*/, 1/*langFrench*/, 0/*Roman*/ },
997 { "fr_CH", 18/*verFrSwiss*/, 1/*langFrench*/, 0/*Roman*/ },
998 { "fr_FR", 1/*verFrance*/, 1/*langFrench*/, 0/*Roman*/ },
999 { "ga"/*IE*/, 50/*verIreland*/, 35/*langIrishGaelic*/, 0/*Roman*/ }, // no dots (h after)
1000 { "ga_Latg"/*IE*/, 81/*verIrishGaelicScrip*/, 146/*langIrishGaelicScript*/, 40/*Gaelic*/ }, // using dots
1001 { "gd", 75/*verScottishGaelic*/, 144/*langScottishGaelic*/, 39/*Celtic*/ },
1002 { "gl", -1, 140/*langGalician*/, 0/*Roman*/ }, // Latn;
1003 { "gn", -1, 133/*langGuarani*/, 0/*Roman*/ }, // Latn;
1004 { "grc", 40/*verGreekAncient*/, 148/*langGreekAncient*/, 6/*Greek*/ }, // polytonic (MacGreek doesn't actually support it)
1005 { "gu"/*IN*/, 94/*verGujarati*/, 69/*langGujarati*/, 11/*Gujarati*/ }, // Gujr;
1006 { "gv", 76/*verManxGaelic*/, 145/*langManxGaelic*/, 39/*Celtic*/ }, // Latn;
1007 { "he"/*IL*/, 13/*verIsrael*/, 10/*langHebrew*/, 5/*Hebrew*/ }, // Hebr;
1008 { "hi"/*IN*/, 33/*verIndiaHindi*/, 21/*langHindi*/, 9/*Devanagari*/ }, // Deva;
1009 { "hr"/*HR*/, 68/*verCroatia*/, 18/*langCroatian*/, 36/*Croatian*/ },
1010 { "hu"/*HU*/, 43/*verHungary*/, 26/*langHungarian*/, 29/*CentralEurRoman*/ },
1011 { "hy"/*AM*/, 84/*verArmenian*/, 51/*langArmenian*/, 24/*Armenian*/ }, // Armn;
1012 { "id", -1, 81/*langIndonesian*/, 0/*Roman*/ }, // Latn;
1013 { "is"/*IS*/, 21/*verIceland*/, 15/*langIcelandic*/, 37/*Icelandic*/ },
1014 { "it", 4/*verItaly*/, 3/*langItalian*/, 0/*Roman*/ }, // "it" defaults to verItaly
1015 { "it_CH", 36/*verItalianSwiss*/, 3/*langItalian*/, 0/*Roman*/ },
1016 { "it_IT", 4/*verItaly*/, 3/*langItalian*/, 0/*Roman*/ },
1017 { "iu"/*CA*/, 78/*verNunavut*/, 143/*langInuktitut*/, 0xEC/*Inuit*/ }, // Cans;
1018 { "ja"/*JP*/, 14/*verJapan*/, 11/*langJapanese*/, 1/*Japanese*/ }, // Jpan;
1019 { "jv", -1, 138/*langJavaneseRom*/, 0/*Roman*/ }, // Latn;
1020 { "ka"/*GE*/, 85/*verGeorgian*/, 52/*langGeorgian*/, 23/*Georgian*/ }, // Geor;
1021 { "kk", -1, 48/*langKazakh*/, 7/*Cyrillic*/ }, // "kk" defaults to -Cyrl; also have -Latn, -Arab
1022 { "kl", 107/*verGreenland*/, 149/*langGreenlandic*/, 0/*Roman*/ }, // Latn;
1023 { "km", -1, 78/*langKhmer*/, 20/*Khmer*/ }, // Khmr;
1024 { "kn", -1, 73/*langKannada*/, 16/*Kannada*/ }, // Knda;
1025 { "ko"/*KR*/, 51/*verKorea*/, 23/*langKorean*/, 3/*Korean*/ }, // Hang;
1026 { "ks", -1, 61/*langKashmiri*/, 4/*Arabic*/ }, // Arab;
1027 { "ku", -1, 60/*langKurdish*/, 4/*Arabic*/ }, // Arab;
1028 { "ky", -1, 54/*langKirghiz*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1029 { "la", -1, 131/*langLatin*/, 0/*Roman*/ }, // Latn;
1030 { "lo", -1, 79/*langLao*/, 22/*Laotian*/ }, // Laoo;
1031 { "lt"/*LT*/, 41/*verLithuania*/, 24/*langLithuanian*/, 29/*CentralEurRoman*/ },
1032 { "lv"/*LV*/, 45/*verLatvia*/, 28/*langLatvian*/, 29/*CentralEurRoman*/ },
1033 { "mg", -1, 93/*langMalagasy*/, 0/*Roman*/ }, // Latn;
1034 { "mk"/*MK*/, 67/*verMacedonian*/, 43/*langMacedonian*/, 7/*Cyrillic*/ }, // Cyrl;
1035 { "ml", -1, 72/*langMalayalam*/, 17/*Malayalam*/ }, // Mlym;
1036 { "mn", -1, 57/*langMongolian*/, 27/*Mongolian*/ }, // "mn" defaults to -Mong
1037 { "mn_Cyrl", -1, 58/*langMongolianCyr*/, 7/*Cyrillic*/ }, // Cyrl;
1038 { "mn_Mong", -1, 57/*langMongolian*/, 27/*Mongolian*/ }, // Mong;
1039 { "mo", -1, 53/*langMoldavian*/, 7/*Cyrillic*/ }, // Cyrl;
1040 { "mr"/*IN*/, 104/*verMarathi*/, 66/*langMarathi*/, 9/*Devanagari*/ }, // Deva;
1041 { "ms", -1, 83/*langMalayRoman*/, 0/*Roman*/ }, // "ms" defaults to -Latn;
1042 { "ms_Arab", -1, 84/*langMalayArabic*/, 4/*Arabic*/ }, // Arab;
1043 { "mt"/*MT*/, 22/*verMalta*/, 16/*langMaltese*/, 0/*Roman*/ }, // Latn;
1044 { "mul", 74/*verMultilingual*/, -1, 0 },
1045 { "my", -1, 77/*langBurmese*/, 19/*Burmese*/ }, // Mymr;
1046 { "nb"/*NO*/, 12/*verNorway*/, 9/*langNorwegian*/, 0/*Roman*/ },
1047 { "ne"/*NP*/, 106/*verNepal*/, 64/*langNepali*/, 9/*Devanagari*/ }, // Deva;
1048 { "nl", 5/*verNetherlands*/, 4/*langDutch*/, 0/*Roman*/ }, // "nl" defaults to verNetherlands
1049 { "nl_BE", 6/*verFlemish*/, 34/*langFlemish*/, 0/*Roman*/ },
1050 { "nl_NL", 5/*verNetherlands*/, 4/*langDutch*/, 0/*Roman*/ },
1051 { "nn"/*NO*/, 101/*verNynorsk*/, 151/*langNynorsk*/, 0/*Roman*/ },
1052 { "ny", -1, 92/*langNyanja/Chewa*/, 0/*Roman*/ }, // Latn;
1053 { "om", -1, 87/*langOromo*/, 28/*Ethiopic*/ }, // Ethi;
1054 { "or", -1, 71/*langOriya*/, 12/*Oriya*/ }, // Orya;
1055 { "pa", 95/*verPunjabi*/, 70/*langPunjabi*/, 10/*Gurmukhi*/ }, // Guru;
1056 { "pl"/*PL*/, 42/*verPoland*/, 25/*langPolish*/, 29/*CentralEurRoman*/ },
1057 { "ps", -1, 59/*langPashto*/, 0x8C/*Farsi*/ }, // Arab;
1058 { "pt", 71/*verBrazil*/, 8/*langPortuguese*/, 0/*Roman*/ }, // "pt" defaults to verBrazil (per Chris Hansten)
1059 { "pt_BR", 71/*verBrazil*/, 8/*langPortuguese*/, 0/*Roman*/ },
1060 { "pt_PT", 10/*verPortugal*/, 8/*langPortuguese*/, 0/*Roman*/ },
1061 { "qu", -1, 132/*langQuechua*/, 0/*Roman*/ }, // Latn;
1062 { "rn", -1, 91/*langRundi*/, 0/*Roman*/ }, // Latn;
1063 { "ro"/*RO*/, 39/*verRomania*/, 37/*langRomanian*/, 38/*Romanian*/ },
1064 { "ru"/*RU*/, 49/*verRussia*/, 32/*langRussian*/, 7/*Cyrillic*/ }, // Cyrl;
1065 { "rw", -1, 90/*langKinyarwanda*/, 0/*Roman*/ }, // Latn;
1066 { "sa", -1, 65/*langSanskrit*/, 9/*Devanagari*/ }, // Deva;
1067 { "sd", -1, 62/*langSindhi*/, 0x8C/*Farsi*/ }, // Arab;
1068 { "se", 46/*verSami*/, 29/*langSami*/, 0/*Roman*/ },
1069 { "si", -1, 76/*langSinhalese*/, 18/*Sinhalese*/ }, // Sinh;
1070 { "sk"/*SK*/, 57/*verSlovak*/, 39/*langSlovak*/, 29/*CentralEurRoman*/ },
1071 { "sl"/*SI*/, 66/*verSlovenian*/, 40/*langSlovenian*/, 36/*Croatian*/ },
1072 { "so", -1, 88/*langSomali*/, 0/*Roman*/ }, // Latn;
1073 { "sq", -1, 36/*langAlbanian*/, 0/*Roman*/ },
1074 { "sr"/*CS,RS*/, 65/*verSerbian*/, 42/*langSerbian*/, 7/*Cyrillic*/ }, // Cyrl;
1075 { "su", -1, 139/*langSundaneseRom*/, 0/*Roman*/ }, // Latn;
1076 { "sv"/*SE*/, 7/*verSweden*/, 5/*langSwedish*/, 0/*Roman*/ },
1077 { "sw", -1, 89/*langSwahili*/, 0/*Roman*/ }, // Latn;
1078 { "ta", -1, 74/*langTamil*/, 14/*Tamil*/ }, // Taml;
1079 { "te", -1, 75/*langTelugu*/, 15/*Telugu*/ }, // Telu
1080 { "tg", -1, 55/*langTajiki*/, 7/*Cyrillic*/ }, // "tg" defaults to "Cyrl"
1081 { "tg_Cyrl", -1, 55/*langTajiki*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1082 { "th"/*TH*/, 54/*verThailand*/, 22/*langThai*/, 21/*Thai*/ }, // Thai;
1083 { "ti", -1, 86/*langTigrinya*/, 28/*Ethiopic*/ }, // Ethi;
1084 { "tk", -1, 56/*langTurkmen*/, 7/*Cyrillic*/ }, // "tk" defaults to Cyrl
1085 { "tk_Cyrl", -1, 56/*langTurkmen*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1086 { "tl", -1, 82/*langTagalog*/, 0/*Roman*/ }, // Latn;
1087 { "to"/*TO*/, 88/*verTonga*/, 147/*langTongan*/, 0/*Roman*/ }, // Latn;
1088 { "tr"/*TR*/, 24/*verTurkey*/, 17/*langTurkish*/, 35/*Turkish*/ }, // Latn;
1089 { "tt", -1, 135/*langTatar*/, 7/*Cyrillic*/ }, // Cyrl;
1090 { "tt_Cyrl", -1, 135/*langTatar*/, 7/*Cyrillic*/ }, // Cyrl;
1091 { "ug", -1, 136/*langUighur*/, 4/*Arabic*/ }, // Arab;
1092 { "uk"/*UA*/, 62/*verUkraine*/, 45/*langUkrainian*/, 7/*Cyrillic*/ }, // Cyrl;
1093 { "und", 55/*verScriptGeneric*/, -1, 0 },
1094 { "ur", 34/*verPakistanUrdu*/, 20/*langUrdu*/, 0x8C/*Farsi*/ }, // "ur" defaults to verPakistanUrdu
1095 { "ur_IN", 96/*verIndiaUrdu*/, 20/*langUrdu*/, 0x8C/*Farsi*/ }, // Arab
1096 { "ur_PK", 34/*verPakistanUrdu*/, 20/*langUrdu*/, 0x8C/*Farsi*/ }, // Arab
1097 { "uz"/*UZ*/, 99/*verUzbek*/, 47/*langUzbek*/, 7/*Cyrillic*/ }, // Cyrl; also -Latn, -Arab
1098 { "uz_Cyrl", 99/*verUzbek*/, 47/*langUzbek*/, 7/*Cyrillic*/ },
1099 { "vi"/*VN*/, 97/*verVietnam*/, 80/*langVietnamese*/, 30/*Vietnamese*/ }, // Latn
1100 { "yi", -1, 41/*langYiddish*/, 5/*Hebrew*/ }, // Hebr;
1101 { "zh", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ }, // "zh" defaults to verChina, langSimpChinese
1102 { "zh_CN", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ },
1103 { "zh_HK", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1104 { "zh_Hans", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ },
1105 { "zh_Hant", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1106 { "zh_MO", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1107 { "zh_SG", 52/*verChina*/, 33/*langSimpChinese*/, 25/*ChineseSimp*/ },
1108 { "zh_TW", 53/*verTaiwan*/, 19/*langTradChinese*/, 2/*ChineseTrad*/ },
1109 };
1110 enum {
1111 kNumLocaleToLegacyCodes = sizeof(localeToLegacyCodes)/sizeof(localeToLegacyCodes[0])
1112 };
1113
1114 /*
1115 For reference here is a list of ICU locales with variants and how some
1116 of them are canonicalized with the ICU function uloc_canonicalize:
1117
1118 ICU 3.0 has:
1119 en_US_POSIX x no change
1120 hy_AM_REVISED x no change
1121 ja_JP_TRADITIONAL -> ja_JP@calendar=japanese
1122 th_TH_TRADITIONAL -> th_TH@calendar=buddhist
1123
1124 ICU 2.8 also had the following (now obsolete):
1125 ca_ES_PREEURO
1126 de__PHONEBOOK -> de@collation=phonebook
1127 de_AT_PREEURO
1128 de_DE_PREEURO
1129 de_LU_PREEURO
1130 el_GR_PREEURO
1131 en_BE_PREEURO
1132 en_GB_EURO -> en_GB@currency=EUR
1133 en_IE_PREEURO -> en_IE@currency=IEP
1134 es__TRADITIONAL -> es@collation=traditional
1135 es_ES_PREEURO
1136 eu_ES_PREEURO
1137 fi_FI_PREEURO
1138 fr_BE_PREEURO
1139 fr_FR_PREEURO -> fr_FR@currency=FRF
1140 fr_LU_PREEURO
1141 ga_IE_PREEURO
1142 gl_ES_PREEURO
1143 hi__DIRECT -> hi@collation=direct
1144 it_IT_PREEURO
1145 nl_BE_PREEURO
1146 nl_NL_PREEURO
1147 pt_PT_PREEURO
1148 zh__PINYIN -> zh@collation=pinyin
1149 zh_TW_STROKE -> zh_TW@collation=stroke
1150
1151 */
1152
1153 // _CompareTestEntryToTableEntryKey
1154 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1155 // comparison function for bsearch
1156 static int _CompareTestEntryToTableEntryKey(const void *testEntryPtr, const void *tableEntryKeyPtr) {
1157 return strcmp( ((const KeyStringToResultString *)testEntryPtr)->key, ((const KeyStringToResultString *)tableEntryKeyPtr)->key );
1158 }
1159
1160 // _CompareTestEntryPrefixToTableEntryKey
1161 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1162 // Comparison function for bsearch. Assumes prefix IS terminated with '-' or '_'.
1163 // Do the following instead of strlen & strncmp so we don't walk tableEntry key twice.
1164 static int _CompareTestEntryPrefixToTableEntryKey(const void *testEntryPtr, const void *tableEntryKeyPtr) {
1165 const char * testPtr = ((const KeyStringToResultString *)testEntryPtr)->key;
1166 const char * tablePtr = ((const KeyStringToResultString *)tableEntryKeyPtr)->key;
1167
1168 while ( *testPtr == *tablePtr && *tablePtr != 0 ) {
1169 testPtr++; tablePtr++;
1170 }
1171 if ( *tablePtr != 0 ) {
1172 // strings are different, and the string in the table has not run out;
1173 // i.e. the table entry is not a prefix of the text string.
1174 return ( *testPtr < *tablePtr )? -1: 1;
1175 }
1176 return 0;
1177 }
1178
1179 // _CompareLowerTestEntryPrefixToTableEntryKey
1180 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1181 // Comparison function for bsearch. Assumes prefix NOT terminated with '-' or '_'.
1182 // Lowercases the test string before comparison (the table should already have lowercased entries).
1183 static int _CompareLowerTestEntryPrefixToTableEntryKey(const void *testEntryPtr, const void *tableEntryKeyPtr) {
1184 const char * testPtr = ((const KeyStringToResultString *)testEntryPtr)->key;
1185 const char * tablePtr = ((const KeyStringToResultString *)tableEntryKeyPtr)->key;
1186 char lowerTestChar;
1187
1188 while ( (lowerTestChar = tolower(*testPtr)) == *tablePtr && *tablePtr != 0 && lowerTestChar != '_' ) { // <1.9>
1189 testPtr++; tablePtr++;
1190 }
1191 if ( *tablePtr != 0 ) {
1192 // strings are different, and the string in the table has not run out;
1193 // i.e. the table entry is not a prefix of the text string.
1194 if (lowerTestChar == '_') // <1.9>
1195 return -1; // <1.9>
1196 return ( lowerTestChar < *tablePtr )? -1: 1;
1197 }
1198 // The string in the table has run out. If the test string char is not alnum,
1199 // then the string matches, else the test string sorts after.
1200 return ( !isalnum(lowerTestChar) )? 0: 1;
1201 }
1202
1203 // _DeleteCharsAtPointer
1204 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1205 // remove _length_ characters from the beginning of the string indicated by _stringPtr_
1206 // (we know that the string has at least _length_ characters in it)
1207 static void _DeleteCharsAtPointer(char *stringPtr, int length) {
1208 do {
1209 *stringPtr = stringPtr[length];
1210 } while (*stringPtr++ != 0);
1211 }
1212
1213 // _CopyReplacementAtPointer
1214 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1215 // Copy replacement string (*excluding* terminating NULL byte) to the place indicated by stringPtr
1216 static void _CopyReplacementAtPointer(char *stringPtr, const char *replacementPtr) {
1217 while (*replacementPtr != 0) {
1218 *stringPtr++ = *replacementPtr++;
1219 }
1220 }
1221
1222 // _CheckForTag
1223 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1224 static Boolean _CheckForTag(const char *localeStringPtr, const char *tagPtr, int tagLen) {
1225 return ( strncmp(localeStringPtr, tagPtr, tagLen) == 0 && !isalnum(localeStringPtr[tagLen]) );
1226 }
1227
1228 // _ReplacePrefix
1229 // Move this code from _UpdateFullLocaleString into separate function // <1.10>
1230 static void _ReplacePrefix(char locString[], int locStringMaxLen, int oldPrefixLen, const char *newPrefix) {
1231 int newPrefixLen = strlen(newPrefix);
1232 int lengthDelta = newPrefixLen - oldPrefixLen;
1233
1234 if (lengthDelta < 0) {
1235 // replacement is shorter, delete chars by shifting tail of string
1236 _DeleteCharsAtPointer(locString + newPrefixLen, -lengthDelta);
1237 } else if (lengthDelta > 0) {
1238 // replacement is longer...
1239 int stringLen = strlen(locString);
1240
1241 if (stringLen + lengthDelta < locStringMaxLen) {
1242 // make room by shifting tail of string
1243 char * tailShiftPtr = locString + stringLen;
1244 char * tailStartPtr = locString + oldPrefixLen; // pointer to tail of string to shift
1245
1246 while (tailShiftPtr >= tailStartPtr) {
1247 tailShiftPtr[lengthDelta] = *tailShiftPtr;
1248 tailShiftPtr--;
1249 }
1250 } else {
1251 // no room, can't do substitution
1252 newPrefix = NULL;
1253 }
1254 }
1255
1256 if (newPrefix) {
1257 // do the substitution
1258 _CopyReplacementAtPointer(locString, newPrefix);
1259 }
1260 }
1261
1262 // _UpdateFullLocaleString
1263 // Given a locale string that uses standard codes (not a special old-style Apple string),
1264 // update all the language codes and region codes to latest versions, map 3-letter
1265 // language codes to 2-letter codes if possible, and normalize casing. If requested, return
1266 // pointers to a language-region variant subtag (if present) and a region tag (if present).
1267 // (add locStringMaxLen parameter) // <1.10>
1268 static void _UpdateFullLocaleString(char inLocaleString[], int locStringMaxLen,
1269 char **langRegSubtagRef, char **regionTagRef,
1270 char varKeyValueString[]) // <1.17>
1271 {
1272 KeyStringToResultString testEntry;
1273 KeyStringToResultString * foundEntry;
1274 const SpecialCaseUpdates * specialCasePtr;
1275 char * inLocalePtr;
1276 char * subtagPtr;
1277 char * langRegSubtag = NULL;
1278 char * regionTag = NULL;
1279 char * variantTag = NULL;
1280 Boolean subtagHasDigits, pastPrimarySubtag, hadRegion;
1281
1282 // 1. First replace any non-canonical prefix (case insensitive) with canonical
1283 // (change 3-letter ISO 639 code to 2-letter, update obsolete ISO 639 codes & RFC 3066 tags, etc.)
1284
1285 testEntry.key = inLocaleString;
1286 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToCanonical, kNumLocaleStringPrefixToCanonical,
1287 sizeof(KeyStringToResultString), _CompareLowerTestEntryPrefixToTableEntryKey );
1288 if (foundEntry) {
1289 // replace key (at beginning of string) with result
1290 _ReplacePrefix(inLocaleString, locStringMaxLen, strlen(foundEntry->key), foundEntry->result); // <1.10>
1291 }
1292
1293 // 2. Walk through input string, normalizing case & marking use of ISO 3166 codes
1294
1295 inLocalePtr = inLocaleString;
1296 subtagPtr = inLocaleString;
1297 subtagHasDigits = false;
1298 pastPrimarySubtag = false;
1299 hadRegion = false;
1300
1301 while ( true ) {
1302 if ( isalpha(*inLocalePtr) ) {
1303 // if not past a region tag, then lowercase, else uppercase
1304 *inLocalePtr = (!hadRegion)? tolower(*inLocalePtr): toupper(*inLocalePtr);
1305 } else if ( isdigit(*inLocalePtr) ) {
1306 subtagHasDigits = true;
1307 } else {
1308
1309 if (!pastPrimarySubtag) {
1310 // may have a NULL primary subtag
1311 if (subtagHasDigits) {
1312 break;
1313 }
1314 pastPrimarySubtag = true;
1315 } else if (!hadRegion) {
1316 // We are after any primary language subtag, but not past any region tag.
1317 // This subtag is preceded by '-' or '_'.
1318 int subtagLength = inLocalePtr - subtagPtr; // includes leading '-' or '_'
1319
1320 if (subtagLength == 3 && !subtagHasDigits) {
1321 // potential ISO 3166 code for region or language variant; if so, needs uppercasing
1322 if (*subtagPtr == '_') {
1323 regionTag = subtagPtr;
1324 hadRegion = true;
1325 subtagPtr[1] = toupper(subtagPtr[1]);
1326 subtagPtr[2] = toupper(subtagPtr[2]);
1327 } else if (langRegSubtag == NULL) {
1328 langRegSubtag = subtagPtr;
1329 subtagPtr[1] = toupper(subtagPtr[1]);
1330 subtagPtr[2] = toupper(subtagPtr[2]);
1331 }
1332 } else if (subtagLength == 4 && subtagHasDigits) {
1333 // potential UN M.49 region code
1334 if (*subtagPtr == '_') {
1335 regionTag = subtagPtr;
1336 hadRegion = true;
1337 } else if (langRegSubtag == NULL) {
1338 langRegSubtag = subtagPtr;
1339 }
1340 } else if (subtagLength == 5 && !subtagHasDigits) {
1341 // ISO 15924 script code, uppercase just the first letter
1342 subtagPtr[1] = toupper(subtagPtr[1]);
1343 } else if (subtagLength == 1 && *subtagPtr == '_') { // <1.17>
1344 hadRegion = true;
1345 }
1346
1347 if (!hadRegion) {
1348 // convert improper '_' to '-'
1349 *subtagPtr = '-';
1350 }
1351 } else {
1352 variantTag = subtagPtr; // <1.17>
1353 }
1354
1355 if (*inLocalePtr == '-' || *inLocalePtr == '_') {
1356 subtagPtr = inLocalePtr;
1357 subtagHasDigits = false;
1358 } else {
1359 break;
1360 }
1361 }
1362
1363 inLocalePtr++;
1364 }
1365
1366 // 3 If there is a variant tag, see if ICU canonicalizes it to keywords. // <1.17> [3577669]
1367 // If so, copy the keywords to varKeyValueString and delete the variant tag
1368 // from the original string (but don't otherwise use the ICU canonicalization).
1369 varKeyValueString[0] = 0;
1370 if (variantTag) {
1371 UErrorCode icuStatus;
1372 int icuCanonStringLen;
1373 char * varKeyValueStringPtr = varKeyValueString;
1374
1375 icuStatus = U_ZERO_ERROR;
1376 icuCanonStringLen = uloc_canonicalize( inLocaleString, varKeyValueString, locStringMaxLen, &icuStatus );
1377 if ( U_SUCCESS(icuStatus) ) {
1378 char * icuCanonStringPtr = varKeyValueString;
1379
1380 if (icuCanonStringLen >= locStringMaxLen)
1381 icuCanonStringLen = locStringMaxLen - 1;
1382 varKeyValueString[icuCanonStringLen] = 0;
1383 while (*icuCanonStringPtr != 0 && *icuCanonStringPtr != ULOC_KEYWORD_SEPARATOR)
1384 ++icuCanonStringPtr;
1385 if (*icuCanonStringPtr != 0) {
1386 // the canonicalized string has keywords
1387 // delete the variant tag in the original string (and other trailing '_' or '-')
1388 *variantTag-- = 0;
1389 while (*variantTag == '_')
1390 *variantTag-- = 0;
1391 // delete all of the canonicalized string except the keywords
1392 while (*icuCanonStringPtr != 0)
1393 *varKeyValueStringPtr++ = *icuCanonStringPtr++;
1394 }
1395 *varKeyValueStringPtr = 0;
1396 }
1397 }
1398
1399 // 4. Handle special cases of updating region codes, or updating language codes based on
1400 // region code.
1401 for (specialCasePtr = specialCases; specialCasePtr->reg1 != NULL; specialCasePtr++) {
1402 if ( specialCasePtr->lang == NULL || _CheckForTag(inLocaleString, specialCasePtr->lang, 2) ) {
1403 // OK, we matched any language specified. Now what needs updating?
1404 char * foundTag;
1405
1406 if ( isupper(specialCasePtr->update1[0]) ) {
1407 // updating a region code
1408 if ( ( foundTag = strstr(inLocaleString, specialCasePtr->reg1) ) && !isalnum(foundTag[3]) ) {
1409 _CopyReplacementAtPointer(foundTag+1, specialCasePtr->update1);
1410 }
1411 if ( regionTag && _CheckForTag(regionTag+1, specialCasePtr->reg1 + 1, 2) ) {
1412 _CopyReplacementAtPointer(regionTag+1, specialCasePtr->update1);
1413 }
1414
1415 } else {
1416 // updating the language, there will be two choices based on region
1417 if ( ( regionTag && _CheckForTag(regionTag+1, specialCasePtr->reg1 + 1, 2) ) ||
1418 ( ( foundTag = strstr(inLocaleString, specialCasePtr->reg1) ) && !isalnum(foundTag[3]) ) ) {
1419 _CopyReplacementAtPointer(inLocaleString, specialCasePtr->update1);
1420 } else if ( ( regionTag && _CheckForTag(regionTag+1, specialCasePtr->reg2 + 1, 2) ) ||
1421 ( ( foundTag = strstr(inLocaleString, specialCasePtr->reg2) ) && !isalnum(foundTag[3]) ) ) {
1422 _CopyReplacementAtPointer(inLocaleString, specialCasePtr->update2);
1423 }
1424 }
1425 }
1426 }
1427
1428 // 5. return pointers if requested.
1429 if (langRegSubtagRef != NULL) {
1430 *langRegSubtagRef = langRegSubtag;
1431 }
1432 if (regionTagRef != NULL) {
1433 *regionTagRef = regionTag;
1434 }
1435 }
1436
1437
1438 // _RemoveSubstringsIfPresent
1439 // (Local function for CFLocaleCreateCanonicalLocaleIdentifierFromString)
1440 // substringList is a list of space-separated substrings to strip if found in localeString
1441 static void _RemoveSubstringsIfPresent(char *localeString, const char *substringList) {
1442 while (*substringList != 0) {
1443 char currentSubstring[kLocaleIdentifierCStringMax];
1444 int substringLength = 0;
1445 char * foundSubstring;
1446
1447 // copy current substring & get its length
1448 while ( isgraph(*substringList) ) {
1449 currentSubstring[substringLength++] = *substringList++;
1450 }
1451 // move to next substring
1452 while ( isspace(*substringList) ) {
1453 substringList++;
1454 }
1455
1456 // search for current substring in locale string
1457 if (substringLength == 0)
1458 continue;
1459 currentSubstring[substringLength] = 0;
1460 foundSubstring = strstr(localeString, currentSubstring);
1461
1462 // if substring is found, delete it
1463 if (foundSubstring) {
1464 _DeleteCharsAtPointer(foundSubstring, substringLength);
1465 }
1466 }
1467 }
1468
1469
1470 // _GetKeyValueString // <1.10>
1471 // Removes any key-value string from inLocaleString, puts canonized version in keyValueString
1472
1473 static void _GetKeyValueString(char inLocaleString[], char keyValueString[]) {
1474 char * inLocalePtr = inLocaleString;
1475
1476 while (*inLocalePtr != 0 && *inLocalePtr != ULOC_KEYWORD_SEPARATOR) {
1477 inLocalePtr++;
1478 }
1479 if (*inLocalePtr != 0) { // we found a key-value section
1480 char * keyValuePtr = keyValueString;
1481
1482 *keyValuePtr = *inLocalePtr;
1483 *inLocalePtr = 0;
1484 do {
1485 if ( *(++inLocalePtr) != ' ' ) {
1486 *(++keyValuePtr) = *inLocalePtr; // remove "tolower() for *inLocalePtr" // <1.11>
1487 }
1488 } while (*inLocalePtr != 0);
1489 } else {
1490 keyValueString[0] = 0;
1491 }
1492 }
1493
1494 static void _AppendKeyValueString(char inLocaleString[], int locStringMaxLen, char keyValueString[]) {
1495 if (keyValueString[0] != 0) {
1496 UErrorCode uerr = U_ZERO_ERROR;
1497 UEnumeration * uenum = uloc_openKeywords(keyValueString, &uerr);
1498 if ( uenum != NULL ) {
1499 const char * keyword;
1500 int32_t length;
1501 char value[ULOC_KEYWORDS_CAPACITY]; // use as max for keyword value
1502 while ( U_SUCCESS(uerr) ) {
1503 keyword = uenum_next(uenum, &length, &uerr);
1504 if ( keyword == NULL ) {
1505 break;
1506 }
1507 length = uloc_getKeywordValue( keyValueString, keyword, value, sizeof(value), &uerr );
1508 length = uloc_setKeywordValue( keyword, value, inLocaleString, locStringMaxLen, &uerr );
1509 }
1510 uenum_close(uenum);
1511 }
1512 }
1513 }
1514
1515 // __private_extern__ CFStringRef _CFLocaleCreateCanonicalLanguageIdentifierForCFBundle(CFAllocatorRef allocator, CFStringRef localeIdentifier) {}
1516
1517 CFStringRef CFLocaleCreateCanonicalLanguageIdentifierFromString(CFAllocatorRef allocator, CFStringRef localeIdentifier) {
1518 char inLocaleString[kLocaleIdentifierCStringMax];
1519 CFStringRef outStringRef = NULL;
1520
1521 if ( localeIdentifier && CFStringGetCString(localeIdentifier, inLocaleString, sizeof(inLocaleString), kCFStringEncodingASCII) ) {
1522 KeyStringToResultString testEntry;
1523 KeyStringToResultString * foundEntry;
1524 char keyValueString[sizeof(inLocaleString)]; // <1.10>
1525 char varKeyValueString[sizeof(inLocaleString)]; // <1.17>
1526
1527 _GetKeyValueString(inLocaleString, keyValueString); // <1.10>
1528 testEntry.result = NULL;
1529
1530 // A. First check if input string matches an old-style string that has a replacement
1531 // (do this before case normalization)
1532 testEntry.key = inLocaleString;
1533 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, oldAppleLocaleToCanonical, kNumOldAppleLocaleToCanonical,
1534 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1535 if (foundEntry) {
1536 // It does match, so replace old string with new
1537 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1538 varKeyValueString[0] = 0;
1539 } else {
1540 char * langRegSubtag = NULL;
1541 char * regionTag = NULL;
1542
1543 // B. No match with an old-style string, use input string but update codes, normalize case, etc.
1544 _UpdateFullLocaleString(inLocaleString, sizeof(inLocaleString), &langRegSubtag, &regionTag, varKeyValueString); // <1.10><1.17><1.19>
1545
1546 // if the language part already includes a regional variant, then delete any region tag. <1.19>
1547 if (langRegSubtag && regionTag)
1548 *regionTag = 0;
1549 }
1550
1551 // C. Now we have an up-to-date locale string, but we need to strip defaults and turn it into a language string
1552
1553 // 1. Strip defaults in input string based on initial part of locale string
1554 // (mainly to strip default script tag for a language)
1555 testEntry.key = inLocaleString;
1556 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToDefaults, kNumLocaleStringPrefixToDefaults,
1557 sizeof(KeyStringToResultString), _CompareTestEntryPrefixToTableEntryKey );
1558 if (foundEntry) {
1559 // The input string begins with a character sequence for which
1560 // there are default substrings which should be stripped if present
1561 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1562 }
1563
1564 // 2. If the string matches a locale string used by Apple as a language string, turn it into a language string
1565 testEntry.key = inLocaleString;
1566 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, appleLocaleToLanguageString, kNumAppleLocaleToLanguageString,
1567 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1568 if (foundEntry) {
1569 // it does match
1570 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1571 } else {
1572 // skip to any region tag or java-type variant
1573 char * inLocalePtr = inLocaleString;
1574 while (*inLocalePtr != 0 && *inLocalePtr != '_') {
1575 inLocalePtr++;
1576 }
1577 // if there is still a region tag, turn it into a language variant <1.19>
1578 if (*inLocalePtr == '_') {
1579 // handle 3-digit regions in addition to 2-letter ones
1580 char * regionTag = inLocalePtr++;
1581 long expectedLength = 0;
1582 if ( isalpha(*inLocalePtr) ) {
1583 while ( isalpha(*(++inLocalePtr)) )
1584 ;
1585 expectedLength = 3;
1586 } else if ( isdigit(*inLocalePtr) ) {
1587 while ( isdigit(*(++inLocalePtr)) )
1588 ;
1589 expectedLength = 4;
1590 }
1591 *regionTag = (inLocalePtr - regionTag == expectedLength)? '-': 0;
1592 }
1593 // anything else at/after '_' just gets deleted
1594 *inLocalePtr = 0;
1595 }
1596
1597 // D. Re-append any key-value strings, now canonical // <1.10><1.17>
1598 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), varKeyValueString );
1599 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), keyValueString );
1600
1601 // All done, return what we came up with.
1602 outStringRef = CFStringCreateWithCString(allocator, inLocaleString, kCFStringEncodingASCII);
1603 }
1604
1605 return outStringRef;
1606 }
1607
1608
1609 CFStringRef CFLocaleCreateCanonicalLocaleIdentifierFromString(CFAllocatorRef allocator, CFStringRef localeIdentifier) {
1610 char inLocaleString[kLocaleIdentifierCStringMax];
1611 CFStringRef outStringRef = NULL;
1612
1613 if ( localeIdentifier && CFStringGetCString(localeIdentifier, inLocaleString, sizeof(inLocaleString), kCFStringEncodingASCII) ) {
1614 KeyStringToResultString testEntry;
1615 KeyStringToResultString * foundEntry;
1616 char keyValueString[sizeof(inLocaleString)]; // <1.10>
1617 char varKeyValueString[sizeof(inLocaleString)]; // <1.17>
1618
1619 _GetKeyValueString(inLocaleString, keyValueString); // <1.10>
1620 testEntry.result = NULL;
1621
1622 // A. First check if input string matches an old-style Apple string that has a replacement
1623 // (do this before case normalization)
1624 testEntry.key = inLocaleString;
1625 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, oldAppleLocaleToCanonical, kNumOldAppleLocaleToCanonical,
1626 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1627 if (foundEntry) {
1628 // It does match, so replace old string with new // <1.10>
1629 strlcpy(inLocaleString, foundEntry->result, sizeof(inLocaleString));
1630 varKeyValueString[0] = 0;
1631 } else {
1632 char * langRegSubtag = NULL;
1633 char * regionTag = NULL;
1634
1635 // B. No match with an old-style string, use input string but update codes, normalize case, etc.
1636 _UpdateFullLocaleString(inLocaleString, sizeof(inLocaleString), &langRegSubtag, &regionTag, varKeyValueString); // <1.10><1.17>
1637
1638
1639 // C. Now strip defaults that are implied by other fields.
1640
1641 // 1. If an ISO 3166 region tag matches an ISO 3166 regional language variant subtag, strip the latter.
1642 if ( langRegSubtag && regionTag && strncmp(langRegSubtag+1, regionTag+1, 2) == 0 ) {
1643 _DeleteCharsAtPointer(langRegSubtag, 3);
1644 }
1645
1646 // 2. Strip defaults in input string based on final region tag in locale string
1647 // (mainly for Chinese, to strip -Hans for _CN/_SG, -Hant for _TW/_HK/_MO)
1648 if ( regionTag ) {
1649 testEntry.key = regionTag;
1650 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringRegionToDefaults, kNumLocaleStringRegionToDefaults,
1651 sizeof(KeyStringToResultString), _CompareTestEntryToTableEntryKey );
1652 if (foundEntry) {
1653 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1654 }
1655 }
1656
1657 // 3. Strip defaults in input string based on initial part of locale string
1658 // (mainly to strip default script tag for a language)
1659 testEntry.key = inLocaleString;
1660 foundEntry = (KeyStringToResultString *)bsearch( &testEntry, localeStringPrefixToDefaults, kNumLocaleStringPrefixToDefaults,
1661 sizeof(KeyStringToResultString), _CompareTestEntryPrefixToTableEntryKey );
1662 if (foundEntry) {
1663 // The input string begins with a character sequence for which
1664 // there are default substrings which should be stripped if present
1665 _RemoveSubstringsIfPresent(inLocaleString, foundEntry->result);
1666 }
1667 }
1668
1669 // D. Re-append any key-value strings, now canonical // <1.10><1.17>
1670 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), varKeyValueString );
1671 _AppendKeyValueString( inLocaleString, sizeof(inLocaleString), keyValueString );
1672
1673 // Now create the CFString (even if empty!)
1674 outStringRef = CFStringCreateWithCString(allocator, inLocaleString, kCFStringEncodingASCII);
1675 }
1676
1677 return outStringRef;
1678 }
1679
1680 // CFLocaleCreateCanonicalLocaleIdentifierFromScriptManagerCodes, based on
1681 // the first part of the SPI CFBundleCopyLocalizationForLocalizationInfo in CFBundle_Resources.c
1682 CFStringRef CFLocaleCreateCanonicalLocaleIdentifierFromScriptManagerCodes(CFAllocatorRef allocator, LangCode lcode, RegionCode rcode) {
1683 CFStringRef result = NULL;
1684 if (0 <= rcode && rcode < kNumRegionCodeToLocaleString) {
1685 const char *localeString = regionCodeToLocaleString[rcode];
1686 if (localeString != NULL && *localeString != '\0') {
1687 result = CFStringCreateWithCStringNoCopy(allocator, localeString, kCFStringEncodingASCII, kCFAllocatorNull);
1688 }
1689 }
1690 if (result) return result;
1691 if (0 <= lcode && lcode < kNumLangCodeToLocaleString) {
1692 const char *localeString = langCodeToLocaleString[lcode];
1693 if (localeString != NULL && *localeString != '\0') {
1694 result = CFStringCreateWithCStringNoCopy(allocator, localeString, kCFStringEncodingASCII, kCFAllocatorNull);
1695 }
1696 }
1697 return result;
1698 }
1699
1700
1701 /*
1702 SPI: CFLocaleGetLanguageRegionEncodingForLocaleIdentifier gets the appropriate language and region codes,
1703 and the default legacy script code and encoding, for the specified locale (or language) string.
1704 Returns false if CFLocale has no information about the given locale (in which case none of the by-reference return values are set);
1705 otherwise may set *langCode and/or *regCode to -1 if there is no appropriate legacy value for the locale.
1706 This is a replacement for the CFBundle SPI CFBundleGetLocalizationInfoForLocalization (which was intended to be temporary and transitional);
1707 this function is more up-to-date in its handling of locale strings, and is in CFLocale where this functionality should belong. Compared
1708 to CFBundleGetLocalizationInfoForLocalization, this function does not spcially interpret a NULL localeIdentifier to mean use the single most
1709 preferred localization in the current context (this function returns NO for a NULL localeIdentifier); and in this function
1710 langCode, regCode, and scriptCode are all SInt16* (not SInt32* like the equivalent parameters in CFBundleGetLocalizationInfoForLocalization).
1711 */
1712 static int CompareLocaleToLegacyCodesEntries( const void *entry1, const void *entry2 );
1713
1714 Boolean CFLocaleGetLanguageRegionEncodingForLocaleIdentifier(CFStringRef localeIdentifier, LangCode *langCode, RegionCode *regCode, ScriptCode *scriptCode, CFStringEncoding *stringEncoding) {
1715 Boolean returnValue = false;
1716 CFStringRef canonicalIdentifier = CFLocaleCreateCanonicalLocaleIdentifierFromString(NULL, localeIdentifier);
1717 if (canonicalIdentifier) {
1718 char localeCString[kLocaleIdentifierCStringMax];
1719 if ( CFStringGetCString(canonicalIdentifier, localeCString, sizeof(localeCString), kCFStringEncodingASCII) ) {
1720 UErrorCode icuStatus = U_ZERO_ERROR;
1721 int32_t languagelength;
1722 char searchString[ULOC_LANG_CAPACITY + ULOC_FULLNAME_CAPACITY];
1723
1724 languagelength = uloc_getLanguage( localeCString, searchString, ULOC_LANG_CAPACITY, &icuStatus );
1725 if ( U_SUCCESS(icuStatus) && languagelength > 0 ) {
1726 // OK, here we have at least a language code, check for other components in order
1727 LocaleToLegacyCodes searchEntry = { (const char *)searchString, 0, 0, 0 };
1728 const LocaleToLegacyCodes * foundEntryPtr;
1729 int32_t componentLength;
1730 char componentString[ULOC_FULLNAME_CAPACITY];
1731
1732 languagelength = strlen(searchString); // in case it got truncated
1733 icuStatus = U_ZERO_ERROR;
1734 componentLength = uloc_getScript( localeCString, componentString, sizeof(componentString), &icuStatus );
1735 if ( U_FAILURE(icuStatus) || componentLength == 0 ) {
1736 icuStatus = U_ZERO_ERROR;
1737 componentLength = uloc_getCountry( localeCString, componentString, sizeof(componentString), &icuStatus );
1738 if ( U_FAILURE(icuStatus) || componentLength == 0 ) {
1739 icuStatus = U_ZERO_ERROR;
1740 componentLength = uloc_getVariant( localeCString, componentString, sizeof(componentString), &icuStatus );
1741 if ( U_FAILURE(icuStatus) ) {
1742 componentLength = 0;
1743 }
1744 }
1745 }
1746
1747 // Append whichever other component we first found
1748 if (componentLength > 0) {
1749 strlcat(searchString, "_", sizeof(searchString));
1750 strlcat(searchString, componentString, sizeof(searchString));
1751 }
1752
1753 // Search
1754 foundEntryPtr = (const LocaleToLegacyCodes *)bsearch( &searchEntry, localeToLegacyCodes, kNumLocaleToLegacyCodes, sizeof(LocaleToLegacyCodes), CompareLocaleToLegacyCodesEntries );
1755 if (foundEntryPtr == NULL && (int32_t) strlen(searchString) > languagelength) {
1756 // truncate to language al;one and try again
1757 searchString[languagelength] = 0;
1758 foundEntryPtr = (const LocaleToLegacyCodes *)bsearch( &searchEntry, localeToLegacyCodes, kNumLocaleToLegacyCodes, sizeof(LocaleToLegacyCodes), CompareLocaleToLegacyCodesEntries );
1759 }
1760
1761 // If found a matching entry, return requested values
1762 if (foundEntryPtr) {
1763 returnValue = true;
1764 if (langCode) *langCode = foundEntryPtr->langCode;
1765 if (regCode) *regCode = foundEntryPtr->regCode;
1766 if (stringEncoding) *stringEncoding = foundEntryPtr->encoding;
1767 if (scriptCode) {
1768 // map CFStringEncoding to ScriptCode
1769 if (foundEntryPtr->encoding < 33/*kCFStringEncodingMacSymbol*/) {
1770 *scriptCode = foundEntryPtr->encoding;
1771 } else {
1772 switch (foundEntryPtr->encoding) {
1773 case 0x8C/*kCFStringEncodingMacFarsi*/: *scriptCode = 4/*smArabic*/; break;
1774 case 0x98/*kCFStringEncodingMacUkrainian*/: *scriptCode = 7/*smCyrillic*/; break;
1775 case 0xEC/*kCFStringEncodingMacInuit*/: *scriptCode = 28/*smEthiopic*/; break;
1776 case 0xFC/*kCFStringEncodingMacVT100*/: *scriptCode = 32/*smUninterp*/; break;
1777 default: *scriptCode = 0/*smRoman*/; break;
1778 }
1779 }
1780 }
1781 }
1782 }
1783 }
1784 CFRelease(canonicalIdentifier);
1785 }
1786 return returnValue;
1787 }
1788
1789 static int CompareLocaleToLegacyCodesEntries( const void *entry1, const void *entry2 ) {
1790 const char * localeString1 = ((const LocaleToLegacyCodes *)entry1)->locale;
1791 const char * localeString2 = ((const LocaleToLegacyCodes *)entry2)->locale;
1792 return strcmp(localeString1, localeString2);
1793 }
1794
1795
1796 CFDictionaryRef CFLocaleCreateComponentsFromLocaleIdentifier(CFAllocatorRef allocator, CFStringRef localeID) {
1797 char cLocaleID[ULOC_FULLNAME_CAPACITY+ULOC_KEYWORD_AND_VALUES_CAPACITY];
1798 char buffer[ULOC_FULLNAME_CAPACITY+ULOC_KEYWORD_AND_VALUES_CAPACITY];
1799 CFMutableDictionaryRef working = CFDictionaryCreateMutable(allocator, 10, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
1800
1801 UErrorCode icuStatus = U_ZERO_ERROR;
1802 int32_t length = 0;
1803
1804 // Extract the C string locale ID, for ICU
1805 CFIndex outBytes = 0;
1806 CFStringGetBytes(localeID, CFRangeMake(0, CFStringGetLength(localeID)), kCFStringEncodingASCII, (UInt8) '?', true, (unsigned char *)cLocaleID, sizeof(cLocaleID)/sizeof(char) - 1, &outBytes);
1807 cLocaleID[outBytes] = '\0';
1808
1809 // Get the components
1810 length = uloc_getLanguage(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1811 if (U_SUCCESS(icuStatus) && length > 0)
1812 {
1813 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1814 CFDictionaryAddValue(working, kCFLocaleLanguageCodeKey, string);
1815 CFRelease(string);
1816 }
1817 icuStatus = U_ZERO_ERROR;
1818
1819 length = uloc_getScript(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1820 if (U_SUCCESS(icuStatus) && length > 0)
1821 {
1822 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1823 CFDictionaryAddValue(working, kCFLocaleScriptCodeKey, string);
1824 CFRelease(string);
1825 }
1826 icuStatus = U_ZERO_ERROR;
1827
1828 length = uloc_getCountry(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1829 if (U_SUCCESS(icuStatus) && length > 0)
1830 {
1831 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1832 CFDictionaryAddValue(working, kCFLocaleCountryCodeKey, string);
1833 CFRelease(string);
1834 }
1835 icuStatus = U_ZERO_ERROR;
1836
1837 length = uloc_getVariant(cLocaleID, buffer, sizeof(buffer)/sizeof(char), &icuStatus);
1838 if (U_SUCCESS(icuStatus) && length > 0)
1839 {
1840 CFStringRef string = CFStringCreateWithBytes(allocator, (UInt8 *)buffer, length, kCFStringEncodingASCII, true);
1841 CFDictionaryAddValue(working, kCFLocaleVariantCodeKey, string);
1842 CFRelease(string);
1843 }
1844 icuStatus = U_ZERO_ERROR;
1845
1846 // Now get the keywords; open an enumerator on them
1847 UEnumeration *iter = uloc_openKeywords(cLocaleID, &icuStatus);
1848 const char *locKey = NULL;
1849 int32_t locKeyLen = 0;
1850 while ((locKey = uenum_next(iter, &locKeyLen, &icuStatus)) && U_SUCCESS(icuStatus))
1851 {
1852 char locValue[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1853
1854 // Get the value for this keyword
1855 if (uloc_getKeywordValue(cLocaleID, locKey, locValue, sizeof(locValue)/sizeof(char), &icuStatus) > 0
1856 && U_SUCCESS(icuStatus))
1857 {
1858 CFStringRef key = CFStringCreateWithBytes(allocator, (UInt8 *)locKey, strlen(locKey), kCFStringEncodingASCII, true);
1859 CFStringRef value = CFStringCreateWithBytes(allocator, (UInt8 *)locValue, strlen(locValue), kCFStringEncodingASCII, true);
1860 if (key && value)
1861 CFDictionaryAddValue(working, key, value);
1862 if (key)
1863 CFRelease(key);
1864 if (value)
1865 CFRelease(value);
1866 }
1867 }
1868 uenum_close(iter);
1869
1870 // Convert to an immutable dictionary and return
1871 CFDictionaryRef result = CFDictionaryCreateCopy(allocator, working);
1872 CFRelease(working);
1873 return result;
1874 }
1875
1876 static char *__CStringFromString(CFStringRef str) {
1877 if (!str) return NULL;
1878 CFRange rg = CFRangeMake(0, CFStringGetLength(str));
1879 CFIndex neededLength = 0;
1880 CFStringGetBytes(str, rg, kCFStringEncodingASCII, (UInt8)'?', false, NULL, 0, &neededLength);
1881 char *buf = (char *)malloc(neededLength + 1);
1882 CFStringGetBytes(str, rg, kCFStringEncodingASCII, (UInt8)'?', false, (uint8_t *)buf, neededLength, &neededLength);
1883 buf[neededLength] = '\0';
1884 return buf;
1885 }
1886
1887 CFStringRef CFLocaleCreateLocaleIdentifierFromComponents(CFAllocatorRef allocator, CFDictionaryRef dictionary) {
1888 CFIndex cnt = CFDictionaryGetCount(dictionary);
1889 STACK_BUFFER_DECL(CFStringRef, values, cnt);
1890 STACK_BUFFER_DECL(CFStringRef, keys, cnt);
1891 CFDictionaryGetKeysAndValues(dictionary, (const void **)keys, (const void **)values);
1892
1893 char *language = NULL, *script = NULL, *country = NULL, *variant = NULL;
1894 for (CFIndex idx = 0; idx < cnt; idx++) {
1895 if (CFEqual(kCFLocaleLanguageCodeKey, keys[idx])) {
1896 language = __CStringFromString(values[idx]);
1897 keys[idx] = NULL;
1898 } else if (CFEqual(kCFLocaleScriptCodeKey, keys[idx])) {
1899 script = __CStringFromString(values[idx]);
1900 keys[idx] = NULL;
1901 } else if (CFEqual(kCFLocaleCountryCodeKey, keys[idx])) {
1902 country = __CStringFromString(values[idx]);
1903 keys[idx] = NULL;
1904 } else if (CFEqual(kCFLocaleVariantCodeKey, keys[idx])) {
1905 variant = __CStringFromString(values[idx]);
1906 keys[idx] = NULL;
1907 }
1908 }
1909
1910 char *buf1 = NULL; // (|L)(|_S)(|_C|_C_V|__V)
1911 asprintf(&buf1, "%s%s%s%s%s%s%s", language ? language : "", script ? "_" : "", script ? script : "", (country || variant ? "_" : ""), country ? country : "", variant ? "_" : "", variant ? variant : "");
1912
1913 char cLocaleID[2 * ULOC_FULLNAME_CAPACITY + 2 * ULOC_KEYWORD_AND_VALUES_CAPACITY];
1914 strlcpy(cLocaleID, buf1, sizeof(cLocaleID));
1915 free(language);
1916 free(script);
1917 free(country);
1918 free(variant);
1919 free(buf1);
1920
1921 for (CFIndex idx = 0; idx < cnt; idx++) {
1922 if (keys[idx]) {
1923 char *key = __CStringFromString(keys[idx]);
1924 char *value = __CStringFromString(values[idx]);
1925 UErrorCode status = U_ZERO_ERROR;
1926 uloc_setKeywordValue(key, value, cLocaleID, sizeof(cLocaleID), &status);
1927 free(key);
1928 free(value);
1929 }
1930 }
1931
1932 return CFStringCreateWithCString(allocator, cLocaleID, kCFStringEncodingASCII);
1933 }
1934