1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
11 * Modification History:
13 * Date Name Description
14 * 04/01/97 aliu Creation.
15 * 08/21/98 stephen JDK 1.2 sync
16 * 12/08/98 rtg New Locale implementation and C API
17 * 03/15/99 damiba overhaul.
18 * 04/06/99 stephen changed setDefault() to realloc and copy
19 * 06/14/99 stephen Changed calls to ures_open for new params
20 * 07/21/99 stephen Modified setDefault() to propagate to C++
21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22 * brought canonicalization code into line with spec
23 *****************************************************************************/
26 POSIX's locale format, from putil.c: [no spaces]
28 ll [ _CC ] [ . MM ] [ @ VV]
30 l = lang, C = ctry, M = charmap, V = variant
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/uloc.h"
49 #include <stdio.h> /* for sprintf */
53 /* ### Declarations **************************************************/
55 /* Locale stuff from locid.cpp */
56 U_CFUNC
void locale_set_default(const char *id
);
57 U_CFUNC
const char *locale_get_default(void);
59 locale_getKeywords(const char *localeID
,
61 char *keywords
, int32_t keywordCapacity
,
62 char *values
, int32_t valuesCapacity
, int32_t *valLen
,
66 /* ### Data tables **************************************************/
69 * Table of language codes, both 2- and 3-letter, with preference
70 * given to 2-letter codes where possible. Includes 3-letter codes
71 * that lack a 2-letter equivalent.
73 * This list must be in sorted order. This list is returned directly
74 * to the user by some API.
76 * This list must be kept in sync with LANGUAGES_3, with corresponding
79 * This table should be terminated with a NULL entry, followed by a
80 * second list, and another NULL entry. The first list is visible to
81 * user code when this array is returned by API. The second list
82 * contains codes we support, but do not expose through user API.
86 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
87 * include the revisions up to 2001/7/27 *CWB*
89 * The 3 character codes are the terminology codes like RFC 3066. This
90 * is compatible with prior ICU codes
92 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
93 * table but now at the end of the table because 3 character codes are
94 * duplicates. This avoids bad searches going from 3 to 2 character
97 * The range qaa-qtz is reserved for local use
99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
100 /* ISO639 table version is 20150505 */
101 /* Subsequent hand addition of selected languages */
102 static const char * const LANGUAGES
[] = {
103 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
104 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
105 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
106 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
107 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
108 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
109 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
110 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
111 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
112 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
113 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
114 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
115 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
116 "cs", "csb", "cu", "cv", "cy",
117 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
118 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
119 "dyo", "dyu", "dz", "dzg",
120 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
121 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
123 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
124 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
126 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
127 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
128 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
129 "gur", "guz", "gv", "gwi",
130 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
131 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
133 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
134 "ilo", "inh", "io", "is", "it", "iu", "izh",
135 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
137 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
138 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
139 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
140 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
141 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
142 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
144 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
145 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
146 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
147 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
148 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
149 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
150 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
151 "ml", "mn", "mnc", "mni", "moh", "mos", "mr", "mrj",
152 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
153 "my", "mye", "myv", "mzn",
154 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
155 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
156 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
157 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
158 "oc", "oj", "om", "or", "os", "osa", "ota",
159 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
160 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
161 "pon", "prg", "pro", "ps", "pt",
163 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
164 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
166 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
167 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
168 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
169 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
170 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
171 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
172 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
173 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
174 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
175 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
176 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
177 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
178 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
179 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
180 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
182 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
183 "xal", "xh", "xmf", "xog",
184 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
185 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
188 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
192 static const char* const DEPRECATED_LANGUAGES
[]={
193 "in", "iw", "ji", "jw", NULL
, NULL
195 static const char* const REPLACEMENT_LANGUAGES
[]={
196 "id", "he", "yi", "jv", NULL
, NULL
200 * Table of 3-letter language codes.
202 * This is a lookup table used to convert 3-letter language codes to
203 * their 2-letter equivalent, where possible. It must be kept in sync
204 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
205 * same language as LANGUAGES_3[i]. The commented-out lines are
206 * copied from LANGUAGES to make eyeballing this baby easier.
208 * Where a 3-letter language code has no 2-letter equivalent, the
209 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
211 * This table should be terminated with a NULL entry, followed by a
212 * second list, and another NULL entry. The two lists correspond to
213 * the two lists in LANGUAGES.
215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
216 /* ISO639 table version is 20150505 */
217 /* Subsequent hand addition of selected languages */
218 static const char * const LANGUAGES_3
[] = {
219 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
220 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
221 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
222 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
223 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
224 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
225 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
226 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
227 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
228 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
229 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
230 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
231 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
232 "ces", "csb", "chu", "chv", "cym",
233 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
234 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
235 "dyo", "dyu", "dzo", "dzg",
236 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
237 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
239 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
240 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
242 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
243 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
244 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
245 "gur", "guz", "glv", "gwi",
246 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
247 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
249 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
250 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
251 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
253 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
254 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
255 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
256 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
257 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
258 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
260 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
261 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
262 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
263 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
264 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
265 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
266 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
267 "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
268 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
269 "mya", "mye", "myv", "mzn",
270 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
271 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
272 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
273 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
274 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
275 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
276 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
277 "pon", "prg", "pro", "pus", "por",
279 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
280 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
282 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
283 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
284 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
285 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
286 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
287 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
288 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
289 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
290 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
291 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
292 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
293 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
294 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
295 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
296 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
298 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
299 "xal", "xho", "xmf", "xog",
300 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
301 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
304 /* "in", "iw", "ji", "jw", "sh", */
305 "ind", "heb", "yid", "jaw", "srp",
310 * Table of 2-letter country codes.
312 * This list must be in sorted order. This list is returned directly
313 * to the user by some API.
315 * This list must be kept in sync with COUNTRIES_3, with corresponding
318 * This table should be terminated with a NULL entry, followed by a
319 * second list, and another NULL entry. The first list is visible to
320 * user code when this array is returned by API. The second list
321 * contains codes we support, but do not expose through user API.
325 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
326 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
327 * new codes keeping the old ones for compatibility updated to include
328 * 1999/12/03 revisions *CWB*
330 * RO(ROM) is now RO(ROU) according to
331 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
333 static const char * const COUNTRIES
[] = {
334 "AC", "AD", "AE", "AF", "AG", "AI", "AL", "AM",
335 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
336 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
337 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
338 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
339 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CP", "CR",
340 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
341 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
342 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
343 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
344 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
345 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
346 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
347 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
348 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
349 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
350 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
351 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
352 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
353 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
354 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
355 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
356 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
357 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
358 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
359 "SX", "SY", "SZ", "TA", "TC", "TD", "TF", "TG", "TH", "TJ",
360 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
361 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
362 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
363 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
365 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
369 static const char* const DEPRECATED_COUNTRIES
[] = {
370 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL
, NULL
/* deprecated country list */
372 static const char* const REPLACEMENT_COUNTRIES
[] = {
373 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
374 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL
, NULL
/* replacement country codes */
378 * Table of 3-letter country codes.
380 * This is a lookup table used to convert 3-letter country codes to
381 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
382 * For all valid i, COUNTRIES[i] must refer to the same country as
383 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
384 * to make eyeballing this baby easier.
386 * This table should be terminated with a NULL entry, followed by a
387 * second list, and another NULL entry. The two lists correspond to
388 * the two lists in COUNTRIES.
390 static const char * const COUNTRIES_3
[] = {
391 /* "AC", "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
392 "ASC", "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
393 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
394 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
395 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
396 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
397 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
398 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
399 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
400 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
401 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CP", "CR", */
402 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CPT", "CRI",
403 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */
404 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
405 /* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */
406 "DMA", "DOM", "DZA", "EA ", "ECU", "EST", "EGY", "ESH", "ERI", /* no valid 3-letter code for EA */
407 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
408 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
409 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
410 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
411 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
412 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
413 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
414 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
415 /* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
416 "IC ", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", /* no valid 3-letter code for IC */
417 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
418 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
419 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
420 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
421 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
422 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
423 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
424 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
425 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
426 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
427 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
428 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
429 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
430 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
431 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
432 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
433 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
434 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
435 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
436 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
437 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
438 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
439 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
440 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
441 /* "SX", "SY", "SZ", "TA", "TC", "TD", "TF", "TG", "TH", "TJ", */
442 "SXM", "SYR", "SWZ", "TAA", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
443 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
444 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
445 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
446 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
447 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
448 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
449 /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
450 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
452 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
453 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
457 typedef struct CanonicalizationMap
{
458 const char *id
; /* input ID */
459 const char *canonicalID
; /* canonicalized output ID */
460 } CanonicalizationMap
;
463 * A map to canonicalize locale IDs. This handles a variety of
464 * different semantic kinds of transformations.
466 static const CanonicalizationMap CANONICALIZE_MAP
[] = {
467 { "", "en_US_POSIX" }, /* .NET name */ // open ICU 64 deleted, we restore
468 { "c", "en_US_POSIX" }, /* POSIX name */ // open ICU 64 deleted, we restore
469 { "posix", "en_US_POSIX" }, /* POSIX name (alias of C) */ // open ICU 64 deleted, we restore
470 { "art_LOJBAN", "jbo" }, /* registered name */
471 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
472 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
473 { "zh_GAN", "gan" }, /* registered name */
474 { "zh_GUOYU", "zh" }, /* registered name */
475 { "zh_HAKKA", "hak" }, /* registered name */
476 { "zh_MIN_NAN", "nan" }, /* registered name */
477 { "zh_WUU", "wuu" }, /* registered name */
478 { "zh_XIANG", "hsn" }, /* registered name */
479 { "zh_YUE", "yue" }, /* registered name */
482 /* ### BCP47 Conversion *******************************************/
483 /* Test if the locale id has BCP47 u extension and does not have '@' */
484 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
485 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
486 #define _ConvertBCP47(finalID, id, buffer, length,err) \
487 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
488 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
490 if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
494 /* Gets the size of the shortest subtag in the given localeID. */
495 static int32_t getShortestSubtagLength(const char *localeID
) {
496 int32_t localeIDLength
= static_cast<int32_t>(uprv_strlen(localeID
));
497 int32_t length
= localeIDLength
;
498 int32_t tmpLength
= 0;
502 for (i
= 0; i
< localeIDLength
; i
++) {
503 if (localeID
[i
] != '_' && localeID
[i
] != '-') {
510 if (tmpLength
!= 0 && tmpLength
< length
) {
520 /* ### Keywords **************************************************/
521 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
522 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
523 /* Punctuation/symbols allowed in legacy key values */
524 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
526 #define ULOC_KEYWORD_BUFFER_LEN 25
527 #define ULOC_MAX_NO_KEYWORDS 25
529 U_CAPI
const char * U_EXPORT2
530 locale_getKeywordsStart(const char *localeID
) {
531 const char *result
= NULL
;
532 if((result
= uprv_strchr(localeID
, '@')) != NULL
) {
535 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
537 /* We do this because the @ sign is variant, and the @ sign used on one
538 EBCDIC machine won't be compiled the same way on other EBCDIC based
540 static const uint8_t ebcdicSigns
[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
541 const uint8_t *charToFind
= ebcdicSigns
;
543 if((result
= uprv_strchr(localeID
, *charToFind
)) != NULL
) {
554 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
555 * @param keywordName incoming name to be canonicalized
556 * @param status return status (keyword too long)
557 * @return length of the keyword name
559 static int32_t locale_canonKeywordName(char *buf
, const char *keywordName
, UErrorCode
*status
)
561 int32_t keywordNameLen
= 0;
563 for (; *keywordName
!= 0; keywordName
++) {
564 if (!UPRV_ISALPHANUM(*keywordName
)) {
565 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed keyword name */
568 if (keywordNameLen
< ULOC_KEYWORD_BUFFER_LEN
- 1) {
569 buf
[keywordNameLen
++] = uprv_tolower(*keywordName
);
571 /* keyword name too long for internal buffer */
572 *status
= U_INTERNAL_PROGRAM_ERROR
;
576 if (keywordNameLen
== 0) {
577 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty keyword name */
580 buf
[keywordNameLen
] = 0; /* terminate */
582 return keywordNameLen
;
586 char keyword
[ULOC_KEYWORD_BUFFER_LEN
];
588 const char *valueStart
;
592 static int32_t U_CALLCONV
593 compareKeywordStructs(const void * /*context*/, const void *left
, const void *right
) {
594 const char* leftString
= ((const KeywordStruct
*)left
)->keyword
;
595 const char* rightString
= ((const KeywordStruct
*)right
)->keyword
;
596 return uprv_strcmp(leftString
, rightString
);
600 _getKeywords(const char *localeID
,
602 char *keywords
, int32_t keywordCapacity
,
603 char *values
, int32_t valuesCapacity
, int32_t *valLen
,
607 KeywordStruct keywordList
[ULOC_MAX_NO_KEYWORDS
];
609 int32_t maxKeywords
= ULOC_MAX_NO_KEYWORDS
;
610 int32_t numKeywords
= 0;
611 const char* pos
= localeID
;
612 const char* equalSign
= NULL
;
613 const char* semicolon
= NULL
;
615 int32_t keywordsLen
= 0;
616 int32_t valuesLen
= 0;
618 if(prev
== '@') { /* start of keyword definition */
619 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
621 UBool duplicate
= FALSE
;
622 /* skip leading spaces */
626 if (!*pos
) { /* handle trailing "; " */
629 if(numKeywords
== maxKeywords
) {
630 *status
= U_INTERNAL_PROGRAM_ERROR
;
633 equalSign
= uprv_strchr(pos
, '=');
634 semicolon
= uprv_strchr(pos
, ';');
635 /* lack of '=' [foo@currency] is illegal */
636 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
637 if(!equalSign
|| (semicolon
&& semicolon
<equalSign
)) {
638 *status
= U_INVALID_FORMAT_ERROR
;
641 /* need to normalize both keyword and keyword name */
642 if(equalSign
- pos
>= ULOC_KEYWORD_BUFFER_LEN
) {
643 /* keyword name too long for internal buffer */
644 *status
= U_INTERNAL_PROGRAM_ERROR
;
647 for(i
= 0, n
= 0; i
< equalSign
- pos
; ++i
) {
649 keywordList
[numKeywords
].keyword
[n
++] = uprv_tolower(pos
[i
]);
653 /* zero-length keyword is an error. */
655 *status
= U_INVALID_FORMAT_ERROR
;
659 keywordList
[numKeywords
].keyword
[n
] = 0;
660 keywordList
[numKeywords
].keywordLen
= n
;
661 /* now grab the value part. First we skip the '=' */
663 /* then we leading spaces */
664 while(*equalSign
== ' ') {
668 /* Premature end or zero-length value */
669 if (!*equalSign
|| equalSign
== semicolon
) {
670 *status
= U_INVALID_FORMAT_ERROR
;
674 keywordList
[numKeywords
].valueStart
= equalSign
;
679 while(*(pos
- i
- 1) == ' ') {
682 keywordList
[numKeywords
].valueLen
= (int32_t)(pos
- equalSign
- i
);
685 i
= (int32_t)uprv_strlen(equalSign
);
686 while(i
&& equalSign
[i
-1] == ' ') {
689 keywordList
[numKeywords
].valueLen
= i
;
691 /* If this is a duplicate keyword, then ignore it */
692 for (j
=0; j
<numKeywords
; ++j
) {
693 if (uprv_strcmp(keywordList
[j
].keyword
, keywordList
[numKeywords
].keyword
) == 0) {
703 /* now we have a list of keywords */
704 /* we need to sort it */
705 uprv_sortArray(keywordList
, numKeywords
, sizeof(KeywordStruct
), compareKeywordStructs
, NULL
, FALSE
, status
);
707 /* Now construct the keyword part */
708 for(i
= 0; i
< numKeywords
; i
++) {
709 if(keywordsLen
+ keywordList
[i
].keywordLen
+ 1< keywordCapacity
) {
710 uprv_strcpy(keywords
+keywordsLen
, keywordList
[i
].keyword
);
712 keywords
[keywordsLen
+ keywordList
[i
].keywordLen
] = '=';
714 keywords
[keywordsLen
+ keywordList
[i
].keywordLen
] = 0;
717 keywordsLen
+= keywordList
[i
].keywordLen
+ 1;
719 if(keywordsLen
+ keywordList
[i
].valueLen
<= keywordCapacity
) {
720 uprv_strncpy(keywords
+keywordsLen
, keywordList
[i
].valueStart
, keywordList
[i
].valueLen
);
722 keywordsLen
+= keywordList
[i
].valueLen
;
724 if(i
< numKeywords
- 1) {
725 if(keywordsLen
< keywordCapacity
) {
726 keywords
[keywordsLen
] = ';';
732 if(valuesLen
+ keywordList
[i
].valueLen
+ 1< valuesCapacity
) {
733 uprv_strcpy(values
+valuesLen
, keywordList
[i
].valueStart
);
734 values
[valuesLen
+ keywordList
[i
].valueLen
] = 0;
736 valuesLen
+= keywordList
[i
].valueLen
+ 1;
740 values
[valuesLen
] = 0;
745 return u_terminateChars(keywords
, keywordCapacity
, keywordsLen
, status
);
752 locale_getKeywords(const char *localeID
,
754 char *keywords
, int32_t keywordCapacity
,
755 char *values
, int32_t valuesCapacity
, int32_t *valLen
,
757 UErrorCode
*status
) {
758 return _getKeywords(localeID
, prev
, keywords
, keywordCapacity
,
759 values
, valuesCapacity
, valLen
, valuesToo
,
763 U_CAPI
int32_t U_EXPORT2
764 uloc_getKeywordValue(const char* localeID
,
765 const char* keywordName
,
766 char* buffer
, int32_t bufferCapacity
,
769 const char* startSearchHere
= NULL
;
770 const char* nextSeparator
= NULL
;
771 char keywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
772 char localeKeywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
775 if(status
&& U_SUCCESS(*status
) && localeID
) {
776 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
777 const char* tmpLocaleID
;
779 if (keywordName
== NULL
|| keywordName
[0] == 0) {
780 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
784 locale_canonKeywordName(keywordNameBuffer
, keywordName
, status
);
785 if(U_FAILURE(*status
)) {
789 if (_hasBCP47Extension(localeID
)) {
790 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), status
);
792 tmpLocaleID
=localeID
;
795 startSearchHere
= locale_getKeywordsStart(tmpLocaleID
);
796 if(startSearchHere
== NULL
) {
797 /* no keywords, return at once */
801 /* find the first keyword */
802 while(startSearchHere
) {
803 const char* keyValueTail
;
806 startSearchHere
++; /* skip @ or ; */
807 nextSeparator
= uprv_strchr(startSearchHere
, '=');
809 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* key must have =value */
812 /* strip leading & trailing spaces (TC decided to tolerate these) */
813 while(*startSearchHere
== ' ') {
816 keyValueTail
= nextSeparator
;
817 while (keyValueTail
> startSearchHere
&& *(keyValueTail
-1) == ' ') {
820 /* now keyValueTail points to first char after the keyName */
821 /* copy & normalize keyName from locale */
822 if (startSearchHere
== keyValueTail
) {
823 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty keyword name in passed-in locale */
827 while (startSearchHere
< keyValueTail
) {
828 if (!UPRV_ISALPHANUM(*startSearchHere
)) {
829 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed keyword name */
832 if (keyValueLen
< ULOC_KEYWORD_BUFFER_LEN
- 1) {
833 localeKeywordNameBuffer
[keyValueLen
++] = uprv_tolower(*startSearchHere
++);
835 /* keyword name too long for internal buffer */
836 *status
= U_INTERNAL_PROGRAM_ERROR
;
840 localeKeywordNameBuffer
[keyValueLen
] = 0; /* terminate */
842 startSearchHere
= uprv_strchr(nextSeparator
, ';');
844 if(uprv_strcmp(keywordNameBuffer
, localeKeywordNameBuffer
) == 0) {
845 /* current entry matches the keyword. */
846 nextSeparator
++; /* skip '=' */
847 /* First strip leading & trailing spaces (TC decided to tolerate these) */
848 while(*nextSeparator
== ' ') {
851 keyValueTail
= (startSearchHere
)? startSearchHere
: nextSeparator
+ uprv_strlen(nextSeparator
);
852 while(keyValueTail
> nextSeparator
&& *(keyValueTail
-1) == ' ') {
855 /* Now copy the value, but check well-formedness */
856 if (nextSeparator
== keyValueTail
) {
857 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty key value name in passed-in locale */
861 while (nextSeparator
< keyValueTail
) {
862 if (!UPRV_ISALPHANUM(*nextSeparator
) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator
)) {
863 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed key value */
866 if (keyValueLen
< bufferCapacity
) {
867 /* Should we lowercase value to return here? Tests expect as-is. */
868 buffer
[keyValueLen
++] = *nextSeparator
++;
869 } else { /* keep advancing so we return correct length in case of overflow */
874 result
= u_terminateChars(buffer
, bufferCapacity
, keyValueLen
, status
);
882 U_CAPI
int32_t U_EXPORT2
883 uloc_setKeywordValue(const char* keywordName
,
884 const char* keywordValue
,
885 char* buffer
, int32_t bufferCapacity
,
888 /* TODO: sorting. removal. */
889 int32_t keywordNameLen
;
890 int32_t keywordValueLen
;
893 char keywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
894 char keywordValueBuffer
[ULOC_KEYWORDS_CAPACITY
+1];
895 char localeKeywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
897 char* nextSeparator
= NULL
;
898 char* nextEqualsign
= NULL
;
899 char* startSearchHere
= NULL
;
900 char* keywordStart
= NULL
;
901 CharString updatedKeysAndValues
;
902 int32_t updatedKeysAndValuesLen
;
903 UBool handledInputKeyAndValue
= FALSE
;
904 char keyValuePrefix
= '@';
906 if(U_FAILURE(*status
)) {
909 if (keywordName
== NULL
|| keywordName
[0] == 0 || bufferCapacity
<= 1) {
910 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
913 bufLen
= (int32_t)uprv_strlen(buffer
);
914 if(bufferCapacity
<bufLen
) {
915 /* The capacity is less than the length?! Is this NULL terminated? */
916 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
919 keywordNameLen
= locale_canonKeywordName(keywordNameBuffer
, keywordName
, status
);
920 if(U_FAILURE(*status
)) {
926 while (*keywordValue
!= 0) {
927 if (!UPRV_ISALPHANUM(*keywordValue
) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue
)) {
928 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed key value */
931 if (keywordValueLen
< ULOC_KEYWORDS_CAPACITY
) {
932 /* Should we force lowercase in value to set? */
933 keywordValueBuffer
[keywordValueLen
++] = *keywordValue
++;
935 /* keywordValue too long for internal buffer */
936 *status
= U_INTERNAL_PROGRAM_ERROR
;
941 keywordValueBuffer
[keywordValueLen
] = 0; /* terminate */
943 startSearchHere
= (char*)locale_getKeywordsStart(buffer
);
944 if(startSearchHere
== NULL
|| (startSearchHere
[1]==0)) {
945 if(keywordValueLen
== 0) { /* no keywords = nothing to remove */
949 needLen
= bufLen
+1+keywordNameLen
+1+keywordValueLen
;
950 if(startSearchHere
) { /* had a single @ */
951 needLen
--; /* already had the @ */
952 /* startSearchHere points at the @ */
954 startSearchHere
=buffer
+bufLen
;
956 if(needLen
>= bufferCapacity
) {
957 *status
= U_BUFFER_OVERFLOW_ERROR
;
958 return needLen
; /* no change */
960 *startSearchHere
++ = '@';
961 uprv_strcpy(startSearchHere
, keywordNameBuffer
);
962 startSearchHere
+= keywordNameLen
;
963 *startSearchHere
++ = '=';
964 uprv_strcpy(startSearchHere
, keywordValueBuffer
);
966 } /* end shortcut - no @ */
968 keywordStart
= startSearchHere
;
969 /* search for keyword */
970 while(keywordStart
) {
971 const char* keyValueTail
;
974 keywordStart
++; /* skip @ or ; */
975 nextEqualsign
= uprv_strchr(keywordStart
, '=');
976 if (!nextEqualsign
) {
977 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* key must have =value */
980 /* strip leading & trailing spaces (TC decided to tolerate these) */
981 while(*keywordStart
== ' ') {
984 keyValueTail
= nextEqualsign
;
985 while (keyValueTail
> keywordStart
&& *(keyValueTail
-1) == ' ') {
988 /* now keyValueTail points to first char after the keyName */
989 /* copy & normalize keyName from locale */
990 if (keywordStart
== keyValueTail
) {
991 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty keyword name in passed-in locale */
995 while (keywordStart
< keyValueTail
) {
996 if (!UPRV_ISALPHANUM(*keywordStart
)) {
997 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed keyword name */
1000 if (keyValueLen
< ULOC_KEYWORD_BUFFER_LEN
- 1) {
1001 localeKeywordNameBuffer
[keyValueLen
++] = uprv_tolower(*keywordStart
++);
1003 /* keyword name too long for internal buffer */
1004 *status
= U_INTERNAL_PROGRAM_ERROR
;
1008 localeKeywordNameBuffer
[keyValueLen
] = 0; /* terminate */
1010 nextSeparator
= uprv_strchr(nextEqualsign
, ';');
1012 /* start processing the value part */
1013 nextEqualsign
++; /* skip '=' */
1014 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1015 while(*nextEqualsign
== ' ') {
1018 keyValueTail
= (nextSeparator
)? nextSeparator
: nextEqualsign
+ uprv_strlen(nextEqualsign
);
1019 while(keyValueTail
> nextEqualsign
&& *(keyValueTail
-1) == ' ') {
1022 if (nextEqualsign
== keyValueTail
) {
1023 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty key value in passed-in locale */
1027 rc
= uprv_strcmp(keywordNameBuffer
, localeKeywordNameBuffer
);
1029 /* Current entry matches the input keyword. Update the entry */
1030 if(keywordValueLen
> 0) { /* updating a value */
1031 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1032 keyValuePrefix
= ';'; /* for any subsequent key-value pair */
1033 updatedKeysAndValues
.append(keywordNameBuffer
, keywordNameLen
, *status
);
1034 updatedKeysAndValues
.append('=', *status
);
1035 updatedKeysAndValues
.append(keywordValueBuffer
, keywordValueLen
, *status
);
1036 } /* else removing this entry, don't emit anything */
1037 handledInputKeyAndValue
= TRUE
;
1039 /* input keyword sorts earlier than current entry, add before current entry */
1040 if (rc
< 0 && keywordValueLen
> 0 && !handledInputKeyAndValue
) {
1041 /* insert new entry at this location */
1042 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1043 keyValuePrefix
= ';'; /* for any subsequent key-value pair */
1044 updatedKeysAndValues
.append(keywordNameBuffer
, keywordNameLen
, *status
);
1045 updatedKeysAndValues
.append('=', *status
);
1046 updatedKeysAndValues
.append(keywordValueBuffer
, keywordValueLen
, *status
);
1047 handledInputKeyAndValue
= TRUE
;
1049 /* copy the current entry */
1050 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1051 keyValuePrefix
= ';'; /* for any subsequent key-value pair */
1052 updatedKeysAndValues
.append(localeKeywordNameBuffer
, keyValueLen
, *status
);
1053 updatedKeysAndValues
.append('=', *status
);
1054 updatedKeysAndValues
.append(nextEqualsign
, static_cast<int32_t>(keyValueTail
-nextEqualsign
), *status
);
1056 if (!nextSeparator
&& keywordValueLen
> 0 && !handledInputKeyAndValue
) {
1057 /* append new entry at the end, it sorts later than existing entries */
1058 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1059 /* skip keyValuePrefix update, no subsequent key-value pair */
1060 updatedKeysAndValues
.append(keywordNameBuffer
, keywordNameLen
, *status
);
1061 updatedKeysAndValues
.append('=', *status
);
1062 updatedKeysAndValues
.append(keywordValueBuffer
, keywordValueLen
, *status
);
1063 handledInputKeyAndValue
= TRUE
;
1065 keywordStart
= nextSeparator
;
1066 } /* end loop searching */
1068 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1069 * problems with the passed-in locale. So if we did encounter problems with the
1070 * passed-in locale above, those errors took precedence and overrode any error
1071 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1072 * are errors here they are from updatedKeysAndValues.append; they do cause an
1073 * error return but the passed-in locale is unmodified and the original bufLen is
1076 if (!handledInputKeyAndValue
|| U_FAILURE(*status
)) {
1077 /* if input key/value specified removal of a keyword not present in locale, or
1078 * there was an error in CharString.append, leave original locale alone. */
1082 updatedKeysAndValuesLen
= updatedKeysAndValues
.length();
1083 /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1084 needLen
= (int32_t)(startSearchHere
- buffer
) + updatedKeysAndValuesLen
;
1085 if(needLen
>= bufferCapacity
) {
1086 *status
= U_BUFFER_OVERFLOW_ERROR
;
1087 return needLen
; /* no change */
1089 if (updatedKeysAndValuesLen
> 0) {
1090 uprv_strncpy(startSearchHere
, updatedKeysAndValues
.data(), updatedKeysAndValuesLen
);
1096 /* ### ID parsing implementation **************************************************/
1098 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1100 /*returns TRUE if one of the special prefixes is here (s=string)
1102 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1104 /* Dot terminates it because of POSIX form where dot precedes the codepage
1105 * except for variant
1107 #define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1110 * Lookup 'key' in the array 'list'. The array 'list' should contain
1111 * a NULL entry, followed by more entries, and a second NULL entry.
1113 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1116 static int16_t _findIndex(const char* const* list
, const char* key
)
1118 const char* const* anchor
= list
;
1121 /* Make two passes through two NULL-terminated arrays at 'list' */
1122 while (pass
++ < 2) {
1124 if (uprv_strcmp(key
, *list
) == 0) {
1125 return (int16_t)(list
- anchor
);
1129 ++list
; /* skip final NULL *CWB*/
1134 /* count the length of src while copying it to dest; return strlen(src) */
1135 static inline int32_t
1136 _copyCount(char *dest
, int32_t destCapacity
, const char *src
) {
1143 return (int32_t)(src
-anchor
);
1145 if(destCapacity
<=0) {
1146 return (int32_t)((src
-anchor
)+uprv_strlen(src
));
1155 uloc_getCurrentCountryID(const char* oldID
){
1156 int32_t offset
= _findIndex(DEPRECATED_COUNTRIES
, oldID
);
1158 return REPLACEMENT_COUNTRIES
[offset
];
1163 uloc_getCurrentLanguageID(const char* oldID
){
1164 int32_t offset
= _findIndex(DEPRECATED_LANGUAGES
, oldID
);
1166 return REPLACEMENT_LANGUAGES
[offset
];
1171 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1172 * avoid duplicating code to handle the earlier locale ID pieces
1173 * in the functions for the later ones by
1174 * setting the *pEnd pointer to where they stopped parsing
1176 * TODO try to use this in Locale
1179 ulocimp_getLanguage(const char *localeID
,
1180 char *language
, int32_t languageCapacity
,
1181 const char **pEnd
) {
1184 char lang
[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1186 /* if it starts with i- or x- then copy that prefix */
1187 if(_isIDPrefix(localeID
)) {
1188 if(i
<languageCapacity
) {
1189 language
[i
]=(char)uprv_tolower(*localeID
);
1191 if(i
<languageCapacity
) {
1198 /* copy the language as far as possible and count its length */
1199 while(!_isTerminator(*localeID
) && !_isIDSeparator(*localeID
)) {
1200 if(i
<languageCapacity
) {
1201 language
[i
]=(char)uprv_tolower(*localeID
);
1205 lang
[i
]=(char)uprv_tolower(*localeID
);
1212 /* convert 3 character code to 2 character code if possible *CWB*/
1213 offset
=_findIndex(LANGUAGES_3
, lang
);
1215 i
=_copyCount(language
, languageCapacity
, LANGUAGES
[offset
]);
1226 ulocimp_getScript(const char *localeID
,
1227 char *script
, int32_t scriptCapacity
,
1236 /* copy the second item as far as possible and count its length */
1237 while(!_isTerminator(localeID
[idLen
]) && !_isIDSeparator(localeID
[idLen
])
1238 && uprv_isASCIILetter(localeID
[idLen
])) {
1242 /* If it's exactly 4 characters long, then it's a script and not a country. */
1246 *pEnd
= localeID
+idLen
;
1248 if(idLen
> scriptCapacity
) {
1249 idLen
= scriptCapacity
;
1252 script
[0]=(char)uprv_toupper(*(localeID
++));
1254 for (i
= 1; i
< idLen
; i
++) {
1255 script
[i
]=(char)uprv_tolower(*(localeID
++));
1265 ulocimp_getCountry(const char *localeID
,
1266 char *country
, int32_t countryCapacity
,
1270 char cnty
[ULOC_COUNTRY_CAPACITY
]={ 0, 0, 0, 0 };
1273 /* copy the country as far as possible and count its length */
1274 while(!_isTerminator(localeID
[idLen
]) && !_isIDSeparator(localeID
[idLen
])) {
1275 if(idLen
<(ULOC_COUNTRY_CAPACITY
-1)) { /*CWB*/
1276 cnty
[idLen
]=(char)uprv_toupper(localeID
[idLen
]);
1281 /* the country should be either length 2 or 3 */
1282 if (idLen
== 2 || idLen
== 3) {
1283 UBool gotCountry
= FALSE
;
1284 /* convert 3 character code to 2 character code if possible *CWB*/
1286 offset
=_findIndex(COUNTRIES_3
, cnty
);
1288 idLen
=_copyCount(country
, countryCapacity
, COUNTRIES
[offset
]);
1294 for (i
= 0; i
< idLen
; i
++) {
1295 if (i
< countryCapacity
) {
1296 country
[i
]=(char)uprv_toupper(localeID
[i
]);
1313 * @param needSeparator if true, then add leading '_' if any variants
1314 * are added to 'variant'
1317 _getVariantEx(const char *localeID
,
1319 char *variant
, int32_t variantCapacity
,
1320 UBool needSeparator
) {
1323 /* get one or more variant tags and separate them with '_' */
1324 if(_isIDSeparator(prev
)) {
1325 /* get a variant string after a '-' or '_' */
1326 while(!_isTerminator(*localeID
)) {
1327 if (needSeparator
) {
1328 if (i
<variantCapacity
) {
1332 needSeparator
= FALSE
;
1334 if(i
<variantCapacity
) {
1335 variant
[i
]=(char)uprv_toupper(*localeID
);
1336 if(variant
[i
]=='-') {
1345 /* if there is no variant tag after a '-' or '_' then look for '@' */
1349 } else if((localeID
=locale_getKeywordsStart(localeID
))!=NULL
) {
1350 ++localeID
; /* point after the '@' */
1354 while(!_isTerminator(*localeID
)) {
1355 if (needSeparator
) {
1356 if (i
<variantCapacity
) {
1360 needSeparator
= FALSE
;
1362 if(i
<variantCapacity
) {
1363 variant
[i
]=(char)uprv_toupper(*localeID
);
1364 if(variant
[i
]=='-' || variant
[i
]==',') {
1377 _getVariant(const char *localeID
,
1379 char *variant
, int32_t variantCapacity
) {
1380 return _getVariantEx(localeID
, prev
, variant
, variantCapacity
, FALSE
);
1383 /* Keyword enumeration */
1385 typedef struct UKeywordsContext
{
1392 static void U_CALLCONV
1393 uloc_kw_closeKeywords(UEnumeration
*enumerator
) {
1394 uprv_free(((UKeywordsContext
*)enumerator
->context
)->keywords
);
1395 uprv_free(enumerator
->context
);
1396 uprv_free(enumerator
);
1399 static int32_t U_CALLCONV
1400 uloc_kw_countKeywords(UEnumeration
*en
, UErrorCode
* /*status*/) {
1401 char *kw
= ((UKeywordsContext
*)en
->context
)->keywords
;
1405 kw
+= uprv_strlen(kw
)+1;
1410 static const char * U_CALLCONV
1411 uloc_kw_nextKeyword(UEnumeration
* en
,
1412 int32_t* resultLength
,
1413 UErrorCode
* /*status*/) {
1414 const char* result
= ((UKeywordsContext
*)en
->context
)->current
;
1417 len
= (int32_t)uprv_strlen(((UKeywordsContext
*)en
->context
)->current
);
1418 ((UKeywordsContext
*)en
->context
)->current
+= len
+1;
1423 *resultLength
= len
;
1428 static void U_CALLCONV
1429 uloc_kw_resetKeywords(UEnumeration
* en
,
1430 UErrorCode
* /*status*/) {
1431 ((UKeywordsContext
*)en
->context
)->current
= ((UKeywordsContext
*)en
->context
)->keywords
;
1437 static const UEnumeration gKeywordsEnum
= {
1440 uloc_kw_closeKeywords
,
1441 uloc_kw_countKeywords
,
1443 uloc_kw_nextKeyword
,
1444 uloc_kw_resetKeywords
1447 U_CAPI UEnumeration
* U_EXPORT2
1448 uloc_openKeywordList(const char *keywordList
, int32_t keywordListSize
, UErrorCode
* status
)
1450 UKeywordsContext
*myContext
= NULL
;
1451 UEnumeration
*result
= NULL
;
1453 if(U_FAILURE(*status
)) {
1456 result
= (UEnumeration
*)uprv_malloc(sizeof(UEnumeration
));
1457 /* Null pointer test */
1458 if (result
== NULL
) {
1459 *status
= U_MEMORY_ALLOCATION_ERROR
;
1462 uprv_memcpy(result
, &gKeywordsEnum
, sizeof(UEnumeration
));
1463 myContext
= static_cast<UKeywordsContext
*>(uprv_malloc(sizeof(UKeywordsContext
)));
1464 if (myContext
== NULL
) {
1465 *status
= U_MEMORY_ALLOCATION_ERROR
;
1469 myContext
->keywords
= (char *)uprv_malloc(keywordListSize
+1);
1470 uprv_memcpy(myContext
->keywords
, keywordList
, keywordListSize
);
1471 myContext
->keywords
[keywordListSize
] = 0;
1472 myContext
->current
= myContext
->keywords
;
1473 result
->context
= myContext
;
1477 U_CAPI UEnumeration
* U_EXPORT2
1478 uloc_openKeywords(const char* localeID
,
1483 int32_t keywordsCapacity
= 256;
1484 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
1485 const char* tmpLocaleID
;
1487 if(status
==NULL
|| U_FAILURE(*status
)) {
1491 if (_hasBCP47Extension(localeID
)) {
1492 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), status
);
1494 if (localeID
==NULL
) {
1495 localeID
=uloc_getDefault();
1497 tmpLocaleID
=localeID
;
1500 /* Skip the language */
1501 ulocimp_getLanguage(tmpLocaleID
, NULL
, 0, &tmpLocaleID
);
1502 if(_isIDSeparator(*tmpLocaleID
)) {
1503 const char *scriptID
;
1504 /* Skip the script if available */
1505 ulocimp_getScript(tmpLocaleID
+1, NULL
, 0, &scriptID
);
1506 if(scriptID
!= tmpLocaleID
+1) {
1507 /* Found optional script */
1508 tmpLocaleID
= scriptID
;
1510 /* Skip the Country */
1511 if (_isIDSeparator(*tmpLocaleID
)) {
1512 ulocimp_getCountry(tmpLocaleID
+1, NULL
, 0, &tmpLocaleID
);
1513 if(_isIDSeparator(*tmpLocaleID
)) {
1514 _getVariant(tmpLocaleID
+1, *tmpLocaleID
, NULL
, 0);
1519 /* keywords are located after '@' */
1520 if((tmpLocaleID
= locale_getKeywordsStart(tmpLocaleID
)) != NULL
) {
1521 i
=locale_getKeywords(tmpLocaleID
+1, '@', keywords
, keywordsCapacity
, NULL
, 0, NULL
, FALSE
, status
);
1525 return uloc_openKeywordList(keywords
, i
, status
);
1532 /* bit-flags for 'options' parameter of _canonicalize */
1533 #define _ULOC_STRIP_KEYWORDS 0x2
1534 #define _ULOC_CANONICALIZE 0x1
1536 #define OPTION_SET(options, mask) ((options & mask) != 0)
1538 static const char i_default
[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1539 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1542 * Canonicalize the given localeID, to level 1 or to level 2,
1543 * depending on the options. To specify level 1, pass in options=0.
1544 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1546 * This is the code underlying uloc_getName and uloc_canonicalize.
1549 _canonicalize(const char* localeID
,
1551 int32_t resultCapacity
,
1554 int32_t j
, len
, fieldCount
=0, scriptSize
=0, variantSize
=0, nameCapacity
;
1555 char localeBuffer
[ULOC_FULLNAME_CAPACITY
];
1556 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
1557 const char* origLocaleID
;
1558 const char* tmpLocaleID
;
1559 const char* keywordAssign
= NULL
;
1560 const char* separatorIndicator
= NULL
;
1562 char* variant
= NULL
; /* pointer into name, or NULL */
1564 if (U_FAILURE(*err
)) {
1568 if (_hasBCP47Extension(localeID
)) {
1569 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), err
);
1571 if (localeID
==NULL
) {
1572 localeID
=uloc_getDefault();
1574 tmpLocaleID
=localeID
;
1577 origLocaleID
=tmpLocaleID
;
1579 /* if we are doing a full canonicalization, then put results in
1580 localeBuffer, if necessary; otherwise send them to result. */
1581 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1582 (result
== NULL
|| resultCapacity
< (int32_t)sizeof(localeBuffer
))) {
1583 name
= localeBuffer
;
1584 nameCapacity
= (int32_t)sizeof(localeBuffer
);
1587 nameCapacity
= resultCapacity
;
1590 /* get all pieces, one after another, and separate with '_' */
1591 len
=ulocimp_getLanguage(tmpLocaleID
, name
, nameCapacity
, &tmpLocaleID
);
1593 if(len
== I_DEFAULT_LENGTH
&& uprv_strncmp(origLocaleID
, i_default
, len
) == 0) {
1594 const char *d
= uloc_getDefault();
1596 len
= (int32_t)uprv_strlen(d
);
1599 uprv_memcpy(name
, d
, len
);
1601 } else if(_isIDSeparator(*tmpLocaleID
)) {
1602 const char *scriptID
;
1605 if(len
<nameCapacity
) {
1610 scriptSize
=ulocimp_getScript(tmpLocaleID
+1,
1611 (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
, &scriptID
);
1612 if(scriptSize
> 0) {
1613 /* Found optional script */
1614 tmpLocaleID
= scriptID
;
1617 if (_isIDSeparator(*tmpLocaleID
)) {
1618 /* If there is something else, then we add the _ */
1619 if(len
<nameCapacity
) {
1626 if (_isIDSeparator(*tmpLocaleID
)) {
1627 const char *cntryID
;
1628 int32_t cntrySize
= ulocimp_getCountry(tmpLocaleID
+1,
1629 (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
, &cntryID
);
1630 if (cntrySize
> 0) {
1631 /* Found optional country */
1632 tmpLocaleID
= cntryID
;
1635 if(_isIDSeparator(*tmpLocaleID
)) {
1636 /* If there is something else, then we add the _ if we found country before. */
1637 if (cntrySize
>= 0 && ! _isIDSeparator(*(tmpLocaleID
+1)) ) {
1639 if(len
<nameCapacity
) {
1645 variantSize
= _getVariant(tmpLocaleID
+1, *tmpLocaleID
,
1646 (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
);
1647 if (variantSize
> 0) {
1648 variant
= len
<nameCapacity
? name
+len
: NULL
;
1650 tmpLocaleID
+= variantSize
+ 1; /* skip '_' and variant */
1656 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1657 if (!OPTION_SET(options
, _ULOC_CANONICALIZE
) && *tmpLocaleID
== '.') {
1660 char c
= *tmpLocaleID
;
1667 if (len
<nameCapacity
) {
1677 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1678 After this, tmpLocaleID either points to '@' or is NULL */
1679 if ((tmpLocaleID
=locale_getKeywordsStart(tmpLocaleID
))!=NULL
) {
1680 keywordAssign
= uprv_strchr(tmpLocaleID
, '=');
1681 separatorIndicator
= uprv_strchr(tmpLocaleID
, ';');
1684 /* Copy POSIX-style variant, if any [mr@FOO] */
1685 if (!OPTION_SET(options
, _ULOC_CANONICALIZE
) &&
1686 tmpLocaleID
!= NULL
&& keywordAssign
== NULL
) {
1688 char c
= *tmpLocaleID
;
1692 if (len
<nameCapacity
) {
1700 if (OPTION_SET(options
, _ULOC_CANONICALIZE
)) {
1701 /* Handle @FOO variant if @ is present and not followed by = */
1702 if (tmpLocaleID
!=NULL
&& keywordAssign
==NULL
) {
1703 int32_t posixVariantSize
;
1704 /* Add missing '_' if needed */
1705 if (fieldCount
< 2 || (fieldCount
< 3 && scriptSize
> 0)) {
1707 if(len
<nameCapacity
) {
1712 } while(fieldCount
<2);
1714 posixVariantSize
= _getVariantEx(tmpLocaleID
+1, '@', name
+len
, nameCapacity
-len
,
1715 (UBool
)(variantSize
> 0));
1716 if (posixVariantSize
> 0) {
1717 if (variant
== NULL
) {
1720 len
+= posixVariantSize
;
1721 variantSize
+= posixVariantSize
;
1725 /* Look up the ID in the canonicalization map */
1726 for (j
=0; j
<UPRV_LENGTHOF(CANONICALIZE_MAP
); j
++) {
1727 const char* id
= CANONICALIZE_MAP
[j
].id
;
1728 int32_t n
= (int32_t)uprv_strlen(id
);
1729 if (len
== n
&& uprv_strncmp(name
, id
, n
) == 0) {
1730 if (n
== 0 && tmpLocaleID
!= NULL
) {
1731 break; /* Don't remap "" if keywords present */
1733 len
= _copyCount(name
, nameCapacity
, CANONICALIZE_MAP
[j
].canonicalID
);
1739 if (!OPTION_SET(options
, _ULOC_STRIP_KEYWORDS
)) {
1740 if (tmpLocaleID
!=NULL
&& keywordAssign
!=NULL
&&
1741 (!separatorIndicator
|| separatorIndicator
> keywordAssign
)) {
1742 if(len
<nameCapacity
) {
1747 len
+= _getKeywords(tmpLocaleID
+1, '@', (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
,
1748 NULL
, 0, NULL
, TRUE
, err
);
1752 if (U_SUCCESS(*err
) && result
!= NULL
&& name
== localeBuffer
) {
1753 uprv_strncpy(result
, localeBuffer
, (len
> resultCapacity
) ? resultCapacity
: len
);
1756 return u_terminateChars(result
, resultCapacity
, len
, err
);
1759 /* ### ID parsing API **************************************************/
1761 U_CAPI
int32_t U_EXPORT2
1762 uloc_getParent(const char* localeID
,
1764 int32_t parentCapacity
,
1767 const char *lastUnderscore
;
1770 if (U_FAILURE(*err
))
1773 if (localeID
== NULL
)
1774 localeID
= uloc_getDefault();
1776 lastUnderscore
=uprv_strrchr(localeID
, '_');
1777 if(lastUnderscore
!=NULL
) {
1778 i
=(int32_t)(lastUnderscore
-localeID
);
1783 if(i
>0 && parent
!= localeID
) {
1784 uprv_memcpy(parent
, localeID
, uprv_min(i
, parentCapacity
));
1787 return u_terminateChars(parent
, parentCapacity
, i
, err
);
1790 U_CAPI
int32_t U_EXPORT2
1791 uloc_getLanguage(const char* localeID
,
1793 int32_t languageCapacity
,
1796 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1799 if (err
==NULL
|| U_FAILURE(*err
)) {
1803 if(localeID
==NULL
) {
1804 localeID
=uloc_getDefault();
1807 i
=ulocimp_getLanguage(localeID
, language
, languageCapacity
, NULL
);
1808 return u_terminateChars(language
, languageCapacity
, i
, err
);
1811 U_CAPI
int32_t U_EXPORT2
1812 uloc_getScript(const char* localeID
,
1814 int32_t scriptCapacity
,
1819 if(err
==NULL
|| U_FAILURE(*err
)) {
1823 if(localeID
==NULL
) {
1824 localeID
=uloc_getDefault();
1827 /* skip the language */
1828 ulocimp_getLanguage(localeID
, NULL
, 0, &localeID
);
1829 if(_isIDSeparator(*localeID
)) {
1830 i
=ulocimp_getScript(localeID
+1, script
, scriptCapacity
, NULL
);
1832 return u_terminateChars(script
, scriptCapacity
, i
, err
);
1835 U_CAPI
int32_t U_EXPORT2
1836 uloc_getCountry(const char* localeID
,
1838 int32_t countryCapacity
,
1843 if(err
==NULL
|| U_FAILURE(*err
)) {
1847 if(localeID
==NULL
) {
1848 localeID
=uloc_getDefault();
1851 /* Skip the language */
1852 ulocimp_getLanguage(localeID
, NULL
, 0, &localeID
);
1853 if(_isIDSeparator(*localeID
)) {
1854 const char *scriptID
;
1855 /* Skip the script if available */
1856 ulocimp_getScript(localeID
+1, NULL
, 0, &scriptID
);
1857 if(scriptID
!= localeID
+1) {
1858 /* Found optional script */
1859 localeID
= scriptID
;
1861 if(_isIDSeparator(*localeID
)) {
1862 i
=ulocimp_getCountry(localeID
+1, country
, countryCapacity
, NULL
);
1865 return u_terminateChars(country
, countryCapacity
, i
, err
);
1868 U_CAPI
int32_t U_EXPORT2
1869 uloc_getVariant(const char* localeID
,
1871 int32_t variantCapacity
,
1874 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
1875 const char* tmpLocaleID
;
1878 if(err
==NULL
|| U_FAILURE(*err
)) {
1882 if (_hasBCP47Extension(localeID
)) {
1883 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), err
);
1885 if (localeID
==NULL
) {
1886 localeID
=uloc_getDefault();
1888 tmpLocaleID
=localeID
;
1891 /* Skip the language */
1892 ulocimp_getLanguage(tmpLocaleID
, NULL
, 0, &tmpLocaleID
);
1893 if(_isIDSeparator(*tmpLocaleID
)) {
1894 const char *scriptID
;
1895 /* Skip the script if available */
1896 ulocimp_getScript(tmpLocaleID
+1, NULL
, 0, &scriptID
);
1897 if(scriptID
!= tmpLocaleID
+1) {
1898 /* Found optional script */
1899 tmpLocaleID
= scriptID
;
1901 /* Skip the Country */
1902 if (_isIDSeparator(*tmpLocaleID
)) {
1903 const char *cntryID
;
1904 ulocimp_getCountry(tmpLocaleID
+1, NULL
, 0, &cntryID
);
1905 if (cntryID
!= tmpLocaleID
+1) {
1906 /* Found optional country */
1907 tmpLocaleID
= cntryID
;
1909 if(_isIDSeparator(*tmpLocaleID
)) {
1910 /* If there was no country ID, skip a possible extra IDSeparator */
1911 if (tmpLocaleID
!= cntryID
&& _isIDSeparator(tmpLocaleID
[1])) {
1914 i
=_getVariant(tmpLocaleID
+1, *tmpLocaleID
, variant
, variantCapacity
);
1919 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1920 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1922 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1923 i=_getVariant(localeID+1, '@', variant, variantCapacity);
1926 return u_terminateChars(variant
, variantCapacity
, i
, err
);
1929 U_CAPI
int32_t U_EXPORT2
1930 uloc_getName(const char* localeID
,
1932 int32_t nameCapacity
,
1935 return _canonicalize(localeID
, name
, nameCapacity
, 0, err
);
1938 U_CAPI
int32_t U_EXPORT2
1939 uloc_getBaseName(const char* localeID
,
1941 int32_t nameCapacity
,
1944 return _canonicalize(localeID
, name
, nameCapacity
, _ULOC_STRIP_KEYWORDS
, err
);
1947 U_CAPI
int32_t U_EXPORT2
1948 uloc_canonicalize(const char* localeID
,
1950 int32_t nameCapacity
,
1953 return _canonicalize(localeID
, name
, nameCapacity
, _ULOC_CANONICALIZE
, err
);
1956 U_CAPI
const char* U_EXPORT2
1957 uloc_getISO3Language(const char* localeID
)
1960 char lang
[ULOC_LANG_CAPACITY
];
1961 UErrorCode err
= U_ZERO_ERROR
;
1963 if (localeID
== NULL
)
1965 localeID
= uloc_getDefault();
1967 uloc_getLanguage(localeID
, lang
, ULOC_LANG_CAPACITY
, &err
);
1970 offset
= _findIndex(LANGUAGES
, lang
);
1973 return LANGUAGES_3
[offset
];
1976 U_CAPI
const char* U_EXPORT2
1977 uloc_getISO3Country(const char* localeID
)
1980 char cntry
[ULOC_LANG_CAPACITY
];
1981 UErrorCode err
= U_ZERO_ERROR
;
1983 if (localeID
== NULL
)
1985 localeID
= uloc_getDefault();
1987 uloc_getCountry(localeID
, cntry
, ULOC_LANG_CAPACITY
, &err
);
1990 offset
= _findIndex(COUNTRIES
, cntry
);
1994 return COUNTRIES_3
[offset
];
1997 U_CAPI
uint32_t U_EXPORT2
1998 uloc_getLCID(const char* localeID
)
2000 UErrorCode status
= U_ZERO_ERROR
;
2001 char langID
[ULOC_FULLNAME_CAPACITY
];
2004 /* Check for incomplete id. */
2005 if (!localeID
|| uprv_strlen(localeID
) < 2) {
2009 // First, attempt Windows platform lookup if available, but fall
2010 // through to catch any special cases (ICU vs Windows name differences).
2011 lcid
= uprv_convertToLCIDPlatform(localeID
, &status
);
2012 if (U_FAILURE(status
)) {
2016 // Windows found an LCID, return that
2020 uloc_getLanguage(localeID
, langID
, sizeof(langID
), &status
);
2021 if (U_FAILURE(status
) || status
== U_STRING_NOT_TERMINATED_WARNING
) {
2025 if (uprv_strchr(localeID
, '@')) {
2026 // uprv_convertToLCID does not support keywords other than collation.
2027 // Remove all keywords except collation.
2029 char collVal
[ULOC_KEYWORDS_CAPACITY
];
2030 char tmpLocaleID
[ULOC_FULLNAME_CAPACITY
];
2032 len
= uloc_getKeywordValue(localeID
, "collation", collVal
,
2033 UPRV_LENGTHOF(collVal
) - 1, &status
);
2035 if (U_SUCCESS(status
) && len
> 0) {
2038 len
= uloc_getBaseName(localeID
, tmpLocaleID
,
2039 UPRV_LENGTHOF(tmpLocaleID
) - 1, &status
);
2041 if (U_SUCCESS(status
) && len
> 0) {
2042 tmpLocaleID
[len
] = 0;
2044 len
= uloc_setKeywordValue("collation", collVal
, tmpLocaleID
,
2045 UPRV_LENGTHOF(tmpLocaleID
) - len
- 1, &status
);
2047 if (U_SUCCESS(status
) && len
> 0) {
2048 tmpLocaleID
[len
] = 0;
2049 return uprv_convertToLCID(langID
, tmpLocaleID
, &status
);
2054 // fall through - all keywords are simply ignored
2055 status
= U_ZERO_ERROR
;
2058 return uprv_convertToLCID(langID
, localeID
, &status
);
2061 U_CAPI
int32_t U_EXPORT2
2062 uloc_getLocaleForLCID(uint32_t hostid
, char *locale
, int32_t localeCapacity
,
2065 return uprv_convertToPosix(hostid
, locale
, localeCapacity
, status
);
2068 /* ### Default locale **************************************************/
2070 U_CAPI
const char* U_EXPORT2
2073 return locale_get_default();
2076 U_CAPI
void U_EXPORT2
2077 uloc_setDefault(const char* newDefaultLocale
,
2080 if (U_FAILURE(*err
))
2082 /* the error code isn't currently used for anything by this function*/
2084 /* propagate change to C++ */
2085 locale_set_default(newDefaultLocale
);
2089 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2090 * to an array of pointers to arrays of char. All of these pointers are owned
2091 * by ICU-- do not delete them, and do not write through them. The array is
2092 * terminated with a null pointer.
2094 U_CAPI
const char* const* U_EXPORT2
2095 uloc_getISOLanguages()
2101 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2102 * pointer to an array of pointers to arrays of char. All of these pointers are
2103 * owned by ICU-- do not delete them, and do not write through them. The array is
2104 * terminated with a null pointer.
2106 U_CAPI
const char* const* U_EXPORT2
2107 uloc_getISOCountries()
2113 /* this function to be moved into cstring.c later */
2114 static char gDecimal
= 0;
2119 _uloc_strtod(const char *start
, char **end
) {
2126 /* For machines that decide to change the decimal on you,
2127 and try to be too smart with localization.
2128 This normally should be just a '.'. */
2129 sprintf(rep
, "%+1.1f", 1.0);
2133 if(gDecimal
== '.') {
2134 return uprv_strtod(start
, end
); /* fall through to OS */
2136 uprv_strncpy(buf
, start
, 29);
2138 decimal
= uprv_strchr(buf
, '.');
2140 *decimal
= gDecimal
;
2142 return uprv_strtod(start
, end
); /* no decimal point */
2144 rv
= uprv_strtod(buf
, &myEnd
);
2146 *end
= (char*)(start
+(myEnd
-buf
)); /* cast away const (to follow uprv_strtod API.) */
2154 int32_t dummy
; /* to avoid uninitialized memory copy from qsort */
2155 char locale
[ULOC_FULLNAME_CAPACITY
+1];
2158 static int32_t U_CALLCONV
2159 uloc_acceptLanguageCompare(const void * /*context*/, const void *a
, const void *b
)
2161 const _acceptLangItem
*aa
= (const _acceptLangItem
*)a
;
2162 const _acceptLangItem
*bb
= (const _acceptLangItem
*)b
;
2166 rc
= -1; /* A > B */
2167 } else if(bb
->q
> aa
->q
) {
2174 rc
= uprv_stricmp(aa
->locale
, bb
->locale
);
2177 #if defined(ULOC_DEBUG)
2178 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2188 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2191 U_CAPI
int32_t U_EXPORT2
2192 uloc_acceptLanguageFromHTTP(char *result
, int32_t resultAvailable
, UAcceptResult
*outResult
,
2193 const char *httpAcceptLanguage
,
2194 UEnumeration
* availableLocales
,
2197 MaybeStackArray
<_acceptLangItem
, 4> items
; // Struct for collecting items.
2198 char tmp
[ULOC_FULLNAME_CAPACITY
+1];
2200 const char *itemEnd
;
2201 const char *paramEnd
;
2206 int32_t l
= (int32_t)uprv_strlen(httpAcceptLanguage
);
2208 if(U_FAILURE(*status
)) {
2212 for(s
=httpAcceptLanguage
;s
&&*s
;) {
2213 while(isspace(*s
)) /* eat space at the beginning */
2215 itemEnd
=uprv_strchr(s
,',');
2216 paramEnd
=uprv_strchr(s
,';');
2218 itemEnd
= httpAcceptLanguage
+l
; /* end of string */
2220 if(paramEnd
&& paramEnd
<itemEnd
) {
2221 /* semicolon (;) is closer than end (,) */
2226 while(isspace(*t
)) {
2232 while(isspace(*t
)) {
2235 items
[n
].q
= (float)_uloc_strtod(t
,NULL
);
2237 /* no semicolon - it's 1.0 */
2242 /* eat spaces prior to semi */
2243 for(t
=(paramEnd
-1);(paramEnd
>s
)&&isspace(*t
);t
--)
2245 int32_t slen
= static_cast<int32_t>(((t
+1)-s
));
2246 if(slen
> ULOC_FULLNAME_CAPACITY
) {
2247 *status
= U_BUFFER_OVERFLOW_ERROR
;
2248 return -1; // too big
2250 uprv_strncpy(items
[n
].locale
, s
, slen
);
2251 items
[n
].locale
[slen
]=0; // terminate
2252 int32_t clen
= uloc_canonicalize(items
[n
].locale
, tmp
, UPRV_LENGTHOF(tmp
)-1, status
);
2253 if(U_FAILURE(*status
)) return -1;
2254 if((clen
!=slen
) || (uprv_strncmp(items
[n
].locale
, tmp
, slen
))) {
2255 // canonicalization had an effect- copy back
2256 uprv_strncpy(items
[n
].locale
, tmp
, clen
);
2257 items
[n
].locale
[clen
] = 0; // terminate
2259 #if defined(ULOC_DEBUG)
2260 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2264 while(*s
==',') { /* eat duplicate commas */
2267 if(n
>=items
.getCapacity()) { // If we need more items
2268 if(NULL
== items
.resize(items
.getCapacity()*2, items
.getCapacity())) {
2269 *status
= U_MEMORY_ALLOCATION_ERROR
;
2272 #if defined(ULOC_DEBUG)
2273 fprintf(stderr
,"malloced at size %d\n", items
.getCapacity());
2277 uprv_sortArray(items
.getAlias(), n
, sizeof(items
[0]), uloc_acceptLanguageCompare
, NULL
, TRUE
, status
);
2278 if (U_FAILURE(*status
)) {
2281 LocalMemory
<const char*> strs(NULL
);
2282 if (strs
.allocateInsteadAndReset(n
) == NULL
) {
2283 *status
= U_MEMORY_ALLOCATION_ERROR
;
2287 #if defined(ULOC_DEBUG)
2288 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2290 strs
[i
]=items
[i
].locale
;
2292 res
= uloc_acceptLanguage(result
, resultAvailable
, outResult
,
2293 strs
.getAlias(), n
, availableLocales
, status
);
2298 U_CAPI
int32_t U_EXPORT2
2299 uloc_acceptLanguage(char *result
, int32_t resultAvailable
,
2300 UAcceptResult
*outResult
, const char **acceptList
,
2301 int32_t acceptListCount
,
2302 UEnumeration
* availableLocales
,
2308 char tmp
[ULOC_FULLNAME_CAPACITY
+1];
2310 char **fallbackList
;
2311 if(U_FAILURE(*status
)) {
2314 fallbackList
= static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList
[0])*acceptListCount
)));
2315 if(fallbackList
==NULL
) {
2316 *status
= U_MEMORY_ALLOCATION_ERROR
;
2319 for(i
=0;i
<acceptListCount
;i
++) {
2320 #if defined(ULOC_DEBUG)
2321 fprintf(stderr
,"%02d: %s\n", i
, acceptList
[i
]);
2323 while((l
=uenum_next(availableLocales
, NULL
, status
)) != NULL
) {
2324 #if defined(ULOC_DEBUG)
2325 fprintf(stderr
," %s\n", l
);
2327 len
= (int32_t)uprv_strlen(l
);
2328 if(!uprv_strcmp(acceptList
[i
], l
)) {
2330 *outResult
= ULOC_ACCEPT_VALID
;
2332 #if defined(ULOC_DEBUG)
2333 fprintf(stderr
, "MATCH! %s\n", l
);
2336 uprv_strncpy(result
, l
, uprv_min(len
, resultAvailable
));
2339 uprv_free(fallbackList
[j
]);
2341 uprv_free(fallbackList
);
2342 return u_terminateChars(result
, resultAvailable
, len
, status
);
2348 uenum_reset(availableLocales
, status
);
2349 /* save off parent info */
2350 if(uloc_getParent(acceptList
[i
], tmp
, UPRV_LENGTHOF(tmp
), status
)!=0) {
2351 fallbackList
[i
] = uprv_strdup(tmp
);
2357 for(maxLen
--;maxLen
>0;maxLen
--) {
2358 for(i
=0;i
<acceptListCount
;i
++) {
2359 if(fallbackList
[i
] && ((int32_t)uprv_strlen(fallbackList
[i
])==maxLen
)) {
2360 #if defined(ULOC_DEBUG)
2361 fprintf(stderr
,"Try: [%s]", fallbackList
[i
]);
2363 while((l
=uenum_next(availableLocales
, NULL
, status
)) != NULL
) {
2364 #if defined(ULOC_DEBUG)
2365 fprintf(stderr
," %s\n", l
);
2367 len
= (int32_t)uprv_strlen(l
);
2368 if(!uprv_strcmp(fallbackList
[i
], l
)) {
2370 *outResult
= ULOC_ACCEPT_FALLBACK
;
2372 #if defined(ULOC_DEBUG)
2373 fprintf(stderr
, "fallback MATCH! %s\n", l
);
2376 uprv_strncpy(result
, l
, uprv_min(len
, resultAvailable
));
2378 for(j
=0;j
<acceptListCount
;j
++) {
2379 uprv_free(fallbackList
[j
]);
2381 uprv_free(fallbackList
);
2382 return u_terminateChars(result
, resultAvailable
, len
, status
);
2385 uenum_reset(availableLocales
, status
);
2387 if(uloc_getParent(fallbackList
[i
], tmp
, UPRV_LENGTHOF(tmp
), status
)!=0) {
2388 uprv_free(fallbackList
[i
]);
2389 fallbackList
[i
] = uprv_strdup(tmp
);
2391 uprv_free(fallbackList
[i
]);
2397 *outResult
= ULOC_ACCEPT_FAILED
;
2400 for(i
=0;i
<acceptListCount
;i
++) {
2401 uprv_free(fallbackList
[i
]);
2403 uprv_free(fallbackList
);
2407 U_CAPI
const char* U_EXPORT2
2408 uloc_toUnicodeLocaleKey(const char* keyword
)
2410 const char* bcpKey
= ulocimp_toBcpKey(keyword
);
2411 if (bcpKey
== NULL
&& ultag_isUnicodeLocaleKey(keyword
, -1)) {
2412 // unknown keyword, but syntax is fine..
2418 U_CAPI
const char* U_EXPORT2
2419 uloc_toUnicodeLocaleType(const char* keyword
, const char* value
)
2421 const char* bcpType
= ulocimp_toBcpType(keyword
, value
, NULL
, NULL
);
2422 if (bcpType
== NULL
&& ultag_isUnicodeLocaleType(value
, -1)) {
2423 // unknown keyword, but syntax is fine..
2430 isWellFormedLegacyKey(const char* legacyKey
)
2432 const char* p
= legacyKey
;
2434 if (!UPRV_ISALPHANUM(*p
)) {
2443 isWellFormedLegacyType(const char* legacyType
)
2445 const char* p
= legacyType
;
2446 int32_t alphaNumLen
= 0;
2448 if (*p
== '_' || *p
== '/' || *p
== '-') {
2449 if (alphaNumLen
== 0) {
2453 } else if (UPRV_ISALPHANUM(*p
)) {
2460 return (alphaNumLen
!= 0);
2463 U_CAPI
const char* U_EXPORT2
2464 uloc_toLegacyKey(const char* keyword
)
2466 const char* legacyKey
= ulocimp_toLegacyKey(keyword
);
2467 if (legacyKey
== NULL
) {
2468 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2471 // LDML/CLDR provides some definition of keyword syntax in
2472 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2473 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2474 // Keys can only consist of [0-9a-zA-Z].
2475 if (isWellFormedLegacyKey(keyword
)) {
2482 U_CAPI
const char* U_EXPORT2
2483 uloc_toLegacyType(const char* keyword
, const char* value
)
2485 const char* legacyType
= ulocimp_toLegacyType(keyword
, value
, NULL
, NULL
);
2486 if (legacyType
== NULL
) {
2487 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2490 // LDML/CLDR provides some definition of keyword syntax in
2491 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2492 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2493 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2494 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2495 if (isWellFormedLegacyType(value
)) {