1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 1997-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
11 * Modification History:
13 * Date Name Description
14 * 04/01/97 aliu Creation.
15 * 08/21/98 stephen JDK 1.2 sync
16 * 12/08/98 rtg New Locale implementation and C API
17 * 03/15/99 damiba overhaul.
18 * 04/06/99 stephen changed setDefault() to realloc and copy
19 * 06/14/99 stephen Changed calls to ures_open for new params
20 * 07/21/99 stephen Modified setDefault() to propagate to C++
21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22 * brought canonicalization code into line with spec
23 *****************************************************************************/
26 POSIX's locale format, from putil.c: [no spaces]
28 ll [ _CC ] [ . MM ] [ @ VV]
30 l = lang, C = ctry, M = charmap, V = variant
33 #include "unicode/utypes.h"
34 #include "unicode/ustring.h"
35 #include "unicode/uloc.h"
49 #include <stdio.h> /* for sprintf */
53 /* ### Declarations **************************************************/
55 /* Locale stuff from locid.cpp */
56 U_CFUNC
void locale_set_default(const char *id
);
57 U_CFUNC
const char *locale_get_default(void);
59 locale_getKeywords(const char *localeID
,
61 char *keywords
, int32_t keywordCapacity
,
62 char *values
, int32_t valuesCapacity
, int32_t *valLen
,
66 /* ### Data tables **************************************************/
69 * Table of language codes, both 2- and 3-letter, with preference
70 * given to 2-letter codes where possible. Includes 3-letter codes
71 * that lack a 2-letter equivalent.
73 * This list must be in sorted order. This list is returned directly
74 * to the user by some API.
76 * This list must be kept in sync with LANGUAGES_3, with corresponding
79 * This table should be terminated with a NULL entry, followed by a
80 * second list, and another NULL entry. The first list is visible to
81 * user code when this array is returned by API. The second list
82 * contains codes we support, but do not expose through user API.
86 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
87 * include the revisions up to 2001/7/27 *CWB*
89 * The 3 character codes are the terminology codes like RFC 3066. This
90 * is compatible with prior ICU codes
92 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
93 * table but now at the end of the table because 3 character codes are
94 * duplicates. This avoids bad searches going from 3 to 2 character
97 * The range qaa-qtz is reserved for local use
99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
100 /* ISO639 table version is 20150505 */
101 /* Subsequent hand addition of selected languages */
102 static const char * const LANGUAGES
[] = {
103 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
104 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
105 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
106 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
107 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
108 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
109 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
110 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
111 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
112 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
113 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
114 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
115 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
116 "cs", "csb", "cu", "cv", "cy",
117 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
118 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
119 "dyo", "dyu", "dz", "dzg",
120 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
121 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
123 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
124 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
126 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
127 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
128 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
129 "gur", "guz", "gv", "gwi",
130 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
131 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
133 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
134 "ilo", "inh", "io", "is", "it", "iu", "izh",
135 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
137 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
138 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
139 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
140 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
141 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
142 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
144 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
145 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
146 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
147 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
148 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
149 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
150 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
151 "ml", "mn", "mnc", "mni", "mo",
152 "moh", "mos", "mr", "mrj",
153 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
154 "my", "mye", "myv", "mzn",
155 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
156 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
157 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
158 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
159 "oc", "oj", "om", "or", "os", "osa", "ota",
160 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
161 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
162 "pon", "prg", "pro", "ps", "pt",
164 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
165 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
167 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
168 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
169 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
170 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
171 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
172 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
173 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
174 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
175 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
176 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
177 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
178 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
179 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
180 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
181 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
183 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
184 "xal", "xh", "xmf", "xog",
185 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
186 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
189 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
193 static const char* const DEPRECATED_LANGUAGES
[]={
194 "in", "iw", "ji", "jw", NULL
, NULL
196 static const char* const REPLACEMENT_LANGUAGES
[]={
197 "id", "he", "yi", "jv", NULL
, NULL
201 * Table of 3-letter language codes.
203 * This is a lookup table used to convert 3-letter language codes to
204 * their 2-letter equivalent, where possible. It must be kept in sync
205 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
206 * same language as LANGUAGES_3[i]. The commented-out lines are
207 * copied from LANGUAGES to make eyeballing this baby easier.
209 * Where a 3-letter language code has no 2-letter equivalent, the
210 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
212 * This table should be terminated with a NULL entry, followed by a
213 * second list, and another NULL entry. The two lists correspond to
214 * the two lists in LANGUAGES.
216 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
217 /* ISO639 table version is 20150505 */
218 /* Subsequent hand addition of selected languages */
219 static const char * const LANGUAGES_3
[] = {
220 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
221 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
222 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
223 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
224 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
225 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
226 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
227 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
228 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
229 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
230 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
231 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
232 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
233 "ces", "csb", "chu", "chv", "cym",
234 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
235 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
236 "dyo", "dyu", "dzo", "dzg",
237 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
238 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
240 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
241 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
243 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
244 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
245 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
246 "gur", "guz", "glv", "gwi",
247 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
248 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
250 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
251 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
252 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
254 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
255 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
256 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
257 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
258 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
259 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
261 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
262 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
263 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
264 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
265 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
266 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
267 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
268 "mal", "mon", "mnc", "mni", "mol",
269 "moh", "mos", "mar", "mrj",
270 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
271 "mya", "mye", "myv", "mzn",
272 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
273 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
274 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
275 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
276 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
277 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
278 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
279 "pon", "prg", "pro", "pus", "por",
281 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
282 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
284 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
285 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
286 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
287 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
288 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
289 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
290 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
291 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
292 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
293 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
294 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
295 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
296 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
297 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
298 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
300 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
301 "xal", "xho", "xmf", "xog",
302 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
303 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
306 /* "in", "iw", "ji", "jw", "sh", */
307 "ind", "heb", "yid", "jaw", "srp",
312 * Table of 2-letter country codes.
314 * This list must be in sorted order. This list is returned directly
315 * to the user by some API.
317 * This list must be kept in sync with COUNTRIES_3, with corresponding
320 * This table should be terminated with a NULL entry, followed by a
321 * second list, and another NULL entry. The first list is visible to
322 * user code when this array is returned by API. The second list
323 * contains codes we support, but do not expose through user API.
327 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
328 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
329 * new codes keeping the old ones for compatibility updated to include
330 * 1999/12/03 revisions *CWB*
332 * RO(ROM) is now RO(ROU) according to
333 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
335 static const char * const COUNTRIES
[] = {
336 "AC", "AD", "AE", "AF", "AG", "AI", "AL", "AM",
337 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
338 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
339 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
340 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
341 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CP", "CR",
342 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
343 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
344 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
345 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
346 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
347 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
348 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
349 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
350 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
351 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
352 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
353 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
354 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
355 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
356 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
357 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
358 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
359 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
360 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
361 "SX", "SY", "SZ", "TA", "TC", "TD", "TF", "TG", "TH", "TJ",
362 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
363 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
364 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
365 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
367 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
371 static const char* const DEPRECATED_COUNTRIES
[] = {
372 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL
, NULL
/* deprecated country list */
374 static const char* const REPLACEMENT_COUNTRIES
[] = {
375 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
376 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL
, NULL
/* replacement country codes */
380 * Table of 3-letter country codes.
382 * This is a lookup table used to convert 3-letter country codes to
383 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
384 * For all valid i, COUNTRIES[i] must refer to the same country as
385 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
386 * to make eyeballing this baby easier.
388 * This table should be terminated with a NULL entry, followed by a
389 * second list, and another NULL entry. The two lists correspond to
390 * the two lists in COUNTRIES.
392 static const char * const COUNTRIES_3
[] = {
393 /* "AC", "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
394 "ASC", "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
395 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
396 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
397 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
398 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
399 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
400 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
401 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
402 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
403 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CP", "CR", */
404 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CPT", "CRI",
405 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */
406 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
407 /* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */
408 "DMA", "DOM", "DZA", "EA ", "ECU", "EST", "EGY", "ESH", "ERI", /* no valid 3-letter code for EA */
409 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
410 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
411 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
412 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
413 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
414 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
415 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
416 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
417 /* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
418 "IC ", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", /* no valid 3-letter code for IC */
419 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
420 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
421 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
422 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
423 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
424 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
425 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
426 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
427 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
428 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
429 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
430 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
431 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
432 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
433 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
434 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
435 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
436 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
437 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
438 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
439 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
440 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
441 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
442 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
443 /* "SX", "SY", "SZ", "TA", "TC", "TD", "TF", "TG", "TH", "TJ", */
444 "SXM", "SYR", "SWZ", "TAA", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
445 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
446 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
447 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
448 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
449 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
450 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
451 /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
452 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
454 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
455 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
459 typedef struct CanonicalizationMap
{
460 const char *id
; /* input ID */
461 const char *canonicalID
; /* canonicalized output ID */
462 } CanonicalizationMap
;
465 * A map to canonicalize locale IDs. This handles a variety of
466 * different semantic kinds of transformations.
468 static const CanonicalizationMap CANONICALIZE_MAP
[] = {
469 { "", "en_US_POSIX" }, /* .NET name */ // open ICU 64 deleted, we restore
470 { "c", "en_US_POSIX" }, /* POSIX name */ // open ICU 64 deleted, we restore
471 { "posix", "en_US_POSIX" }, /* POSIX name (alias of C) */ // open ICU 64 deleted, we restore
472 { "art_LOJBAN", "jbo" }, /* registered name */
473 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
474 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
475 { "zh_GAN", "gan" }, /* registered name */
476 { "zh_GUOYU", "zh" }, /* registered name */
477 { "zh_HAKKA", "hak" }, /* registered name */
478 { "zh_MIN_NAN", "nan" }, /* registered name */
479 { "zh_WUU", "wuu" }, /* registered name */
480 { "zh_XIANG", "hsn" }, /* registered name */
481 { "zh_YUE", "yue" }, /* registered name */
484 /* ### BCP47 Conversion *******************************************/
485 /* Test if the locale id has BCP47 u extension and does not have '@' */
486 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
487 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
488 #define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
489 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
490 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
492 if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
496 } UPRV_BLOCK_MACRO_END
497 /* Gets the size of the shortest subtag in the given localeID. */
498 static int32_t getShortestSubtagLength(const char *localeID
) {
499 int32_t localeIDLength
= static_cast<int32_t>(uprv_strlen(localeID
));
500 int32_t length
= localeIDLength
;
501 int32_t tmpLength
= 0;
505 for (i
= 0; i
< localeIDLength
; i
++) {
506 if (localeID
[i
] != '_' && localeID
[i
] != '-') {
513 if (tmpLength
!= 0 && tmpLength
< length
) {
523 /* ### Keywords **************************************************/
524 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
525 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
526 /* Punctuation/symbols allowed in legacy key values */
527 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
529 #define ULOC_KEYWORD_BUFFER_LEN 25
530 #define ULOC_MAX_NO_KEYWORDS 25
532 U_CAPI
const char * U_EXPORT2
533 locale_getKeywordsStart(const char *localeID
) {
534 const char *result
= NULL
;
535 if((result
= uprv_strchr(localeID
, '@')) != NULL
) {
538 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
540 /* We do this because the @ sign is variant, and the @ sign used on one
541 EBCDIC machine won't be compiled the same way on other EBCDIC based
543 static const uint8_t ebcdicSigns
[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
544 const uint8_t *charToFind
= ebcdicSigns
;
546 if((result
= uprv_strchr(localeID
, *charToFind
)) != NULL
) {
557 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
558 * @param keywordName incoming name to be canonicalized
559 * @param status return status (keyword too long)
560 * @return length of the keyword name
562 static int32_t locale_canonKeywordName(char *buf
, const char *keywordName
, UErrorCode
*status
)
564 int32_t keywordNameLen
= 0;
566 for (; *keywordName
!= 0; keywordName
++) {
567 if (!UPRV_ISALPHANUM(*keywordName
)) {
568 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed keyword name */
571 if (keywordNameLen
< ULOC_KEYWORD_BUFFER_LEN
- 1) {
572 buf
[keywordNameLen
++] = uprv_tolower(*keywordName
);
574 /* keyword name too long for internal buffer */
575 *status
= U_INTERNAL_PROGRAM_ERROR
;
579 if (keywordNameLen
== 0) {
580 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty keyword name */
583 buf
[keywordNameLen
] = 0; /* terminate */
585 return keywordNameLen
;
589 char keyword
[ULOC_KEYWORD_BUFFER_LEN
];
591 const char *valueStart
;
595 static int32_t U_CALLCONV
596 compareKeywordStructs(const void * /*context*/, const void *left
, const void *right
) {
597 const char* leftString
= ((const KeywordStruct
*)left
)->keyword
;
598 const char* rightString
= ((const KeywordStruct
*)right
)->keyword
;
599 return uprv_strcmp(leftString
, rightString
);
603 _getKeywords(const char *localeID
,
605 char *keywords
, int32_t keywordCapacity
,
606 char *values
, int32_t valuesCapacity
, int32_t *valLen
,
610 KeywordStruct keywordList
[ULOC_MAX_NO_KEYWORDS
];
612 int32_t maxKeywords
= ULOC_MAX_NO_KEYWORDS
;
613 int32_t numKeywords
= 0;
614 const char* pos
= localeID
;
615 const char* equalSign
= NULL
;
616 const char* semicolon
= NULL
;
618 int32_t keywordsLen
= 0;
619 int32_t valuesLen
= 0;
621 if(prev
== '@') { /* start of keyword definition */
622 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
624 UBool duplicate
= FALSE
;
625 /* skip leading spaces */
629 if (!*pos
) { /* handle trailing "; " */
632 if(numKeywords
== maxKeywords
) {
633 *status
= U_INTERNAL_PROGRAM_ERROR
;
636 equalSign
= uprv_strchr(pos
, '=');
637 semicolon
= uprv_strchr(pos
, ';');
638 /* lack of '=' [foo@currency] is illegal */
639 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
640 if(!equalSign
|| (semicolon
&& semicolon
<equalSign
)) {
641 *status
= U_INVALID_FORMAT_ERROR
;
644 /* need to normalize both keyword and keyword name */
645 if(equalSign
- pos
>= ULOC_KEYWORD_BUFFER_LEN
) {
646 /* keyword name too long for internal buffer */
647 *status
= U_INTERNAL_PROGRAM_ERROR
;
650 for(i
= 0, n
= 0; i
< equalSign
- pos
; ++i
) {
652 keywordList
[numKeywords
].keyword
[n
++] = uprv_tolower(pos
[i
]);
656 /* zero-length keyword is an error. */
658 *status
= U_INVALID_FORMAT_ERROR
;
662 keywordList
[numKeywords
].keyword
[n
] = 0;
663 keywordList
[numKeywords
].keywordLen
= n
;
664 /* now grab the value part. First we skip the '=' */
666 /* then we leading spaces */
667 while(*equalSign
== ' ') {
671 /* Premature end or zero-length value */
672 if (!*equalSign
|| equalSign
== semicolon
) {
673 *status
= U_INVALID_FORMAT_ERROR
;
677 keywordList
[numKeywords
].valueStart
= equalSign
;
682 while(*(pos
- i
- 1) == ' ') {
685 keywordList
[numKeywords
].valueLen
= (int32_t)(pos
- equalSign
- i
);
688 i
= (int32_t)uprv_strlen(equalSign
);
689 while(i
&& equalSign
[i
-1] == ' ') {
692 keywordList
[numKeywords
].valueLen
= i
;
694 /* If this is a duplicate keyword, then ignore it */
695 for (j
=0; j
<numKeywords
; ++j
) {
696 if (uprv_strcmp(keywordList
[j
].keyword
, keywordList
[numKeywords
].keyword
) == 0) {
706 /* now we have a list of keywords */
707 /* we need to sort it */
708 uprv_sortArray(keywordList
, numKeywords
, sizeof(KeywordStruct
), compareKeywordStructs
, NULL
, FALSE
, status
);
710 /* Now construct the keyword part */
711 for(i
= 0; i
< numKeywords
; i
++) {
712 if(keywordsLen
+ keywordList
[i
].keywordLen
+ 1< keywordCapacity
) {
713 uprv_strcpy(keywords
+keywordsLen
, keywordList
[i
].keyword
);
715 keywords
[keywordsLen
+ keywordList
[i
].keywordLen
] = '=';
717 keywords
[keywordsLen
+ keywordList
[i
].keywordLen
] = 0;
720 keywordsLen
+= keywordList
[i
].keywordLen
+ 1;
722 if(keywordsLen
+ keywordList
[i
].valueLen
<= keywordCapacity
) {
723 uprv_strncpy(keywords
+keywordsLen
, keywordList
[i
].valueStart
, keywordList
[i
].valueLen
);
725 keywordsLen
+= keywordList
[i
].valueLen
;
727 if(i
< numKeywords
- 1) {
728 if(keywordsLen
< keywordCapacity
) {
729 keywords
[keywordsLen
] = ';';
735 if(valuesLen
+ keywordList
[i
].valueLen
+ 1< valuesCapacity
) {
736 uprv_strcpy(values
+valuesLen
, keywordList
[i
].valueStart
);
737 values
[valuesLen
+ keywordList
[i
].valueLen
] = 0;
739 valuesLen
+= keywordList
[i
].valueLen
+ 1;
743 values
[valuesLen
] = 0;
748 return u_terminateChars(keywords
, keywordCapacity
, keywordsLen
, status
);
755 locale_getKeywords(const char *localeID
,
757 char *keywords
, int32_t keywordCapacity
,
758 char *values
, int32_t valuesCapacity
, int32_t *valLen
,
760 UErrorCode
*status
) {
761 return _getKeywords(localeID
, prev
, keywords
, keywordCapacity
,
762 values
, valuesCapacity
, valLen
, valuesToo
,
766 U_CAPI
int32_t U_EXPORT2
767 uloc_getKeywordValue(const char* localeID
,
768 const char* keywordName
,
769 char* buffer
, int32_t bufferCapacity
,
772 if (buffer
!= nullptr) {
775 const char* startSearchHere
= NULL
;
776 const char* nextSeparator
= NULL
;
777 char keywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
778 char localeKeywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
781 if(status
&& U_SUCCESS(*status
) && localeID
) {
782 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
783 const char* tmpLocaleID
;
785 if (keywordName
== NULL
|| keywordName
[0] == 0) {
786 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
790 locale_canonKeywordName(keywordNameBuffer
, keywordName
, status
);
791 if(U_FAILURE(*status
)) {
795 if (_hasBCP47Extension(localeID
)) {
796 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), status
);
798 tmpLocaleID
=localeID
;
801 startSearchHere
= locale_getKeywordsStart(tmpLocaleID
);
802 if(startSearchHere
== NULL
) {
803 /* no keywords, return at once */
807 /* find the first keyword */
808 while(startSearchHere
) {
809 const char* keyValueTail
;
812 startSearchHere
++; /* skip @ or ; */
813 nextSeparator
= uprv_strchr(startSearchHere
, '=');
815 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* key must have =value */
818 /* strip leading & trailing spaces (TC decided to tolerate these) */
819 while(*startSearchHere
== ' ') {
822 keyValueTail
= nextSeparator
;
823 while (keyValueTail
> startSearchHere
&& *(keyValueTail
-1) == ' ') {
826 /* now keyValueTail points to first char after the keyName */
827 /* copy & normalize keyName from locale */
828 if (startSearchHere
== keyValueTail
) {
829 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty keyword name in passed-in locale */
833 while (startSearchHere
< keyValueTail
) {
834 if (!UPRV_ISALPHANUM(*startSearchHere
)) {
835 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed keyword name */
838 if (keyValueLen
< ULOC_KEYWORD_BUFFER_LEN
- 1) {
839 localeKeywordNameBuffer
[keyValueLen
++] = uprv_tolower(*startSearchHere
++);
841 /* keyword name too long for internal buffer */
842 *status
= U_INTERNAL_PROGRAM_ERROR
;
846 localeKeywordNameBuffer
[keyValueLen
] = 0; /* terminate */
848 startSearchHere
= uprv_strchr(nextSeparator
, ';');
850 if(uprv_strcmp(keywordNameBuffer
, localeKeywordNameBuffer
) == 0) {
851 /* current entry matches the keyword. */
852 nextSeparator
++; /* skip '=' */
853 /* First strip leading & trailing spaces (TC decided to tolerate these) */
854 while(*nextSeparator
== ' ') {
857 keyValueTail
= (startSearchHere
)? startSearchHere
: nextSeparator
+ uprv_strlen(nextSeparator
);
858 while(keyValueTail
> nextSeparator
&& *(keyValueTail
-1) == ' ') {
861 /* Now copy the value, but check well-formedness */
862 if (nextSeparator
== keyValueTail
) {
863 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty key value name in passed-in locale */
867 while (nextSeparator
< keyValueTail
) {
868 if (!UPRV_ISALPHANUM(*nextSeparator
) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator
)) {
869 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed key value */
872 if (keyValueLen
< bufferCapacity
) {
873 /* Should we lowercase value to return here? Tests expect as-is. */
874 buffer
[keyValueLen
++] = *nextSeparator
++;
875 } else { /* keep advancing so we return correct length in case of overflow */
880 result
= u_terminateChars(buffer
, bufferCapacity
, keyValueLen
, status
);
888 U_CAPI
int32_t U_EXPORT2
889 uloc_setKeywordValue(const char* keywordName
,
890 const char* keywordValue
,
891 char* buffer
, int32_t bufferCapacity
,
894 /* TODO: sorting. removal. */
895 int32_t keywordNameLen
;
896 int32_t keywordValueLen
;
899 char keywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
900 char keywordValueBuffer
[ULOC_KEYWORDS_CAPACITY
+1];
901 char localeKeywordNameBuffer
[ULOC_KEYWORD_BUFFER_LEN
];
903 char* nextSeparator
= NULL
;
904 char* nextEqualsign
= NULL
;
905 char* startSearchHere
= NULL
;
906 char* keywordStart
= NULL
;
907 CharString updatedKeysAndValues
;
908 int32_t updatedKeysAndValuesLen
;
909 UBool handledInputKeyAndValue
= FALSE
;
910 char keyValuePrefix
= '@';
912 if(U_FAILURE(*status
)) {
915 if (keywordName
== NULL
|| keywordName
[0] == 0 || bufferCapacity
<= 1) {
916 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
919 bufLen
= (int32_t)uprv_strlen(buffer
);
920 if(bufferCapacity
<bufLen
) {
921 /* The capacity is less than the length?! Is this NULL terminated? */
922 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
925 keywordNameLen
= locale_canonKeywordName(keywordNameBuffer
, keywordName
, status
);
926 if(U_FAILURE(*status
)) {
932 while (*keywordValue
!= 0) {
933 if (!UPRV_ISALPHANUM(*keywordValue
) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue
)) {
934 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed key value */
937 if (keywordValueLen
< ULOC_KEYWORDS_CAPACITY
) {
938 /* Should we force lowercase in value to set? */
939 keywordValueBuffer
[keywordValueLen
++] = *keywordValue
++;
941 /* keywordValue too long for internal buffer */
942 *status
= U_INTERNAL_PROGRAM_ERROR
;
947 keywordValueBuffer
[keywordValueLen
] = 0; /* terminate */
949 startSearchHere
= (char*)locale_getKeywordsStart(buffer
);
950 if(startSearchHere
== NULL
|| (startSearchHere
[1]==0)) {
951 if(keywordValueLen
== 0) { /* no keywords = nothing to remove */
955 needLen
= bufLen
+1+keywordNameLen
+1+keywordValueLen
;
956 if(startSearchHere
) { /* had a single @ */
957 needLen
--; /* already had the @ */
958 /* startSearchHere points at the @ */
960 startSearchHere
=buffer
+bufLen
;
962 if(needLen
>= bufferCapacity
) {
963 *status
= U_BUFFER_OVERFLOW_ERROR
;
964 return needLen
; /* no change */
966 *startSearchHere
++ = '@';
967 uprv_strcpy(startSearchHere
, keywordNameBuffer
);
968 startSearchHere
+= keywordNameLen
;
969 *startSearchHere
++ = '=';
970 uprv_strcpy(startSearchHere
, keywordValueBuffer
);
972 } /* end shortcut - no @ */
974 keywordStart
= startSearchHere
;
975 /* search for keyword */
976 while(keywordStart
) {
977 const char* keyValueTail
;
980 keywordStart
++; /* skip @ or ; */
981 nextEqualsign
= uprv_strchr(keywordStart
, '=');
982 if (!nextEqualsign
) {
983 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* key must have =value */
986 /* strip leading & trailing spaces (TC decided to tolerate these) */
987 while(*keywordStart
== ' ') {
990 keyValueTail
= nextEqualsign
;
991 while (keyValueTail
> keywordStart
&& *(keyValueTail
-1) == ' ') {
994 /* now keyValueTail points to first char after the keyName */
995 /* copy & normalize keyName from locale */
996 if (keywordStart
== keyValueTail
) {
997 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty keyword name in passed-in locale */
1001 while (keywordStart
< keyValueTail
) {
1002 if (!UPRV_ISALPHANUM(*keywordStart
)) {
1003 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* malformed keyword name */
1006 if (keyValueLen
< ULOC_KEYWORD_BUFFER_LEN
- 1) {
1007 localeKeywordNameBuffer
[keyValueLen
++] = uprv_tolower(*keywordStart
++);
1009 /* keyword name too long for internal buffer */
1010 *status
= U_INTERNAL_PROGRAM_ERROR
;
1014 localeKeywordNameBuffer
[keyValueLen
] = 0; /* terminate */
1016 nextSeparator
= uprv_strchr(nextEqualsign
, ';');
1018 /* start processing the value part */
1019 nextEqualsign
++; /* skip '=' */
1020 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1021 while(*nextEqualsign
== ' ') {
1024 keyValueTail
= (nextSeparator
)? nextSeparator
: nextEqualsign
+ uprv_strlen(nextEqualsign
);
1025 while(keyValueTail
> nextEqualsign
&& *(keyValueTail
-1) == ' ') {
1028 if (nextEqualsign
== keyValueTail
) {
1029 *status
= U_ILLEGAL_ARGUMENT_ERROR
; /* empty key value in passed-in locale */
1033 rc
= uprv_strcmp(keywordNameBuffer
, localeKeywordNameBuffer
);
1035 /* Current entry matches the input keyword. Update the entry */
1036 if(keywordValueLen
> 0) { /* updating a value */
1037 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1038 keyValuePrefix
= ';'; /* for any subsequent key-value pair */
1039 updatedKeysAndValues
.append(keywordNameBuffer
, keywordNameLen
, *status
);
1040 updatedKeysAndValues
.append('=', *status
);
1041 updatedKeysAndValues
.append(keywordValueBuffer
, keywordValueLen
, *status
);
1042 } /* else removing this entry, don't emit anything */
1043 handledInputKeyAndValue
= TRUE
;
1045 /* input keyword sorts earlier than current entry, add before current entry */
1046 if (rc
< 0 && keywordValueLen
> 0 && !handledInputKeyAndValue
) {
1047 /* insert new entry at this location */
1048 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1049 keyValuePrefix
= ';'; /* for any subsequent key-value pair */
1050 updatedKeysAndValues
.append(keywordNameBuffer
, keywordNameLen
, *status
);
1051 updatedKeysAndValues
.append('=', *status
);
1052 updatedKeysAndValues
.append(keywordValueBuffer
, keywordValueLen
, *status
);
1053 handledInputKeyAndValue
= TRUE
;
1055 /* copy the current entry */
1056 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1057 keyValuePrefix
= ';'; /* for any subsequent key-value pair */
1058 updatedKeysAndValues
.append(localeKeywordNameBuffer
, keyValueLen
, *status
);
1059 updatedKeysAndValues
.append('=', *status
);
1060 updatedKeysAndValues
.append(nextEqualsign
, static_cast<int32_t>(keyValueTail
-nextEqualsign
), *status
);
1062 if (!nextSeparator
&& keywordValueLen
> 0 && !handledInputKeyAndValue
) {
1063 /* append new entry at the end, it sorts later than existing entries */
1064 updatedKeysAndValues
.append(keyValuePrefix
, *status
);
1065 /* skip keyValuePrefix update, no subsequent key-value pair */
1066 updatedKeysAndValues
.append(keywordNameBuffer
, keywordNameLen
, *status
);
1067 updatedKeysAndValues
.append('=', *status
);
1068 updatedKeysAndValues
.append(keywordValueBuffer
, keywordValueLen
, *status
);
1069 handledInputKeyAndValue
= TRUE
;
1071 keywordStart
= nextSeparator
;
1072 } /* end loop searching */
1074 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1075 * problems with the passed-in locale. So if we did encounter problems with the
1076 * passed-in locale above, those errors took precedence and overrode any error
1077 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1078 * are errors here they are from updatedKeysAndValues.append; they do cause an
1079 * error return but the passed-in locale is unmodified and the original bufLen is
1082 if (!handledInputKeyAndValue
|| U_FAILURE(*status
)) {
1083 /* if input key/value specified removal of a keyword not present in locale, or
1084 * there was an error in CharString.append, leave original locale alone. */
1088 updatedKeysAndValuesLen
= updatedKeysAndValues
.length();
1089 /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1090 needLen
= (int32_t)(startSearchHere
- buffer
) + updatedKeysAndValuesLen
;
1091 if(needLen
>= bufferCapacity
) {
1092 *status
= U_BUFFER_OVERFLOW_ERROR
;
1093 return needLen
; /* no change */
1095 if (updatedKeysAndValuesLen
> 0) {
1096 uprv_strncpy(startSearchHere
, updatedKeysAndValues
.data(), updatedKeysAndValuesLen
);
1102 /* ### ID parsing implementation **************************************************/
1104 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1106 /*returns TRUE if one of the special prefixes is here (s=string)
1108 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1110 /* Dot terminates it because of POSIX form where dot precedes the codepage
1111 * except for variant
1113 #define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1116 * Lookup 'key' in the array 'list'. The array 'list' should contain
1117 * a NULL entry, followed by more entries, and a second NULL entry.
1119 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1122 static int16_t _findIndex(const char* const* list
, const char* key
)
1124 const char* const* anchor
= list
;
1127 /* Make two passes through two NULL-terminated arrays at 'list' */
1128 while (pass
++ < 2) {
1130 if (uprv_strcmp(key
, *list
) == 0) {
1131 return (int16_t)(list
- anchor
);
1135 ++list
; /* skip final NULL *CWB*/
1140 /* count the length of src while copying it to dest; return strlen(src) */
1141 static inline int32_t
1142 _copyCount(char *dest
, int32_t destCapacity
, const char *src
) {
1149 return (int32_t)(src
-anchor
);
1151 if(destCapacity
<=0) {
1152 return (int32_t)((src
-anchor
)+uprv_strlen(src
));
1161 uloc_getCurrentCountryID(const char* oldID
){
1162 int32_t offset
= _findIndex(DEPRECATED_COUNTRIES
, oldID
);
1164 return REPLACEMENT_COUNTRIES
[offset
];
1169 uloc_getCurrentLanguageID(const char* oldID
){
1170 int32_t offset
= _findIndex(DEPRECATED_LANGUAGES
, oldID
);
1172 return REPLACEMENT_LANGUAGES
[offset
];
1177 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1178 * avoid duplicating code to handle the earlier locale ID pieces
1179 * in the functions for the later ones by
1180 * setting the *pEnd pointer to where they stopped parsing
1182 * TODO try to use this in Locale
1185 ulocimp_getLanguage(const char *localeID
,
1186 char *language
, int32_t languageCapacity
,
1187 const char **pEnd
) {
1190 char lang
[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1192 /* if it starts with i- or x- then copy that prefix */
1193 if(_isIDPrefix(localeID
)) {
1194 if(i
<languageCapacity
) {
1195 language
[i
]=(char)uprv_tolower(*localeID
);
1197 if(i
<languageCapacity
) {
1204 /* copy the language as far as possible and count its length */
1205 while(!_isTerminator(*localeID
) && !_isIDSeparator(*localeID
)) {
1206 if(i
<languageCapacity
) {
1207 language
[i
]=(char)uprv_tolower(*localeID
);
1211 lang
[i
]=(char)uprv_tolower(*localeID
);
1218 /* convert 3 character code to 2 character code if possible *CWB*/
1219 offset
=_findIndex(LANGUAGES_3
, lang
);
1221 i
=_copyCount(language
, languageCapacity
, LANGUAGES
[offset
]);
1232 ulocimp_getScript(const char *localeID
,
1233 char *script
, int32_t scriptCapacity
,
1242 /* copy the second item as far as possible and count its length */
1243 while(!_isTerminator(localeID
[idLen
]) && !_isIDSeparator(localeID
[idLen
])
1244 && uprv_isASCIILetter(localeID
[idLen
])) {
1248 /* If it's exactly 4 characters long, then it's a script and not a country. */
1252 *pEnd
= localeID
+idLen
;
1254 if(idLen
> scriptCapacity
) {
1255 idLen
= scriptCapacity
;
1258 script
[0]=(char)uprv_toupper(*(localeID
++));
1260 for (i
= 1; i
< idLen
; i
++) {
1261 script
[i
]=(char)uprv_tolower(*(localeID
++));
1271 ulocimp_getCountry(const char *localeID
,
1272 char *country
, int32_t countryCapacity
,
1276 char cnty
[ULOC_COUNTRY_CAPACITY
]={ 0, 0, 0, 0 };
1279 /* copy the country as far as possible and count its length */
1280 while(!_isTerminator(localeID
[idLen
]) && !_isIDSeparator(localeID
[idLen
])) {
1281 if(idLen
<(ULOC_COUNTRY_CAPACITY
-1)) { /*CWB*/
1282 cnty
[idLen
]=(char)uprv_toupper(localeID
[idLen
]);
1287 /* the country should be either length 2 or 3 */
1288 if (idLen
== 2 || idLen
== 3) {
1289 UBool gotCountry
= FALSE
;
1290 /* convert 3 character code to 2 character code if possible *CWB*/
1292 offset
=_findIndex(COUNTRIES_3
, cnty
);
1294 idLen
=_copyCount(country
, countryCapacity
, COUNTRIES
[offset
]);
1300 for (i
= 0; i
< idLen
; i
++) {
1301 if (i
< countryCapacity
) {
1302 country
[i
]=(char)uprv_toupper(localeID
[i
]);
1319 * @param needSeparator if true, then add leading '_' if any variants
1320 * are added to 'variant'
1323 _getVariantEx(const char *localeID
,
1325 char *variant
, int32_t variantCapacity
,
1326 UBool needSeparator
) {
1329 /* get one or more variant tags and separate them with '_' */
1330 if(_isIDSeparator(prev
)) {
1331 /* get a variant string after a '-' or '_' */
1332 while(!_isTerminator(*localeID
)) {
1333 if (needSeparator
) {
1334 if (i
<variantCapacity
) {
1338 needSeparator
= FALSE
;
1340 if(i
<variantCapacity
) {
1341 variant
[i
]=(char)uprv_toupper(*localeID
);
1342 if(variant
[i
]=='-') {
1351 /* if there is no variant tag after a '-' or '_' then look for '@' */
1355 } else if((localeID
=locale_getKeywordsStart(localeID
))!=NULL
) {
1356 ++localeID
; /* point after the '@' */
1360 while(!_isTerminator(*localeID
)) {
1361 if (needSeparator
) {
1362 if (i
<variantCapacity
) {
1366 needSeparator
= FALSE
;
1368 if(i
<variantCapacity
) {
1369 variant
[i
]=(char)uprv_toupper(*localeID
);
1370 if(variant
[i
]=='-' || variant
[i
]==',') {
1383 _getVariant(const char *localeID
,
1385 char *variant
, int32_t variantCapacity
) {
1386 return _getVariantEx(localeID
, prev
, variant
, variantCapacity
, FALSE
);
1389 /* Keyword enumeration */
1391 typedef struct UKeywordsContext
{
1398 static void U_CALLCONV
1399 uloc_kw_closeKeywords(UEnumeration
*enumerator
) {
1400 uprv_free(((UKeywordsContext
*)enumerator
->context
)->keywords
);
1401 uprv_free(enumerator
->context
);
1402 uprv_free(enumerator
);
1405 static int32_t U_CALLCONV
1406 uloc_kw_countKeywords(UEnumeration
*en
, UErrorCode
* /*status*/) {
1407 char *kw
= ((UKeywordsContext
*)en
->context
)->keywords
;
1411 kw
+= uprv_strlen(kw
)+1;
1416 static const char * U_CALLCONV
1417 uloc_kw_nextKeyword(UEnumeration
* en
,
1418 int32_t* resultLength
,
1419 UErrorCode
* /*status*/) {
1420 const char* result
= ((UKeywordsContext
*)en
->context
)->current
;
1423 len
= (int32_t)uprv_strlen(((UKeywordsContext
*)en
->context
)->current
);
1424 ((UKeywordsContext
*)en
->context
)->current
+= len
+1;
1429 *resultLength
= len
;
1434 static void U_CALLCONV
1435 uloc_kw_resetKeywords(UEnumeration
* en
,
1436 UErrorCode
* /*status*/) {
1437 ((UKeywordsContext
*)en
->context
)->current
= ((UKeywordsContext
*)en
->context
)->keywords
;
1443 static const UEnumeration gKeywordsEnum
= {
1446 uloc_kw_closeKeywords
,
1447 uloc_kw_countKeywords
,
1449 uloc_kw_nextKeyword
,
1450 uloc_kw_resetKeywords
1453 U_CAPI UEnumeration
* U_EXPORT2
1454 uloc_openKeywordList(const char *keywordList
, int32_t keywordListSize
, UErrorCode
* status
)
1456 LocalMemory
<UKeywordsContext
> myContext
;
1457 LocalMemory
<UEnumeration
> result
;
1459 if (U_FAILURE(*status
)) {
1462 myContext
.adoptInstead(static_cast<UKeywordsContext
*>(uprv_malloc(sizeof(UKeywordsContext
))));
1463 result
.adoptInstead(static_cast<UEnumeration
*>(uprv_malloc(sizeof(UEnumeration
))));
1464 if (myContext
.isNull() || result
.isNull()) {
1465 *status
= U_MEMORY_ALLOCATION_ERROR
;
1468 uprv_memcpy(result
.getAlias(), &gKeywordsEnum
, sizeof(UEnumeration
));
1469 myContext
->keywords
= static_cast<char *>(uprv_malloc(keywordListSize
+1));
1470 if (myContext
->keywords
== nullptr) {
1471 *status
= U_MEMORY_ALLOCATION_ERROR
;
1474 uprv_memcpy(myContext
->keywords
, keywordList
, keywordListSize
);
1475 myContext
->keywords
[keywordListSize
] = 0;
1476 myContext
->current
= myContext
->keywords
;
1477 result
->context
= myContext
.orphan();
1478 return result
.orphan();
1481 U_CAPI UEnumeration
* U_EXPORT2
1482 uloc_openKeywords(const char* localeID
,
1487 int32_t keywordsCapacity
= 256;
1488 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
1489 const char* tmpLocaleID
;
1491 if(status
==NULL
|| U_FAILURE(*status
)) {
1495 if (_hasBCP47Extension(localeID
)) {
1496 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), status
);
1498 if (localeID
==NULL
) {
1499 localeID
=uloc_getDefault();
1501 tmpLocaleID
=localeID
;
1504 /* Skip the language */
1505 ulocimp_getLanguage(tmpLocaleID
, NULL
, 0, &tmpLocaleID
);
1506 if(_isIDSeparator(*tmpLocaleID
)) {
1507 const char *scriptID
;
1508 /* Skip the script if available */
1509 ulocimp_getScript(tmpLocaleID
+1, NULL
, 0, &scriptID
);
1510 if(scriptID
!= tmpLocaleID
+1) {
1511 /* Found optional script */
1512 tmpLocaleID
= scriptID
;
1514 /* Skip the Country */
1515 if (_isIDSeparator(*tmpLocaleID
)) {
1516 ulocimp_getCountry(tmpLocaleID
+1, NULL
, 0, &tmpLocaleID
);
1517 if(_isIDSeparator(*tmpLocaleID
)) {
1518 _getVariant(tmpLocaleID
+1, *tmpLocaleID
, NULL
, 0);
1523 /* keywords are located after '@' */
1524 if((tmpLocaleID
= locale_getKeywordsStart(tmpLocaleID
)) != NULL
) {
1525 i
=locale_getKeywords(tmpLocaleID
+1, '@', keywords
, keywordsCapacity
, NULL
, 0, NULL
, FALSE
, status
);
1529 return uloc_openKeywordList(keywords
, i
, status
);
1536 /* bit-flags for 'options' parameter of _canonicalize */
1537 #define _ULOC_STRIP_KEYWORDS 0x2
1538 #define _ULOC_CANONICALIZE 0x1
1540 #define OPTION_SET(options, mask) ((options & mask) != 0)
1542 static const char i_default
[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1543 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1546 * Canonicalize the given localeID, to level 1 or to level 2,
1547 * depending on the options. To specify level 1, pass in options=0.
1548 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1550 * This is the code underlying uloc_getName and uloc_canonicalize.
1553 _canonicalize(const char* localeID
,
1555 int32_t resultCapacity
,
1558 int32_t j
, len
, fieldCount
=0, scriptSize
=0, variantSize
=0, nameCapacity
;
1559 char localeBuffer
[ULOC_FULLNAME_CAPACITY
];
1560 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
1561 const char* origLocaleID
;
1562 const char* tmpLocaleID
;
1563 const char* keywordAssign
= NULL
;
1564 const char* separatorIndicator
= NULL
;
1566 char* variant
= NULL
; /* pointer into name, or NULL */
1568 if (U_FAILURE(*err
)) {
1572 if (_hasBCP47Extension(localeID
)) {
1573 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), err
);
1575 if (localeID
==NULL
) {
1576 localeID
=uloc_getDefault();
1578 tmpLocaleID
=localeID
;
1581 origLocaleID
=tmpLocaleID
;
1583 /* if we are doing a full canonicalization, then put results in
1584 localeBuffer, if necessary; otherwise send them to result. */
1585 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1586 (result
== NULL
|| resultCapacity
< (int32_t)sizeof(localeBuffer
))) {
1587 name
= localeBuffer
;
1588 nameCapacity
= (int32_t)sizeof(localeBuffer
);
1591 nameCapacity
= resultCapacity
;
1594 /* get all pieces, one after another, and separate with '_' */
1595 len
=ulocimp_getLanguage(tmpLocaleID
, name
, nameCapacity
, &tmpLocaleID
);
1597 if(len
== I_DEFAULT_LENGTH
&& uprv_strncmp(origLocaleID
, i_default
, len
) == 0) {
1598 const char *d
= uloc_getDefault();
1600 len
= (int32_t)uprv_strlen(d
);
1603 uprv_memcpy(name
, d
, len
);
1605 } else if(_isIDSeparator(*tmpLocaleID
)) {
1606 const char *scriptID
;
1609 if(len
<nameCapacity
) {
1614 scriptSize
=ulocimp_getScript(tmpLocaleID
+1,
1615 (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
, &scriptID
);
1616 if(scriptSize
> 0) {
1617 /* Found optional script */
1618 tmpLocaleID
= scriptID
;
1621 if (_isIDSeparator(*tmpLocaleID
)) {
1622 /* If there is something else, then we add the _ */
1623 if(len
<nameCapacity
) {
1630 if (_isIDSeparator(*tmpLocaleID
)) {
1631 const char *cntryID
;
1632 int32_t cntrySize
= ulocimp_getCountry(tmpLocaleID
+1,
1633 (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
, &cntryID
);
1634 if (cntrySize
> 0) {
1635 /* Found optional country */
1636 tmpLocaleID
= cntryID
;
1639 if(_isIDSeparator(*tmpLocaleID
)) {
1640 /* If there is something else, then we add the _ if we found country before. */
1641 if (cntrySize
>= 0 && ! _isIDSeparator(*(tmpLocaleID
+1)) ) {
1643 if(len
<nameCapacity
) {
1649 variantSize
= _getVariant(tmpLocaleID
+1, *tmpLocaleID
,
1650 (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
);
1651 if (variantSize
> 0) {
1652 variant
= len
<nameCapacity
? name
+len
: NULL
;
1654 tmpLocaleID
+= variantSize
+ 1; /* skip '_' and variant */
1660 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1661 if (!OPTION_SET(options
, _ULOC_CANONICALIZE
) && *tmpLocaleID
== '.') {
1664 char c
= *tmpLocaleID
;
1671 if (len
<nameCapacity
) {
1681 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1682 After this, tmpLocaleID either points to '@' or is NULL */
1683 if ((tmpLocaleID
=locale_getKeywordsStart(tmpLocaleID
))!=NULL
) {
1684 keywordAssign
= uprv_strchr(tmpLocaleID
, '=');
1685 separatorIndicator
= uprv_strchr(tmpLocaleID
, ';');
1688 /* Copy POSIX-style variant, if any [mr@FOO] */
1689 if (!OPTION_SET(options
, _ULOC_CANONICALIZE
) &&
1690 tmpLocaleID
!= NULL
&& keywordAssign
== NULL
) {
1692 char c
= *tmpLocaleID
;
1696 if (len
<nameCapacity
) {
1704 if (OPTION_SET(options
, _ULOC_CANONICALIZE
)) {
1705 /* Handle @FOO variant if @ is present and not followed by = */
1706 if (tmpLocaleID
!=NULL
&& keywordAssign
==NULL
) {
1707 int32_t posixVariantSize
;
1708 /* Add missing '_' if needed */
1709 if (fieldCount
< 2 || (fieldCount
< 3 && scriptSize
> 0)) {
1711 if(len
<nameCapacity
) {
1716 } while(fieldCount
<2);
1718 posixVariantSize
= _getVariantEx(tmpLocaleID
+1, '@', name
+len
, nameCapacity
-len
,
1719 (UBool
)(variantSize
> 0));
1720 if (posixVariantSize
> 0) {
1721 if (variant
== NULL
) {
1724 len
+= posixVariantSize
;
1725 variantSize
+= posixVariantSize
;
1729 /* Look up the ID in the canonicalization map */
1730 for (j
=0; j
<UPRV_LENGTHOF(CANONICALIZE_MAP
); j
++) {
1731 const char* id
= CANONICALIZE_MAP
[j
].id
;
1732 int32_t n
= (int32_t)uprv_strlen(id
);
1733 if (len
== n
&& uprv_strncmp(name
, id
, n
) == 0) {
1734 if (n
== 0 && tmpLocaleID
!= NULL
) {
1735 break; /* Don't remap "" if keywords present */
1737 len
= _copyCount(name
, nameCapacity
, CANONICALIZE_MAP
[j
].canonicalID
);
1743 if (!OPTION_SET(options
, _ULOC_STRIP_KEYWORDS
)) {
1744 if (tmpLocaleID
!=NULL
&& keywordAssign
!=NULL
&&
1745 (!separatorIndicator
|| separatorIndicator
> keywordAssign
)) {
1746 if(len
<nameCapacity
) {
1751 len
+= _getKeywords(tmpLocaleID
+1, '@', (len
<nameCapacity
? name
+len
: NULL
), nameCapacity
-len
,
1752 NULL
, 0, NULL
, TRUE
, err
);
1756 if (U_SUCCESS(*err
) && result
!= NULL
&& name
== localeBuffer
) {
1757 uprv_strncpy(result
, localeBuffer
, (len
> resultCapacity
) ? resultCapacity
: len
);
1760 return u_terminateChars(result
, resultCapacity
, len
, err
);
1763 /* ### ID parsing API **************************************************/
1765 U_CAPI
int32_t U_EXPORT2
1766 uloc_getParent(const char* localeID
,
1768 int32_t parentCapacity
,
1771 const char *lastUnderscore
;
1774 if (U_FAILURE(*err
))
1777 if (localeID
== NULL
)
1778 localeID
= uloc_getDefault();
1780 lastUnderscore
=uprv_strrchr(localeID
, '_');
1781 if(lastUnderscore
!=NULL
) {
1782 i
=(int32_t)(lastUnderscore
-localeID
);
1787 if(i
>0 && parent
!= localeID
) {
1788 uprv_memcpy(parent
, localeID
, uprv_min(i
, parentCapacity
));
1791 return u_terminateChars(parent
, parentCapacity
, i
, err
);
1794 U_CAPI
int32_t U_EXPORT2
1795 uloc_getLanguage(const char* localeID
,
1797 int32_t languageCapacity
,
1800 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1803 if (err
==NULL
|| U_FAILURE(*err
)) {
1807 if(localeID
==NULL
) {
1808 localeID
=uloc_getDefault();
1811 i
=ulocimp_getLanguage(localeID
, language
, languageCapacity
, NULL
);
1812 return u_terminateChars(language
, languageCapacity
, i
, err
);
1815 U_CAPI
int32_t U_EXPORT2
1816 uloc_getScript(const char* localeID
,
1818 int32_t scriptCapacity
,
1823 if(err
==NULL
|| U_FAILURE(*err
)) {
1827 if(localeID
==NULL
) {
1828 localeID
=uloc_getDefault();
1831 /* skip the language */
1832 ulocimp_getLanguage(localeID
, NULL
, 0, &localeID
);
1833 if(_isIDSeparator(*localeID
)) {
1834 i
=ulocimp_getScript(localeID
+1, script
, scriptCapacity
, NULL
);
1836 return u_terminateChars(script
, scriptCapacity
, i
, err
);
1839 U_CAPI
int32_t U_EXPORT2
1840 uloc_getCountry(const char* localeID
,
1842 int32_t countryCapacity
,
1847 if(err
==NULL
|| U_FAILURE(*err
)) {
1851 if(localeID
==NULL
) {
1852 localeID
=uloc_getDefault();
1855 /* Skip the language */
1856 ulocimp_getLanguage(localeID
, NULL
, 0, &localeID
);
1857 if(_isIDSeparator(*localeID
)) {
1858 const char *scriptID
;
1859 /* Skip the script if available */
1860 ulocimp_getScript(localeID
+1, NULL
, 0, &scriptID
);
1861 if(scriptID
!= localeID
+1) {
1862 /* Found optional script */
1863 localeID
= scriptID
;
1865 if(_isIDSeparator(*localeID
)) {
1866 i
=ulocimp_getCountry(localeID
+1, country
, countryCapacity
, NULL
);
1869 return u_terminateChars(country
, countryCapacity
, i
, err
);
1872 U_CAPI
int32_t U_EXPORT2
1873 uloc_getVariant(const char* localeID
,
1875 int32_t variantCapacity
,
1878 char tempBuffer
[ULOC_FULLNAME_CAPACITY
];
1879 const char* tmpLocaleID
;
1882 if(err
==NULL
|| U_FAILURE(*err
)) {
1886 if (_hasBCP47Extension(localeID
)) {
1887 _ConvertBCP47(tmpLocaleID
, localeID
, tempBuffer
, sizeof(tempBuffer
), err
);
1889 if (localeID
==NULL
) {
1890 localeID
=uloc_getDefault();
1892 tmpLocaleID
=localeID
;
1895 /* Skip the language */
1896 ulocimp_getLanguage(tmpLocaleID
, NULL
, 0, &tmpLocaleID
);
1897 if(_isIDSeparator(*tmpLocaleID
)) {
1898 const char *scriptID
;
1899 /* Skip the script if available */
1900 ulocimp_getScript(tmpLocaleID
+1, NULL
, 0, &scriptID
);
1901 if(scriptID
!= tmpLocaleID
+1) {
1902 /* Found optional script */
1903 tmpLocaleID
= scriptID
;
1905 /* Skip the Country */
1906 if (_isIDSeparator(*tmpLocaleID
)) {
1907 const char *cntryID
;
1908 ulocimp_getCountry(tmpLocaleID
+1, NULL
, 0, &cntryID
);
1909 if (cntryID
!= tmpLocaleID
+1) {
1910 /* Found optional country */
1911 tmpLocaleID
= cntryID
;
1913 if(_isIDSeparator(*tmpLocaleID
)) {
1914 /* If there was no country ID, skip a possible extra IDSeparator */
1915 if (tmpLocaleID
!= cntryID
&& _isIDSeparator(tmpLocaleID
[1])) {
1918 i
=_getVariant(tmpLocaleID
+1, *tmpLocaleID
, variant
, variantCapacity
);
1923 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1924 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1926 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1927 i=_getVariant(localeID+1, '@', variant, variantCapacity);
1930 return u_terminateChars(variant
, variantCapacity
, i
, err
);
1933 U_CAPI
int32_t U_EXPORT2
1934 uloc_getName(const char* localeID
,
1936 int32_t nameCapacity
,
1939 return _canonicalize(localeID
, name
, nameCapacity
, 0, err
);
1942 U_CAPI
int32_t U_EXPORT2
1943 uloc_getBaseName(const char* localeID
,
1945 int32_t nameCapacity
,
1948 return _canonicalize(localeID
, name
, nameCapacity
, _ULOC_STRIP_KEYWORDS
, err
);
1951 U_CAPI
int32_t U_EXPORT2
1952 uloc_canonicalize(const char* localeID
,
1954 int32_t nameCapacity
,
1957 return _canonicalize(localeID
, name
, nameCapacity
, _ULOC_CANONICALIZE
, err
);
1960 U_CAPI
const char* U_EXPORT2
1961 uloc_getISO3Language(const char* localeID
)
1964 char lang
[ULOC_LANG_CAPACITY
];
1965 UErrorCode err
= U_ZERO_ERROR
;
1967 if (localeID
== NULL
)
1969 localeID
= uloc_getDefault();
1971 uloc_getLanguage(localeID
, lang
, ULOC_LANG_CAPACITY
, &err
);
1974 offset
= _findIndex(LANGUAGES
, lang
);
1977 return LANGUAGES_3
[offset
];
1980 U_CAPI
const char* U_EXPORT2
1981 uloc_getISO3Country(const char* localeID
)
1984 char cntry
[ULOC_LANG_CAPACITY
];
1985 UErrorCode err
= U_ZERO_ERROR
;
1987 if (localeID
== NULL
)
1989 localeID
= uloc_getDefault();
1991 uloc_getCountry(localeID
, cntry
, ULOC_LANG_CAPACITY
, &err
);
1994 offset
= _findIndex(COUNTRIES
, cntry
);
1998 return COUNTRIES_3
[offset
];
2001 U_CAPI
uint32_t U_EXPORT2
2002 uloc_getLCID(const char* localeID
)
2004 UErrorCode status
= U_ZERO_ERROR
;
2005 char langID
[ULOC_FULLNAME_CAPACITY
];
2008 /* Check for incomplete id. */
2009 if (!localeID
|| uprv_strlen(localeID
) < 2) {
2013 // First, attempt Windows platform lookup if available, but fall
2014 // through to catch any special cases (ICU vs Windows name differences).
2015 lcid
= uprv_convertToLCIDPlatform(localeID
, &status
);
2016 if (U_FAILURE(status
)) {
2020 // Windows found an LCID, return that
2024 uloc_getLanguage(localeID
, langID
, sizeof(langID
), &status
);
2025 if (U_FAILURE(status
) || status
== U_STRING_NOT_TERMINATED_WARNING
) {
2029 if (uprv_strchr(localeID
, '@')) {
2030 // uprv_convertToLCID does not support keywords other than collation.
2031 // Remove all keywords except collation.
2033 char collVal
[ULOC_KEYWORDS_CAPACITY
];
2034 char tmpLocaleID
[ULOC_FULLNAME_CAPACITY
];
2036 len
= uloc_getKeywordValue(localeID
, "collation", collVal
,
2037 UPRV_LENGTHOF(collVal
) - 1, &status
);
2039 if (U_SUCCESS(status
) && len
> 0) {
2042 len
= uloc_getBaseName(localeID
, tmpLocaleID
,
2043 UPRV_LENGTHOF(tmpLocaleID
) - 1, &status
);
2045 if (U_SUCCESS(status
) && len
> 0) {
2046 tmpLocaleID
[len
] = 0;
2048 len
= uloc_setKeywordValue("collation", collVal
, tmpLocaleID
,
2049 UPRV_LENGTHOF(tmpLocaleID
) - len
- 1, &status
);
2051 if (U_SUCCESS(status
) && len
> 0) {
2052 tmpLocaleID
[len
] = 0;
2053 return uprv_convertToLCID(langID
, tmpLocaleID
, &status
);
2058 // fall through - all keywords are simply ignored
2059 status
= U_ZERO_ERROR
;
2062 return uprv_convertToLCID(langID
, localeID
, &status
);
2065 U_CAPI
int32_t U_EXPORT2
2066 uloc_getLocaleForLCID(uint32_t hostid
, char *locale
, int32_t localeCapacity
,
2069 return uprv_convertToPosix(hostid
, locale
, localeCapacity
, status
);
2072 /* ### Default locale **************************************************/
2074 U_CAPI
const char* U_EXPORT2
2077 return locale_get_default();
2080 U_CAPI
void U_EXPORT2
2081 uloc_setDefault(const char* newDefaultLocale
,
2084 if (U_FAILURE(*err
))
2086 /* the error code isn't currently used for anything by this function*/
2088 /* propagate change to C++ */
2089 locale_set_default(newDefaultLocale
);
2093 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
2094 * to an array of pointers to arrays of char. All of these pointers are owned
2095 * by ICU-- do not delete them, and do not write through them. The array is
2096 * terminated with a null pointer.
2098 U_CAPI
const char* const* U_EXPORT2
2099 uloc_getISOLanguages()
2105 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2106 * pointer to an array of pointers to arrays of char. All of these pointers are
2107 * owned by ICU-- do not delete them, and do not write through them. The array is
2108 * terminated with a null pointer.
2110 U_CAPI
const char* const* U_EXPORT2
2111 uloc_getISOCountries()
2117 /* this function to be moved into cstring.c later */
2118 static char gDecimal
= 0;
2123 _uloc_strtod(const char *start
, char **end
) {
2130 /* For machines that decide to change the decimal on you,
2131 and try to be too smart with localization.
2132 This normally should be just a '.'. */
2133 sprintf(rep
, "%+1.1f", 1.0);
2137 if(gDecimal
== '.') {
2138 return uprv_strtod(start
, end
); /* fall through to OS */
2140 uprv_strncpy(buf
, start
, 29);
2142 decimal
= uprv_strchr(buf
, '.');
2144 *decimal
= gDecimal
;
2146 return uprv_strtod(start
, end
); /* no decimal point */
2148 rv
= uprv_strtod(buf
, &myEnd
);
2150 *end
= (char*)(start
+(myEnd
-buf
)); /* cast away const (to follow uprv_strtod API.) */
2158 int32_t dummy
; /* to avoid uninitialized memory copy from qsort */
2159 char locale
[ULOC_FULLNAME_CAPACITY
+1];
2162 static int32_t U_CALLCONV
2163 uloc_acceptLanguageCompare(const void * /*context*/, const void *a
, const void *b
)
2165 const _acceptLangItem
*aa
= (const _acceptLangItem
*)a
;
2166 const _acceptLangItem
*bb
= (const _acceptLangItem
*)b
;
2170 rc
= -1; /* A > B */
2171 } else if(bb
->q
> aa
->q
) {
2178 rc
= uprv_stricmp(aa
->locale
, bb
->locale
);
2181 #if defined(ULOC_DEBUG)
2182 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2192 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2195 U_CAPI
int32_t U_EXPORT2
2196 uloc_acceptLanguageFromHTTP(char *result
, int32_t resultAvailable
, UAcceptResult
*outResult
,
2197 const char *httpAcceptLanguage
,
2198 UEnumeration
* availableLocales
,
2201 MaybeStackArray
<_acceptLangItem
, 4> items
; // Struct for collecting items.
2202 char tmp
[ULOC_FULLNAME_CAPACITY
+1];
2204 const char *itemEnd
;
2205 const char *paramEnd
;
2210 int32_t l
= (int32_t)uprv_strlen(httpAcceptLanguage
);
2212 if(U_FAILURE(*status
)) {
2216 for(s
=httpAcceptLanguage
;s
&&*s
;) {
2217 while(isspace(*s
)) /* eat space at the beginning */
2219 itemEnd
=uprv_strchr(s
,',');
2220 paramEnd
=uprv_strchr(s
,';');
2222 itemEnd
= httpAcceptLanguage
+l
; /* end of string */
2224 if(paramEnd
&& paramEnd
<itemEnd
) {
2225 /* semicolon (;) is closer than end (,) */
2230 while(isspace(*t
)) {
2236 while(isspace(*t
)) {
2239 items
[n
].q
= (float)_uloc_strtod(t
,NULL
);
2241 /* no semicolon - it's 1.0 */
2246 /* eat spaces prior to semi */
2247 for(t
=(paramEnd
-1);(paramEnd
>s
)&&isspace(*t
);t
--)
2249 int32_t slen
= static_cast<int32_t>(((t
+1)-s
));
2250 if(slen
> ULOC_FULLNAME_CAPACITY
) {
2251 *status
= U_BUFFER_OVERFLOW_ERROR
;
2252 return -1; // too big
2254 uprv_strncpy(items
[n
].locale
, s
, slen
);
2255 items
[n
].locale
[slen
]=0; // terminate
2256 int32_t clen
= uloc_canonicalize(items
[n
].locale
, tmp
, UPRV_LENGTHOF(tmp
)-1, status
);
2257 if(U_FAILURE(*status
)) return -1;
2258 if((clen
!=slen
) || (uprv_strncmp(items
[n
].locale
, tmp
, slen
))) {
2259 // canonicalization had an effect- copy back
2260 uprv_strncpy(items
[n
].locale
, tmp
, clen
);
2261 items
[n
].locale
[clen
] = 0; // terminate
2263 #if defined(ULOC_DEBUG)
2264 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2268 while(*s
==',') { /* eat duplicate commas */
2271 if(n
>=items
.getCapacity()) { // If we need more items
2272 if(NULL
== items
.resize(items
.getCapacity()*2, items
.getCapacity())) {
2273 *status
= U_MEMORY_ALLOCATION_ERROR
;
2276 #if defined(ULOC_DEBUG)
2277 fprintf(stderr
,"malloced at size %d\n", items
.getCapacity());
2281 uprv_sortArray(items
.getAlias(), n
, sizeof(items
[0]), uloc_acceptLanguageCompare
, NULL
, TRUE
, status
);
2282 if (U_FAILURE(*status
)) {
2285 LocalMemory
<const char*> strs(NULL
);
2286 if (strs
.allocateInsteadAndReset(n
) == NULL
) {
2287 *status
= U_MEMORY_ALLOCATION_ERROR
;
2291 #if defined(ULOC_DEBUG)
2292 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2294 strs
[i
]=items
[i
].locale
;
2296 res
= uloc_acceptLanguage(result
, resultAvailable
, outResult
,
2297 strs
.getAlias(), n
, availableLocales
, status
);
2302 U_CAPI
int32_t U_EXPORT2
2303 uloc_acceptLanguage(char *result
, int32_t resultAvailable
,
2304 UAcceptResult
*outResult
, const char **acceptList
,
2305 int32_t acceptListCount
,
2306 UEnumeration
* availableLocales
,
2312 char tmp
[ULOC_FULLNAME_CAPACITY
+1];
2314 char **fallbackList
;
2315 if(U_FAILURE(*status
)) {
2318 fallbackList
= static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList
[0])*acceptListCount
)));
2319 if(fallbackList
==NULL
) {
2320 *status
= U_MEMORY_ALLOCATION_ERROR
;
2323 for(i
=0;i
<acceptListCount
;i
++) {
2324 #if defined(ULOC_DEBUG)
2325 fprintf(stderr
,"%02d: %s\n", i
, acceptList
[i
]);
2327 while((l
=uenum_next(availableLocales
, NULL
, status
)) != NULL
) {
2328 #if defined(ULOC_DEBUG)
2329 fprintf(stderr
," %s\n", l
);
2331 len
= (int32_t)uprv_strlen(l
);
2332 if(!uprv_strcmp(acceptList
[i
], l
)) {
2334 *outResult
= ULOC_ACCEPT_VALID
;
2336 #if defined(ULOC_DEBUG)
2337 fprintf(stderr
, "MATCH! %s\n", l
);
2340 uprv_strncpy(result
, l
, uprv_min(len
, resultAvailable
));
2343 uprv_free(fallbackList
[j
]);
2345 uprv_free(fallbackList
);
2346 return u_terminateChars(result
, resultAvailable
, len
, status
);
2352 uenum_reset(availableLocales
, status
);
2353 /* save off parent info */
2354 if(uloc_getParent(acceptList
[i
], tmp
, UPRV_LENGTHOF(tmp
), status
)!=0) {
2355 fallbackList
[i
] = uprv_strdup(tmp
);
2361 for(maxLen
--;maxLen
>0;maxLen
--) {
2362 for(i
=0;i
<acceptListCount
;i
++) {
2363 if(fallbackList
[i
] && ((int32_t)uprv_strlen(fallbackList
[i
])==maxLen
)) {
2364 #if defined(ULOC_DEBUG)
2365 fprintf(stderr
,"Try: [%s]", fallbackList
[i
]);
2367 while((l
=uenum_next(availableLocales
, NULL
, status
)) != NULL
) {
2368 #if defined(ULOC_DEBUG)
2369 fprintf(stderr
," %s\n", l
);
2371 len
= (int32_t)uprv_strlen(l
);
2372 if(!uprv_strcmp(fallbackList
[i
], l
)) {
2374 *outResult
= ULOC_ACCEPT_FALLBACK
;
2376 #if defined(ULOC_DEBUG)
2377 fprintf(stderr
, "fallback MATCH! %s\n", l
);
2380 uprv_strncpy(result
, l
, uprv_min(len
, resultAvailable
));
2382 for(j
=0;j
<acceptListCount
;j
++) {
2383 uprv_free(fallbackList
[j
]);
2385 uprv_free(fallbackList
);
2386 return u_terminateChars(result
, resultAvailable
, len
, status
);
2389 uenum_reset(availableLocales
, status
);
2391 if(uloc_getParent(fallbackList
[i
], tmp
, UPRV_LENGTHOF(tmp
), status
)!=0) {
2392 uprv_free(fallbackList
[i
]);
2393 fallbackList
[i
] = uprv_strdup(tmp
);
2395 uprv_free(fallbackList
[i
]);
2401 *outResult
= ULOC_ACCEPT_FAILED
;
2404 for(i
=0;i
<acceptListCount
;i
++) {
2405 uprv_free(fallbackList
[i
]);
2407 uprv_free(fallbackList
);
2411 U_CAPI
const char* U_EXPORT2
2412 uloc_toUnicodeLocaleKey(const char* keyword
)
2414 const char* bcpKey
= ulocimp_toBcpKey(keyword
);
2415 if (bcpKey
== NULL
&& ultag_isUnicodeLocaleKey(keyword
, -1)) {
2416 // unknown keyword, but syntax is fine..
2422 U_CAPI
const char* U_EXPORT2
2423 uloc_toUnicodeLocaleType(const char* keyword
, const char* value
)
2425 const char* bcpType
= ulocimp_toBcpType(keyword
, value
, NULL
, NULL
);
2426 if (bcpType
== NULL
&& ultag_isUnicodeLocaleType(value
, -1)) {
2427 // unknown keyword, but syntax is fine..
2434 isWellFormedLegacyKey(const char* legacyKey
)
2436 const char* p
= legacyKey
;
2438 if (!UPRV_ISALPHANUM(*p
)) {
2447 isWellFormedLegacyType(const char* legacyType
)
2449 const char* p
= legacyType
;
2450 int32_t alphaNumLen
= 0;
2452 if (*p
== '_' || *p
== '/' || *p
== '-') {
2453 if (alphaNumLen
== 0) {
2457 } else if (UPRV_ISALPHANUM(*p
)) {
2464 return (alphaNumLen
!= 0);
2467 U_CAPI
const char* U_EXPORT2
2468 uloc_toLegacyKey(const char* keyword
)
2470 const char* legacyKey
= ulocimp_toLegacyKey(keyword
);
2471 if (legacyKey
== NULL
) {
2472 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2475 // LDML/CLDR provides some definition of keyword syntax in
2476 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2477 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2478 // Keys can only consist of [0-9a-zA-Z].
2479 if (isWellFormedLegacyKey(keyword
)) {
2486 U_CAPI
const char* U_EXPORT2
2487 uloc_toLegacyType(const char* keyword
, const char* value
)
2489 const char* legacyType
= ulocimp_toLegacyType(keyword
, value
, NULL
, NULL
);
2490 if (legacyType
== NULL
) {
2491 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2494 // LDML/CLDR provides some definition of keyword syntax in
2495 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2496 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2497 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2498 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2499 if (isWellFormedLegacyType(value
)) {