]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/uloc.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / uloc.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (C) 1997-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*
9* File ULOC.CPP
10*
11* Modification History:
12*
13* Date Name Description
14* 04/01/97 aliu Creation.
15* 08/21/98 stephen JDK 1.2 sync
16* 12/08/98 rtg New Locale implementation and C API
17* 03/15/99 damiba overhaul.
18* 04/06/99 stephen changed setDefault() to realloc and copy
19* 06/14/99 stephen Changed calls to ures_open for new params
20* 07/21/99 stephen Modified setDefault() to propagate to C++
374ca955
A
21* 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs,
22* brought canonicalization code into line with spec
b75a7d8f
A
23*****************************************************************************/
24
25/*
26 POSIX's locale format, from putil.c: [no spaces]
27
28 ll [ _CC ] [ . MM ] [ @ VV]
29
30 l = lang, C = ctry, M = charmap, V = variant
31*/
32
b75a7d8f
A
33#include "unicode/utypes.h"
34#include "unicode/ustring.h"
35#include "unicode/uloc.h"
36
374ca955 37#include "putilimp.h"
b75a7d8f 38#include "ustr_imp.h"
374ca955 39#include "ulocimp.h"
b75a7d8f
A
40#include "umutex.h"
41#include "cstring.h"
42#include "cmemory.h"
374ca955
A
43#include "locmap.h"
44#include "uarrsort.h"
45#include "uenumimp.h"
46#include "uassert.h"
f3c0d7a5 47#include "charstr.h"
b75a7d8f 48
374ca955
A
49#include <stdio.h> /* for sprintf */
50
f3c0d7a5
A
51U_NAMESPACE_USE
52
374ca955 53/* ### Declarations **************************************************/
b75a7d8f
A
54
55/* Locale stuff from locid.cpp */
56U_CFUNC void locale_set_default(const char *id);
57U_CFUNC const char *locale_get_default(void);
374ca955
A
58U_CFUNC int32_t
59locale_getKeywords(const char *localeID,
60 char prev,
61 char *keywords, int32_t keywordCapacity,
62 char *values, int32_t valuesCapacity, int32_t *valLen,
63 UBool valuesToo,
64 UErrorCode *status);
65
374ca955
A
66/* ### Data tables **************************************************/
67
68/**
69 * Table of language codes, both 2- and 3-letter, with preference
70 * given to 2-letter codes where possible. Includes 3-letter codes
71 * that lack a 2-letter equivalent.
72 *
73 * This list must be in sorted order. This list is returned directly
74 * to the user by some API.
75 *
76 * This list must be kept in sync with LANGUAGES_3, with corresponding
77 * entries matched.
78 *
79 * This table should be terminated with a NULL entry, followed by a
80 * second list, and another NULL entry. The first list is visible to
81 * user code when this array is returned by API. The second list
82 * contains codes we support, but do not expose through user API.
83 *
84 * Notes
85 *
86 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
87 * include the revisions up to 2001/7/27 *CWB*
88 *
89 * The 3 character codes are the terminology codes like RFC 3066. This
90 * is compatible with prior ICU codes
91 *
92 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
93 * table but now at the end of the table because 3 character codes are
94 * duplicates. This avoids bad searches going from 3 to 2 character
95 * codes.
96 *
97 * The range qaa-qtz is reserved for local use
98 */
51004dcb 99/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
2ca993e8 100/* ISO639 table version is 20150505 */
0f5d89e8 101/* Subsequent hand addition of selected languages */
374ca955 102static const char * const LANGUAGES[] = {
f3c0d7a5
A
103 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb",
104 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale",
105 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc",
106 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
107 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az",
108 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
109 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
110 "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla",
111 "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh",
112 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv",
0f5d89e8 113 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg",
f3c0d7a5
A
114 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
115 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh",
116 "cs", "csb", "cu", "cv", "cy",
117 "da", "dak", "dar", "dav", "de", "del", "den", "dgr",
118 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
119 "dyo", "dyu", "dz", "dzg",
120 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx",
121 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo",
122 "ext",
123 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj",
124 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr",
125 "frs", "fur", "fy",
126 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
127 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom",
128 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc",
129 "gur", "guz", "gv", "gwi",
130 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil",
131 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu",
132 "hup", "hy", "hz",
133 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik",
134 "ilo", "inh", "io", "is", "it", "iu", "izh",
135 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
136 "jv",
137 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
138 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp",
139 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk",
140 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi",
141 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl",
142 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut",
143 "kv", "kw", "ky",
144 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn",
145 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo",
146 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui",
147 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz",
148 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
149 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga",
150 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk",
340931cb
A
151 "ml", "mn", "mnc", "mni", "mo",
152 "moh", "mos", "mr", "mrj",
f3c0d7a5
A
153 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
154 "my", "mye", "myv", "mzn",
155 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne",
156 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn",
157 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso",
158 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi",
159 "oc", "oj", "om", "or", "os", "osa", "ota",
160 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
161 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt",
162 "pon", "prg", "pro", "ps", "pt",
163 "qu", "quc", "qug",
164 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro",
165 "rof", "rom", "rtm", "ru", "rue", "rug", "rup",
166 "rw", "rwk",
167 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
168 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh",
169 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga",
170 "sgs", "shi", "shn", "shu", "si", "sid", "sk",
171 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms",
172 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr",
173 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux",
174 "sv", "sw", "swb", "swc", "syc", "syr", "szl",
175 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg",
176 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", "tl",
177 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tpi",
178 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt",
179 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm",
180 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz",
181 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vo",
182 "vot", "vro", "vun",
183 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu",
184 "xal", "xh", "xmf", "xog",
185 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue",
186 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu",
187 "zun", "zxx", "zza",
b75a7d8f
A
188NULL,
189 "in", "iw", "ji", "jw", "sh", /* obsolete language codes */
190NULL
191};
51004dcb 192
73c04bcf
A
193static const char* const DEPRECATED_LANGUAGES[]={
194 "in", "iw", "ji", "jw", NULL, NULL
195};
196static const char* const REPLACEMENT_LANGUAGES[]={
197 "id", "he", "yi", "jv", NULL, NULL
198};
b75a7d8f 199
374ca955
A
200/**
201 * Table of 3-letter language codes.
202 *
203 * This is a lookup table used to convert 3-letter language codes to
204 * their 2-letter equivalent, where possible. It must be kept in sync
205 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the
206 * same language as LANGUAGES_3[i]. The commented-out lines are
207 * copied from LANGUAGES to make eyeballing this baby easier.
208 *
209 * Where a 3-letter language code has no 2-letter equivalent, the
210 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
211 *
212 * This table should be terminated with a NULL entry, followed by a
213 * second list, and another NULL entry. The two lists correspond to
214 * the two lists in LANGUAGES.
215 */
51004dcb 216/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
2ca993e8 217/* ISO639 table version is 20150505 */
0f5d89e8 218/* Subsequent hand addition of selected languages */
374ca955 219static const char * const LANGUAGES_3[] = {
f3c0d7a5
A
220 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
221 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
222 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
223 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
224 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
225 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
226 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
227 "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
228 "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
229 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
0f5d89e8 230 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
f3c0d7a5
A
231 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
232 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
233 "ces", "csb", "chu", "chv", "cym",
234 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
235 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
236 "dyo", "dyu", "dzo", "dzg",
237 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
238 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
239 "ext",
240 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
241 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
242 "frs", "fur", "fry",
243 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
244 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
245 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
246 "gur", "guz", "glv", "gwi",
247 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
248 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
249 "hup", "hye", "her",
250 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
251 "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
252 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
253 "jav",
254 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
255 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
256 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
257 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
258 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
259 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
260 "kom", "cor", "kir",
261 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
262 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
263 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
264 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
265 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
266 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
267 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
340931cb
A
268 "mal", "mon", "mnc", "mni", "mol",
269 "moh", "mos", "mar", "mrj",
f3c0d7a5
A
270 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
271 "mya", "mye", "myv", "mzn",
272 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
273 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
274 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
275 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
276 "oci", "oji", "orm", "ori", "oss", "osa", "ota",
277 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
278 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
279 "pon", "prg", "pro", "pus", "por",
280 "que", "quc", "qug",
281 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
282 "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
283 "kin", "rwk",
284 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
285 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
286 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
287 "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
288 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
289 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
290 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
291 "swe", "swa", "swb", "swc", "syc", "syr", "szl",
292 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
293 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
294 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
295 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
296 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
297 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
298 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
299 "vot", "vro", "vun",
300 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
301 "xal", "xho", "xmf", "xog",
302 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
303 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
304 "zun", "zxx", "zza",
b75a7d8f
A
305NULL,
306/* "in", "iw", "ji", "jw", "sh", */
307 "ind", "heb", "yid", "jaw", "srp",
308NULL
309};
310
374ca955
A
311/**
312 * Table of 2-letter country codes.
313 *
314 * This list must be in sorted order. This list is returned directly
315 * to the user by some API.
316 *
317 * This list must be kept in sync with COUNTRIES_3, with corresponding
318 * entries matched.
319 *
320 * This table should be terminated with a NULL entry, followed by a
321 * second list, and another NULL entry. The first list is visible to
322 * user code when this array is returned by API. The second list
323 * contains codes we support, but do not expose through user API.
324 *
325 * Notes:
326 *
327 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
328 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
329 * new codes keeping the old ones for compatibility updated to include
330 * 1999/12/03 revisions *CWB*
331 *
332 * RO(ROM) is now RO(ROU) according to
333 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
334 */
335static const char * const COUNTRIES[] = {
2ca993e8 336 "AC", "AD", "AE", "AF", "AG", "AI", "AL", "AM",
73c04bcf 337 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ",
b75a7d8f 338 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI",
51004dcb 339 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV",
b75a7d8f 340 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG",
2ca993e8
A
341 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CP", "CR",
342 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK",
343 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER",
b75a7d8f 344 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR",
73c04bcf 345 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL",
b75a7d8f
A
346 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU",
347 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU",
2ca993e8 348 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS",
73c04bcf 349 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
b75a7d8f
A
350 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA",
351 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU",
46f4442e 352 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK",
b75a7d8f
A
353 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS",
354 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA",
355 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP",
356 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG",
357 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT",
46f4442e 358 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA",
b75a7d8f 359 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ",
51004dcb 360 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV",
2ca993e8 361 "SX", "SY", "SZ", "TA", "TC", "TD", "TF", "TG", "TH", "TJ",
b75a7d8f
A
362 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV",
363 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ",
364 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF",
2ca993e8 365 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW",
b75a7d8f 366NULL,
51004dcb 367 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */
b75a7d8f
A
368NULL
369};
370
51004dcb
A
371static const char* const DEPRECATED_COUNTRIES[] = {
372 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
73c04bcf
A
373};
374static const char* const REPLACEMENT_COUNTRIES[] = {
51004dcb 375/* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
f3c0d7a5 376 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL /* replacement country codes */
73c04bcf 377};
f3c0d7a5 378
374ca955
A
379/**
380 * Table of 3-letter country codes.
381 *
382 * This is a lookup table used to convert 3-letter country codes to
383 * their 2-letter equivalent. It must be kept in sync with COUNTRIES.
384 * For all valid i, COUNTRIES[i] must refer to the same country as
385 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES
386 * to make eyeballing this baby easier.
387 *
388 * This table should be terminated with a NULL entry, followed by a
389 * second list, and another NULL entry. The two lists correspond to
390 * the two lists in COUNTRIES.
391 */
392static const char * const COUNTRIES_3[] = {
2ca993e8
A
393/* "AC", "AD", "AE", "AF", "AG", "AI", "AL", "AM", */
394 "ASC", "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
73c04bcf
A
395/* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */
396 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
b75a7d8f
A
397/* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */
398 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
51004dcb
A
399/* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */
400 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
b75a7d8f
A
401/* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */
402 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
2ca993e8
A
403/* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CP", "CR", */
404 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CPT", "CRI",
405/* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */
406 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
407/* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */
408 "DMA", "DOM", "DZA", "EA ", "ECU", "EST", "EGY", "ESH", "ERI", /* no valid 3-letter code for EA */
b75a7d8f
A
409/* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */
410 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
46f4442e 411/* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */
73c04bcf 412 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
b75a7d8f
A
413/* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */
414 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
415/* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */
416 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
2ca993e8
A
417/* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */
418 "IC ", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", /* no valid 3-letter code for IC */
46f4442e 419/* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */
73c04bcf 420 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
b75a7d8f
A
421/* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */
422 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
423/* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */
424 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
46f4442e
A
425/* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */
426 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
b75a7d8f
A
427/* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */
428 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
429/* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */
430 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
431/* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */
432 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
433/* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */
434 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
435/* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */
436 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
46f4442e
A
437/* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */
438 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
b75a7d8f
A
439/* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */
440 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
51004dcb
A
441/* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */
442 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
2ca993e8
A
443/* "SX", "SY", "SZ", "TA", "TC", "TD", "TF", "TG", "TH", "TJ", */
444 "SXM", "SYR", "SWZ", "TAA", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
b75a7d8f
A
445/* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */
446 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
447/* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */
448 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
449/* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
450 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
2ca993e8
A
451/* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
452 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
b75a7d8f 453NULL,
51004dcb
A
454/* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
455 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
b75a7d8f
A
456NULL
457};
458
374ca955
A
459typedef struct CanonicalizationMap {
460 const char *id; /* input ID */
461 const char *canonicalID; /* canonicalized output ID */
374ca955
A
462} CanonicalizationMap;
463
464/**
465 * A map to canonicalize locale IDs. This handles a variety of
466 * different semantic kinds of transformations.
467 */
468static const CanonicalizationMap CANONICALIZE_MAP[] = {
3d1f044b
A
469 { "", "en_US_POSIX" }, /* .NET name */ // open ICU 64 deleted, we restore
470 { "c", "en_US_POSIX" }, /* POSIX name */ // open ICU 64 deleted, we restore
471 { "posix", "en_US_POSIX" }, /* POSIX name (alias of C) */ // open ICU 64 deleted, we restore
472 { "art_LOJBAN", "jbo" }, /* registered name */
473 { "hy__AREVELA", "hy" }, /* Registered IANA variant */
474 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */
475 { "zh_GAN", "gan" }, /* registered name */
476 { "zh_GUOYU", "zh" }, /* registered name */
477 { "zh_HAKKA", "hak" }, /* registered name */
478 { "zh_MIN_NAN", "nan" }, /* registered name */
479 { "zh_WUU", "wuu" }, /* registered name */
480 { "zh_XIANG", "hsn" }, /* registered name */
481 { "zh_YUE", "yue" }, /* registered name */
374ca955
A
482};
483
729e4ab9
A
484/* ### BCP47 Conversion *******************************************/
485/* Test if the locale id has BCP47 u extension and does not have '@' */
486#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
487/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
340931cb
A
488#define _ConvertBCP47(finalID, id, buffer, length,err) UPRV_BLOCK_MACRO_BEGIN { \
489 if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || \
490 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
491 finalID=id; \
492 if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
493 } else { \
494 finalID=buffer; \
495 } \
496} UPRV_BLOCK_MACRO_END
729e4ab9
A
497/* Gets the size of the shortest subtag in the given localeID. */
498static int32_t getShortestSubtagLength(const char *localeID) {
0f5d89e8 499 int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
729e4ab9
A
500 int32_t length = localeIDLength;
501 int32_t tmpLength = 0;
502 int32_t i;
503 UBool reset = TRUE;
504
505 for (i = 0; i < localeIDLength; i++) {
506 if (localeID[i] != '_' && localeID[i] != '-') {
507 if (reset) {
508 tmpLength = 0;
509 reset = FALSE;
510 }
511 tmpLength++;
512 } else {
513 if (tmpLength != 0 && tmpLength < length) {
514 length = tmpLength;
515 }
516 reset = TRUE;
517 }
518 }
519
520 return length;
521}
522
374ca955 523/* ### Keywords **************************************************/
f3c0d7a5
A
524#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
525#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
526/* Punctuation/symbols allowed in legacy key values */
527#define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
374ca955
A
528
529#define ULOC_KEYWORD_BUFFER_LEN 25
530#define ULOC_MAX_NO_KEYWORDS 25
531
729e4ab9 532U_CAPI const char * U_EXPORT2
374ca955 533locale_getKeywordsStart(const char *localeID) {
374ca955 534 const char *result = NULL;
374ca955
A
535 if((result = uprv_strchr(localeID, '@')) != NULL) {
536 return result;
73c04bcf
A
537 }
538#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
539 else {
540 /* We do this because the @ sign is variant, and the @ sign used on one
541 EBCDIC machine won't be compiled the same way on other EBCDIC based
542 machines. */
543 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
374ca955
A
544 const uint8_t *charToFind = ebcdicSigns;
545 while(*charToFind) {
546 if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
547 return result;
548 }
549 charToFind++;
550 }
551 }
73c04bcf 552#endif
374ca955
A
553 return NULL;
554}
555
556/**
557 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
558 * @param keywordName incoming name to be canonicalized
559 * @param status return status (keyword too long)
560 * @return length of the keyword name
561 */
562static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
563{
f3c0d7a5
A
564 int32_t keywordNameLen = 0;
565
566 for (; *keywordName != 0; keywordName++) {
567 if (!UPRV_ISALPHANUM(*keywordName)) {
568 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
569 return 0;
570 }
571 if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
572 buf[keywordNameLen++] = uprv_tolower(*keywordName);
573 } else {
574 /* keyword name too long for internal buffer */
575 *status = U_INTERNAL_PROGRAM_ERROR;
576 return 0;
577 }
374ca955 578 }
f3c0d7a5
A
579 if (keywordNameLen == 0) {
580 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
581 return 0;
374ca955 582 }
f3c0d7a5
A
583 buf[keywordNameLen] = 0; /* terminate */
584
374ca955
A
585 return keywordNameLen;
586}
587
588typedef struct {
589 char keyword[ULOC_KEYWORD_BUFFER_LEN];
590 int32_t keywordLen;
591 const char *valueStart;
592 int32_t valueLen;
593} KeywordStruct;
594
595static int32_t U_CALLCONV
4388f060 596compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
374ca955
A
597 const char* leftString = ((const KeywordStruct *)left)->keyword;
598 const char* rightString = ((const KeywordStruct *)right)->keyword;
599 return uprv_strcmp(leftString, rightString);
600}
601
374ca955
A
602static int32_t
603_getKeywords(const char *localeID,
604 char prev,
605 char *keywords, int32_t keywordCapacity,
606 char *values, int32_t valuesCapacity, int32_t *valLen,
607 UBool valuesToo,
374ca955
A
608 UErrorCode *status)
609{
610 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
f3c0d7a5 611
374ca955
A
612 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
613 int32_t numKeywords = 0;
614 const char* pos = localeID;
615 const char* equalSign = NULL;
616 const char* semicolon = NULL;
617 int32_t i = 0, j, n;
618 int32_t keywordsLen = 0;
619 int32_t valuesLen = 0;
620
621 if(prev == '@') { /* start of keyword definition */
622 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
623 do {
624 UBool duplicate = FALSE;
625 /* skip leading spaces */
626 while(*pos == ' ') {
627 pos++;
628 }
629 if (!*pos) { /* handle trailing "; " */
630 break;
631 }
632 if(numKeywords == maxKeywords) {
633 *status = U_INTERNAL_PROGRAM_ERROR;
634 return 0;
635 }
636 equalSign = uprv_strchr(pos, '=');
637 semicolon = uprv_strchr(pos, ';');
638 /* lack of '=' [foo@currency] is illegal */
639 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
640 if(!equalSign || (semicolon && semicolon<equalSign)) {
641 *status = U_INVALID_FORMAT_ERROR;
642 return 0;
643 }
644 /* need to normalize both keyword and keyword name */
645 if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
646 /* keyword name too long for internal buffer */
647 *status = U_INTERNAL_PROGRAM_ERROR;
648 return 0;
649 }
650 for(i = 0, n = 0; i < equalSign - pos; ++i) {
651 if (pos[i] != ' ') {
652 keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
653 }
654 }
57a6839d
A
655
656 /* zero-length keyword is an error. */
657 if (n == 0) {
658 *status = U_INVALID_FORMAT_ERROR;
659 return 0;
660 }
661
374ca955
A
662 keywordList[numKeywords].keyword[n] = 0;
663 keywordList[numKeywords].keywordLen = n;
664 /* now grab the value part. First we skip the '=' */
665 equalSign++;
666 /* then we leading spaces */
667 while(*equalSign == ' ') {
668 equalSign++;
669 }
57a6839d
A
670
671 /* Premature end or zero-length value */
2ca993e8 672 if (!*equalSign || equalSign == semicolon) {
57a6839d
A
673 *status = U_INVALID_FORMAT_ERROR;
674 return 0;
675 }
676
374ca955 677 keywordList[numKeywords].valueStart = equalSign;
57a6839d 678
374ca955
A
679 pos = semicolon;
680 i = 0;
681 if(pos) {
682 while(*(pos - i - 1) == ' ') {
683 i++;
684 }
73c04bcf 685 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
374ca955
A
686 pos++;
687 } else {
73c04bcf 688 i = (int32_t)uprv_strlen(equalSign);
4388f060 689 while(i && equalSign[i-1] == ' ') {
374ca955
A
690 i--;
691 }
692 keywordList[numKeywords].valueLen = i;
693 }
694 /* If this is a duplicate keyword, then ignore it */
695 for (j=0; j<numKeywords; ++j) {
696 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
697 duplicate = TRUE;
698 break;
699 }
700 }
701 if (!duplicate) {
702 ++numKeywords;
703 }
704 } while(pos);
705
374ca955
A
706 /* now we have a list of keywords */
707 /* we need to sort it */
708 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
f3c0d7a5 709
374ca955
A
710 /* Now construct the keyword part */
711 for(i = 0; i < numKeywords; i++) {
712 if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
713 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
714 if(valuesToo) {
715 keywords[keywordsLen + keywordList[i].keywordLen] = '=';
716 } else {
717 keywords[keywordsLen + keywordList[i].keywordLen] = 0;
718 }
719 }
720 keywordsLen += keywordList[i].keywordLen + 1;
721 if(valuesToo) {
3d1f044b 722 if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
374ca955
A
723 uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
724 }
725 keywordsLen += keywordList[i].valueLen;
f3c0d7a5 726
374ca955 727 if(i < numKeywords - 1) {
f3c0d7a5 728 if(keywordsLen < keywordCapacity) {
374ca955
A
729 keywords[keywordsLen] = ';';
730 }
731 keywordsLen++;
732 }
733 }
734 if(values) {
735 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
736 uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
737 values[valuesLen + keywordList[i].valueLen] = 0;
738 }
739 valuesLen += keywordList[i].valueLen + 1;
740 }
741 }
742 if(values) {
743 values[valuesLen] = 0;
744 if(valLen) {
745 *valLen = valuesLen;
746 }
747 }
f3c0d7a5 748 return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
374ca955
A
749 } else {
750 return 0;
751 }
752}
753
754U_CFUNC int32_t
755locale_getKeywords(const char *localeID,
756 char prev,
757 char *keywords, int32_t keywordCapacity,
758 char *values, int32_t valuesCapacity, int32_t *valLen,
759 UBool valuesToo,
760 UErrorCode *status) {
761 return _getKeywords(localeID, prev, keywords, keywordCapacity,
762 values, valuesCapacity, valLen, valuesToo,
3d1f044b 763 status);
374ca955
A
764}
765
766U_CAPI int32_t U_EXPORT2
767uloc_getKeywordValue(const char* localeID,
768 const char* keywordName,
769 char* buffer, int32_t bufferCapacity,
770 UErrorCode* status)
f3c0d7a5 771{
340931cb
A
772 if (buffer != nullptr) {
773 buffer[0] = '\0';
774 }
729e4ab9 775 const char* startSearchHere = NULL;
374ca955 776 const char* nextSeparator = NULL;
374ca955
A
777 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
778 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
374ca955
A
779 int32_t result = 0;
780
781 if(status && U_SUCCESS(*status) && localeID) {
729e4ab9
A
782 char tempBuffer[ULOC_FULLNAME_CAPACITY];
783 const char* tmpLocaleID;
784
f3c0d7a5
A
785 if (keywordName == NULL || keywordName[0] == 0) {
786 *status = U_ILLEGAL_ARGUMENT_ERROR;
787 return 0;
788 }
789
790 locale_canonKeywordName(keywordNameBuffer, keywordName, status);
791 if(U_FAILURE(*status)) {
792 return 0;
793 }
794
729e4ab9
A
795 if (_hasBCP47Extension(localeID)) {
796 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
797 } else {
798 tmpLocaleID=localeID;
799 }
f3c0d7a5
A
800
801 startSearchHere = locale_getKeywordsStart(tmpLocaleID);
374ca955
A
802 if(startSearchHere == NULL) {
803 /* no keywords, return at once */
804 return 0;
805 }
806
374ca955
A
807 /* find the first keyword */
808 while(startSearchHere) {
f3c0d7a5
A
809 const char* keyValueTail;
810 int32_t keyValueLen;
811
812 startSearchHere++; /* skip @ or ; */
813 nextSeparator = uprv_strchr(startSearchHere, '=');
814 if(!nextSeparator) {
815 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
816 return 0;
817 }
818 /* strip leading & trailing spaces (TC decided to tolerate these) */
374ca955
A
819 while(*startSearchHere == ' ') {
820 startSearchHere++;
821 }
f3c0d7a5
A
822 keyValueTail = nextSeparator;
823 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
824 keyValueTail--;
825 }
826 /* now keyValueTail points to first char after the keyName */
827 /* copy & normalize keyName from locale */
828 if (startSearchHere == keyValueTail) {
829 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
830 return 0;
374ca955 831 }
f3c0d7a5
A
832 keyValueLen = 0;
833 while (startSearchHere < keyValueTail) {
834 if (!UPRV_ISALPHANUM(*startSearchHere)) {
835 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
836 return 0;
837 }
838 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
839 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
840 } else {
374ca955
A
841 /* keyword name too long for internal buffer */
842 *status = U_INTERNAL_PROGRAM_ERROR;
843 return 0;
f3c0d7a5 844 }
374ca955 845 }
f3c0d7a5
A
846 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
847
374ca955 848 startSearchHere = uprv_strchr(nextSeparator, ';');
f3c0d7a5 849
374ca955 850 if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
f3c0d7a5
A
851 /* current entry matches the keyword. */
852 nextSeparator++; /* skip '=' */
853 /* First strip leading & trailing spaces (TC decided to tolerate these) */
374ca955 854 while(*nextSeparator == ' ') {
f3c0d7a5 855 nextSeparator++;
374ca955 856 }
f3c0d7a5
A
857 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
858 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
859 keyValueTail--;
860 }
861 /* Now copy the value, but check well-formedness */
862 if (nextSeparator == keyValueTail) {
863 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
864 return 0;
374ca955 865 }
f3c0d7a5
A
866 keyValueLen = 0;
867 while (nextSeparator < keyValueTail) {
868 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
869 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
870 return 0;
871 }
872 if (keyValueLen < bufferCapacity) {
873 /* Should we lowercase value to return here? Tests expect as-is. */
874 buffer[keyValueLen++] = *nextSeparator++;
875 } else { /* keep advancing so we return correct length in case of overflow */
876 keyValueLen++;
877 nextSeparator++;
878 }
879 }
880 result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
374ca955
A
881 return result;
882 }
883 }
884 }
885 return 0;
886}
887
888U_CAPI int32_t U_EXPORT2
889uloc_setKeywordValue(const char* keywordName,
890 const char* keywordValue,
891 char* buffer, int32_t bufferCapacity,
892 UErrorCode* status)
893{
894 /* TODO: sorting. removal. */
895 int32_t keywordNameLen;
896 int32_t keywordValueLen;
897 int32_t bufLen;
898 int32_t needLen = 0;
374ca955 899 char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
f3c0d7a5 900 char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
374ca955 901 char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
374ca955
A
902 int32_t rc;
903 char* nextSeparator = NULL;
904 char* nextEqualsign = NULL;
905 char* startSearchHere = NULL;
906 char* keywordStart = NULL;
f3c0d7a5
A
907 CharString updatedKeysAndValues;
908 int32_t updatedKeysAndValuesLen;
909 UBool handledInputKeyAndValue = FALSE;
910 char keyValuePrefix = '@';
911
912 if(U_FAILURE(*status)) {
913 return -1;
374ca955 914 }
f3c0d7a5 915 if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
73c04bcf
A
916 *status = U_ILLEGAL_ARGUMENT_ERROR;
917 return 0;
918 }
f3c0d7a5 919 bufLen = (int32_t)uprv_strlen(buffer);
73c04bcf
A
920 if(bufferCapacity<bufLen) {
921 /* The capacity is less than the length?! Is this NULL terminated? */
922 *status = U_ILLEGAL_ARGUMENT_ERROR;
923 return 0;
924 }
374ca955
A
925 keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
926 if(U_FAILURE(*status)) {
927 return 0;
928 }
f3c0d7a5
A
929
930 keywordValueLen = 0;
931 if(keywordValue) {
932 while (*keywordValue != 0) {
933 if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
934 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
935 return 0;
936 }
937 if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
938 /* Should we force lowercase in value to set? */
939 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
940 } else {
941 /* keywordValue too long for internal buffer */
942 *status = U_INTERNAL_PROGRAM_ERROR;
943 return 0;
944 }
945 }
946 }
947 keywordValueBuffer[keywordValueLen] = 0; /* terminate */
948
374ca955 949 startSearchHere = (char*)locale_getKeywordsStart(buffer);
374ca955 950 if(startSearchHere == NULL || (startSearchHere[1]==0)) {
f3c0d7a5
A
951 if(keywordValueLen == 0) { /* no keywords = nothing to remove */
952 return bufLen;
374ca955
A
953 }
954
955 needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
f3c0d7a5 956 if(startSearchHere) { /* had a single @ */
374ca955
A
957 needLen--; /* already had the @ */
958 /* startSearchHere points at the @ */
959 } else {
960 startSearchHere=buffer+bufLen;
961 }
962 if(needLen >= bufferCapacity) {
963 *status = U_BUFFER_OVERFLOW_ERROR;
964 return needLen; /* no change */
965 }
f3c0d7a5 966 *startSearchHere++ = '@';
374ca955
A
967 uprv_strcpy(startSearchHere, keywordNameBuffer);
968 startSearchHere += keywordNameLen;
f3c0d7a5
A
969 *startSearchHere++ = '=';
970 uprv_strcpy(startSearchHere, keywordValueBuffer);
374ca955
A
971 return needLen;
972 } /* end shortcut - no @ */
f3c0d7a5 973
374ca955
A
974 keywordStart = startSearchHere;
975 /* search for keyword */
976 while(keywordStart) {
f3c0d7a5
A
977 const char* keyValueTail;
978 int32_t keyValueLen;
979
980 keywordStart++; /* skip @ or ; */
981 nextEqualsign = uprv_strchr(keywordStart, '=');
982 if (!nextEqualsign) {
983 *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
984 return 0;
985 }
986 /* strip leading & trailing spaces (TC decided to tolerate these) */
374ca955
A
987 while(*keywordStart == ' ') {
988 keywordStart++;
989 }
f3c0d7a5
A
990 keyValueTail = nextEqualsign;
991 while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
992 keyValueTail--;
374ca955 993 }
f3c0d7a5
A
994 /* now keyValueTail points to first char after the keyName */
995 /* copy & normalize keyName from locale */
996 if (keywordStart == keyValueTail) {
997 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
374ca955
A
998 return 0;
999 }
f3c0d7a5
A
1000 keyValueLen = 0;
1001 while (keywordStart < keyValueTail) {
1002 if (!UPRV_ISALPHANUM(*keywordStart)) {
1003 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1004 return 0;
1005 }
1006 if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1007 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1008 } else {
1009 /* keyword name too long for internal buffer */
1010 *status = U_INTERNAL_PROGRAM_ERROR;
1011 return 0;
1012 }
374ca955 1013 }
f3c0d7a5 1014 localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
374ca955
A
1015
1016 nextSeparator = uprv_strchr(nextEqualsign, ';');
f3c0d7a5
A
1017
1018 /* start processing the value part */
1019 nextEqualsign++; /* skip '=' */
1020 /* First strip leading & trailing spaces (TC decided to tolerate these) */
1021 while(*nextEqualsign == ' ') {
1022 nextEqualsign++;
1023 }
1024 keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1025 while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1026 keyValueTail--;
1027 }
1028 if (nextEqualsign == keyValueTail) {
1029 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1030 return 0;
1031 }
1032
374ca955
A
1033 rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1034 if(rc == 0) {
f3c0d7a5
A
1035 /* Current entry matches the input keyword. Update the entry */
1036 if(keywordValueLen > 0) { /* updating a value */
1037 updatedKeysAndValues.append(keyValuePrefix, *status);
1038 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1039 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1040 updatedKeysAndValues.append('=', *status);
1041 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1042 } /* else removing this entry, don't emit anything */
1043 handledInputKeyAndValue = TRUE;
1044 } else {
1045 /* input keyword sorts earlier than current entry, add before current entry */
1046 if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1047 /* insert new entry at this location */
1048 updatedKeysAndValues.append(keyValuePrefix, *status);
1049 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1050 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1051 updatedKeysAndValues.append('=', *status);
1052 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1053 handledInputKeyAndValue = TRUE;
374ca955 1054 }
f3c0d7a5
A
1055 /* copy the current entry */
1056 updatedKeysAndValues.append(keyValuePrefix, *status);
1057 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1058 updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1059 updatedKeysAndValues.append('=', *status);
3d1f044b 1060 updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
f3c0d7a5
A
1061 }
1062 if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1063 /* append new entry at the end, it sorts later than existing entries */
1064 updatedKeysAndValues.append(keyValuePrefix, *status);
1065 /* skip keyValuePrefix update, no subsequent key-value pair */
1066 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1067 updatedKeysAndValues.append('=', *status);
1068 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1069 handledInputKeyAndValue = TRUE;
374ca955
A
1070 }
1071 keywordStart = nextSeparator;
1072 } /* end loop searching */
374ca955 1073
f3c0d7a5
A
1074 /* Any error from updatedKeysAndValues.append above would be internal and not due to
1075 * problems with the passed-in locale. So if we did encounter problems with the
1076 * passed-in locale above, those errors took precedence and overrode any error
1077 * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1078 * are errors here they are from updatedKeysAndValues.append; they do cause an
1079 * error return but the passed-in locale is unmodified and the original bufLen is
1080 * returned.
1081 */
1082 if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1083 /* if input key/value specified removal of a keyword not present in locale, or
1084 * there was an error in CharString.append, leave original locale alone. */
1085 return bufLen;
1086 }
1087
1088 updatedKeysAndValuesLen = updatedKeysAndValues.length();
1089 /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1090 needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
374ca955
A
1091 if(needLen >= bufferCapacity) {
1092 *status = U_BUFFER_OVERFLOW_ERROR;
1093 return needLen; /* no change */
1094 }
f3c0d7a5
A
1095 if (updatedKeysAndValuesLen > 0) {
1096 uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
374ca955
A
1097 }
1098 buffer[needLen]=0;
1099 return needLen;
1100}
b75a7d8f 1101
374ca955 1102/* ### ID parsing implementation **************************************************/
b75a7d8f 1103
b75a7d8f 1104#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
374ca955 1105
b75a7d8f
A
1106/*returns TRUE if one of the special prefixes is here (s=string)
1107 'x-' or 'i-' */
1108#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1109
1110/* Dot terminates it because of POSIX form where dot precedes the codepage
1111 * except for variant
1112 */
1113#define _isTerminator(a) ((a==0)||(a=='.')||(a=='@'))
1114
374ca955
A
1115/**
1116 * Lookup 'key' in the array 'list'. The array 'list' should contain
1117 * a NULL entry, followed by more entries, and a second NULL entry.
1118 *
1119 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1120 * COUNTRIES_3.
1121 */
b75a7d8f
A
1122static int16_t _findIndex(const char* const* list, const char* key)
1123{
1124 const char* const* anchor = list;
374ca955
A
1125 int32_t pass = 0;
1126
1127 /* Make two passes through two NULL-terminated arrays at 'list' */
1128 while (pass++ < 2) {
1129 while (*list) {
1130 if (uprv_strcmp(key, *list) == 0) {
1131 return (int16_t)(list - anchor);
1132 }
1133 list++;
b75a7d8f 1134 }
374ca955 1135 ++list; /* skip final NULL *CWB*/
b75a7d8f
A
1136 }
1137 return -1;
1138}
1139
1140/* count the length of src while copying it to dest; return strlen(src) */
4388f060 1141static inline int32_t
b75a7d8f
A
1142_copyCount(char *dest, int32_t destCapacity, const char *src) {
1143 const char *anchor;
1144 char c;
1145
1146 anchor=src;
1147 for(;;) {
1148 if((c=*src)==0) {
1149 return (int32_t)(src-anchor);
1150 }
1151 if(destCapacity<=0) {
1152 return (int32_t)((src-anchor)+uprv_strlen(src));
1153 }
1154 ++src;
1155 *dest++=c;
1156 --destCapacity;
1157 }
1158}
1159
f3c0d7a5 1160U_CFUNC const char*
73c04bcf
A
1161uloc_getCurrentCountryID(const char* oldID){
1162 int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1163 if (offset >= 0) {
1164 return REPLACEMENT_COUNTRIES[offset];
1165 }
1166 return oldID;
1167}
f3c0d7a5 1168U_CFUNC const char*
73c04bcf
A
1169uloc_getCurrentLanguageID(const char* oldID){
1170 int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1171 if (offset >= 0) {
1172 return REPLACEMENT_LANGUAGES[offset];
1173 }
f3c0d7a5 1174 return oldID;
73c04bcf 1175}
b75a7d8f
A
1176/*
1177 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1178 * avoid duplicating code to handle the earlier locale ID pieces
1179 * in the functions for the later ones by
1180 * setting the *pEnd pointer to where they stopped parsing
1181 *
1182 * TODO try to use this in Locale
1183 */
729e4ab9
A
1184U_CFUNC int32_t
1185ulocimp_getLanguage(const char *localeID,
1186 char *language, int32_t languageCapacity,
1187 const char **pEnd) {
b75a7d8f
A
1188 int32_t i=0;
1189 int32_t offset;
1190 char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1191
1192 /* if it starts with i- or x- then copy that prefix */
1193 if(_isIDPrefix(localeID)) {
1194 if(i<languageCapacity) {
1195 language[i]=(char)uprv_tolower(*localeID);
1196 }
1197 if(i<languageCapacity) {
1198 language[i+1]='-';
1199 }
1200 i+=2;
1201 localeID+=2;
1202 }
f3c0d7a5 1203
b75a7d8f
A
1204 /* copy the language as far as possible and count its length */
1205 while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1206 if(i<languageCapacity) {
1207 language[i]=(char)uprv_tolower(*localeID);
1208 }
1209 if(i<3) {
4388f060 1210 U_ASSERT(i>=0);
b75a7d8f
A
1211 lang[i]=(char)uprv_tolower(*localeID);
1212 }
1213 i++;
1214 localeID++;
1215 }
1216
1217 if(i==3) {
1218 /* convert 3 character code to 2 character code if possible *CWB*/
374ca955 1219 offset=_findIndex(LANGUAGES_3, lang);
b75a7d8f 1220 if(offset>=0) {
374ca955 1221 i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
b75a7d8f
A
1222 }
1223 }
1224
1225 if(pEnd!=NULL) {
1226 *pEnd=localeID;
1227 }
1228 return i;
1229}
1230
729e4ab9
A
1231U_CFUNC int32_t
1232ulocimp_getScript(const char *localeID,
1233 char *script, int32_t scriptCapacity,
1234 const char **pEnd)
b75a7d8f 1235{
374ca955 1236 int32_t idLen = 0;
b75a7d8f 1237
374ca955
A
1238 if (pEnd != NULL) {
1239 *pEnd = localeID;
b75a7d8f 1240 }
374ca955
A
1241
1242 /* copy the second item as far as possible and count its length */
4388f060
A
1243 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1244 && uprv_isASCIILetter(localeID[idLen])) {
374ca955 1245 idLen++;
b75a7d8f
A
1246 }
1247
374ca955
A
1248 /* If it's exactly 4 characters long, then it's a script and not a country. */
1249 if (idLen == 4) {
1250 int32_t i;
1251 if (pEnd != NULL) {
1252 *pEnd = localeID+idLen;
1253 }
1254 if(idLen > scriptCapacity) {
1255 idLen = scriptCapacity;
1256 }
1257 if (idLen >= 1) {
1258 script[0]=(char)uprv_toupper(*(localeID++));
1259 }
1260 for (i = 1; i < idLen; i++) {
1261 script[i]=(char)uprv_tolower(*(localeID++));
1262 }
1263 }
1264 else {
1265 idLen = 0;
1266 }
1267 return idLen;
b75a7d8f
A
1268}
1269
729e4ab9
A
1270U_CFUNC int32_t
1271ulocimp_getCountry(const char *localeID,
1272 char *country, int32_t countryCapacity,
1273 const char **pEnd)
374ca955 1274{
729e4ab9 1275 int32_t idLen=0;
374ca955 1276 char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
b75a7d8f
A
1277 int32_t offset;
1278
1279 /* copy the country as far as possible and count its length */
729e4ab9
A
1280 while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1281 if(idLen<(ULOC_COUNTRY_CAPACITY-1)) { /*CWB*/
1282 cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
b75a7d8f 1283 }
729e4ab9 1284 idLen++;
b75a7d8f
A
1285 }
1286
729e4ab9
A
1287 /* the country should be either length 2 or 3 */
1288 if (idLen == 2 || idLen == 3) {
1289 UBool gotCountry = FALSE;
1290 /* convert 3 character code to 2 character code if possible *CWB*/
1291 if(idLen==3) {
1292 offset=_findIndex(COUNTRIES_3, cnty);
1293 if(offset>=0) {
1294 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1295 gotCountry = TRUE;
1296 }
1297 }
1298 if (!gotCountry) {
1299 int32_t i = 0;
1300 for (i = 0; i < idLen; i++) {
1301 if (i < countryCapacity) {
1302 country[i]=(char)uprv_toupper(localeID[i]);
1303 }
1304 }
b75a7d8f 1305 }
729e4ab9
A
1306 localeID+=idLen;
1307 } else {
1308 idLen = 0;
b75a7d8f
A
1309 }
1310
1311 if(pEnd!=NULL) {
1312 *pEnd=localeID;
1313 }
729e4ab9
A
1314
1315 return idLen;
b75a7d8f
A
1316}
1317
374ca955
A
1318/**
1319 * @param needSeparator if true, then add leading '_' if any variants
1320 * are added to 'variant'
1321 */
1322static int32_t
1323_getVariantEx(const char *localeID,
1324 char prev,
1325 char *variant, int32_t variantCapacity,
1326 UBool needSeparator) {
b75a7d8f
A
1327 int32_t i=0;
1328
1329 /* get one or more variant tags and separate them with '_' */
1330 if(_isIDSeparator(prev)) {
1331 /* get a variant string after a '-' or '_' */
1332 while(!_isTerminator(*localeID)) {
374ca955
A
1333 if (needSeparator) {
1334 if (i<variantCapacity) {
1335 variant[i] = '_';
1336 }
1337 ++i;
1338 needSeparator = FALSE;
1339 }
b75a7d8f
A
1340 if(i<variantCapacity) {
1341 variant[i]=(char)uprv_toupper(*localeID);
1342 if(variant[i]=='-') {
1343 variant[i]='_';
1344 }
1345 }
1346 i++;
1347 localeID++;
1348 }
1349 }
1350
1351 /* if there is no variant tag after a '-' or '_' then look for '@' */
1352 if(i==0) {
1353 if(prev=='@') {
1354 /* keep localeID */
374ca955 1355 } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
b75a7d8f
A
1356 ++localeID; /* point after the '@' */
1357 } else {
1358 return 0;
1359 }
1360 while(!_isTerminator(*localeID)) {
374ca955
A
1361 if (needSeparator) {
1362 if (i<variantCapacity) {
1363 variant[i] = '_';
1364 }
1365 ++i;
1366 needSeparator = FALSE;
1367 }
b75a7d8f
A
1368 if(i<variantCapacity) {
1369 variant[i]=(char)uprv_toupper(*localeID);
1370 if(variant[i]=='-' || variant[i]==',') {
1371 variant[i]='_';
1372 }
1373 }
1374 i++;
1375 localeID++;
1376 }
1377 }
f3c0d7a5 1378
b75a7d8f
A
1379 return i;
1380}
1381
374ca955
A
1382static int32_t
1383_getVariant(const char *localeID,
1384 char prev,
1385 char *variant, int32_t variantCapacity) {
1386 return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1387}
1388
374ca955
A
1389/* Keyword enumeration */
1390
1391typedef struct UKeywordsContext {
1392 char* keywords;
1393 char* current;
1394} UKeywordsContext;
1395
f3c0d7a5
A
1396U_CDECL_BEGIN
1397
374ca955
A
1398static void U_CALLCONV
1399uloc_kw_closeKeywords(UEnumeration *enumerator) {
1400 uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1401 uprv_free(enumerator->context);
1402 uprv_free(enumerator);
1403}
1404
1405static int32_t U_CALLCONV
4388f060 1406uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
374ca955
A
1407 char *kw = ((UKeywordsContext *)en->context)->keywords;
1408 int32_t result = 0;
1409 while(*kw) {
1410 result++;
1411 kw += uprv_strlen(kw)+1;
1412 }
1413 return result;
1414}
1415
f3c0d7a5 1416static const char * U_CALLCONV
374ca955
A
1417uloc_kw_nextKeyword(UEnumeration* en,
1418 int32_t* resultLength,
4388f060 1419 UErrorCode* /*status*/) {
374ca955
A
1420 const char* result = ((UKeywordsContext *)en->context)->current;
1421 int32_t len = 0;
1422 if(*result) {
73c04bcf 1423 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
374ca955
A
1424 ((UKeywordsContext *)en->context)->current += len+1;
1425 } else {
1426 result = NULL;
1427 }
1428 if (resultLength) {
1429 *resultLength = len;
1430 }
1431 return result;
1432}
1433
f3c0d7a5
A
1434static void U_CALLCONV
1435uloc_kw_resetKeywords(UEnumeration* en,
4388f060 1436 UErrorCode* /*status*/) {
374ca955
A
1437 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1438}
1439
f3c0d7a5
A
1440U_CDECL_END
1441
1442
374ca955
A
1443static const UEnumeration gKeywordsEnum = {
1444 NULL,
1445 NULL,
1446 uloc_kw_closeKeywords,
1447 uloc_kw_countKeywords,
1448 uenum_unextDefault,
1449 uloc_kw_nextKeyword,
1450 uloc_kw_resetKeywords
1451};
1452
1453U_CAPI UEnumeration* U_EXPORT2
1454uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
b75a7d8f 1455{
340931cb
A
1456 LocalMemory<UKeywordsContext> myContext;
1457 LocalMemory<UEnumeration> result;
b75a7d8f 1458
340931cb
A
1459 if (U_FAILURE(*status)) {
1460 return nullptr;
46f4442e 1461 }
340931cb
A
1462 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
1463 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
1464 if (myContext.isNull() || result.isNull()) {
46f4442e 1465 *status = U_MEMORY_ALLOCATION_ERROR;
340931cb 1466 return nullptr;
46f4442e 1467 }
340931cb
A
1468 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
1469 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
1470 if (myContext->keywords == nullptr) {
46f4442e 1471 *status = U_MEMORY_ALLOCATION_ERROR;
340931cb 1472 return nullptr;
46f4442e 1473 }
46f4442e
A
1474 uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1475 myContext->keywords[keywordListSize] = 0;
1476 myContext->current = myContext->keywords;
340931cb
A
1477 result->context = myContext.orphan();
1478 return result.orphan();
374ca955
A
1479}
1480
1481U_CAPI UEnumeration* U_EXPORT2
1482uloc_openKeywords(const char* localeID,
f3c0d7a5 1483 UErrorCode* status)
374ca955
A
1484{
1485 int32_t i=0;
1486 char keywords[256];
1487 int32_t keywordsCapacity = 256;
729e4ab9
A
1488 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1489 const char* tmpLocaleID;
1490
374ca955 1491 if(status==NULL || U_FAILURE(*status)) {
b75a7d8f
A
1492 return 0;
1493 }
f3c0d7a5 1494
729e4ab9
A
1495 if (_hasBCP47Extension(localeID)) {
1496 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1497 } else {
1498 if (localeID==NULL) {
1499 localeID=uloc_getDefault();
1500 }
1501 tmpLocaleID=localeID;
b75a7d8f
A
1502 }
1503
374ca955 1504 /* Skip the language */
729e4ab9
A
1505 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1506 if(_isIDSeparator(*tmpLocaleID)) {
374ca955
A
1507 const char *scriptID;
1508 /* Skip the script if available */
729e4ab9
A
1509 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1510 if(scriptID != tmpLocaleID+1) {
374ca955 1511 /* Found optional script */
729e4ab9 1512 tmpLocaleID = scriptID;
374ca955
A
1513 }
1514 /* Skip the Country */
729e4ab9
A
1515 if (_isIDSeparator(*tmpLocaleID)) {
1516 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1517 if(_isIDSeparator(*tmpLocaleID)) {
1518 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
374ca955 1519 }
b75a7d8f
A
1520 }
1521 }
1522
374ca955 1523 /* keywords are located after '@' */
729e4ab9
A
1524 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1525 i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
374ca955
A
1526 }
1527
1528 if(i) {
1529 return uloc_openKeywordList(keywords, i, status);
1530 } else {
1531 return NULL;
b75a7d8f 1532 }
b75a7d8f
A
1533}
1534
b75a7d8f 1535
374ca955
A
1536/* bit-flags for 'options' parameter of _canonicalize */
1537#define _ULOC_STRIP_KEYWORDS 0x2
1538#define _ULOC_CANONICALIZE 0x1
1539
1540#define OPTION_SET(options, mask) ((options & mask) != 0)
1541
73c04bcf 1542static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
2ca993e8 1543#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
73c04bcf 1544
374ca955
A
1545/**
1546 * Canonicalize the given localeID, to level 1 or to level 2,
1547 * depending on the options. To specify level 1, pass in options=0.
1548 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1549 *
1550 * This is the code underlying uloc_getName and uloc_canonicalize.
1551 */
1552static int32_t
1553_canonicalize(const char* localeID,
1554 char* result,
1555 int32_t resultCapacity,
1556 uint32_t options,
1557 UErrorCode* err) {
1558 int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1559 char localeBuffer[ULOC_FULLNAME_CAPACITY];
729e4ab9 1560 char tempBuffer[ULOC_FULLNAME_CAPACITY];
46f4442e 1561 const char* origLocaleID;
729e4ab9 1562 const char* tmpLocaleID;
374ca955
A
1563 const char* keywordAssign = NULL;
1564 const char* separatorIndicator = NULL;
374ca955
A
1565 char* name;
1566 char* variant = NULL; /* pointer into name, or NULL */
374ca955
A
1567
1568 if (U_FAILURE(*err)) {
b75a7d8f
A
1569 return 0;
1570 }
f3c0d7a5 1571
729e4ab9
A
1572 if (_hasBCP47Extension(localeID)) {
1573 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1574 } else {
1575 if (localeID==NULL) {
1576 localeID=uloc_getDefault();
1577 }
1578 tmpLocaleID=localeID;
b75a7d8f 1579 }
729e4ab9
A
1580
1581 origLocaleID=tmpLocaleID;
b75a7d8f 1582
374ca955
A
1583 /* if we are doing a full canonicalization, then put results in
1584 localeBuffer, if necessary; otherwise send them to result. */
729e4ab9 1585 if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
4388f060 1586 (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
374ca955 1587 name = localeBuffer;
4388f060 1588 nameCapacity = (int32_t)sizeof(localeBuffer);
374ca955
A
1589 } else {
1590 name = result;
1591 nameCapacity = resultCapacity;
1592 }
1593
b75a7d8f 1594 /* get all pieces, one after another, and separate with '_' */
729e4ab9 1595 len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
73c04bcf
A
1596
1597 if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1598 const char *d = uloc_getDefault();
f3c0d7a5 1599
729e4ab9 1600 len = (int32_t)uprv_strlen(d);
73c04bcf
A
1601
1602 if (name != NULL) {
3d1f044b 1603 uprv_memcpy(name, d, len);
73c04bcf 1604 }
729e4ab9 1605 } else if(_isIDSeparator(*tmpLocaleID)) {
374ca955
A
1606 const char *scriptID;
1607
b75a7d8f 1608 ++fieldCount;
374ca955
A
1609 if(len<nameCapacity) {
1610 name[len]='_';
b75a7d8f 1611 }
374ca955
A
1612 ++len;
1613
4388f060
A
1614 scriptSize=ulocimp_getScript(tmpLocaleID+1,
1615 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
374ca955
A
1616 if(scriptSize > 0) {
1617 /* Found optional script */
729e4ab9 1618 tmpLocaleID = scriptID;
b75a7d8f 1619 ++fieldCount;
374ca955 1620 len+=scriptSize;
729e4ab9 1621 if (_isIDSeparator(*tmpLocaleID)) {
374ca955
A
1622 /* If there is something else, then we add the _ */
1623 if(len<nameCapacity) {
1624 name[len]='_';
1625 }
1626 ++len;
1627 }
1628 }
1629
729e4ab9
A
1630 if (_isIDSeparator(*tmpLocaleID)) {
1631 const char *cntryID;
4388f060
A
1632 int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1633 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
729e4ab9
A
1634 if (cntrySize > 0) {
1635 /* Found optional country */
1636 tmpLocaleID = cntryID;
1637 len+=cntrySize;
1638 }
1639 if(_isIDSeparator(*tmpLocaleID)) {
51004dcb
A
1640 /* If there is something else, then we add the _ if we found country before. */
1641 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
729e4ab9
A
1642 ++fieldCount;
1643 if(len<nameCapacity) {
1644 name[len]='_';
1645 }
1646 ++len;
374ca955 1647 }
729e4ab9 1648
4388f060
A
1649 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1650 (len<nameCapacity ? name+len : NULL), nameCapacity-len);
374ca955 1651 if (variantSize > 0) {
4388f060 1652 variant = len<nameCapacity ? name+len : NULL;
374ca955 1653 len += variantSize;
729e4ab9 1654 tmpLocaleID += variantSize + 1; /* skip '_' and variant */
374ca955 1655 }
b75a7d8f 1656 }
b75a7d8f
A
1657 }
1658 }
1659
374ca955 1660 /* Copy POSIX-style charset specifier, if any [mr.utf8] */
729e4ab9 1661 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
374ca955 1662 UBool done = FALSE;
b75a7d8f 1663 do {
729e4ab9 1664 char c = *tmpLocaleID;
374ca955
A
1665 switch (c) {
1666 case 0:
1667 case '@':
1668 done = TRUE;
1669 break;
1670 default:
1671 if (len<nameCapacity) {
1672 name[len] = c;
1673 }
1674 ++len;
729e4ab9 1675 ++tmpLocaleID;
374ca955
A
1676 break;
1677 }
1678 } while (!done);
1679 }
1680
1681 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
729e4ab9
A
1682 After this, tmpLocaleID either points to '@' or is NULL */
1683 if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1684 keywordAssign = uprv_strchr(tmpLocaleID, '=');
1685 separatorIndicator = uprv_strchr(tmpLocaleID, ';');
374ca955
A
1686 }
1687
1688 /* Copy POSIX-style variant, if any [mr@FOO] */
1689 if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
729e4ab9 1690 tmpLocaleID != NULL && keywordAssign == NULL) {
374ca955 1691 for (;;) {
729e4ab9 1692 char c = *tmpLocaleID;
374ca955
A
1693 if (c == 0) {
1694 break;
1695 }
1696 if (len<nameCapacity) {
1697 name[len] = c;
1698 }
1699 ++len;
729e4ab9 1700 ++tmpLocaleID;
374ca955
A
1701 }
1702 }
1703
1704 if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1705 /* Handle @FOO variant if @ is present and not followed by = */
729e4ab9 1706 if (tmpLocaleID!=NULL && keywordAssign==NULL) {
374ca955
A
1707 int32_t posixVariantSize;
1708 /* Add missing '_' if needed */
1709 if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1710 do {
1711 if(len<nameCapacity) {
1712 name[len]='_';
1713 }
1714 ++len;
1715 ++fieldCount;
1716 } while(fieldCount<2);
1717 }
729e4ab9 1718 posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
374ca955
A
1719 (UBool)(variantSize > 0));
1720 if (posixVariantSize > 0) {
1721 if (variant == NULL) {
1722 variant = name+len;
1723 }
1724 len += posixVariantSize;
1725 variantSize += posixVariantSize;
b75a7d8f 1726 }
374ca955
A
1727 }
1728
374ca955 1729 /* Look up the ID in the canonicalization map */
2ca993e8 1730 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
374ca955 1731 const char* id = CANONICALIZE_MAP[j].id;
73c04bcf 1732 int32_t n = (int32_t)uprv_strlen(id);
374ca955 1733 if (len == n && uprv_strncmp(name, id, n) == 0) {
729e4ab9 1734 if (n == 0 && tmpLocaleID != NULL) {
374ca955
A
1735 break; /* Don't remap "" if keywords present */
1736 }
1737 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
374ca955
A
1738 break;
1739 }
1740 }
374ca955
A
1741 }
1742
1743 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
729e4ab9 1744 if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
374ca955
A
1745 (!separatorIndicator || separatorIndicator > keywordAssign)) {
1746 if(len<nameCapacity) {
1747 name[len]='@';
1748 }
1749 ++len;
b75a7d8f 1750 ++fieldCount;
4388f060 1751 len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
3d1f044b 1752 NULL, 0, NULL, TRUE, err);
374ca955
A
1753 }
1754 }
1755
46f4442e 1756 if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
374ca955
A
1757 uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1758 }
1759
1760 return u_terminateChars(result, resultCapacity, len, err);
1761}
1762
1763/* ### ID parsing API **************************************************/
1764
1765U_CAPI int32_t U_EXPORT2
1766uloc_getParent(const char* localeID,
1767 char* parent,
1768 int32_t parentCapacity,
1769 UErrorCode* err)
1770{
1771 const char *lastUnderscore;
1772 int32_t i;
f3c0d7a5 1773
374ca955
A
1774 if (U_FAILURE(*err))
1775 return 0;
f3c0d7a5 1776
374ca955
A
1777 if (localeID == NULL)
1778 localeID = uloc_getDefault();
1779
1780 lastUnderscore=uprv_strrchr(localeID, '_');
1781 if(lastUnderscore!=NULL) {
1782 i=(int32_t)(lastUnderscore-localeID);
1783 } else {
1784 i=0;
b75a7d8f 1785 }
374ca955 1786
73c04bcf 1787 if(i>0 && parent != localeID) {
374ca955
A
1788 uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1789 }
3d1f044b 1790
374ca955 1791 return u_terminateChars(parent, parentCapacity, i, err);
b75a7d8f 1792}
374ca955
A
1793
1794U_CAPI int32_t U_EXPORT2
1795uloc_getLanguage(const char* localeID,
1796 char* language,
1797 int32_t languageCapacity,
1798 UErrorCode* err)
1799{
1800 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1801 int32_t i=0;
1802
1803 if (err==NULL || U_FAILURE(*err)) {
1804 return 0;
1805 }
f3c0d7a5 1806
374ca955
A
1807 if(localeID==NULL) {
1808 localeID=uloc_getDefault();
1809 }
1810
729e4ab9 1811 i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
374ca955
A
1812 return u_terminateChars(language, languageCapacity, i, err);
1813}
1814
1815U_CAPI int32_t U_EXPORT2
1816uloc_getScript(const char* localeID,
1817 char* script,
1818 int32_t scriptCapacity,
1819 UErrorCode* err)
1820{
1821 int32_t i=0;
1822
1823 if(err==NULL || U_FAILURE(*err)) {
1824 return 0;
1825 }
1826
1827 if(localeID==NULL) {
1828 localeID=uloc_getDefault();
1829 }
1830
1831 /* skip the language */
729e4ab9 1832 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
374ca955 1833 if(_isIDSeparator(*localeID)) {
729e4ab9 1834 i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
374ca955
A
1835 }
1836 return u_terminateChars(script, scriptCapacity, i, err);
1837}
1838
1839U_CAPI int32_t U_EXPORT2
1840uloc_getCountry(const char* localeID,
1841 char* country,
1842 int32_t countryCapacity,
f3c0d7a5 1843 UErrorCode* err)
374ca955
A
1844{
1845 int32_t i=0;
1846
1847 if(err==NULL || U_FAILURE(*err)) {
1848 return 0;
1849 }
1850
1851 if(localeID==NULL) {
1852 localeID=uloc_getDefault();
1853 }
1854
1855 /* Skip the language */
729e4ab9 1856 ulocimp_getLanguage(localeID, NULL, 0, &localeID);
374ca955
A
1857 if(_isIDSeparator(*localeID)) {
1858 const char *scriptID;
1859 /* Skip the script if available */
729e4ab9 1860 ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
374ca955
A
1861 if(scriptID != localeID+1) {
1862 /* Found optional script */
1863 localeID = scriptID;
1864 }
1865 if(_isIDSeparator(*localeID)) {
729e4ab9 1866 i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
374ca955
A
1867 }
1868 }
1869 return u_terminateChars(country, countryCapacity, i, err);
1870}
1871
1872U_CAPI int32_t U_EXPORT2
1873uloc_getVariant(const char* localeID,
1874 char* variant,
1875 int32_t variantCapacity,
f3c0d7a5 1876 UErrorCode* err)
374ca955 1877{
729e4ab9
A
1878 char tempBuffer[ULOC_FULLNAME_CAPACITY];
1879 const char* tmpLocaleID;
374ca955 1880 int32_t i=0;
f3c0d7a5 1881
374ca955
A
1882 if(err==NULL || U_FAILURE(*err)) {
1883 return 0;
1884 }
f3c0d7a5 1885
729e4ab9
A
1886 if (_hasBCP47Extension(localeID)) {
1887 _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1888 } else {
1889 if (localeID==NULL) {
1890 localeID=uloc_getDefault();
1891 }
1892 tmpLocaleID=localeID;
374ca955 1893 }
f3c0d7a5 1894
374ca955 1895 /* Skip the language */
729e4ab9
A
1896 ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1897 if(_isIDSeparator(*tmpLocaleID)) {
374ca955
A
1898 const char *scriptID;
1899 /* Skip the script if available */
729e4ab9
A
1900 ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1901 if(scriptID != tmpLocaleID+1) {
374ca955 1902 /* Found optional script */
729e4ab9 1903 tmpLocaleID = scriptID;
374ca955
A
1904 }
1905 /* Skip the Country */
729e4ab9
A
1906 if (_isIDSeparator(*tmpLocaleID)) {
1907 const char *cntryID;
1908 ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
1909 if (cntryID != tmpLocaleID+1) {
1910 /* Found optional country */
1911 tmpLocaleID = cntryID;
1912 }
1913 if(_isIDSeparator(*tmpLocaleID)) {
1914 /* If there was no country ID, skip a possible extra IDSeparator */
1915 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1916 tmpLocaleID++;
1917 }
1918 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
374ca955
A
1919 }
1920 }
1921 }
f3c0d7a5 1922
374ca955
A
1923 /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1924 /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1925/*
1926 if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1927 i=_getVariant(localeID+1, '@', variant, variantCapacity);
1928 }
1929*/
1930 return u_terminateChars(variant, variantCapacity, i, err);
1931}
1932
1933U_CAPI int32_t U_EXPORT2
1934uloc_getName(const char* localeID,
1935 char* name,
1936 int32_t nameCapacity,
f3c0d7a5 1937 UErrorCode* err)
374ca955
A
1938{
1939 return _canonicalize(localeID, name, nameCapacity, 0, err);
1940}
1941
1942U_CAPI int32_t U_EXPORT2
1943uloc_getBaseName(const char* localeID,
1944 char* name,
1945 int32_t nameCapacity,
f3c0d7a5 1946 UErrorCode* err)
374ca955
A
1947{
1948 return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
1949}
1950
1951U_CAPI int32_t U_EXPORT2
1952uloc_canonicalize(const char* localeID,
1953 char* name,
1954 int32_t nameCapacity,
f3c0d7a5 1955 UErrorCode* err)
374ca955
A
1956{
1957 return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
1958}
f3c0d7a5 1959
b75a7d8f 1960U_CAPI const char* U_EXPORT2
f3c0d7a5 1961uloc_getISO3Language(const char* localeID)
b75a7d8f 1962{
374ca955
A
1963 int16_t offset;
1964 char lang[ULOC_LANG_CAPACITY];
1965 UErrorCode err = U_ZERO_ERROR;
f3c0d7a5 1966
374ca955
A
1967 if (localeID == NULL)
1968 {
1969 localeID = uloc_getDefault();
1970 }
1971 uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1972 if (U_FAILURE(err))
1973 return "";
1974 offset = _findIndex(LANGUAGES, lang);
1975 if (offset < 0)
1976 return "";
1977 return LANGUAGES_3[offset];
b75a7d8f
A
1978}
1979
1980U_CAPI const char* U_EXPORT2
f3c0d7a5 1981uloc_getISO3Country(const char* localeID)
b75a7d8f
A
1982{
1983 int16_t offset;
374ca955 1984 char cntry[ULOC_LANG_CAPACITY];
b75a7d8f 1985 UErrorCode err = U_ZERO_ERROR;
f3c0d7a5 1986
b75a7d8f
A
1987 if (localeID == NULL)
1988 {
1989 localeID = uloc_getDefault();
1990 }
374ca955 1991 uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
b75a7d8f
A
1992 if (U_FAILURE(err))
1993 return "";
374ca955 1994 offset = _findIndex(COUNTRIES, cntry);
b75a7d8f
A
1995 if (offset < 0)
1996 return "";
f3c0d7a5 1997
374ca955 1998 return COUNTRIES_3[offset];
b75a7d8f
A
1999}
2000
2001U_CAPI uint32_t U_EXPORT2
f3c0d7a5 2002uloc_getLCID(const char* localeID)
b75a7d8f 2003{
374ca955
A
2004 UErrorCode status = U_ZERO_ERROR;
2005 char langID[ULOC_FULLNAME_CAPACITY];
f3c0d7a5
A
2006 uint32_t lcid = 0;
2007
2008 /* Check for incomplete id. */
2009 if (!localeID || uprv_strlen(localeID) < 2) {
2010 return 0;
2011 }
2012
3d1f044b
A
2013 // First, attempt Windows platform lookup if available, but fall
2014 // through to catch any special cases (ICU vs Windows name differences).
2015 lcid = uprv_convertToLCIDPlatform(localeID, &status);
2016 if (U_FAILURE(status)) {
2017 return 0;
2018 }
2019 if (lcid > 0) {
f3c0d7a5
A
2020 // Windows found an LCID, return that
2021 return lcid;
2022 }
374ca955
A
2023
2024 uloc_getLanguage(localeID, langID, sizeof(langID), &status);
3d1f044b 2025 if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
374ca955 2026 return 0;
b75a7d8f 2027 }
374ca955 2028
57a6839d
A
2029 if (uprv_strchr(localeID, '@')) {
2030 // uprv_convertToLCID does not support keywords other than collation.
2031 // Remove all keywords except collation.
2032 int32_t len;
2033 char collVal[ULOC_KEYWORDS_CAPACITY];
2034 char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2035
2036 len = uloc_getKeywordValue(localeID, "collation", collVal,
2ca993e8 2037 UPRV_LENGTHOF(collVal) - 1, &status);
57a6839d
A
2038
2039 if (U_SUCCESS(status) && len > 0) {
2040 collVal[len] = 0;
2041
2042 len = uloc_getBaseName(localeID, tmpLocaleID,
2ca993e8 2043 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
57a6839d 2044
2ca993e8 2045 if (U_SUCCESS(status) && len > 0) {
57a6839d
A
2046 tmpLocaleID[len] = 0;
2047
2048 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2ca993e8 2049 UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
57a6839d 2050
2ca993e8 2051 if (U_SUCCESS(status) && len > 0) {
57a6839d
A
2052 tmpLocaleID[len] = 0;
2053 return uprv_convertToLCID(langID, tmpLocaleID, &status);
2054 }
2055 }
2056 }
2057
2058 // fall through - all keywords are simply ignored
2059 status = U_ZERO_ERROR;
2060 }
2061
374ca955
A
2062 return uprv_convertToLCID(langID, localeID, &status);
2063}
2064
73c04bcf
A
2065U_CAPI int32_t U_EXPORT2
2066uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2067 UErrorCode *status)
2068{
57a6839d 2069 return uprv_convertToPosix(hostid, locale, localeCapacity, status);
73c04bcf
A
2070}
2071
374ca955
A
2072/* ### Default locale **************************************************/
2073
2074U_CAPI const char* U_EXPORT2
2075uloc_getDefault()
2076{
2077 return locale_get_default();
2078}
2079
2080U_CAPI void U_EXPORT2
2081uloc_setDefault(const char* newDefaultLocale,
f3c0d7a5 2082 UErrorCode* err)
374ca955
A
2083{
2084 if (U_FAILURE(*err))
2085 return;
2086 /* the error code isn't currently used for anything by this function*/
f3c0d7a5 2087
374ca955
A
2088 /* propagate change to C++ */
2089 locale_set_default(newDefaultLocale);
b75a7d8f
A
2090}
2091
729e4ab9 2092/**
51004dcb 2093 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer
729e4ab9
A
2094 * to an array of pointers to arrays of char. All of these pointers are owned
2095 * by ICU-- do not delete them, and do not write through them. The array is
2096 * terminated with a null pointer.
2097 */
2098U_CAPI const char* const* U_EXPORT2
f3c0d7a5 2099uloc_getISOLanguages()
729e4ab9
A
2100{
2101 return LANGUAGES;
2102}
374ca955 2103
729e4ab9
A
2104/**
2105 * Returns a list of all 2-letter country codes defined in ISO 639. This is a
2106 * pointer to an array of pointers to arrays of char. All of these pointers are
2107 * owned by ICU-- do not delete them, and do not write through them. The array is
2108 * terminated with a null pointer.
b75a7d8f 2109 */
729e4ab9 2110U_CAPI const char* const* U_EXPORT2
f3c0d7a5 2111uloc_getISOCountries()
b75a7d8f 2112{
729e4ab9
A
2113 return COUNTRIES;
2114}
73c04bcf 2115
b75a7d8f 2116
729e4ab9
A
2117/* this function to be moved into cstring.c later */
2118static char gDecimal = 0;
b75a7d8f 2119
729e4ab9
A
2120static /* U_CAPI */
2121double
2122/* U_EXPORT2 */
2123_uloc_strtod(const char *start, char **end) {
2124 char *decimal;
2125 char *myEnd;
2126 char buf[30];
2127 double rv;
2128 if (!gDecimal) {
2129 char rep[5];
2130 /* For machines that decide to change the decimal on you,
2131 and try to be too smart with localization.
2132 This normally should be just a '.'. */
2133 sprintf(rep, "%+1.1f", 1.0);
2134 gDecimal = rep[2];
b75a7d8f 2135 }
b75a7d8f 2136
729e4ab9
A
2137 if(gDecimal == '.') {
2138 return uprv_strtod(start, end); /* fall through to OS */
b75a7d8f 2139 } else {
729e4ab9
A
2140 uprv_strncpy(buf, start, 29);
2141 buf[29]=0;
2142 decimal = uprv_strchr(buf, '.');
2143 if(decimal) {
2144 *decimal = gDecimal;
46f4442e 2145 } else {
729e4ab9 2146 return uprv_strtod(start, end); /* no decimal point */
46f4442e 2147 }
729e4ab9
A
2148 rv = uprv_strtod(buf, &myEnd);
2149 if(end) {
2150 *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
b75a7d8f 2151 }
729e4ab9 2152 return rv;
374ca955 2153 }
374ca955
A
2154}
2155
f3c0d7a5 2156typedef struct {
729e4ab9
A
2157 float q;
2158 int32_t dummy; /* to avoid uninitialized memory copy from qsort */
f3c0d7a5 2159 char locale[ULOC_FULLNAME_CAPACITY+1];
729e4ab9 2160} _acceptLangItem;
b75a7d8f 2161
729e4ab9 2162static int32_t U_CALLCONV
4388f060 2163uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
729e4ab9
A
2164{
2165 const _acceptLangItem *aa = (const _acceptLangItem*)a;
2166 const _acceptLangItem *bb = (const _acceptLangItem*)b;
b75a7d8f 2167
729e4ab9
A
2168 int32_t rc = 0;
2169 if(bb->q < aa->q) {
2170 rc = -1; /* A > B */
2171 } else if(bb->q > aa->q) {
2172 rc = 1; /* A < B */
2173 } else {
2174 rc = 0; /* A = B */
b75a7d8f
A
2175 }
2176
729e4ab9
A
2177 if(rc==0) {
2178 rc = uprv_stricmp(aa->locale, bb->locale);
b75a7d8f
A
2179 }
2180
729e4ab9 2181#if defined(ULOC_DEBUG)
f3c0d7a5
A
2182 /* fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2183 aa->locale, aa->q,
729e4ab9
A
2184 bb->locale, bb->q,
2185 rc);*/
2186#endif
374ca955 2187
729e4ab9 2188 return rc;
374ca955
A
2189}
2190
f3c0d7a5 2191/*
729e4ab9
A
2192mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2193*/
374ca955 2194
b75a7d8f 2195U_CAPI int32_t U_EXPORT2
729e4ab9
A
2196uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2197 const char *httpAcceptLanguage,
2198 UEnumeration* availableLocales,
2199 UErrorCode *status)
374ca955 2200{
f3c0d7a5 2201 MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
729e4ab9
A
2202 char tmp[ULOC_FULLNAME_CAPACITY +1];
2203 int32_t n = 0;
2204 const char *itemEnd;
2205 const char *paramEnd;
2206 const char *s;
2207 const char *t;
2208 int32_t res;
2209 int32_t i;
2210 int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
b75a7d8f 2211
729e4ab9
A
2212 if(U_FAILURE(*status)) {
2213 return -1;
b75a7d8f
A
2214 }
2215
729e4ab9
A
2216 for(s=httpAcceptLanguage;s&&*s;) {
2217 while(isspace(*s)) /* eat space at the beginning */
2218 s++;
2219 itemEnd=uprv_strchr(s,',');
2220 paramEnd=uprv_strchr(s,';');
2221 if(!itemEnd) {
2222 itemEnd = httpAcceptLanguage+l; /* end of string */
b75a7d8f 2223 }
f3c0d7a5 2224 if(paramEnd && paramEnd<itemEnd) {
729e4ab9
A
2225 /* semicolon (;) is closer than end (,) */
2226 t = paramEnd+1;
2227 if(*t=='q') {
2228 t++;
2229 }
2230 while(isspace(*t)) {
2231 t++;
2232 }
2233 if(*t=='=') {
2234 t++;
2235 }
2236 while(isspace(*t)) {
2237 t++;
2238 }
f3c0d7a5 2239 items[n].q = (float)_uloc_strtod(t,NULL);
729e4ab9
A
2240 } else {
2241 /* no semicolon - it's 1.0 */
f3c0d7a5 2242 items[n].q = 1.0f;
729e4ab9 2243 paramEnd = itemEnd;
374ca955 2244 }
f3c0d7a5 2245 items[n].dummy=0;
374ca955
A
2246 /* eat spaces prior to semi */
2247 for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2248 ;
3d1f044b 2249 int32_t slen = static_cast<int32_t>(((t+1)-s));
f3c0d7a5
A
2250 if(slen > ULOC_FULLNAME_CAPACITY) {
2251 *status = U_BUFFER_OVERFLOW_ERROR;
2252 return -1; // too big
2253 }
2254 uprv_strncpy(items[n].locale, s, slen);
2255 items[n].locale[slen]=0; // terminate
2256 int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2257 if(U_FAILURE(*status)) return -1;
2258 if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2259 // canonicalization had an effect- copy back
2260 uprv_strncpy(items[n].locale, tmp, clen);
2261 items[n].locale[clen] = 0; // terminate
374ca955
A
2262 }
2263#if defined(ULOC_DEBUG)
2264 /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2265#endif
2266 n++;
2267 s = itemEnd;
2268 while(*s==',') { /* eat duplicate commas */
2269 s++;
2270 }
f3c0d7a5
A
2271 if(n>=items.getCapacity()) { // If we need more items
2272 if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2273 *status = U_MEMORY_ALLOCATION_ERROR;
2274 return -1;
2275 }
374ca955 2276#if defined(ULOC_DEBUG)
f3c0d7a5 2277 fprintf(stderr,"malloced at size %d\n", items.getCapacity());
374ca955 2278#endif
374ca955
A
2279 }
2280 }
f3c0d7a5
A
2281 uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2282 if (U_FAILURE(*status)) {
46f4442e 2283 return -1;
374ca955 2284 }
f3c0d7a5
A
2285 LocalMemory<const char*> strs(NULL);
2286 if (strs.allocateInsteadAndReset(n) == NULL) {
46f4442e
A
2287 *status = U_MEMORY_ALLOCATION_ERROR;
2288 return -1;
2289 }
374ca955
A
2290 for(i=0;i<n;i++) {
2291#if defined(ULOC_DEBUG)
2292 /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2293#endif
f3c0d7a5 2294 strs[i]=items[i].locale;
374ca955 2295 }
f3c0d7a5
A
2296 res = uloc_acceptLanguage(result, resultAvailable, outResult,
2297 strs.getAlias(), n, availableLocales, status);
374ca955
A
2298 return res;
2299}
2300
2301
2302U_CAPI int32_t U_EXPORT2
f3c0d7a5 2303uloc_acceptLanguage(char *result, int32_t resultAvailable,
374ca955
A
2304 UAcceptResult *outResult, const char **acceptList,
2305 int32_t acceptListCount,
2306 UEnumeration* availableLocales,
2307 UErrorCode *status)
2308{
2309 int32_t i,j;
2310 int32_t len;
2311 int32_t maxLen=0;
2312 char tmp[ULOC_FULLNAME_CAPACITY+1];
2313 const char *l;
2314 char **fallbackList;
2315 if(U_FAILURE(*status)) {
2316 return -1;
2317 }
51004dcb 2318 fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
374ca955 2319 if(fallbackList==NULL) {
46f4442e
A
2320 *status = U_MEMORY_ALLOCATION_ERROR;
2321 return -1;
374ca955
A
2322 }
2323 for(i=0;i<acceptListCount;i++) {
2324#if defined(ULOC_DEBUG)
2325 fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2326#endif
0f5d89e8 2327 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
374ca955
A
2328#if defined(ULOC_DEBUG)
2329 fprintf(stderr," %s\n", l);
2330#endif
73c04bcf 2331 len = (int32_t)uprv_strlen(l);
374ca955 2332 if(!uprv_strcmp(acceptList[i], l)) {
f3c0d7a5 2333 if(outResult) {
374ca955
A
2334 *outResult = ULOC_ACCEPT_VALID;
2335 }
2336#if defined(ULOC_DEBUG)
2337 fprintf(stderr, "MATCH! %s\n", l);
2338#endif
2339 if(len>0) {
2340 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2341 }
2342 for(j=0;j<i;j++) {
2343 uprv_free(fallbackList[j]);
2344 }
2345 uprv_free(fallbackList);
f3c0d7a5 2346 return u_terminateChars(result, resultAvailable, len, status);
374ca955
A
2347 }
2348 if(len>maxLen) {
2349 maxLen = len;
2350 }
2351 }
f3c0d7a5 2352 uenum_reset(availableLocales, status);
374ca955 2353 /* save off parent info */
2ca993e8 2354 if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
374ca955
A
2355 fallbackList[i] = uprv_strdup(tmp);
2356 } else {
2357 fallbackList[i]=0;
2358 }
2359 }
2360
2361 for(maxLen--;maxLen>0;maxLen--) {
2362 for(i=0;i<acceptListCount;i++) {
2363 if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2364#if defined(ULOC_DEBUG)
2365 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2366#endif
0f5d89e8 2367 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
374ca955
A
2368#if defined(ULOC_DEBUG)
2369 fprintf(stderr," %s\n", l);
2370#endif
73c04bcf 2371 len = (int32_t)uprv_strlen(l);
374ca955 2372 if(!uprv_strcmp(fallbackList[i], l)) {
f3c0d7a5 2373 if(outResult) {
374ca955
A
2374 *outResult = ULOC_ACCEPT_FALLBACK;
2375 }
2376#if defined(ULOC_DEBUG)
2377 fprintf(stderr, "fallback MATCH! %s\n", l);
2378#endif
2379 if(len>0) {
2380 uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2381 }
73c04bcf
A
2382 for(j=0;j<acceptListCount;j++) {
2383 uprv_free(fallbackList[j]);
374ca955
A
2384 }
2385 uprv_free(fallbackList);
73c04bcf 2386 return u_terminateChars(result, resultAvailable, len, status);
374ca955
A
2387 }
2388 }
f3c0d7a5 2389 uenum_reset(availableLocales, status);
374ca955 2390
2ca993e8 2391 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
374ca955
A
2392 uprv_free(fallbackList[i]);
2393 fallbackList[i] = uprv_strdup(tmp);
2394 } else {
2395 uprv_free(fallbackList[i]);
2396 fallbackList[i]=0;
2397 }
2398 }
2399 }
f3c0d7a5 2400 if(outResult) {
374ca955
A
2401 *outResult = ULOC_ACCEPT_FAILED;
2402 }
2403 }
2404 for(i=0;i<acceptListCount;i++) {
2405 uprv_free(fallbackList[i]);
2406 }
2407 uprv_free(fallbackList);
2408 return -1;
b75a7d8f 2409}
374ca955 2410
b331163b
A
2411U_CAPI const char* U_EXPORT2
2412uloc_toUnicodeLocaleKey(const char* keyword)
2413{
2414 const char* bcpKey = ulocimp_toBcpKey(keyword);
2415 if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2416 // unknown keyword, but syntax is fine..
2417 return keyword;
2418 }
2419 return bcpKey;
2420}
2421
2422U_CAPI const char* U_EXPORT2
2423uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2424{
2425 const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2426 if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2427 // unknown keyword, but syntax is fine..
2428 return value;
2429 }
2430 return bcpType;
2431}
2432
b331163b
A
2433static UBool
2434isWellFormedLegacyKey(const char* legacyKey)
2435{
2436 const char* p = legacyKey;
2437 while (*p) {
2438 if (!UPRV_ISALPHANUM(*p)) {
2439 return FALSE;
2440 }
2441 p++;
2442 }
2443 return TRUE;
2444}
2445
2446static UBool
2447isWellFormedLegacyType(const char* legacyType)
2448{
2449 const char* p = legacyType;
2450 int32_t alphaNumLen = 0;
2451 while (*p) {
2452 if (*p == '_' || *p == '/' || *p == '-') {
2453 if (alphaNumLen == 0) {
2454 return FALSE;
2455 }
2456 alphaNumLen = 0;
2457 } else if (UPRV_ISALPHANUM(*p)) {
2458 alphaNumLen++;
2459 } else {
2460 return FALSE;
2461 }
2462 p++;
2463 }
2464 return (alphaNumLen != 0);
2465}
2466
2467U_CAPI const char* U_EXPORT2
2468uloc_toLegacyKey(const char* keyword)
2469{
2470 const char* legacyKey = ulocimp_toLegacyKey(keyword);
2471 if (legacyKey == NULL) {
2472 // Checks if the specified locale key is well-formed with the legacy locale syntax.
2473 //
2474 // Note:
f3c0d7a5
A
2475 // LDML/CLDR provides some definition of keyword syntax in
2476 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2477 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2478 // Keys can only consist of [0-9a-zA-Z].
b331163b
A
2479 if (isWellFormedLegacyKey(keyword)) {
2480 return keyword;
2481 }
2482 }
2483 return legacyKey;
2484}
2485
2486U_CAPI const char* U_EXPORT2
2487uloc_toLegacyType(const char* keyword, const char* value)
2488{
2489 const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2490 if (legacyType == NULL) {
2491 // Checks if the specified locale type is well-formed with the legacy locale syntax.
2492 //
2493 // Note:
f3c0d7a5
A
2494 // LDML/CLDR provides some definition of keyword syntax in
2495 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2496 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2497 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2498 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
b331163b
A
2499 if (isWellFormedLegacyType(value)) {
2500 return value;
2501 }
2502 }
2503 return legacyType;
2504}
2505
374ca955 2506/*eof*/