icuSources/common/uloc.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1997-2013, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *
   7 * File ULOC.CPP
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   04/01/97    aliu        Creation.
  13 *   08/21/98    stephen     JDK 1.2 sync
  14 *   12/08/98    rtg         New Locale implementation and C API
  15 *   03/15/99    damiba      overhaul.
  16 *   04/06/99    stephen     changed setDefault() to realloc and copy
  17 *   06/14/99    stephen     Changed calls to ures_open for new params
  18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  20 *                           brought canonicalization code into line with spec
  21 *****************************************************************************/
  22
  23 /*
  24    POSIX's locale format, from putil.c: [no spaces]
  25
  26      ll [ _CC ] [ . MM ] [ @ VV]
  27
  28      l = lang, C = ctry, M = charmap, V = variant
  29 */
  30
  31 #include "unicode/utypes.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/uloc.h"
  34
  35 #include "putilimp.h"
  36 #include "ustr_imp.h"
  37 #include "ulocimp.h"
  38 #include "umutex.h"
  39 #include "cstring.h"
  40 #include "cmemory.h"
  41 #include "ucln_cmn.h"
  42 #include "locmap.h"
  43 #include "uarrsort.h"
  44 #include "uenumimp.h"
  45 #include "uassert.h"
  46
  47 #include <stdio.h> /* for sprintf */
  48
  49 /* ### Declarations **************************************************/
  50
  51 /* Locale stuff from locid.cpp */
  52 U_CFUNC void locale_set_default(const char *id);
  53 U_CFUNC const char *locale_get_default(void);
  54 U_CFUNC int32_t
  55 locale_getKeywords(const char *localeID,
  56             char prev,
  57             char *keywords, int32_t keywordCapacity,
  58             char *values, int32_t valuesCapacity, int32_t *valLen,
  59             UBool valuesToo,
  60             UErrorCode *status);
  61
  62 /* ### Data tables **************************************************/
  63
  64 /**
  65  * Table of language codes, both 2- and 3-letter, with preference
  66  * given to 2-letter codes where possible.  Includes 3-letter codes
  67  * that lack a 2-letter equivalent.
  68  *
  69  * This list must be in sorted order.  This list is returned directly
  70  * to the user by some API.
  71  *
  72  * This list must be kept in sync with LANGUAGES_3, with corresponding
  73  * entries matched.
  74  *
  75  * This table should be terminated with a NULL entry, followed by a
  76  * second list, and another NULL entry.  The first list is visible to
  77  * user code when this array is returned by API.  The second list
  78  * contains codes we support, but do not expose through user API.
  79  *
  80  * Notes
  81  *
  82  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  83  * include the revisions up to 2001/7/27 *CWB*
  84  *
  85  * The 3 character codes are the terminology codes like RFC 3066.  This
  86  * is compatible with prior ICU codes
  87  *
  88  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  89  * table but now at the end of the table because 3 character codes are
  90  * duplicates.  This avoids bad searches going from 3 to 2 character
  91  * codes.
  92  *
  93  * The range qaa-qtz is reserved for local use
  94  */
  95 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
  96 /* ISO639 table version is 20130531 */
  97 static const char * const LANGUAGES[] = {
  98     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",
  99     "afa", "afh", "agq", "ain", "ak",  "akk", "ale", "alg",
 100     "alt", "am",  "an",  "ang", "anp", "apa", "ar",  "arc",
 101     "arn", "arp", "art", "arw", "as",  "asa", "ast", "ath",
 102     "aus", "av",  "awa", "ay",  "az",
 103     "ba",  "bad", "bai", "bal", "ban", "bas", "bat", "bax",
 104     "bbj", "be",  "bej", "bem", "ber", "bez", "bfd", "bg",
 105     "bh",  "bho", "bi",  "bik", "bin", "bkm", "bla", "bm",
 106     "bn",  "bnt", "bo",  "br",  "bra", "brx", "bs",  "bss",
 107     "btk", "bua", "bug", "bum", "byn", "byv",
 108     "ca",  "cad", "cai", "car", "cau", "cay", "cch", "ce",
 109     "ceb", "cel", "cgg", "ch",  "chb", "chg", "chk", "chm",
 110     "chn", "cho", "chp", "chr", "chy", "ckb", "cmc", "co",
 111     "cop", "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",
 112     "csb", "cu",  "cus", "cv",  "cy",
 113     "da",  "dak", "dar", "dav", "day", "de",  "del", "den",
 114     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum",
 115     "dv",  "dyo", "dyu", "dz",  "dzg",
 116     "ebu", "ee",  "efi", "egy", "eka", "el",  "elx", "en",
 117     "enm", "eo",  "es",  "et",  "eu",  "ewo",
 118     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",
 119     "fo",  "fon", "fr",  "frm", "fro", "frr", "frs", "fur",
 120     "fy",
 121     "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
 122     "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
 123     "grc", "gsw", "gu",  "guz", "gv",  "gwi",
 124     "ha",  "hai", "haw", "he",  "hi",  "hil", "him", "hit",
 125     "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",
 126     "hz",
 127     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ijo",
 128     "ik",  "ilo", "inc", "ine", "inh", "io",  "ira", "iro",
 129     "is",  "it",  "iu",
 130     "ja",  "jbo", "jgo", "jmc", "jpr", "jrb", "jv",
 131     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kar", "kaw",
 132     "kbd", "kbl", "kcg", "kde", "kea", "kfo", "kg",  "kha",
 133     "khi", "kho", "khq", "ki",  "kj",  "kk",  "kkj", "kl",
 134     "kln", "km",  "kmb", "kn",  "ko",  "kok", "kos", "kpe",
 135     "kr",  "krc", "krl", "kro", "kru", "ks",  "ksb", "ksf",
 136     "ksh", "ku",  "kum", "kut", "kv",  "kw",  "ky",
 137     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lg",
 138     "li",  "lkt", "ln",  "lo",  "lol", "loz", "lt",  "lu",
 139     "lua", "lui", "lun", "luo", "lus", "luy", "lv",
 140     "mad", "maf", "mag", "mai", "mak", "man", "map", "mas",
 141     "mde", "mdf", "mdr", "men", "mer", "mfe", "mg",  "mga",
 142     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
 143     "mkh", "ml",  "mn",  "mnc", "mni", "mno", "mo",  "moh",
 144     "mos", "mr",  "ms",  "mt",  "mua", "mul", "mun", "mus",
 145     "mwl", "mwr", "my",  "mye", "myn", "myv",
 146     "na",  "nah", "nai", "nap", "naq", "nb",  "nd",  "nds",
 147     "ne",  "new", "ng",  "nia", "nic", "niu", "nl",  "nmg",
 148     "nn",  "nnh", "no",  "nog", "non", "nqo", "nr",  "nso",
 149     "nub", "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo",
 150     "nzi",
 151     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota", "oto",
 152     "pa",  "paa", "pag", "pal", "pam", "pap", "pau", "peo",
 153     "phi", "phn", "pi",  "pl",  "pon", "pra", "pro", "ps",
 154     "pt",
 155     "qu",
 156     "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rof",
 157     "rom", "ru",  "rup", "rw",  "rwk",
 158     "sa",  "sad", "sah", "sai", "sal", "sam", "saq", "sas",
 159     "sat", "sba", "sbp", "sc",  "scn", "sco", "sd",  "se",
 160     "see", "seh", "sel", "sem", "ses", "sg",  "sga", "sgn",
 161     "shi", "shn", "shu", "si",  "sid", "sio", "sit",
 162     "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
 163     "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
 164     "srn", "srr", "ss",  "ssa", "ssy", "st",  "su",  "suk",
 165     "sus", "sux", "sv",  "sw",  "swb", "swc", "syc", "syr",
 166     "ta",  "tai", "te",  "tem", "teo", "ter", "tet", "tg",
 167     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tl",  "tlh",
 168     "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr",  "trv",
 169     "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",
 170     "twq", "ty",  "tyv", "tzm",
 171     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
 172     "vai", "ve",  "vi",  "vo",  "vot", "vun",
 173     "wa",  "wae", "wak", "wal", "war", "was", "wen", "wo",
 174     "xal", "xh",  "xog",
 175     "yao", "yap", "yav", "ybb", "yi",  "yo",  "ypk", "yue",
 176     "za",  "zap", "zbl", "zen", "zgh", "zh",  "znd", "zu",
 177     "zun", "zxx", "zza",
 178 NULL,
 179     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 180 NULL
 181 };
 182
 183 static const char* const DEPRECATED_LANGUAGES[]={
 184     "in", "iw", "ji", "jw", NULL, NULL
 185 };
 186 static const char* const REPLACEMENT_LANGUAGES[]={
 187     "id", "he", "yi", "jv", NULL, NULL
 188 };
 189
 190 /**
 191  * Table of 3-letter language codes.
 192  *
 193  * This is a lookup table used to convert 3-letter language codes to
 194  * their 2-letter equivalent, where possible.  It must be kept in sync
 195  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 196  * same language as LANGUAGES_3[i].  The commented-out lines are
 197  * copied from LANGUAGES to make eyeballing this baby easier.
 198  *
 199  * Where a 3-letter language code has no 2-letter equivalent, the
 200  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 201  *
 202  * This table should be terminated with a NULL entry, followed by a
 203  * second list, and another NULL entry.  The two lists correspond to
 204  * the two lists in LANGUAGES.
 205  */
 206 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 207 /* ISO639 table version is 20130531 */
 208 static const char * const LANGUAGES_3[] = {
 209     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr",
 210     "afa", "afh", "agq", "ain", "aka", "akk", "ale", "alg",
 211     "alt", "amh", "arg", "ang", "anp", "apa", "ara", "arc",
 212     "arn", "arp", "art", "arw", "asm", "asa", "ast", "ath",
 213     "aus", "ava", "awa", "aym", "aze",
 214     "bak", "bad", "bai", "bal", "ban", "bas", "bat", "bax",
 215     "bbj", "bel", "bej", "bem", "ber", "bez", "bfd", "bul",
 216     "bih", "bho", "bis", "bik", "bin", "bkm", "bla", "bam",
 217     "ben", "bnt", "bod", "bre", "bra", "brx", "bos", "bss",
 218     "btk", "bua", "bug", "bum", "byn", "byv",
 219     "cat", "cad", "cai", "car", "cau", "cay", "cch", "che",
 220     "ceb", "cel", "cgg", "cha", "chb", "chg", "chk", "chm",
 221     "chn", "cho", "chp", "chr", "chy", "ckb", "cmc", "cos",
 222     "cop", "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces",
 223     "csb", "chu", "cus", "chv", "cym",
 224     "dan", "dak", "dar", "dav", "day", "deu", "del", "den",
 225     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum",
 226     "div", "dyo", "dyu", "dzo", "dzg",
 227     "ebu", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
 228     "enm", "epo", "spa", "est", "eus", "ewo",
 229     "fas", "fan", "fat", "ful", "fin", "fil", "fiu", "fij",
 230     "fao", "fon", "fra", "frm", "fro", "frr", "frs", "fur",
 231     "fry",
 232     "gle", "gaa", "gay", "gba", "gla", "gem", "gez", "gil",
 233     "glg", "gmh", "grn", "goh", "gon", "gor", "got", "grb",
 234     "grc", "gsw", "guj", "guz", "glv", "gwi",
 235     "hau", "hai", "haw", "heb", "hin", "hil", "him", "hit",
 236     "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye",
 237     "her",
 238     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ijo",
 239     "ipk", "ilo", "inc", "ine", "inh", "ido", "ira", "iro",
 240     "isl", "ita", "iku",
 241     "jpn", "jbo", "jgo", "jmc", "jpr", "jrb", "jav",
 242     "kat", "kaa", "kab", "kac", "kaj", "kam", "kar", "kaw",
 243     "kbd", "kbl", "kcg", "kde", "kea", "kfo", "kon", "kha",
 244     "khi", "kho", "khq", "kik", "kua", "kaz", "kkj", "kal",
 245     "kln", "khm", "kmb", "kan", "kor", "kok", "kos", "kpe",
 246     "kau", "krc", "krl", "kro", "kru", "kas", "ksb", "ksf",
 247     "ksh", "kur", "kum", "kut", "kom", "cor", "kir",
 248     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lug",
 249     "lim", "lkt", "lin", "lao", "lol", "loz", "lit", "lub",
 250     "lua", "lui", "lun", "luo", "lus", "luy", "lav",
 251     "mad", "maf", "mag", "mai", "mak", "man", "map", "mas",
 252     "mde", "mdf", "mdr", "men", "mer", "mfe", "mlg", "mga",
 253     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
 254     "mkh", "mal", "mon", "mnc", "mni", "mno", "mol", "moh",
 255     "mos", "mar", "msa", "mlt", "mua", "mul", "mun", "mus",
 256     "mwl", "mwr", "mya", "mye", "myn", "myv",
 257     "nau", "nah", "nai", "nap", "naq", "nob", "nde", "nds",
 258     "nep", "new", "ndo", "nia", "nic", "niu", "nld", "nmg",
 259     "nno", "nnh", "nor", "nog", "non", "nqo", "nbl", "nso",
 260     "nub", "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo",
 261     "nzi",
 262     "oci", "oji", "orm", "ori", "oss", "osa", "ota", "oto",
 263     "pan", "paa", "pag", "pal", "pam", "pap", "pau", "peo",
 264     "phi", "phn", "pli", "pol", "pon", "pra", "pro", "pus",
 265     "por",
 266     "que",
 267     "raj", "rap", "rar", "roh", "run", "ron", "roa", "rof",
 268     "rom", "rus", "rup", "kin", "rwk",
 269     "san", "sad", "sah", "sai", "sal", "sam", "saq", "sas",
 270     "sat", "sba", "sbp", "srd", "scn", "sco", "snd", "sme",
 271     "see", "seh", "sel", "sem", "ses", "sag", "sga", "sgn",
 272     "shi", "shn", "shu", "sin", "sid", "sio", "sit",
 273     "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
 274     "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
 275     "srn", "srr", "ssw", "ssa", "ssy", "sot", "sun", "suk",
 276     "sus", "sux", "swe", "swa", "swb", "swc", "syc", "syr",
 277     "tam", "tai", "tel", "tem", "teo", "ter", "tet", "tgk",
 278     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tgl", "tlh",
 279     "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
 280     "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi",
 281     "twq", "tah", "tyv", "tzm",
 282     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
 283     "vai", "ven", "vie", "vol", "vot", "vun",
 284     "wln", "wae", "wak", "wal", "war", "was", "wen", "wol",
 285     "xal", "xho", "xog",
 286     "yao", "yap", "yav", "ybb", "yid", "yor", "ypk", "yue",
 287     "zha", "zap", "zbl", "zen", "zgh", "zho", "znd", "zul",
 288     "zun", "zxx", "zza",
 289 NULL,
 290 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 291     "ind", "heb", "yid", "jaw", "srp",
 292 NULL
 293 };
 294
 295 /**
 296  * Table of 2-letter country codes.
 297  *
 298  * This list must be in sorted order.  This list is returned directly
 299  * to the user by some API.
 300  *
 301  * This list must be kept in sync with COUNTRIES_3, with corresponding
 302  * entries matched.
 303  *
 304  * This table should be terminated with a NULL entry, followed by a
 305  * second list, and another NULL entry.  The first list is visible to
 306  * user code when this array is returned by API.  The second list
 307  * contains codes we support, but do not expose through user API.
 308  *
 309  * Notes:
 310  *
 311  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 312  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 313  * new codes keeping the old ones for compatibility updated to include
 314  * 1999/12/03 revisions *CWB*
 315  *
 316  * RO(ROM) is now RO(ROU) according to
 317  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 318  */
 319 static const char * const COUNTRIES[] = {
 320     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
 321     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 322     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 323     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
 324     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 325     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
 326     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
 327     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
 328     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 329     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 330     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 331     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 332     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 333     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 334     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 335     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 336     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 337     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 338     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 339     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 340     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 341     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 342     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 343     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 344     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
 345     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 346     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 347     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 348     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 349     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 350 NULL,
 351     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 352 NULL
 353 };
 354
 355 static const char* const DEPRECATED_COUNTRIES[] = {
 356     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
 357 };
 358 static const char* const REPLACEMENT_COUNTRIES[] = {
 359 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
 360     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
 361 };
 362
 363 /**
 364  * Table of 3-letter country codes.
 365  *
 366  * This is a lookup table used to convert 3-letter country codes to
 367  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 368  * For all valid i, COUNTRIES[i] must refer to the same country as
 369  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 370  * to make eyeballing this baby easier.
 371  *
 372  * This table should be terminated with a NULL entry, followed by a
 373  * second list, and another NULL entry.  The two lists correspond to
 374  * the two lists in COUNTRIES.
 375  */
 376 static const char * const COUNTRIES_3[] = {
 377 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
 378     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
 379 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 380     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 381 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 382     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 383 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
 384     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
 385 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 386     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 387 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
 388     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
 389 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
 390     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
 391 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 392     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
 393 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 394     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 395 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 396     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 397 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 398     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 399 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 400     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 401 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 402     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 403 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 404     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 405 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 406     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 407 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 408     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 409 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 410     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 411 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 412     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 413 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 414     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 415 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 416     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 417 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 418     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 419 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 420     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 421 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 422     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 423 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 424     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 425 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
 426     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
 427 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 428     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 429 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 430     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 431 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 432     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 433 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 434     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 435 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 436     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 437 NULL,
 438 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
 439     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
 440 NULL
 441 };
 442
 443 typedef struct CanonicalizationMap {
 444     const char *id;          /* input ID */
 445     const char *canonicalID; /* canonicalized output ID */
 446     const char *keyword;     /* keyword, or NULL if none */
 447     const char *value;       /* keyword value, or NULL if kw==NULL */
 448 } CanonicalizationMap;
 449
 450 /**
 451  * A map to canonicalize locale IDs.  This handles a variety of
 452  * different semantic kinds of transformations.
 453  */
 454 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 455     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 456     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 457     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 458     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 459     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 460     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 461     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 462     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 463     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 464     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 465     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 466     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 467     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 468     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 469     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 470     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 471     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 472     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 473     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 474     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 475     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 476     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 477     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 478     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 479     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 480     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 481     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 482     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 483     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 484     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 485     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 486     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 487     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 488     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 489     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 490     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 491     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 492     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 493     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 494     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
 495     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 496     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
 497     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
 498     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
 499     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
 500     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
 501 };
 502
 503 typedef struct VariantMap {
 504     const char *variant;          /* input ID */
 505     const char *keyword;     /* keyword, or NULL if none */
 506     const char *value;       /* keyword value, or NULL if kw==NULL */
 507 } VariantMap;
 508
 509 static const VariantMap VARIANT_MAP[] = {
 510     { "EURO",   "currency", "EUR" },
 511     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 512     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 513 };
 514
 515 /* ### BCP47 Conversion *******************************************/
 516 /* Test if the locale id has BCP47 u extension and does not have '@' */
 517 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 518 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 519 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 520         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
 521             finalID=id; \
 522         } else { \
 523             finalID=buffer; \
 524         }
 525 /* Gets the size of the shortest subtag in the given localeID. */
 526 static int32_t getShortestSubtagLength(const char *localeID) {
 527     int32_t localeIDLength = uprv_strlen(localeID);
 528     int32_t length = localeIDLength;
 529     int32_t tmpLength = 0;
 530     int32_t i;
 531     UBool reset = TRUE;
 532
 533     for (i = 0; i < localeIDLength; i++) {
 534         if (localeID[i] != '_' && localeID[i] != '-') {
 535             if (reset) {
 536                 tmpLength = 0;
 537                 reset = FALSE;
 538             }
 539             tmpLength++;
 540         } else {
 541             if (tmpLength != 0 && tmpLength < length) {
 542                 length = tmpLength;
 543             }
 544             reset = TRUE;
 545         }
 546     }
 547
 548     return length;
 549 }
 550
 551 /* ### Keywords **************************************************/
 552
 553 #define ULOC_KEYWORD_BUFFER_LEN 25
 554 #define ULOC_MAX_NO_KEYWORDS 25
 555
 556 U_CAPI const char * U_EXPORT2
 557 locale_getKeywordsStart(const char *localeID) {
 558     const char *result = NULL;
 559     if((result = uprv_strchr(localeID, '@')) != NULL) {
 560         return result;
 561     }
 562 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 563     else {
 564         /* We do this because the @ sign is variant, and the @ sign used on one
 565         EBCDIC machine won't be compiled the same way on other EBCDIC based
 566         machines. */
 567         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 568         const uint8_t *charToFind = ebcdicSigns;
 569         while(*charToFind) {
 570             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 571                 return result;
 572             }
 573             charToFind++;
 574         }
 575     }
 576 #endif
 577     return NULL;
 578 }
 579
 580 /**
 581  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 582  * @param keywordName incoming name to be canonicalized
 583  * @param status return status (keyword too long)
 584  * @return length of the keyword name
 585  */
 586 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 587 {
 588   int32_t i;
 589   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
 590
 591   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
 592     /* keyword name too long for internal buffer */
 593     *status = U_INTERNAL_PROGRAM_ERROR;
 594           return 0;
 595   }
 596
 597   /* normalize the keyword name */
 598   for(i = 0; i < keywordNameLen; i++) {
 599     buf[i] = uprv_tolower(keywordName[i]);
 600   }
 601   buf[i] = 0;
 602
 603   return keywordNameLen;
 604 }
 605
 606 typedef struct {
 607     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 608     int32_t keywordLen;
 609     const char *valueStart;
 610     int32_t valueLen;
 611 } KeywordStruct;
 612
 613 static int32_t U_CALLCONV
 614 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
 615     const char* leftString = ((const KeywordStruct *)left)->keyword;
 616     const char* rightString = ((const KeywordStruct *)right)->keyword;
 617     return uprv_strcmp(leftString, rightString);
 618 }
 619
 620 /**
 621  * Both addKeyword and addValue must already be in canonical form.
 622  * Either both addKeyword and addValue are NULL, or neither is NULL.
 623  * If they are not NULL they must be zero terminated.
 624  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 625  */
 626 static int32_t
 627 _getKeywords(const char *localeID,
 628              char prev,
 629              char *keywords, int32_t keywordCapacity,
 630              char *values, int32_t valuesCapacity, int32_t *valLen,
 631              UBool valuesToo,
 632              const char* addKeyword,
 633              const char* addValue,
 634              UErrorCode *status)
 635 {
 636     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 637
 638     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 639     int32_t numKeywords = 0;
 640     const char* pos = localeID;
 641     const char* equalSign = NULL;
 642     const char* semicolon = NULL;
 643     int32_t i = 0, j, n;
 644     int32_t keywordsLen = 0;
 645     int32_t valuesLen = 0;
 646
 647     if(prev == '@') { /* start of keyword definition */
 648         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 649         do {
 650             UBool duplicate = FALSE;
 651             /* skip leading spaces */
 652             while(*pos == ' ') {
 653                 pos++;
 654             }
 655             if (!*pos) { /* handle trailing "; " */
 656                 break;
 657             }
 658             if(numKeywords == maxKeywords) {
 659                 *status = U_INTERNAL_PROGRAM_ERROR;
 660                 return 0;
 661             }
 662             equalSign = uprv_strchr(pos, '=');
 663             semicolon = uprv_strchr(pos, ';');
 664             /* lack of '=' [foo@currency] is illegal */
 665             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 666             if(!equalSign || (semicolon && semicolon<equalSign)) {
 667                 *status = U_INVALID_FORMAT_ERROR;
 668                 return 0;
 669             }
 670             /* need to normalize both keyword and keyword name */
 671             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 672                 /* keyword name too long for internal buffer */
 673                 *status = U_INTERNAL_PROGRAM_ERROR;
 674                 return 0;
 675             }
 676             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 677                 if (pos[i] != ' ') {
 678                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 679                 }
 680             }
 681
 682             /* zero-length keyword is an error. */
 683             if (n == 0) {
 684                 *status = U_INVALID_FORMAT_ERROR;
 685                 return 0;
 686             }
 687
 688             keywordList[numKeywords].keyword[n] = 0;
 689             keywordList[numKeywords].keywordLen = n;
 690             /* now grab the value part. First we skip the '=' */
 691             equalSign++;
 692             /* then we leading spaces */
 693             while(*equalSign == ' ') {
 694                 equalSign++;
 695             }
 696
 697             /* Premature end or zero-length value */
 698             if (!equalSign || equalSign == semicolon) {
 699                 *status = U_INVALID_FORMAT_ERROR;
 700                 return 0;
 701             }
 702
 703             keywordList[numKeywords].valueStart = equalSign;
 704
 705             pos = semicolon;
 706             i = 0;
 707             if(pos) {
 708                 while(*(pos - i - 1) == ' ') {
 709                     i++;
 710                 }
 711                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 712                 pos++;
 713             } else {
 714                 i = (int32_t)uprv_strlen(equalSign);
 715                 while(i && equalSign[i-1] == ' ') {
 716                     i--;
 717                 }
 718                 keywordList[numKeywords].valueLen = i;
 719             }
 720             /* If this is a duplicate keyword, then ignore it */
 721             for (j=0; j<numKeywords; ++j) {
 722                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 723                     duplicate = TRUE;
 724                     break;
 725                 }
 726             }
 727             if (!duplicate) {
 728                 ++numKeywords;
 729             }
 730         } while(pos);
 731
 732         /* Handle addKeyword/addValue. */
 733         if (addKeyword != NULL) {
 734             UBool duplicate = FALSE;
 735             U_ASSERT(addValue != NULL);
 736             /* Search for duplicate; if found, do nothing. Explicit keyword
 737                overrides addKeyword. */
 738             for (j=0; j<numKeywords; ++j) {
 739                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 740                     duplicate = TRUE;
 741                     break;
 742                 }
 743             }
 744             if (!duplicate) {
 745                 if (numKeywords == maxKeywords) {
 746                     *status = U_INTERNAL_PROGRAM_ERROR;
 747                     return 0;
 748                 }
 749                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 750                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 751                 keywordList[numKeywords].valueStart = addValue;
 752                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 753                 ++numKeywords;
 754             }
 755         } else {
 756             U_ASSERT(addValue == NULL);
 757         }
 758
 759         /* now we have a list of keywords */
 760         /* we need to sort it */
 761         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 762
 763         /* Now construct the keyword part */
 764         for(i = 0; i < numKeywords; i++) {
 765             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 766                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 767                 if(valuesToo) {
 768                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 769                 } else {
 770                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 771                 }
 772             }
 773             keywordsLen += keywordList[i].keywordLen + 1;
 774             if(valuesToo) {
 775                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 776                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 777                 }
 778                 keywordsLen += keywordList[i].valueLen;
 779
 780                 if(i < numKeywords - 1) {
 781                     if(keywordsLen < keywordCapacity) {
 782                         keywords[keywordsLen] = ';';
 783                     }
 784                     keywordsLen++;
 785                 }
 786             }
 787             if(values) {
 788                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 789                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 790                     values[valuesLen + keywordList[i].valueLen] = 0;
 791                 }
 792                 valuesLen += keywordList[i].valueLen + 1;
 793             }
 794         }
 795         if(values) {
 796             values[valuesLen] = 0;
 797             if(valLen) {
 798                 *valLen = valuesLen;
 799             }
 800         }
 801         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 802     } else {
 803         return 0;
 804     }
 805 }
 806
 807 U_CFUNC int32_t
 808 locale_getKeywords(const char *localeID,
 809                    char prev,
 810                    char *keywords, int32_t keywordCapacity,
 811                    char *values, int32_t valuesCapacity, int32_t *valLen,
 812                    UBool valuesToo,
 813                    UErrorCode *status) {
 814     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 815                         values, valuesCapacity, valLen, valuesToo,
 816                         NULL, NULL, status);
 817 }
 818
 819 U_CAPI int32_t U_EXPORT2
 820 uloc_getKeywordValue(const char* localeID,
 821                      const char* keywordName,
 822                      char* buffer, int32_t bufferCapacity,
 823                      UErrorCode* status)
 824 {
 825     const char* startSearchHere = NULL;
 826     const char* nextSeparator = NULL;
 827     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 828     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 829     int32_t i = 0;
 830     int32_t result = 0;
 831
 832     if(status && U_SUCCESS(*status) && localeID) {
 833       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 834       const char* tmpLocaleID;
 835
 836       if (_hasBCP47Extension(localeID)) {
 837           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 838       } else {
 839           tmpLocaleID=localeID;
 840       }
 841
 842       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
 843       if(startSearchHere == NULL) {
 844           /* no keywords, return at once */
 845           return 0;
 846       }
 847
 848       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 849       if(U_FAILURE(*status)) {
 850         return 0;
 851       }
 852
 853       /* find the first keyword */
 854       while(startSearchHere) {
 855           startSearchHere++;
 856           /* skip leading spaces (allowed?) */
 857           while(*startSearchHere == ' ') {
 858               startSearchHere++;
 859           }
 860           nextSeparator = uprv_strchr(startSearchHere, '=');
 861           /* need to normalize both keyword and keyword name */
 862           if(!nextSeparator) {
 863               break;
 864           }
 865           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
 866               /* keyword name too long for internal buffer */
 867               *status = U_INTERNAL_PROGRAM_ERROR;
 868               return 0;
 869           }
 870           for(i = 0; i < nextSeparator - startSearchHere; i++) {
 871               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
 872           }
 873           /* trim trailing spaces */
 874           while(startSearchHere[i-1] == ' ') {
 875               i--;
 876               U_ASSERT(i>=0);
 877           }
 878           localeKeywordNameBuffer[i] = 0;
 879
 880           startSearchHere = uprv_strchr(nextSeparator, ';');
 881
 882           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 883               nextSeparator++;
 884               while(*nextSeparator == ' ') {
 885                   nextSeparator++;
 886               }
 887               /* we actually found the keyword. Copy the value */
 888               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
 889                   while(*(startSearchHere-1) == ' ') {
 890                       startSearchHere--;
 891                   }
 892                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
 893                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
 894               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
 895                   i = (int32_t)uprv_strlen(nextSeparator);
 896                   while(nextSeparator[i - 1] == ' ') {
 897                       i--;
 898                   }
 899                   uprv_strncpy(buffer, nextSeparator, i);
 900                   result = u_terminateChars(buffer, bufferCapacity, i, status);
 901               } else {
 902                   /* give a bigger buffer, please */
 903                   *status = U_BUFFER_OVERFLOW_ERROR;
 904                   if(startSearchHere) {
 905                       result = (int32_t)(startSearchHere - nextSeparator);
 906                   } else {
 907                       result = (int32_t)uprv_strlen(nextSeparator);
 908                   }
 909               }
 910               return result;
 911           }
 912       }
 913     }
 914     return 0;
 915 }
 916
 917 U_CAPI int32_t U_EXPORT2
 918 uloc_setKeywordValue(const char* keywordName,
 919                      const char* keywordValue,
 920                      char* buffer, int32_t bufferCapacity,
 921                      UErrorCode* status)
 922 {
 923     /* TODO: sorting. removal. */
 924     int32_t keywordNameLen;
 925     int32_t keywordValueLen;
 926     int32_t bufLen;
 927     int32_t needLen = 0;
 928     int32_t foundValueLen;
 929     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
 930     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 931     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 932     int32_t i = 0;
 933     int32_t rc;
 934     char* nextSeparator = NULL;
 935     char* nextEqualsign = NULL;
 936     char* startSearchHere = NULL;
 937     char* keywordStart = NULL;
 938     char *insertHere = NULL;
 939     if(U_FAILURE(*status)) {
 940         return -1;
 941     }
 942     if(bufferCapacity>1) {
 943         bufLen = (int32_t)uprv_strlen(buffer);
 944     } else {
 945         *status = U_ILLEGAL_ARGUMENT_ERROR;
 946         return 0;
 947     }
 948     if(bufferCapacity<bufLen) {
 949         /* The capacity is less than the length?! Is this NULL terminated? */
 950         *status = U_ILLEGAL_ARGUMENT_ERROR;
 951         return 0;
 952     }
 953     if(keywordValue && !*keywordValue) {
 954         keywordValue = NULL;
 955     }
 956     if(keywordValue) {
 957         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
 958     } else {
 959         keywordValueLen = 0;
 960     }
 961     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 962     if(U_FAILURE(*status)) {
 963         return 0;
 964     }
 965     startSearchHere = (char*)locale_getKeywordsStart(buffer);
 966     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
 967         if(!keywordValue) { /* no keywords = nothing to remove */
 968             return bufLen;
 969         }
 970
 971         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
 972         if(startSearchHere) { /* had a single @ */
 973             needLen--; /* already had the @ */
 974             /* startSearchHere points at the @ */
 975         } else {
 976             startSearchHere=buffer+bufLen;
 977         }
 978         if(needLen >= bufferCapacity) {
 979             *status = U_BUFFER_OVERFLOW_ERROR;
 980             return needLen; /* no change */
 981         }
 982         *startSearchHere = '@';
 983         startSearchHere++;
 984         uprv_strcpy(startSearchHere, keywordNameBuffer);
 985         startSearchHere += keywordNameLen;
 986         *startSearchHere = '=';
 987         startSearchHere++;
 988         uprv_strcpy(startSearchHere, keywordValue);
 989         startSearchHere+=keywordValueLen;
 990         return needLen;
 991     } /* end shortcut - no @ */
 992
 993     keywordStart = startSearchHere;
 994     /* search for keyword */
 995     while(keywordStart) {
 996         keywordStart++;
 997         /* skip leading spaces (allowed?) */
 998         while(*keywordStart == ' ') {
 999             keywordStart++;
1000         }
1001         nextEqualsign = uprv_strchr(keywordStart, '=');
1002         /* need to normalize both keyword and keyword name */
1003         if(!nextEqualsign) {
1004             break;
1005         }
1006         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1007             /* keyword name too long for internal buffer */
1008             *status = U_INTERNAL_PROGRAM_ERROR;
1009             return 0;
1010         }
1011         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1012             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1013         }
1014         /* trim trailing spaces */
1015         while(keywordStart[i-1] == ' ') {
1016             i--;
1017         }
1018         U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
1019         localeKeywordNameBuffer[i] = 0;
1020
1021         nextSeparator = uprv_strchr(nextEqualsign, ';');
1022         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1023         if(rc == 0) {
1024             nextEqualsign++;
1025             while(*nextEqualsign == ' ') {
1026                 nextEqualsign++;
1027             }
1028             /* we actually found the keyword. Change the value */
1029             if (nextSeparator) {
1030                 keywordAtEnd = 0;
1031                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1032             } else {
1033                 keywordAtEnd = 1;
1034                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1035             }
1036             if(keywordValue) { /* adding a value - not removing */
1037               if(foundValueLen == keywordValueLen) {
1038                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1039                 return bufLen; /* no change in size */
1040               } else if(foundValueLen > keywordValueLen) {
1041                 int32_t delta = foundValueLen - keywordValueLen;
1042                 if(nextSeparator) { /* RH side */
1043                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1044                 }
1045                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1046                 bufLen -= delta;
1047                 buffer[bufLen]=0;
1048                 return bufLen;
1049               } else { /* FVL < KVL */
1050                 int32_t delta = keywordValueLen - foundValueLen;
1051                 if((bufLen+delta) >= bufferCapacity) {
1052                   *status = U_BUFFER_OVERFLOW_ERROR;
1053                   return bufLen+delta;
1054                 }
1055                 if(nextSeparator) { /* RH side */
1056                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1057                 }
1058                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1059                 bufLen += delta;
1060                 buffer[bufLen]=0;
1061                 return bufLen;
1062               }
1063             } else { /* removing a keyword */
1064               if(keywordAtEnd) {
1065                 /* zero out the ';' or '@' just before startSearchhere */
1066                 keywordStart[-1] = 0;
1067                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1068               } else {
1069                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1070                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1071                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1072               }
1073             }
1074         } else if(rc<0){ /* end match keyword */
1075           /* could insert at this location. */
1076           insertHere = keywordStart;
1077         }
1078         keywordStart = nextSeparator;
1079     } /* end loop searching */
1080
1081     if(!keywordValue) {
1082       return bufLen; /* removal of non-extant keyword - no change */
1083     }
1084
1085     /* we know there is at least one keyword. */
1086     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1087     if(needLen >= bufferCapacity) {
1088         *status = U_BUFFER_OVERFLOW_ERROR;
1089         return needLen; /* no change */
1090     }
1091
1092     if(insertHere) {
1093       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1094       keywordStart = insertHere;
1095     } else {
1096       keywordStart = buffer+bufLen;
1097       *keywordStart = ';';
1098       keywordStart++;
1099     }
1100     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1101     keywordStart += keywordNameLen;
1102     *keywordStart = '=';
1103     keywordStart++;
1104     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1105     keywordStart+=keywordValueLen;
1106     if(insertHere) {
1107       *keywordStart = ';';
1108       keywordStart++;
1109     }
1110     buffer[needLen]=0;
1111     return needLen;
1112 }
1113
1114 /* ### ID parsing implementation **************************************************/
1115
1116 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1117
1118 /*returns TRUE if one of the special prefixes is here (s=string)
1119   'x-' or 'i-' */
1120 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1121
1122 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1123  * except for variant
1124  */
1125 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1126
1127 static char* _strnchr(const char* str, int32_t len, char c) {
1128     U_ASSERT(str != 0 && len >= 0);
1129     while (len-- != 0) {
1130         char d = *str;
1131         if (d == c) {
1132             return (char*) str;
1133         } else if (d == 0) {
1134             break;
1135         }
1136         ++str;
1137     }
1138     return NULL;
1139 }
1140
1141 /**
1142  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1143  * a NULL entry, followed by more entries, and a second NULL entry.
1144  *
1145  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1146  * COUNTRIES_3.
1147  */
1148 static int16_t _findIndex(const char* const* list, const char* key)
1149 {
1150     const char* const* anchor = list;
1151     int32_t pass = 0;
1152
1153     /* Make two passes through two NULL-terminated arrays at 'list' */
1154     while (pass++ < 2) {
1155         while (*list) {
1156             if (uprv_strcmp(key, *list) == 0) {
1157                 return (int16_t)(list - anchor);
1158             }
1159             list++;
1160         }
1161         ++list;     /* skip final NULL *CWB*/
1162     }
1163     return -1;
1164 }
1165
1166 /* count the length of src while copying it to dest; return strlen(src) */
1167 static inline int32_t
1168 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1169     const char *anchor;
1170     char c;
1171
1172     anchor=src;
1173     for(;;) {
1174         if((c=*src)==0) {
1175             return (int32_t)(src-anchor);
1176         }
1177         if(destCapacity<=0) {
1178             return (int32_t)((src-anchor)+uprv_strlen(src));
1179         }
1180         ++src;
1181         *dest++=c;
1182         --destCapacity;
1183     }
1184 }
1185
1186 U_CFUNC const char*
1187 uloc_getCurrentCountryID(const char* oldID){
1188     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1189     if (offset >= 0) {
1190         return REPLACEMENT_COUNTRIES[offset];
1191     }
1192     return oldID;
1193 }
1194 U_CFUNC const char*
1195 uloc_getCurrentLanguageID(const char* oldID){
1196     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1197     if (offset >= 0) {
1198         return REPLACEMENT_LANGUAGES[offset];
1199     }
1200     return oldID;
1201 }
1202 /*
1203  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1204  * avoid duplicating code to handle the earlier locale ID pieces
1205  * in the functions for the later ones by
1206  * setting the *pEnd pointer to where they stopped parsing
1207  *
1208  * TODO try to use this in Locale
1209  */
1210 U_CFUNC int32_t
1211 ulocimp_getLanguage(const char *localeID,
1212                     char *language, int32_t languageCapacity,
1213                     const char **pEnd) {
1214     int32_t i=0;
1215     int32_t offset;
1216     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1217
1218     /* if it starts with i- or x- then copy that prefix */
1219     if(_isIDPrefix(localeID)) {
1220         if(i<languageCapacity) {
1221             language[i]=(char)uprv_tolower(*localeID);
1222         }
1223         if(i<languageCapacity) {
1224             language[i+1]='-';
1225         }
1226         i+=2;
1227         localeID+=2;
1228     }
1229
1230     /* copy the language as far as possible and count its length */
1231     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1232         if(i<languageCapacity) {
1233             language[i]=(char)uprv_tolower(*localeID);
1234         }
1235         if(i<3) {
1236             U_ASSERT(i>=0);
1237             lang[i]=(char)uprv_tolower(*localeID);
1238         }
1239         i++;
1240         localeID++;
1241     }
1242
1243     if(i==3) {
1244         /* convert 3 character code to 2 character code if possible *CWB*/
1245         offset=_findIndex(LANGUAGES_3, lang);
1246         if(offset>=0) {
1247             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1248         }
1249     }
1250
1251     if(pEnd!=NULL) {
1252         *pEnd=localeID;
1253     }
1254     return i;
1255 }
1256
1257 U_CFUNC int32_t
1258 ulocimp_getScript(const char *localeID,
1259                   char *script, int32_t scriptCapacity,
1260                   const char **pEnd)
1261 {
1262     int32_t idLen = 0;
1263
1264     if (pEnd != NULL) {
1265         *pEnd = localeID;
1266     }
1267
1268     /* copy the second item as far as possible and count its length */
1269     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1270             && uprv_isASCIILetter(localeID[idLen])) {
1271         idLen++;
1272     }
1273
1274     /* If it's exactly 4 characters long, then it's a script and not a country. */
1275     if (idLen == 4) {
1276         int32_t i;
1277         if (pEnd != NULL) {
1278             *pEnd = localeID+idLen;
1279         }
1280         if(idLen > scriptCapacity) {
1281             idLen = scriptCapacity;
1282         }
1283         if (idLen >= 1) {
1284             script[0]=(char)uprv_toupper(*(localeID++));
1285         }
1286         for (i = 1; i < idLen; i++) {
1287             script[i]=(char)uprv_tolower(*(localeID++));
1288         }
1289     }
1290     else {
1291         idLen = 0;
1292     }
1293     return idLen;
1294 }
1295
1296 U_CFUNC int32_t
1297 ulocimp_getCountry(const char *localeID,
1298                    char *country, int32_t countryCapacity,
1299                    const char **pEnd)
1300 {
1301     int32_t idLen=0;
1302     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1303     int32_t offset;
1304
1305     /* copy the country as far as possible and count its length */
1306     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1307         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1308             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1309         }
1310         idLen++;
1311     }
1312
1313     /* the country should be either length 2 or 3 */
1314     if (idLen == 2 || idLen == 3) {
1315         UBool gotCountry = FALSE;
1316         /* convert 3 character code to 2 character code if possible *CWB*/
1317         if(idLen==3) {
1318             offset=_findIndex(COUNTRIES_3, cnty);
1319             if(offset>=0) {
1320                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1321                 gotCountry = TRUE;
1322             }
1323         }
1324         if (!gotCountry) {
1325             int32_t i = 0;
1326             for (i = 0; i < idLen; i++) {
1327                 if (i < countryCapacity) {
1328                     country[i]=(char)uprv_toupper(localeID[i]);
1329                 }
1330             }
1331         }
1332         localeID+=idLen;
1333     } else {
1334         idLen = 0;
1335     }
1336
1337     if(pEnd!=NULL) {
1338         *pEnd=localeID;
1339     }
1340
1341     return idLen;
1342 }
1343
1344 /**
1345  * @param needSeparator if true, then add leading '_' if any variants
1346  * are added to 'variant'
1347  */
1348 static int32_t
1349 _getVariantEx(const char *localeID,
1350               char prev,
1351               char *variant, int32_t variantCapacity,
1352               UBool needSeparator) {
1353     int32_t i=0;
1354
1355     /* get one or more variant tags and separate them with '_' */
1356     if(_isIDSeparator(prev)) {
1357         /* get a variant string after a '-' or '_' */
1358         while(!_isTerminator(*localeID)) {
1359             if (needSeparator) {
1360                 if (i<variantCapacity) {
1361                     variant[i] = '_';
1362                 }
1363                 ++i;
1364                 needSeparator = FALSE;
1365             }
1366             if(i<variantCapacity) {
1367                 variant[i]=(char)uprv_toupper(*localeID);
1368                 if(variant[i]=='-') {
1369                     variant[i]='_';
1370                 }
1371             }
1372             i++;
1373             localeID++;
1374         }
1375     }
1376
1377     /* if there is no variant tag after a '-' or '_' then look for '@' */
1378     if(i==0) {
1379         if(prev=='@') {
1380             /* keep localeID */
1381         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1382             ++localeID; /* point after the '@' */
1383         } else {
1384             return 0;
1385         }
1386         while(!_isTerminator(*localeID)) {
1387             if (needSeparator) {
1388                 if (i<variantCapacity) {
1389                     variant[i] = '_';
1390                 }
1391                 ++i;
1392                 needSeparator = FALSE;
1393             }
1394             if(i<variantCapacity) {
1395                 variant[i]=(char)uprv_toupper(*localeID);
1396                 if(variant[i]=='-' || variant[i]==',') {
1397                     variant[i]='_';
1398                 }
1399             }
1400             i++;
1401             localeID++;
1402         }
1403     }
1404
1405     return i;
1406 }
1407
1408 static int32_t
1409 _getVariant(const char *localeID,
1410             char prev,
1411             char *variant, int32_t variantCapacity) {
1412     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1413 }
1414
1415 /**
1416  * Delete ALL instances of a variant from the given list of one or
1417  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1418  * @param variants the source string of one or more variants,
1419  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1420  * terminated; if it is, trailing zero will NOT be maintained.
1421  * @param variantsLen length of variants
1422  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1423  * or "PREEURO"; not zero terminated
1424  * @param toDeleteLen length of toDelete
1425  * @return number of characters deleted from variants
1426  */
1427 static int32_t
1428 _deleteVariant(char* variants, int32_t variantsLen,
1429                const char* toDelete, int32_t toDeleteLen)
1430 {
1431     int32_t delta = 0; /* number of chars deleted */
1432     for (;;) {
1433         UBool flag = FALSE;
1434         if (variantsLen < toDeleteLen) {
1435             return delta;
1436         }
1437         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1438             (variantsLen == toDeleteLen ||
1439              (flag=(variants[toDeleteLen] == '_'))))
1440         {
1441             int32_t d = toDeleteLen + (flag?1:0);
1442             variantsLen -= d;
1443             delta += d;
1444             if (variantsLen > 0) {
1445                 uprv_memmove(variants, variants+d, variantsLen);
1446             }
1447         } else {
1448             char* p = _strnchr(variants, variantsLen, '_');
1449             if (p == NULL) {
1450                 return delta;
1451             }
1452             ++p;
1453             variantsLen -= (int32_t)(p - variants);
1454             variants = p;
1455         }
1456     }
1457 }
1458
1459 /* Keyword enumeration */
1460
1461 typedef struct UKeywordsContext {
1462     char* keywords;
1463     char* current;
1464 } UKeywordsContext;
1465
1466 static void U_CALLCONV
1467 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1468     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1469     uprv_free(enumerator->context);
1470     uprv_free(enumerator);
1471 }
1472
1473 static int32_t U_CALLCONV
1474 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1475     char *kw = ((UKeywordsContext *)en->context)->keywords;
1476     int32_t result = 0;
1477     while(*kw) {
1478         result++;
1479         kw += uprv_strlen(kw)+1;
1480     }
1481     return result;
1482 }
1483
1484 static const char* U_CALLCONV
1485 uloc_kw_nextKeyword(UEnumeration* en,
1486                     int32_t* resultLength,
1487                     UErrorCode* /*status*/) {
1488     const char* result = ((UKeywordsContext *)en->context)->current;
1489     int32_t len = 0;
1490     if(*result) {
1491         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1492         ((UKeywordsContext *)en->context)->current += len+1;
1493     } else {
1494         result = NULL;
1495     }
1496     if (resultLength) {
1497         *resultLength = len;
1498     }
1499     return result;
1500 }
1501
1502 static void U_CALLCONV
1503 uloc_kw_resetKeywords(UEnumeration* en,
1504                       UErrorCode* /*status*/) {
1505     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1506 }
1507
1508 static const UEnumeration gKeywordsEnum = {
1509     NULL,
1510     NULL,
1511     uloc_kw_closeKeywords,
1512     uloc_kw_countKeywords,
1513     uenum_unextDefault,
1514     uloc_kw_nextKeyword,
1515     uloc_kw_resetKeywords
1516 };
1517
1518 U_CAPI UEnumeration* U_EXPORT2
1519 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1520 {
1521     UKeywordsContext *myContext = NULL;
1522     UEnumeration *result = NULL;
1523
1524     if(U_FAILURE(*status)) {
1525         return NULL;
1526     }
1527     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1528     /* Null pointer test */
1529     if (result == NULL) {
1530         *status = U_MEMORY_ALLOCATION_ERROR;
1531         return NULL;
1532     }
1533     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1534     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1535     if (myContext == NULL) {
1536         *status = U_MEMORY_ALLOCATION_ERROR;
1537         uprv_free(result);
1538         return NULL;
1539     }
1540     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1541     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1542     myContext->keywords[keywordListSize] = 0;
1543     myContext->current = myContext->keywords;
1544     result->context = myContext;
1545     return result;
1546 }
1547
1548 U_CAPI UEnumeration* U_EXPORT2
1549 uloc_openKeywords(const char* localeID,
1550                         UErrorCode* status)
1551 {
1552     int32_t i=0;
1553     char keywords[256];
1554     int32_t keywordsCapacity = 256;
1555     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1556     const char* tmpLocaleID;
1557
1558     if(status==NULL || U_FAILURE(*status)) {
1559         return 0;
1560     }
1561
1562     if (_hasBCP47Extension(localeID)) {
1563         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1564     } else {
1565         if (localeID==NULL) {
1566            localeID=uloc_getDefault();
1567         }
1568         tmpLocaleID=localeID;
1569     }
1570
1571     /* Skip the language */
1572     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1573     if(_isIDSeparator(*tmpLocaleID)) {
1574         const char *scriptID;
1575         /* Skip the script if available */
1576         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1577         if(scriptID != tmpLocaleID+1) {
1578             /* Found optional script */
1579             tmpLocaleID = scriptID;
1580         }
1581         /* Skip the Country */
1582         if (_isIDSeparator(*tmpLocaleID)) {
1583             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1584             if(_isIDSeparator(*tmpLocaleID)) {
1585                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1586             }
1587         }
1588     }
1589
1590     /* keywords are located after '@' */
1591     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1592         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1593     }
1594
1595     if(i) {
1596         return uloc_openKeywordList(keywords, i, status);
1597     } else {
1598         return NULL;
1599     }
1600 }
1601
1602
1603 /* bit-flags for 'options' parameter of _canonicalize */
1604 #define _ULOC_STRIP_KEYWORDS 0x2
1605 #define _ULOC_CANONICALIZE   0x1
1606
1607 #define OPTION_SET(options, mask) ((options & mask) != 0)
1608
1609 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1610 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1611
1612 /**
1613  * Canonicalize the given localeID, to level 1 or to level 2,
1614  * depending on the options.  To specify level 1, pass in options=0.
1615  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1616  *
1617  * This is the code underlying uloc_getName and uloc_canonicalize.
1618  */
1619 static int32_t
1620 _canonicalize(const char* localeID,
1621               char* result,
1622               int32_t resultCapacity,
1623               uint32_t options,
1624               UErrorCode* err) {
1625     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1626     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1627     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1628     const char* origLocaleID;
1629     const char* tmpLocaleID;
1630     const char* keywordAssign = NULL;
1631     const char* separatorIndicator = NULL;
1632     const char* addKeyword = NULL;
1633     const char* addValue = NULL;
1634     char* name;
1635     char* variant = NULL; /* pointer into name, or NULL */
1636
1637     if (U_FAILURE(*err)) {
1638         return 0;
1639     }
1640
1641     if (_hasBCP47Extension(localeID)) {
1642         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1643     } else {
1644         if (localeID==NULL) {
1645            localeID=uloc_getDefault();
1646         }
1647         tmpLocaleID=localeID;
1648     }
1649
1650     origLocaleID=tmpLocaleID;
1651
1652     /* if we are doing a full canonicalization, then put results in
1653        localeBuffer, if necessary; otherwise send them to result. */
1654     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1655         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1656         name = localeBuffer;
1657         nameCapacity = (int32_t)sizeof(localeBuffer);
1658     } else {
1659         name = result;
1660         nameCapacity = resultCapacity;
1661     }
1662
1663     /* get all pieces, one after another, and separate with '_' */
1664     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1665
1666     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1667         const char *d = uloc_getDefault();
1668
1669         len = (int32_t)uprv_strlen(d);
1670
1671         if (name != NULL) {
1672             uprv_strncpy(name, d, len);
1673         }
1674     } else if(_isIDSeparator(*tmpLocaleID)) {
1675         const char *scriptID;
1676
1677         ++fieldCount;
1678         if(len<nameCapacity) {
1679             name[len]='_';
1680         }
1681         ++len;
1682
1683         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1684             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1685         if(scriptSize > 0) {
1686             /* Found optional script */
1687             tmpLocaleID = scriptID;
1688             ++fieldCount;
1689             len+=scriptSize;
1690             if (_isIDSeparator(*tmpLocaleID)) {
1691                 /* If there is something else, then we add the _ */
1692                 if(len<nameCapacity) {
1693                     name[len]='_';
1694                 }
1695                 ++len;
1696             }
1697         }
1698
1699         if (_isIDSeparator(*tmpLocaleID)) {
1700             const char *cntryID;
1701             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1702                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1703             if (cntrySize > 0) {
1704                 /* Found optional country */
1705                 tmpLocaleID = cntryID;
1706                 len+=cntrySize;
1707             }
1708             if(_isIDSeparator(*tmpLocaleID)) {
1709                 /* If there is something else, then we add the _  if we found country before. */
1710                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1711                     ++fieldCount;
1712                     if(len<nameCapacity) {
1713                         name[len]='_';
1714                     }
1715                     ++len;
1716                 }
1717
1718                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1719                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1720                 if (variantSize > 0) {
1721                     variant = len<nameCapacity ? name+len : NULL;
1722                     len += variantSize;
1723                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1724                 }
1725             }
1726         }
1727     }
1728
1729     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1730     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1731         UBool done = FALSE;
1732         do {
1733             char c = *tmpLocaleID;
1734             switch (c) {
1735             case 0:
1736             case '@':
1737                 done = TRUE;
1738                 break;
1739             default:
1740                 if (len<nameCapacity) {
1741                     name[len] = c;
1742                 }
1743                 ++len;
1744                 ++tmpLocaleID;
1745                 break;
1746             }
1747         } while (!done);
1748     }
1749
1750     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1751        After this, tmpLocaleID either points to '@' or is NULL */
1752     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1753         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1754         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1755     }
1756
1757     /* Copy POSIX-style variant, if any [mr@FOO] */
1758     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1759         tmpLocaleID != NULL && keywordAssign == NULL) {
1760         for (;;) {
1761             char c = *tmpLocaleID;
1762             if (c == 0) {
1763                 break;
1764             }
1765             if (len<nameCapacity) {
1766                 name[len] = c;
1767             }
1768             ++len;
1769             ++tmpLocaleID;
1770         }
1771     }
1772
1773     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1774         /* Handle @FOO variant if @ is present and not followed by = */
1775         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1776             int32_t posixVariantSize;
1777             /* Add missing '_' if needed */
1778             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1779                 do {
1780                     if(len<nameCapacity) {
1781                         name[len]='_';
1782                     }
1783                     ++len;
1784                     ++fieldCount;
1785                 } while(fieldCount<2);
1786             }
1787             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1788                                              (UBool)(variantSize > 0));
1789             if (posixVariantSize > 0) {
1790                 if (variant == NULL) {
1791                     variant = name+len;
1792                 }
1793                 len += posixVariantSize;
1794                 variantSize += posixVariantSize;
1795             }
1796         }
1797
1798         /* Handle generic variants first */
1799         if (variant) {
1800             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1801                 const char* variantToCompare = VARIANT_MAP[j].variant;
1802                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1803                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1804                 len -= variantLen;
1805                 if (variantLen > 0) {
1806                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1807                         --len;
1808                     }
1809                     addKeyword = VARIANT_MAP[j].keyword;
1810                     addValue = VARIANT_MAP[j].value;
1811                     break;
1812                 }
1813             }
1814             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1815                 --len;
1816             }
1817         }
1818
1819         /* Look up the ID in the canonicalization map */
1820         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1821             const char* id = CANONICALIZE_MAP[j].id;
1822             int32_t n = (int32_t)uprv_strlen(id);
1823             if (len == n && uprv_strncmp(name, id, n) == 0) {
1824                 if (n == 0 && tmpLocaleID != NULL) {
1825                     break; /* Don't remap "" if keywords present */
1826                 }
1827                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1828                 if (CANONICALIZE_MAP[j].keyword) {
1829                     addKeyword = CANONICALIZE_MAP[j].keyword;
1830                     addValue = CANONICALIZE_MAP[j].value;
1831                 }
1832                 break;
1833             }
1834         }
1835     }
1836
1837     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1838         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1839             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1840             if(len<nameCapacity) {
1841                 name[len]='@';
1842             }
1843             ++len;
1844             ++fieldCount;
1845             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1846                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1847         } else if (addKeyword != NULL) {
1848             U_ASSERT(addValue != NULL && len < nameCapacity);
1849             /* inelegant but works -- later make _getKeywords do this? */
1850             len += _copyCount(name+len, nameCapacity-len, "@");
1851             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1852             len += _copyCount(name+len, nameCapacity-len, "=");
1853             len += _copyCount(name+len, nameCapacity-len, addValue);
1854         }
1855     }
1856
1857     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1858         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1859     }
1860
1861     return u_terminateChars(result, resultCapacity, len, err);
1862 }
1863
1864 /* ### ID parsing API **************************************************/
1865
1866 U_CAPI int32_t  U_EXPORT2
1867 uloc_getParent(const char*    localeID,
1868                char* parent,
1869                int32_t parentCapacity,
1870                UErrorCode* err)
1871 {
1872     const char *lastUnderscore;
1873     int32_t i;
1874
1875     if (U_FAILURE(*err))
1876         return 0;
1877
1878     if (localeID == NULL)
1879         localeID = uloc_getDefault();
1880
1881     lastUnderscore=uprv_strrchr(localeID, '_');
1882     if(lastUnderscore!=NULL) {
1883         i=(int32_t)(lastUnderscore-localeID);
1884     } else {
1885         i=0;
1886     }
1887
1888     if(i>0 && parent != localeID) {
1889         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1890     }
1891     return u_terminateChars(parent, parentCapacity, i, err);
1892 }
1893
1894 U_CAPI int32_t U_EXPORT2
1895 uloc_getLanguage(const char*    localeID,
1896          char* language,
1897          int32_t languageCapacity,
1898          UErrorCode* err)
1899 {
1900     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1901     int32_t i=0;
1902
1903     if (err==NULL || U_FAILURE(*err)) {
1904         return 0;
1905     }
1906
1907     if(localeID==NULL) {
1908         localeID=uloc_getDefault();
1909     }
1910
1911     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1912     return u_terminateChars(language, languageCapacity, i, err);
1913 }
1914
1915 U_CAPI int32_t U_EXPORT2
1916 uloc_getScript(const char*    localeID,
1917          char* script,
1918          int32_t scriptCapacity,
1919          UErrorCode* err)
1920 {
1921     int32_t i=0;
1922
1923     if(err==NULL || U_FAILURE(*err)) {
1924         return 0;
1925     }
1926
1927     if(localeID==NULL) {
1928         localeID=uloc_getDefault();
1929     }
1930
1931     /* skip the language */
1932     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1933     if(_isIDSeparator(*localeID)) {
1934         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1935     }
1936     return u_terminateChars(script, scriptCapacity, i, err);
1937 }
1938
1939 U_CAPI int32_t  U_EXPORT2
1940 uloc_getCountry(const char* localeID,
1941             char* country,
1942             int32_t countryCapacity,
1943             UErrorCode* err)
1944 {
1945     int32_t i=0;
1946
1947     if(err==NULL || U_FAILURE(*err)) {
1948         return 0;
1949     }
1950
1951     if(localeID==NULL) {
1952         localeID=uloc_getDefault();
1953     }
1954
1955     /* Skip the language */
1956     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1957     if(_isIDSeparator(*localeID)) {
1958         const char *scriptID;
1959         /* Skip the script if available */
1960         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1961         if(scriptID != localeID+1) {
1962             /* Found optional script */
1963             localeID = scriptID;
1964         }
1965         if(_isIDSeparator(*localeID)) {
1966             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1967         }
1968     }
1969     return u_terminateChars(country, countryCapacity, i, err);
1970 }
1971
1972 U_CAPI int32_t  U_EXPORT2
1973 uloc_getVariant(const char* localeID,
1974                 char* variant,
1975                 int32_t variantCapacity,
1976                 UErrorCode* err)
1977 {
1978     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1979     const char* tmpLocaleID;
1980     int32_t i=0;
1981
1982     if(err==NULL || U_FAILURE(*err)) {
1983         return 0;
1984     }
1985
1986     if (_hasBCP47Extension(localeID)) {
1987         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1988     } else {
1989         if (localeID==NULL) {
1990            localeID=uloc_getDefault();
1991         }
1992         tmpLocaleID=localeID;
1993     }
1994
1995     /* Skip the language */
1996     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1997     if(_isIDSeparator(*tmpLocaleID)) {
1998         const char *scriptID;
1999         /* Skip the script if available */
2000         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2001         if(scriptID != tmpLocaleID+1) {
2002             /* Found optional script */
2003             tmpLocaleID = scriptID;
2004         }
2005         /* Skip the Country */
2006         if (_isIDSeparator(*tmpLocaleID)) {
2007             const char *cntryID;
2008             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2009             if (cntryID != tmpLocaleID+1) {
2010                 /* Found optional country */
2011                 tmpLocaleID = cntryID;
2012             }
2013             if(_isIDSeparator(*tmpLocaleID)) {
2014                 /* If there was no country ID, skip a possible extra IDSeparator */
2015                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2016                     tmpLocaleID++;
2017                 }
2018                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2019             }
2020         }
2021     }
2022
2023     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2024     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2025 /*
2026     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2027         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2028     }
2029 */
2030     return u_terminateChars(variant, variantCapacity, i, err);
2031 }
2032
2033 U_CAPI int32_t  U_EXPORT2
2034 uloc_getName(const char* localeID,
2035              char* name,
2036              int32_t nameCapacity,
2037              UErrorCode* err)
2038 {
2039     return _canonicalize(localeID, name, nameCapacity, 0, err);
2040 }
2041
2042 U_CAPI int32_t  U_EXPORT2
2043 uloc_getBaseName(const char* localeID,
2044                  char* name,
2045                  int32_t nameCapacity,
2046                  UErrorCode* err)
2047 {
2048     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2049 }
2050
2051 U_CAPI int32_t  U_EXPORT2
2052 uloc_canonicalize(const char* localeID,
2053                   char* name,
2054                   int32_t nameCapacity,
2055                   UErrorCode* err)
2056 {
2057     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2058 }
2059
2060 U_CAPI const char*  U_EXPORT2
2061 uloc_getISO3Language(const char* localeID)
2062 {
2063     int16_t offset;
2064     char lang[ULOC_LANG_CAPACITY];
2065     UErrorCode err = U_ZERO_ERROR;
2066
2067     if (localeID == NULL)
2068     {
2069         localeID = uloc_getDefault();
2070     }
2071     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2072     if (U_FAILURE(err))
2073         return "";
2074     offset = _findIndex(LANGUAGES, lang);
2075     if (offset < 0)
2076         return "";
2077     return LANGUAGES_3[offset];
2078 }
2079
2080 U_CAPI const char*  U_EXPORT2
2081 uloc_getISO3Country(const char* localeID)
2082 {
2083     int16_t offset;
2084     char cntry[ULOC_LANG_CAPACITY];
2085     UErrorCode err = U_ZERO_ERROR;
2086
2087     if (localeID == NULL)
2088     {
2089         localeID = uloc_getDefault();
2090     }
2091     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2092     if (U_FAILURE(err))
2093         return "";
2094     offset = _findIndex(COUNTRIES, cntry);
2095     if (offset < 0)
2096         return "";
2097
2098     return COUNTRIES_3[offset];
2099 }
2100
2101 U_CAPI uint32_t  U_EXPORT2
2102 uloc_getLCID(const char* localeID)
2103 {
2104     UErrorCode status = U_ZERO_ERROR;
2105     char       langID[ULOC_FULLNAME_CAPACITY];
2106
2107     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2108     if (U_FAILURE(status)) {
2109         return 0;
2110     }
2111
2112     if (uprv_strchr(localeID, '@')) {
2113         // uprv_convertToLCID does not support keywords other than collation.
2114         // Remove all keywords except collation.
2115         int32_t len;
2116         char collVal[ULOC_KEYWORDS_CAPACITY];
2117         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2118
2119         len = uloc_getKeywordValue(localeID, "collation", collVal,
2120             sizeof(collVal)/sizeof(collVal[0]) - 1, &status);
2121
2122         if (U_SUCCESS(status) && len > 0) {
2123             collVal[len] = 0;
2124
2125             len = uloc_getBaseName(localeID, tmpLocaleID,
2126                 sizeof(tmpLocaleID)/sizeof(tmpLocaleID[0]) - 1, &status);
2127
2128             if (U_SUCCESS(status)) {
2129                 tmpLocaleID[len] = 0;
2130
2131                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2132                     sizeof(tmpLocaleID)/sizeof(tmpLocaleID[0]) - len - 1, &status);
2133
2134                 if (U_SUCCESS(status)) {
2135                     tmpLocaleID[len] = 0;
2136                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2137                 }
2138             }
2139         }
2140
2141         // fall through - all keywords are simply ignored
2142         status = U_ZERO_ERROR;
2143     }
2144
2145     return uprv_convertToLCID(langID, localeID, &status);
2146 }
2147
2148 U_CAPI int32_t U_EXPORT2
2149 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2150                 UErrorCode *status)
2151 {
2152     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2153 }
2154
2155 /* ### Default locale **************************************************/
2156
2157 U_CAPI const char*  U_EXPORT2
2158 uloc_getDefault()
2159 {
2160     return locale_get_default();
2161 }
2162
2163 U_CAPI void  U_EXPORT2
2164 uloc_setDefault(const char*   newDefaultLocale,
2165              UErrorCode* err)
2166 {
2167     if (U_FAILURE(*err))
2168         return;
2169     /* the error code isn't currently used for anything by this function*/
2170
2171     /* propagate change to C++ */
2172     locale_set_default(newDefaultLocale);
2173 }
2174
2175 /**
2176  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2177  * to an array of pointers to arrays of char.  All of these pointers are owned
2178  * by ICU-- do not delete them, and do not write through them.  The array is
2179  * terminated with a null pointer.
2180  */
2181 U_CAPI const char* const*  U_EXPORT2
2182 uloc_getISOLanguages()
2183 {
2184     return LANGUAGES;
2185 }
2186
2187 /**
2188  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2189  * pointer to an array of pointers to arrays of char.  All of these pointers are
2190  * owned by ICU-- do not delete them, and do not write through them.  The array is
2191  * terminated with a null pointer.
2192  */
2193 U_CAPI const char* const*  U_EXPORT2
2194 uloc_getISOCountries()
2195 {
2196     return COUNTRIES;
2197 }
2198
2199
2200 /* this function to be moved into cstring.c later */
2201 static char gDecimal = 0;
2202
2203 static /* U_CAPI */
2204 double
2205 /* U_EXPORT2 */
2206 _uloc_strtod(const char *start, char **end) {
2207     char *decimal;
2208     char *myEnd;
2209     char buf[30];
2210     double rv;
2211     if (!gDecimal) {
2212         char rep[5];
2213         /* For machines that decide to change the decimal on you,
2214         and try to be too smart with localization.
2215         This normally should be just a '.'. */
2216         sprintf(rep, "%+1.1f", 1.0);
2217         gDecimal = rep[2];
2218     }
2219
2220     if(gDecimal == '.') {
2221         return uprv_strtod(start, end); /* fall through to OS */
2222     } else {
2223         uprv_strncpy(buf, start, 29);
2224         buf[29]=0;
2225         decimal = uprv_strchr(buf, '.');
2226         if(decimal) {
2227             *decimal = gDecimal;
2228         } else {
2229             return uprv_strtod(start, end); /* no decimal point */
2230         }
2231         rv = uprv_strtod(buf, &myEnd);
2232         if(end) {
2233             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2234         }
2235         return rv;
2236     }
2237 }
2238
2239 typedef struct {
2240     float q;
2241     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2242     char *locale;
2243 } _acceptLangItem;
2244
2245 static int32_t U_CALLCONV
2246 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2247 {
2248     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2249     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2250
2251     int32_t rc = 0;
2252     if(bb->q < aa->q) {
2253         rc = -1;  /* A > B */
2254     } else if(bb->q > aa->q) {
2255         rc = 1;   /* A < B */
2256     } else {
2257         rc = 0;   /* A = B */
2258     }
2259
2260     if(rc==0) {
2261         rc = uprv_stricmp(aa->locale, bb->locale);
2262     }
2263
2264 #if defined(ULOC_DEBUG)
2265     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2266     aa->locale, aa->q,
2267     bb->locale, bb->q,
2268     rc);*/
2269 #endif
2270
2271     return rc;
2272 }
2273
2274 /*
2275 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2276 */
2277
2278 U_CAPI int32_t U_EXPORT2
2279 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2280                             const char *httpAcceptLanguage,
2281                             UEnumeration* availableLocales,
2282                             UErrorCode *status)
2283 {
2284     _acceptLangItem *j;
2285     _acceptLangItem smallBuffer[30];
2286     char **strs;
2287     char tmp[ULOC_FULLNAME_CAPACITY +1];
2288     int32_t n = 0;
2289     const char *itemEnd;
2290     const char *paramEnd;
2291     const char *s;
2292     const char *t;
2293     int32_t res;
2294     int32_t i;
2295     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2296     int32_t jSize;
2297     char *tempstr; /* Use for null pointer check */
2298
2299     j = smallBuffer;
2300     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2301     if(U_FAILURE(*status)) {
2302         return -1;
2303     }
2304
2305     for(s=httpAcceptLanguage;s&&*s;) {
2306         while(isspace(*s)) /* eat space at the beginning */
2307             s++;
2308         itemEnd=uprv_strchr(s,',');
2309         paramEnd=uprv_strchr(s,';');
2310         if(!itemEnd) {
2311             itemEnd = httpAcceptLanguage+l; /* end of string */
2312         }
2313         if(paramEnd && paramEnd<itemEnd) {
2314             /* semicolon (;) is closer than end (,) */
2315             t = paramEnd+1;
2316             if(*t=='q') {
2317                 t++;
2318             }
2319             while(isspace(*t)) {
2320                 t++;
2321             }
2322             if(*t=='=') {
2323                 t++;
2324             }
2325             while(isspace(*t)) {
2326                 t++;
2327             }
2328             j[n].q = (float)_uloc_strtod(t,NULL);
2329         } else {
2330             /* no semicolon - it's 1.0 */
2331             j[n].q = 1.0f;
2332             paramEnd = itemEnd;
2333         }
2334         j[n].dummy=0;
2335         /* eat spaces prior to semi */
2336         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2337             ;
2338         /* Check for null pointer from uprv_strndup */
2339         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2340         if (tempstr == NULL) {
2341             *status = U_MEMORY_ALLOCATION_ERROR;
2342             return -1;
2343         }
2344         j[n].locale = tempstr;
2345         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2346         if(strcmp(j[n].locale,tmp)) {
2347             uprv_free(j[n].locale);
2348             j[n].locale=uprv_strdup(tmp);
2349         }
2350 #if defined(ULOC_DEBUG)
2351         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2352 #endif
2353         n++;
2354         s = itemEnd;
2355         while(*s==',') { /* eat duplicate commas */
2356             s++;
2357         }
2358         if(n>=jSize) {
2359             if(j==smallBuffer) {  /* overflowed the small buffer. */
2360                 j = static_cast<_acceptLangItem *>(uprv_malloc(sizeof(j[0])*(jSize*2)));
2361                 if(j!=NULL) {
2362                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2363                 }
2364 #if defined(ULOC_DEBUG)
2365                 fprintf(stderr,"malloced at size %d\n", jSize);
2366 #endif
2367             } else {
2368                 j = static_cast<_acceptLangItem *>(uprv_realloc(j, sizeof(j[0])*jSize*2));
2369 #if defined(ULOC_DEBUG)
2370                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2371 #endif
2372             }
2373             jSize *= 2;
2374             if(j==NULL) {
2375                 *status = U_MEMORY_ALLOCATION_ERROR;
2376                 return -1;
2377             }
2378         }
2379     }
2380     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2381     if(U_FAILURE(*status)) {
2382         if(j != smallBuffer) {
2383 #if defined(ULOC_DEBUG)
2384             fprintf(stderr,"freeing j %p\n", j);
2385 #endif
2386             uprv_free(j);
2387         }
2388         return -1;
2389     }
2390     strs = static_cast<char **>(uprv_malloc((size_t)(sizeof(strs[0])*n)));
2391     /* Check for null pointer */
2392     if (strs == NULL) {
2393         uprv_free(j); /* Free to avoid memory leak */
2394         *status = U_MEMORY_ALLOCATION_ERROR;
2395         return -1;
2396     }
2397     for(i=0;i<n;i++) {
2398 #if defined(ULOC_DEBUG)
2399         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2400 #endif
2401         strs[i]=j[i].locale;
2402     }
2403     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2404         (const char**)strs, n, availableLocales, status);
2405     for(i=0;i<n;i++) {
2406         uprv_free(strs[i]);
2407     }
2408     uprv_free(strs);
2409     if(j != smallBuffer) {
2410 #if defined(ULOC_DEBUG)
2411         fprintf(stderr,"freeing j %p\n", j);
2412 #endif
2413         uprv_free(j);
2414     }
2415     return res;
2416 }
2417
2418
2419 U_CAPI int32_t U_EXPORT2
2420 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2421                     UAcceptResult *outResult, const char **acceptList,
2422                     int32_t acceptListCount,
2423                     UEnumeration* availableLocales,
2424                     UErrorCode *status)
2425 {
2426     int32_t i,j;
2427     int32_t len;
2428     int32_t maxLen=0;
2429     char tmp[ULOC_FULLNAME_CAPACITY+1];
2430     const char *l;
2431     char **fallbackList;
2432     if(U_FAILURE(*status)) {
2433         return -1;
2434     }
2435     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2436     if(fallbackList==NULL) {
2437         *status = U_MEMORY_ALLOCATION_ERROR;
2438         return -1;
2439     }
2440     for(i=0;i<acceptListCount;i++) {
2441 #if defined(ULOC_DEBUG)
2442         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2443 #endif
2444         while((l=uenum_next(availableLocales, NULL, status))) {
2445 #if defined(ULOC_DEBUG)
2446             fprintf(stderr,"  %s\n", l);
2447 #endif
2448             len = (int32_t)uprv_strlen(l);
2449             if(!uprv_strcmp(acceptList[i], l)) {
2450                 if(outResult) {
2451                     *outResult = ULOC_ACCEPT_VALID;
2452                 }
2453 #if defined(ULOC_DEBUG)
2454                 fprintf(stderr, "MATCH! %s\n", l);
2455 #endif
2456                 if(len>0) {
2457                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2458                 }
2459                 for(j=0;j<i;j++) {
2460                     uprv_free(fallbackList[j]);
2461                 }
2462                 uprv_free(fallbackList);
2463                 return u_terminateChars(result, resultAvailable, len, status);
2464             }
2465             if(len>maxLen) {
2466                 maxLen = len;
2467             }
2468         }
2469         uenum_reset(availableLocales, status);
2470         /* save off parent info */
2471         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2472             fallbackList[i] = uprv_strdup(tmp);
2473         } else {
2474             fallbackList[i]=0;
2475         }
2476     }
2477
2478     for(maxLen--;maxLen>0;maxLen--) {
2479         for(i=0;i<acceptListCount;i++) {
2480             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2481 #if defined(ULOC_DEBUG)
2482                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2483 #endif
2484                 while((l=uenum_next(availableLocales, NULL, status))) {
2485 #if defined(ULOC_DEBUG)
2486                     fprintf(stderr,"  %s\n", l);
2487 #endif
2488                     len = (int32_t)uprv_strlen(l);
2489                     if(!uprv_strcmp(fallbackList[i], l)) {
2490                         if(outResult) {
2491                             *outResult = ULOC_ACCEPT_FALLBACK;
2492                         }
2493 #if defined(ULOC_DEBUG)
2494                         fprintf(stderr, "fallback MATCH! %s\n", l);
2495 #endif
2496                         if(len>0) {
2497                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2498                         }
2499                         for(j=0;j<acceptListCount;j++) {
2500                             uprv_free(fallbackList[j]);
2501                         }
2502                         uprv_free(fallbackList);
2503                         return u_terminateChars(result, resultAvailable, len, status);
2504                     }
2505                 }
2506                 uenum_reset(availableLocales, status);
2507
2508                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2509                     uprv_free(fallbackList[i]);
2510                     fallbackList[i] = uprv_strdup(tmp);
2511                 } else {
2512                     uprv_free(fallbackList[i]);
2513                     fallbackList[i]=0;
2514                 }
2515             }
2516         }
2517         if(outResult) {
2518             *outResult = ULOC_ACCEPT_FAILED;
2519         }
2520     }
2521     for(i=0;i<acceptListCount;i++) {
2522         uprv_free(fallbackList[i]);
2523     }
2524     uprv_free(fallbackList);
2525     return -1;
2526 }
2527
2528 /*eof*/