icuSources/common/uloc.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1997-2014, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *
   7 * File ULOC.CPP
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   04/01/97    aliu        Creation.
  13 *   08/21/98    stephen     JDK 1.2 sync
  14 *   12/08/98    rtg         New Locale implementation and C API
  15 *   03/15/99    damiba      overhaul.
  16 *   04/06/99    stephen     changed setDefault() to realloc and copy
  17 *   06/14/99    stephen     Changed calls to ures_open for new params
  18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  20 *                           brought canonicalization code into line with spec
  21 *****************************************************************************/
  22
  23 /*
  24    POSIX's locale format, from putil.c: [no spaces]
  25
  26      ll [ _CC ] [ . MM ] [ @ VV]
  27
  28      l = lang, C = ctry, M = charmap, V = variant
  29 */
  30
  31 #include "unicode/utypes.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/uloc.h"
  34
  35 #include "putilimp.h"
  36 #include "ustr_imp.h"
  37 #include "ulocimp.h"
  38 #include "umutex.h"
  39 #include "cstring.h"
  40 #include "cmemory.h"
  41 #include "locmap.h"
  42 #include "uarrsort.h"
  43 #include "uenumimp.h"
  44 #include "uassert.h"
  45
  46 #include <stdio.h> /* for sprintf */
  47
  48 /* ### Declarations **************************************************/
  49
  50 /* Locale stuff from locid.cpp */
  51 U_CFUNC void locale_set_default(const char *id);
  52 U_CFUNC const char *locale_get_default(void);
  53 U_CFUNC int32_t
  54 locale_getKeywords(const char *localeID,
  55             char prev,
  56             char *keywords, int32_t keywordCapacity,
  57             char *values, int32_t valuesCapacity, int32_t *valLen,
  58             UBool valuesToo,
  59             UErrorCode *status);
  60
  61 /* ### Data tables **************************************************/
  62
  63 /**
  64  * Table of language codes, both 2- and 3-letter, with preference
  65  * given to 2-letter codes where possible.  Includes 3-letter codes
  66  * that lack a 2-letter equivalent.
  67  *
  68  * This list must be in sorted order.  This list is returned directly
  69  * to the user by some API.
  70  *
  71  * This list must be kept in sync with LANGUAGES_3, with corresponding
  72  * entries matched.
  73  *
  74  * This table should be terminated with a NULL entry, followed by a
  75  * second list, and another NULL entry.  The first list is visible to
  76  * user code when this array is returned by API.  The second list
  77  * contains codes we support, but do not expose through user API.
  78  *
  79  * Notes
  80  *
  81  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  82  * include the revisions up to 2001/7/27 *CWB*
  83  *
  84  * The 3 character codes are the terminology codes like RFC 3066.  This
  85  * is compatible with prior ICU codes
  86  *
  87  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  88  * table but now at the end of the table because 3 character codes are
  89  * duplicates.  This avoids bad searches going from 3 to 2 character
  90  * codes.
  91  *
  92  * The range qaa-qtz is reserved for local use
  93  */
  94 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
  95 /* ISO639 table version is 20130531 */
  96 static const char * const LANGUAGES[] = {
  97     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",
  98     "afa", "afh", "agq", "ain", "ak",  "akk", "ale", "alg",
  99     "alt", "am",  "an",  "ang", "anp", "apa", "ar",  "arc",
 100     "arn", "arp", "art", "arw", "as",  "asa", "ast", "ath",
 101     "aus", "av",  "awa", "ay",  "az",
 102     "ba",  "bad", "bai", "bal", "ban", "bas", "bat", "bax",
 103     "bbj", "be",  "bej", "bem", "ber", "bez", "bfd", "bg",
 104     "bh",  "bho", "bi",  "bik", "bin", "bkm", "bla", "bm",
 105     "bn",  "bnt", "bo",  "br",  "bra", "brx", "bs",  "bss",
 106     "btk", "bua", "bug", "bum", "byn", "byv",
 107     "ca",  "cad", "cai", "car", "cau", "cay", "cch", "ce",
 108     "ceb", "cel", "cgg", "ch",  "chb", "chg", "chk", "chm",
 109     "chn", "cho", "chp", "chr", "chy", "ckb", "cmc", "co",
 110     "cop", "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",
 111     "csb", "cu",  "cus", "cv",  "cy",
 112     "da",  "dak", "dar", "dav", "day", "de",  "del", "den",
 113     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum",
 114     "dv",  "dyo", "dyu", "dz",  "dzg",
 115     "ebu", "ee",  "efi", "egy", "eka", "el",  "elx", "en",
 116     "enm", "eo",  "es",  "et",  "eu",  "ewo",
 117     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",
 118     "fo",  "fon", "fr",  "frm", "fro", "frr", "frs", "fur",
 119     "fy",
 120     "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
 121     "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
 122     "grc", "gsw", "gu",  "guz", "gv",  "gwi",
 123     "ha",  "hai", "haw", "he",  "hi",  "hil", "him", "hit",
 124     "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",
 125     "hz",
 126     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ijo",
 127     "ik",  "ilo", "inc", "ine", "inh", "io",  "ira", "iro",
 128     "is",  "it",  "iu",
 129     "ja",  "jbo", "jgo", "jmc", "jpr", "jrb", "jv",
 130     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kar", "kaw",
 131     "kbd", "kbl", "kcg", "kde", "kea", "kfo", "kg",  "kha",
 132     "khi", "kho", "khq", "ki",  "kj",  "kk",  "kkj", "kl",
 133     "kln", "km",  "kmb", "kn",  "ko",  "kok", "kos", "kpe",
 134     "kr",  "krc", "krl", "kro", "kru", "ks",  "ksb", "ksf",
 135     "ksh", "ku",  "kum", "kut", "kv",  "kw",  "ky",
 136     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lg",
 137     "li",  "lkt", "ln",  "lo",  "lol", "loz", "lt",  "lu",
 138     "lua", "lui", "lun", "luo", "lus", "luy", "lv",
 139     "mad", "maf", "mag", "mai", "mak", "man", "map", "mas",
 140     "mde", "mdf", "mdr", "men", "mer", "mfe", "mg",  "mga",
 141     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
 142     "mkh", "ml",  "mn",  "mnc", "mni", "mno", "mo",  "moh",
 143     "mos", "mr",  "ms",  "mt",  "mua", "mul", "mun", "mus",
 144     "mwl", "mwr", "my",  "mye", "myn", "myv",
 145     "na",  "nah", "nai", "nap", "naq", "nb",  "nd",  "nds",
 146     "ne",  "new", "ng",  "nia", "nic", "niu", "nl",  "nmg",
 147     "nn",  "nnh", "no",  "nog", "non", "nqo", "nr",  "nso",
 148     "nub", "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo",
 149     "nzi",
 150     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota", "oto",
 151     "pa",  "paa", "pag", "pal", "pam", "pap", "pau", "peo",
 152     "phi", "phn", "pi",  "pl",  "pon", "pra", "pro", "ps",
 153     "pt",
 154     "qu",
 155     "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rof",
 156     "rom", "ru",  "rup", "rw",  "rwk",
 157     "sa",  "sad", "sah", "sai", "sal", "sam", "saq", "sas",
 158     "sat", "sba", "sbp", "sc",  "scn", "sco", "sd",  "se",
 159     "see", "seh", "sel", "sem", "ses", "sg",  "sga", "sgn",
 160     "shi", "shn", "shu", "si",  "sid", "sio", "sit",
 161     "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
 162     "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
 163     "srn", "srr", "ss",  "ssa", "ssy", "st",  "su",  "suk",
 164     "sus", "sux", "sv",  "sw",  "swb", "swc", "syc", "syr",
 165     "ta",  "tai", "te",  "tem", "teo", "ter", "tet", "tg",
 166     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tl",  "tlh",
 167     "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr",  "trv",
 168     "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",
 169     "twq", "ty",  "tyv", "tzm",
 170     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
 171     "vai", "ve",  "vi",  "vo",  "vot", "vun",
 172     "wa",  "wae", "wak", "wal", "war", "was", "wen", "wo",
 173     "xal", "xh",  "xog",
 174     "yao", "yap", "yav", "ybb", "yi",  "yo",  "ypk", "yue",
 175     "za",  "zap", "zbl", "zen", "zgh", "zh",  "znd", "zu",
 176     "zun", "zxx", "zza",
 177 NULL,
 178     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 179 NULL
 180 };
 181
 182 static const char* const DEPRECATED_LANGUAGES[]={
 183     "in", "iw", "ji", "jw", NULL, NULL
 184 };
 185 static const char* const REPLACEMENT_LANGUAGES[]={
 186     "id", "he", "yi", "jv", NULL, NULL
 187 };
 188
 189 /**
 190  * Table of 3-letter language codes.
 191  *
 192  * This is a lookup table used to convert 3-letter language codes to
 193  * their 2-letter equivalent, where possible.  It must be kept in sync
 194  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 195  * same language as LANGUAGES_3[i].  The commented-out lines are
 196  * copied from LANGUAGES to make eyeballing this baby easier.
 197  *
 198  * Where a 3-letter language code has no 2-letter equivalent, the
 199  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 200  *
 201  * This table should be terminated with a NULL entry, followed by a
 202  * second list, and another NULL entry.  The two lists correspond to
 203  * the two lists in LANGUAGES.
 204  */
 205 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 206 /* ISO639 table version is 20130531 */
 207 static const char * const LANGUAGES_3[] = {
 208     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr",
 209     "afa", "afh", "agq", "ain", "aka", "akk", "ale", "alg",
 210     "alt", "amh", "arg", "ang", "anp", "apa", "ara", "arc",
 211     "arn", "arp", "art", "arw", "asm", "asa", "ast", "ath",
 212     "aus", "ava", "awa", "aym", "aze",
 213     "bak", "bad", "bai", "bal", "ban", "bas", "bat", "bax",
 214     "bbj", "bel", "bej", "bem", "ber", "bez", "bfd", "bul",
 215     "bih", "bho", "bis", "bik", "bin", "bkm", "bla", "bam",
 216     "ben", "bnt", "bod", "bre", "bra", "brx", "bos", "bss",
 217     "btk", "bua", "bug", "bum", "byn", "byv",
 218     "cat", "cad", "cai", "car", "cau", "cay", "cch", "che",
 219     "ceb", "cel", "cgg", "cha", "chb", "chg", "chk", "chm",
 220     "chn", "cho", "chp", "chr", "chy", "ckb", "cmc", "cos",
 221     "cop", "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces",
 222     "csb", "chu", "cus", "chv", "cym",
 223     "dan", "dak", "dar", "dav", "day", "deu", "del", "den",
 224     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum",
 225     "div", "dyo", "dyu", "dzo", "dzg",
 226     "ebu", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
 227     "enm", "epo", "spa", "est", "eus", "ewo",
 228     "fas", "fan", "fat", "ful", "fin", "fil", "fiu", "fij",
 229     "fao", "fon", "fra", "frm", "fro", "frr", "frs", "fur",
 230     "fry",
 231     "gle", "gaa", "gay", "gba", "gla", "gem", "gez", "gil",
 232     "glg", "gmh", "grn", "goh", "gon", "gor", "got", "grb",
 233     "grc", "gsw", "guj", "guz", "glv", "gwi",
 234     "hau", "hai", "haw", "heb", "hin", "hil", "him", "hit",
 235     "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye",
 236     "her",
 237     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ijo",
 238     "ipk", "ilo", "inc", "ine", "inh", "ido", "ira", "iro",
 239     "isl", "ita", "iku",
 240     "jpn", "jbo", "jgo", "jmc", "jpr", "jrb", "jav",
 241     "kat", "kaa", "kab", "kac", "kaj", "kam", "kar", "kaw",
 242     "kbd", "kbl", "kcg", "kde", "kea", "kfo", "kon", "kha",
 243     "khi", "kho", "khq", "kik", "kua", "kaz", "kkj", "kal",
 244     "kln", "khm", "kmb", "kan", "kor", "kok", "kos", "kpe",
 245     "kau", "krc", "krl", "kro", "kru", "kas", "ksb", "ksf",
 246     "ksh", "kur", "kum", "kut", "kom", "cor", "kir",
 247     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lug",
 248     "lim", "lkt", "lin", "lao", "lol", "loz", "lit", "lub",
 249     "lua", "lui", "lun", "luo", "lus", "luy", "lav",
 250     "mad", "maf", "mag", "mai", "mak", "man", "map", "mas",
 251     "mde", "mdf", "mdr", "men", "mer", "mfe", "mlg", "mga",
 252     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
 253     "mkh", "mal", "mon", "mnc", "mni", "mno", "mol", "moh",
 254     "mos", "mar", "msa", "mlt", "mua", "mul", "mun", "mus",
 255     "mwl", "mwr", "mya", "mye", "myn", "myv",
 256     "nau", "nah", "nai", "nap", "naq", "nob", "nde", "nds",
 257     "nep", "new", "ndo", "nia", "nic", "niu", "nld", "nmg",
 258     "nno", "nnh", "nor", "nog", "non", "nqo", "nbl", "nso",
 259     "nub", "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo",
 260     "nzi",
 261     "oci", "oji", "orm", "ori", "oss", "osa", "ota", "oto",
 262     "pan", "paa", "pag", "pal", "pam", "pap", "pau", "peo",
 263     "phi", "phn", "pli", "pol", "pon", "pra", "pro", "pus",
 264     "por",
 265     "que",
 266     "raj", "rap", "rar", "roh", "run", "ron", "roa", "rof",
 267     "rom", "rus", "rup", "kin", "rwk",
 268     "san", "sad", "sah", "sai", "sal", "sam", "saq", "sas",
 269     "sat", "sba", "sbp", "srd", "scn", "sco", "snd", "sme",
 270     "see", "seh", "sel", "sem", "ses", "sag", "sga", "sgn",
 271     "shi", "shn", "shu", "sin", "sid", "sio", "sit",
 272     "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
 273     "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
 274     "srn", "srr", "ssw", "ssa", "ssy", "sot", "sun", "suk",
 275     "sus", "sux", "swe", "swa", "swb", "swc", "syc", "syr",
 276     "tam", "tai", "tel", "tem", "teo", "ter", "tet", "tgk",
 277     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tgl", "tlh",
 278     "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
 279     "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi",
 280     "twq", "tah", "tyv", "tzm",
 281     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
 282     "vai", "ven", "vie", "vol", "vot", "vun",
 283     "wln", "wae", "wak", "wal", "war", "was", "wen", "wol",
 284     "xal", "xho", "xog",
 285     "yao", "yap", "yav", "ybb", "yid", "yor", "ypk", "yue",
 286     "zha", "zap", "zbl", "zen", "zgh", "zho", "znd", "zul",
 287     "zun", "zxx", "zza",
 288 NULL,
 289 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 290     "ind", "heb", "yid", "jaw", "srp",
 291 NULL
 292 };
 293
 294 /**
 295  * Table of 2-letter country codes.
 296  *
 297  * This list must be in sorted order.  This list is returned directly
 298  * to the user by some API.
 299  *
 300  * This list must be kept in sync with COUNTRIES_3, with corresponding
 301  * entries matched.
 302  *
 303  * This table should be terminated with a NULL entry, followed by a
 304  * second list, and another NULL entry.  The first list is visible to
 305  * user code when this array is returned by API.  The second list
 306  * contains codes we support, but do not expose through user API.
 307  *
 308  * Notes:
 309  *
 310  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 311  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 312  * new codes keeping the old ones for compatibility updated to include
 313  * 1999/12/03 revisions *CWB*
 314  *
 315  * RO(ROM) is now RO(ROU) according to
 316  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 317  */
 318 static const char * const COUNTRIES[] = {
 319     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
 320     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 321     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 322     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
 323     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 324     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
 325     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
 326     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
 327     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 328     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 329     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 330     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 331     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 332     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 333     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 334     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 335     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 336     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 337     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 338     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 339     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 340     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 341     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 342     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 343     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
 344     "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 345     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 346     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 347     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 348     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 349 NULL,
 350     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 351 NULL
 352 };
 353
 354 static const char* const DEPRECATED_COUNTRIES[] = {
 355     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
 356 };
 357 static const char* const REPLACEMENT_COUNTRIES[] = {
 358 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
 359     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
 360 };
 361
 362 /**
 363  * Table of 3-letter country codes.
 364  *
 365  * This is a lookup table used to convert 3-letter country codes to
 366  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 367  * For all valid i, COUNTRIES[i] must refer to the same country as
 368  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 369  * to make eyeballing this baby easier.
 370  *
 371  * This table should be terminated with a NULL entry, followed by a
 372  * second list, and another NULL entry.  The two lists correspond to
 373  * the two lists in COUNTRIES.
 374  */
 375 static const char * const COUNTRIES_3[] = {
 376 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
 377     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
 378 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 379     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 380 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 381     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 382 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
 383     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
 384 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 385     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 386 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
 387     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
 388 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
 389     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
 390 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 391     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
 392 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 393     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 394 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 395     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 396 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 397     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 398 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 399     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 400 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 401     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 402 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 403     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 404 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 405     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 406 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 407     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 408 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 409     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 410 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 411     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 412 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 413     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 414 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 415     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 416 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 417     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 418 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 419     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 420 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 421     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 422 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 423     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 424 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
 425     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
 426 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 427     "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 428 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 429     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 430 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 431     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 432 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 433     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 434 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 435     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 436 NULL,
 437 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
 438     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
 439 NULL
 440 };
 441
 442 typedef struct CanonicalizationMap {
 443     const char *id;          /* input ID */
 444     const char *canonicalID; /* canonicalized output ID */
 445     const char *keyword;     /* keyword, or NULL if none */
 446     const char *value;       /* keyword value, or NULL if kw==NULL */
 447 } CanonicalizationMap;
 448
 449 /**
 450  * A map to canonicalize locale IDs.  This handles a variety of
 451  * different semantic kinds of transformations.
 452  */
 453 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 454     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 455     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 456     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 457     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 458     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 459     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 460     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 461     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 462     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 463     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 464     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 465     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 466     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 467     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 468     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 469     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 470     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 471     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 472     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 473     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 474     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 475     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 476     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 477     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 478     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 479     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 480     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 481     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 482     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 483     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 484     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 485     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 486     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 487     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 488     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 489     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 490     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 491     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 492     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 493     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
 494     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 495     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
 496     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
 497     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
 498     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
 499     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
 500 };
 501
 502 typedef struct VariantMap {
 503     const char *variant;          /* input ID */
 504     const char *keyword;     /* keyword, or NULL if none */
 505     const char *value;       /* keyword value, or NULL if kw==NULL */
 506 } VariantMap;
 507
 508 static const VariantMap VARIANT_MAP[] = {
 509     { "EURO",   "currency", "EUR" },
 510     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 511     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 512 };
 513
 514 /* ### BCP47 Conversion *******************************************/
 515 /* Test if the locale id has BCP47 u extension and does not have '@' */
 516 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 517 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 518 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 519         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
 520             finalID=id; \
 521         } else { \
 522             finalID=buffer; \
 523         }
 524 /* Gets the size of the shortest subtag in the given localeID. */
 525 static int32_t getShortestSubtagLength(const char *localeID) {
 526     int32_t localeIDLength = uprv_strlen(localeID);
 527     int32_t length = localeIDLength;
 528     int32_t tmpLength = 0;
 529     int32_t i;
 530     UBool reset = TRUE;
 531
 532     for (i = 0; i < localeIDLength; i++) {
 533         if (localeID[i] != '_' && localeID[i] != '-') {
 534             if (reset) {
 535                 tmpLength = 0;
 536                 reset = FALSE;
 537             }
 538             tmpLength++;
 539         } else {
 540             if (tmpLength != 0 && tmpLength < length) {
 541                 length = tmpLength;
 542             }
 543             reset = TRUE;
 544         }
 545     }
 546
 547     return length;
 548 }
 549
 550 /* ### Keywords **************************************************/
 551
 552 #define ULOC_KEYWORD_BUFFER_LEN 25
 553 #define ULOC_MAX_NO_KEYWORDS 25
 554
 555 U_CAPI const char * U_EXPORT2
 556 locale_getKeywordsStart(const char *localeID) {
 557     const char *result = NULL;
 558     if((result = uprv_strchr(localeID, '@')) != NULL) {
 559         return result;
 560     }
 561 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 562     else {
 563         /* We do this because the @ sign is variant, and the @ sign used on one
 564         EBCDIC machine won't be compiled the same way on other EBCDIC based
 565         machines. */
 566         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 567         const uint8_t *charToFind = ebcdicSigns;
 568         while(*charToFind) {
 569             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 570                 return result;
 571             }
 572             charToFind++;
 573         }
 574     }
 575 #endif
 576     return NULL;
 577 }
 578
 579 /**
 580  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 581  * @param keywordName incoming name to be canonicalized
 582  * @param status return status (keyword too long)
 583  * @return length of the keyword name
 584  */
 585 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 586 {
 587   int32_t i;
 588   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
 589
 590   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
 591     /* keyword name too long for internal buffer */
 592     *status = U_INTERNAL_PROGRAM_ERROR;
 593           return 0;
 594   }
 595
 596   /* normalize the keyword name */
 597   for(i = 0; i < keywordNameLen; i++) {
 598     buf[i] = uprv_tolower(keywordName[i]);
 599   }
 600   buf[i] = 0;
 601
 602   return keywordNameLen;
 603 }
 604
 605 typedef struct {
 606     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 607     int32_t keywordLen;
 608     const char *valueStart;
 609     int32_t valueLen;
 610 } KeywordStruct;
 611
 612 static int32_t U_CALLCONV
 613 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
 614     const char* leftString = ((const KeywordStruct *)left)->keyword;
 615     const char* rightString = ((const KeywordStruct *)right)->keyword;
 616     return uprv_strcmp(leftString, rightString);
 617 }
 618
 619 /**
 620  * Both addKeyword and addValue must already be in canonical form.
 621  * Either both addKeyword and addValue are NULL, or neither is NULL.
 622  * If they are not NULL they must be zero terminated.
 623  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 624  */
 625 static int32_t
 626 _getKeywords(const char *localeID,
 627              char prev,
 628              char *keywords, int32_t keywordCapacity,
 629              char *values, int32_t valuesCapacity, int32_t *valLen,
 630              UBool valuesToo,
 631              const char* addKeyword,
 632              const char* addValue,
 633              UErrorCode *status)
 634 {
 635     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 636
 637     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 638     int32_t numKeywords = 0;
 639     const char* pos = localeID;
 640     const char* equalSign = NULL;
 641     const char* semicolon = NULL;
 642     int32_t i = 0, j, n;
 643     int32_t keywordsLen = 0;
 644     int32_t valuesLen = 0;
 645
 646     if(prev == '@') { /* start of keyword definition */
 647         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 648         do {
 649             UBool duplicate = FALSE;
 650             /* skip leading spaces */
 651             while(*pos == ' ') {
 652                 pos++;
 653             }
 654             if (!*pos) { /* handle trailing "; " */
 655                 break;
 656             }
 657             if(numKeywords == maxKeywords) {
 658                 *status = U_INTERNAL_PROGRAM_ERROR;
 659                 return 0;
 660             }
 661             equalSign = uprv_strchr(pos, '=');
 662             semicolon = uprv_strchr(pos, ';');
 663             /* lack of '=' [foo@currency] is illegal */
 664             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 665             if(!equalSign || (semicolon && semicolon<equalSign)) {
 666                 *status = U_INVALID_FORMAT_ERROR;
 667                 return 0;
 668             }
 669             /* need to normalize both keyword and keyword name */
 670             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 671                 /* keyword name too long for internal buffer */
 672                 *status = U_INTERNAL_PROGRAM_ERROR;
 673                 return 0;
 674             }
 675             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 676                 if (pos[i] != ' ') {
 677                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 678                 }
 679             }
 680
 681             /* zero-length keyword is an error. */
 682             if (n == 0) {
 683                 *status = U_INVALID_FORMAT_ERROR;
 684                 return 0;
 685             }
 686
 687             keywordList[numKeywords].keyword[n] = 0;
 688             keywordList[numKeywords].keywordLen = n;
 689             /* now grab the value part. First we skip the '=' */
 690             equalSign++;
 691             /* then we leading spaces */
 692             while(*equalSign == ' ') {
 693                 equalSign++;
 694             }
 695
 696             /* Premature end or zero-length value */
 697             if (!equalSign || equalSign == semicolon) {
 698                 *status = U_INVALID_FORMAT_ERROR;
 699                 return 0;
 700             }
 701
 702             keywordList[numKeywords].valueStart = equalSign;
 703
 704             pos = semicolon;
 705             i = 0;
 706             if(pos) {
 707                 while(*(pos - i - 1) == ' ') {
 708                     i++;
 709                 }
 710                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 711                 pos++;
 712             } else {
 713                 i = (int32_t)uprv_strlen(equalSign);
 714                 while(i && equalSign[i-1] == ' ') {
 715                     i--;
 716                 }
 717                 keywordList[numKeywords].valueLen = i;
 718             }
 719             /* If this is a duplicate keyword, then ignore it */
 720             for (j=0; j<numKeywords; ++j) {
 721                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 722                     duplicate = TRUE;
 723                     break;
 724                 }
 725             }
 726             if (!duplicate) {
 727                 ++numKeywords;
 728             }
 729         } while(pos);
 730
 731         /* Handle addKeyword/addValue. */
 732         if (addKeyword != NULL) {
 733             UBool duplicate = FALSE;
 734             U_ASSERT(addValue != NULL);
 735             /* Search for duplicate; if found, do nothing. Explicit keyword
 736                overrides addKeyword. */
 737             for (j=0; j<numKeywords; ++j) {
 738                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 739                     duplicate = TRUE;
 740                     break;
 741                 }
 742             }
 743             if (!duplicate) {
 744                 if (numKeywords == maxKeywords) {
 745                     *status = U_INTERNAL_PROGRAM_ERROR;
 746                     return 0;
 747                 }
 748                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 749                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 750                 keywordList[numKeywords].valueStart = addValue;
 751                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 752                 ++numKeywords;
 753             }
 754         } else {
 755             U_ASSERT(addValue == NULL);
 756         }
 757
 758         /* now we have a list of keywords */
 759         /* we need to sort it */
 760         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 761
 762         /* Now construct the keyword part */
 763         for(i = 0; i < numKeywords; i++) {
 764             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 765                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 766                 if(valuesToo) {
 767                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 768                 } else {
 769                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 770                 }
 771             }
 772             keywordsLen += keywordList[i].keywordLen + 1;
 773             if(valuesToo) {
 774                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 775                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 776                 }
 777                 keywordsLen += keywordList[i].valueLen;
 778
 779                 if(i < numKeywords - 1) {
 780                     if(keywordsLen < keywordCapacity) {
 781                         keywords[keywordsLen] = ';';
 782                     }
 783                     keywordsLen++;
 784                 }
 785             }
 786             if(values) {
 787                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 788                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 789                     values[valuesLen + keywordList[i].valueLen] = 0;
 790                 }
 791                 valuesLen += keywordList[i].valueLen + 1;
 792             }
 793         }
 794         if(values) {
 795             values[valuesLen] = 0;
 796             if(valLen) {
 797                 *valLen = valuesLen;
 798             }
 799         }
 800         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 801     } else {
 802         return 0;
 803     }
 804 }
 805
 806 U_CFUNC int32_t
 807 locale_getKeywords(const char *localeID,
 808                    char prev,
 809                    char *keywords, int32_t keywordCapacity,
 810                    char *values, int32_t valuesCapacity, int32_t *valLen,
 811                    UBool valuesToo,
 812                    UErrorCode *status) {
 813     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 814                         values, valuesCapacity, valLen, valuesToo,
 815                         NULL, NULL, status);
 816 }
 817
 818 U_CAPI int32_t U_EXPORT2
 819 uloc_getKeywordValue(const char* localeID,
 820                      const char* keywordName,
 821                      char* buffer, int32_t bufferCapacity,
 822                      UErrorCode* status)
 823 {
 824     const char* startSearchHere = NULL;
 825     const char* nextSeparator = NULL;
 826     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 827     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 828     int32_t i = 0;
 829     int32_t result = 0;
 830
 831     if(status && U_SUCCESS(*status) && localeID) {
 832       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 833       const char* tmpLocaleID;
 834
 835       if (_hasBCP47Extension(localeID)) {
 836           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 837       } else {
 838           tmpLocaleID=localeID;
 839       }
 840
 841       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
 842       if(startSearchHere == NULL) {
 843           /* no keywords, return at once */
 844           return 0;
 845       }
 846
 847       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 848       if(U_FAILURE(*status)) {
 849         return 0;
 850       }
 851
 852       /* find the first keyword */
 853       while(startSearchHere) {
 854           startSearchHere++;
 855           /* skip leading spaces (allowed?) */
 856           while(*startSearchHere == ' ') {
 857               startSearchHere++;
 858           }
 859           nextSeparator = uprv_strchr(startSearchHere, '=');
 860           /* need to normalize both keyword and keyword name */
 861           if(!nextSeparator) {
 862               break;
 863           }
 864           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
 865               /* keyword name too long for internal buffer */
 866               *status = U_INTERNAL_PROGRAM_ERROR;
 867               return 0;
 868           }
 869           for(i = 0; i < nextSeparator - startSearchHere; i++) {
 870               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
 871           }
 872           /* trim trailing spaces */
 873           while(startSearchHere[i-1] == ' ') {
 874               i--;
 875               U_ASSERT(i>=0);
 876           }
 877           localeKeywordNameBuffer[i] = 0;
 878
 879           startSearchHere = uprv_strchr(nextSeparator, ';');
 880
 881           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 882               nextSeparator++;
 883               while(*nextSeparator == ' ') {
 884                   nextSeparator++;
 885               }
 886               /* we actually found the keyword. Copy the value */
 887               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
 888                   while(*(startSearchHere-1) == ' ') {
 889                       startSearchHere--;
 890                   }
 891                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
 892                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
 893               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
 894                   i = (int32_t)uprv_strlen(nextSeparator);
 895                   while(nextSeparator[i - 1] == ' ') {
 896                       i--;
 897                   }
 898                   uprv_strncpy(buffer, nextSeparator, i);
 899                   result = u_terminateChars(buffer, bufferCapacity, i, status);
 900               } else {
 901                   /* give a bigger buffer, please */
 902                   *status = U_BUFFER_OVERFLOW_ERROR;
 903                   if(startSearchHere) {
 904                       result = (int32_t)(startSearchHere - nextSeparator);
 905                   } else {
 906                       result = (int32_t)uprv_strlen(nextSeparator);
 907                   }
 908               }
 909               return result;
 910           }
 911       }
 912     }
 913     return 0;
 914 }
 915
 916 U_CAPI int32_t U_EXPORT2
 917 uloc_setKeywordValue(const char* keywordName,
 918                      const char* keywordValue,
 919                      char* buffer, int32_t bufferCapacity,
 920                      UErrorCode* status)
 921 {
 922     /* TODO: sorting. removal. */
 923     int32_t keywordNameLen;
 924     int32_t keywordValueLen;
 925     int32_t bufLen;
 926     int32_t needLen = 0;
 927     int32_t foundValueLen;
 928     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
 929     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 930     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 931     int32_t i = 0;
 932     int32_t rc;
 933     char* nextSeparator = NULL;
 934     char* nextEqualsign = NULL;
 935     char* startSearchHere = NULL;
 936     char* keywordStart = NULL;
 937     char *insertHere = NULL;
 938     if(U_FAILURE(*status)) {
 939         return -1;
 940     }
 941     if(bufferCapacity>1) {
 942         bufLen = (int32_t)uprv_strlen(buffer);
 943     } else {
 944         *status = U_ILLEGAL_ARGUMENT_ERROR;
 945         return 0;
 946     }
 947     if(bufferCapacity<bufLen) {
 948         /* The capacity is less than the length?! Is this NULL terminated? */
 949         *status = U_ILLEGAL_ARGUMENT_ERROR;
 950         return 0;
 951     }
 952     if(keywordValue && !*keywordValue) {
 953         keywordValue = NULL;
 954     }
 955     if(keywordValue) {
 956         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
 957     } else {
 958         keywordValueLen = 0;
 959     }
 960     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 961     if(U_FAILURE(*status)) {
 962         return 0;
 963     }
 964     startSearchHere = (char*)locale_getKeywordsStart(buffer);
 965     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
 966         if(!keywordValue) { /* no keywords = nothing to remove */
 967             return bufLen;
 968         }
 969
 970         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
 971         if(startSearchHere) { /* had a single @ */
 972             needLen--; /* already had the @ */
 973             /* startSearchHere points at the @ */
 974         } else {
 975             startSearchHere=buffer+bufLen;
 976         }
 977         if(needLen >= bufferCapacity) {
 978             *status = U_BUFFER_OVERFLOW_ERROR;
 979             return needLen; /* no change */
 980         }
 981         *startSearchHere = '@';
 982         startSearchHere++;
 983         uprv_strcpy(startSearchHere, keywordNameBuffer);
 984         startSearchHere += keywordNameLen;
 985         *startSearchHere = '=';
 986         startSearchHere++;
 987         uprv_strcpy(startSearchHere, keywordValue);
 988         startSearchHere+=keywordValueLen;
 989         return needLen;
 990     } /* end shortcut - no @ */
 991
 992     keywordStart = startSearchHere;
 993     /* search for keyword */
 994     while(keywordStart) {
 995         keywordStart++;
 996         /* skip leading spaces (allowed?) */
 997         while(*keywordStart == ' ') {
 998             keywordStart++;
 999         }
1000         nextEqualsign = uprv_strchr(keywordStart, '=');
1001         /* need to normalize both keyword and keyword name */
1002         if(!nextEqualsign) {
1003             break;
1004         }
1005         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1006             /* keyword name too long for internal buffer */
1007             *status = U_INTERNAL_PROGRAM_ERROR;
1008             return 0;
1009         }
1010         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1011             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1012         }
1013         /* trim trailing spaces */
1014         while(keywordStart[i-1] == ' ') {
1015             i--;
1016         }
1017         U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
1018         localeKeywordNameBuffer[i] = 0;
1019
1020         nextSeparator = uprv_strchr(nextEqualsign, ';');
1021         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1022         if(rc == 0) {
1023             nextEqualsign++;
1024             while(*nextEqualsign == ' ') {
1025                 nextEqualsign++;
1026             }
1027             /* we actually found the keyword. Change the value */
1028             if (nextSeparator) {
1029                 keywordAtEnd = 0;
1030                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1031             } else {
1032                 keywordAtEnd = 1;
1033                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1034             }
1035             if(keywordValue) { /* adding a value - not removing */
1036               if(foundValueLen == keywordValueLen) {
1037                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1038                 return bufLen; /* no change in size */
1039               } else if(foundValueLen > keywordValueLen) {
1040                 int32_t delta = foundValueLen - keywordValueLen;
1041                 if(nextSeparator) { /* RH side */
1042                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1043                 }
1044                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1045                 bufLen -= delta;
1046                 buffer[bufLen]=0;
1047                 return bufLen;
1048               } else { /* FVL < KVL */
1049                 int32_t delta = keywordValueLen - foundValueLen;
1050                 if((bufLen+delta) >= bufferCapacity) {
1051                   *status = U_BUFFER_OVERFLOW_ERROR;
1052                   return bufLen+delta;
1053                 }
1054                 if(nextSeparator) { /* RH side */
1055                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1056                 }
1057                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1058                 bufLen += delta;
1059                 buffer[bufLen]=0;
1060                 return bufLen;
1061               }
1062             } else { /* removing a keyword */
1063               if(keywordAtEnd) {
1064                 /* zero out the ';' or '@' just before startSearchhere */
1065                 keywordStart[-1] = 0;
1066                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1067               } else {
1068                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1069                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1070                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1071               }
1072             }
1073         } else if(rc<0){ /* end match keyword */
1074           /* could insert at this location. */
1075           insertHere = keywordStart;
1076         }
1077         keywordStart = nextSeparator;
1078     } /* end loop searching */
1079
1080     if(!keywordValue) {
1081       return bufLen; /* removal of non-extant keyword - no change */
1082     }
1083
1084     /* we know there is at least one keyword. */
1085     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1086     if(needLen >= bufferCapacity) {
1087         *status = U_BUFFER_OVERFLOW_ERROR;
1088         return needLen; /* no change */
1089     }
1090
1091     if(insertHere) {
1092       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1093       keywordStart = insertHere;
1094     } else {
1095       keywordStart = buffer+bufLen;
1096       *keywordStart = ';';
1097       keywordStart++;
1098     }
1099     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1100     keywordStart += keywordNameLen;
1101     *keywordStart = '=';
1102     keywordStart++;
1103     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1104     keywordStart+=keywordValueLen;
1105     if(insertHere) {
1106       *keywordStart = ';';
1107       keywordStart++;
1108     }
1109     buffer[needLen]=0;
1110     return needLen;
1111 }
1112
1113 /* ### ID parsing implementation **************************************************/
1114
1115 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1116
1117 /*returns TRUE if one of the special prefixes is here (s=string)
1118   'x-' or 'i-' */
1119 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1120
1121 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1122  * except for variant
1123  */
1124 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1125
1126 static char* _strnchr(const char* str, int32_t len, char c) {
1127     U_ASSERT(str != 0 && len >= 0);
1128     while (len-- != 0) {
1129         char d = *str;
1130         if (d == c) {
1131             return (char*) str;
1132         } else if (d == 0) {
1133             break;
1134         }
1135         ++str;
1136     }
1137     return NULL;
1138 }
1139
1140 /**
1141  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1142  * a NULL entry, followed by more entries, and a second NULL entry.
1143  *
1144  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1145  * COUNTRIES_3.
1146  */
1147 static int16_t _findIndex(const char* const* list, const char* key)
1148 {
1149     const char* const* anchor = list;
1150     int32_t pass = 0;
1151
1152     /* Make two passes through two NULL-terminated arrays at 'list' */
1153     while (pass++ < 2) {
1154         while (*list) {
1155             if (uprv_strcmp(key, *list) == 0) {
1156                 return (int16_t)(list - anchor);
1157             }
1158             list++;
1159         }
1160         ++list;     /* skip final NULL *CWB*/
1161     }
1162     return -1;
1163 }
1164
1165 /* count the length of src while copying it to dest; return strlen(src) */
1166 static inline int32_t
1167 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1168     const char *anchor;
1169     char c;
1170
1171     anchor=src;
1172     for(;;) {
1173         if((c=*src)==0) {
1174             return (int32_t)(src-anchor);
1175         }
1176         if(destCapacity<=0) {
1177             return (int32_t)((src-anchor)+uprv_strlen(src));
1178         }
1179         ++src;
1180         *dest++=c;
1181         --destCapacity;
1182     }
1183 }
1184
1185 U_CFUNC const char*
1186 uloc_getCurrentCountryID(const char* oldID){
1187     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1188     if (offset >= 0) {
1189         return REPLACEMENT_COUNTRIES[offset];
1190     }
1191     return oldID;
1192 }
1193 U_CFUNC const char*
1194 uloc_getCurrentLanguageID(const char* oldID){
1195     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1196     if (offset >= 0) {
1197         return REPLACEMENT_LANGUAGES[offset];
1198     }
1199     return oldID;
1200 }
1201 /*
1202  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1203  * avoid duplicating code to handle the earlier locale ID pieces
1204  * in the functions for the later ones by
1205  * setting the *pEnd pointer to where they stopped parsing
1206  *
1207  * TODO try to use this in Locale
1208  */
1209 U_CFUNC int32_t
1210 ulocimp_getLanguage(const char *localeID,
1211                     char *language, int32_t languageCapacity,
1212                     const char **pEnd) {
1213     int32_t i=0;
1214     int32_t offset;
1215     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1216
1217     /* if it starts with i- or x- then copy that prefix */
1218     if(_isIDPrefix(localeID)) {
1219         if(i<languageCapacity) {
1220             language[i]=(char)uprv_tolower(*localeID);
1221         }
1222         if(i<languageCapacity) {
1223             language[i+1]='-';
1224         }
1225         i+=2;
1226         localeID+=2;
1227     }
1228
1229     /* copy the language as far as possible and count its length */
1230     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1231         if(i<languageCapacity) {
1232             language[i]=(char)uprv_tolower(*localeID);
1233         }
1234         if(i<3) {
1235             U_ASSERT(i>=0);
1236             lang[i]=(char)uprv_tolower(*localeID);
1237         }
1238         i++;
1239         localeID++;
1240     }
1241
1242     if(i==3) {
1243         /* convert 3 character code to 2 character code if possible *CWB*/
1244         offset=_findIndex(LANGUAGES_3, lang);
1245         if(offset>=0) {
1246             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1247         }
1248     }
1249
1250     if(pEnd!=NULL) {
1251         *pEnd=localeID;
1252     }
1253     return i;
1254 }
1255
1256 U_CFUNC int32_t
1257 ulocimp_getScript(const char *localeID,
1258                   char *script, int32_t scriptCapacity,
1259                   const char **pEnd)
1260 {
1261     int32_t idLen = 0;
1262
1263     if (pEnd != NULL) {
1264         *pEnd = localeID;
1265     }
1266
1267     /* copy the second item as far as possible and count its length */
1268     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1269             && uprv_isASCIILetter(localeID[idLen])) {
1270         idLen++;
1271     }
1272
1273     /* If it's exactly 4 characters long, then it's a script and not a country. */
1274     if (idLen == 4) {
1275         int32_t i;
1276         if (pEnd != NULL) {
1277             *pEnd = localeID+idLen;
1278         }
1279         if(idLen > scriptCapacity) {
1280             idLen = scriptCapacity;
1281         }
1282         if (idLen >= 1) {
1283             script[0]=(char)uprv_toupper(*(localeID++));
1284         }
1285         for (i = 1; i < idLen; i++) {
1286             script[i]=(char)uprv_tolower(*(localeID++));
1287         }
1288     }
1289     else {
1290         idLen = 0;
1291     }
1292     return idLen;
1293 }
1294
1295 U_CFUNC int32_t
1296 ulocimp_getCountry(const char *localeID,
1297                    char *country, int32_t countryCapacity,
1298                    const char **pEnd)
1299 {
1300     int32_t idLen=0;
1301     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1302     int32_t offset;
1303
1304     /* copy the country as far as possible and count its length */
1305     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1306         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1307             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1308         }
1309         idLen++;
1310     }
1311
1312     /* the country should be either length 2 or 3 */
1313     if (idLen == 2 || idLen == 3) {
1314         UBool gotCountry = FALSE;
1315         /* convert 3 character code to 2 character code if possible *CWB*/
1316         if(idLen==3) {
1317             offset=_findIndex(COUNTRIES_3, cnty);
1318             if(offset>=0) {
1319                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1320                 gotCountry = TRUE;
1321             }
1322         }
1323         if (!gotCountry) {
1324             int32_t i = 0;
1325             for (i = 0; i < idLen; i++) {
1326                 if (i < countryCapacity) {
1327                     country[i]=(char)uprv_toupper(localeID[i]);
1328                 }
1329             }
1330         }
1331         localeID+=idLen;
1332     } else {
1333         idLen = 0;
1334     }
1335
1336     if(pEnd!=NULL) {
1337         *pEnd=localeID;
1338     }
1339
1340     return idLen;
1341 }
1342
1343 /**
1344  * @param needSeparator if true, then add leading '_' if any variants
1345  * are added to 'variant'
1346  */
1347 static int32_t
1348 _getVariantEx(const char *localeID,
1349               char prev,
1350               char *variant, int32_t variantCapacity,
1351               UBool needSeparator) {
1352     int32_t i=0;
1353
1354     /* get one or more variant tags and separate them with '_' */
1355     if(_isIDSeparator(prev)) {
1356         /* get a variant string after a '-' or '_' */
1357         while(!_isTerminator(*localeID)) {
1358             if (needSeparator) {
1359                 if (i<variantCapacity) {
1360                     variant[i] = '_';
1361                 }
1362                 ++i;
1363                 needSeparator = FALSE;
1364             }
1365             if(i<variantCapacity) {
1366                 variant[i]=(char)uprv_toupper(*localeID);
1367                 if(variant[i]=='-') {
1368                     variant[i]='_';
1369                 }
1370             }
1371             i++;
1372             localeID++;
1373         }
1374     }
1375
1376     /* if there is no variant tag after a '-' or '_' then look for '@' */
1377     if(i==0) {
1378         if(prev=='@') {
1379             /* keep localeID */
1380         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1381             ++localeID; /* point after the '@' */
1382         } else {
1383             return 0;
1384         }
1385         while(!_isTerminator(*localeID)) {
1386             if (needSeparator) {
1387                 if (i<variantCapacity) {
1388                     variant[i] = '_';
1389                 }
1390                 ++i;
1391                 needSeparator = FALSE;
1392             }
1393             if(i<variantCapacity) {
1394                 variant[i]=(char)uprv_toupper(*localeID);
1395                 if(variant[i]=='-' || variant[i]==',') {
1396                     variant[i]='_';
1397                 }
1398             }
1399             i++;
1400             localeID++;
1401         }
1402     }
1403
1404     return i;
1405 }
1406
1407 static int32_t
1408 _getVariant(const char *localeID,
1409             char prev,
1410             char *variant, int32_t variantCapacity) {
1411     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1412 }
1413
1414 /**
1415  * Delete ALL instances of a variant from the given list of one or
1416  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1417  * @param variants the source string of one or more variants,
1418  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1419  * terminated; if it is, trailing zero will NOT be maintained.
1420  * @param variantsLen length of variants
1421  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1422  * or "PREEURO"; not zero terminated
1423  * @param toDeleteLen length of toDelete
1424  * @return number of characters deleted from variants
1425  */
1426 static int32_t
1427 _deleteVariant(char* variants, int32_t variantsLen,
1428                const char* toDelete, int32_t toDeleteLen)
1429 {
1430     int32_t delta = 0; /* number of chars deleted */
1431     for (;;) {
1432         UBool flag = FALSE;
1433         if (variantsLen < toDeleteLen) {
1434             return delta;
1435         }
1436         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1437             (variantsLen == toDeleteLen ||
1438              (flag=(variants[toDeleteLen] == '_'))))
1439         {
1440             int32_t d = toDeleteLen + (flag?1:0);
1441             variantsLen -= d;
1442             delta += d;
1443             if (variantsLen > 0) {
1444                 uprv_memmove(variants, variants+d, variantsLen);
1445             }
1446         } else {
1447             char* p = _strnchr(variants, variantsLen, '_');
1448             if (p == NULL) {
1449                 return delta;
1450             }
1451             ++p;
1452             variantsLen -= (int32_t)(p - variants);
1453             variants = p;
1454         }
1455     }
1456 }
1457
1458 /* Keyword enumeration */
1459
1460 typedef struct UKeywordsContext {
1461     char* keywords;
1462     char* current;
1463 } UKeywordsContext;
1464
1465 static void U_CALLCONV
1466 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1467     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1468     uprv_free(enumerator->context);
1469     uprv_free(enumerator);
1470 }
1471
1472 static int32_t U_CALLCONV
1473 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1474     char *kw = ((UKeywordsContext *)en->context)->keywords;
1475     int32_t result = 0;
1476     while(*kw) {
1477         result++;
1478         kw += uprv_strlen(kw)+1;
1479     }
1480     return result;
1481 }
1482
1483 static const char* U_CALLCONV
1484 uloc_kw_nextKeyword(UEnumeration* en,
1485                     int32_t* resultLength,
1486                     UErrorCode* /*status*/) {
1487     const char* result = ((UKeywordsContext *)en->context)->current;
1488     int32_t len = 0;
1489     if(*result) {
1490         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1491         ((UKeywordsContext *)en->context)->current += len+1;
1492     } else {
1493         result = NULL;
1494     }
1495     if (resultLength) {
1496         *resultLength = len;
1497     }
1498     return result;
1499 }
1500
1501 static void U_CALLCONV
1502 uloc_kw_resetKeywords(UEnumeration* en,
1503                       UErrorCode* /*status*/) {
1504     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1505 }
1506
1507 static const UEnumeration gKeywordsEnum = {
1508     NULL,
1509     NULL,
1510     uloc_kw_closeKeywords,
1511     uloc_kw_countKeywords,
1512     uenum_unextDefault,
1513     uloc_kw_nextKeyword,
1514     uloc_kw_resetKeywords
1515 };
1516
1517 U_CAPI UEnumeration* U_EXPORT2
1518 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1519 {
1520     UKeywordsContext *myContext = NULL;
1521     UEnumeration *result = NULL;
1522
1523     if(U_FAILURE(*status)) {
1524         return NULL;
1525     }
1526     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1527     /* Null pointer test */
1528     if (result == NULL) {
1529         *status = U_MEMORY_ALLOCATION_ERROR;
1530         return NULL;
1531     }
1532     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1533     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1534     if (myContext == NULL) {
1535         *status = U_MEMORY_ALLOCATION_ERROR;
1536         uprv_free(result);
1537         return NULL;
1538     }
1539     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1540     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1541     myContext->keywords[keywordListSize] = 0;
1542     myContext->current = myContext->keywords;
1543     result->context = myContext;
1544     return result;
1545 }
1546
1547 U_CAPI UEnumeration* U_EXPORT2
1548 uloc_openKeywords(const char* localeID,
1549                         UErrorCode* status)
1550 {
1551     int32_t i=0;
1552     char keywords[256];
1553     int32_t keywordsCapacity = 256;
1554     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1555     const char* tmpLocaleID;
1556
1557     if(status==NULL || U_FAILURE(*status)) {
1558         return 0;
1559     }
1560
1561     if (_hasBCP47Extension(localeID)) {
1562         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1563     } else {
1564         if (localeID==NULL) {
1565            localeID=uloc_getDefault();
1566         }
1567         tmpLocaleID=localeID;
1568     }
1569
1570     /* Skip the language */
1571     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1572     if(_isIDSeparator(*tmpLocaleID)) {
1573         const char *scriptID;
1574         /* Skip the script if available */
1575         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1576         if(scriptID != tmpLocaleID+1) {
1577             /* Found optional script */
1578             tmpLocaleID = scriptID;
1579         }
1580         /* Skip the Country */
1581         if (_isIDSeparator(*tmpLocaleID)) {
1582             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1583             if(_isIDSeparator(*tmpLocaleID)) {
1584                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1585             }
1586         }
1587     }
1588
1589     /* keywords are located after '@' */
1590     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1591         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1592     }
1593
1594     if(i) {
1595         return uloc_openKeywordList(keywords, i, status);
1596     } else {
1597         return NULL;
1598     }
1599 }
1600
1601
1602 /* bit-flags for 'options' parameter of _canonicalize */
1603 #define _ULOC_STRIP_KEYWORDS 0x2
1604 #define _ULOC_CANONICALIZE   0x1
1605
1606 #define OPTION_SET(options, mask) ((options & mask) != 0)
1607
1608 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1609 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1610
1611 /**
1612  * Canonicalize the given localeID, to level 1 or to level 2,
1613  * depending on the options.  To specify level 1, pass in options=0.
1614  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1615  *
1616  * This is the code underlying uloc_getName and uloc_canonicalize.
1617  */
1618 static int32_t
1619 _canonicalize(const char* localeID,
1620               char* result,
1621               int32_t resultCapacity,
1622               uint32_t options,
1623               UErrorCode* err) {
1624     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1625     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1626     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1627     const char* origLocaleID;
1628     const char* tmpLocaleID;
1629     const char* keywordAssign = NULL;
1630     const char* separatorIndicator = NULL;
1631     const char* addKeyword = NULL;
1632     const char* addValue = NULL;
1633     char* name;
1634     char* variant = NULL; /* pointer into name, or NULL */
1635
1636     if (U_FAILURE(*err)) {
1637         return 0;
1638     }
1639
1640     if (_hasBCP47Extension(localeID)) {
1641         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1642     } else {
1643         if (localeID==NULL) {
1644            localeID=uloc_getDefault();
1645         }
1646         tmpLocaleID=localeID;
1647     }
1648
1649     origLocaleID=tmpLocaleID;
1650
1651     /* if we are doing a full canonicalization, then put results in
1652        localeBuffer, if necessary; otherwise send them to result. */
1653     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1654         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1655         name = localeBuffer;
1656         nameCapacity = (int32_t)sizeof(localeBuffer);
1657     } else {
1658         name = result;
1659         nameCapacity = resultCapacity;
1660     }
1661
1662     /* get all pieces, one after another, and separate with '_' */
1663     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1664
1665     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1666         const char *d = uloc_getDefault();
1667
1668         len = (int32_t)uprv_strlen(d);
1669
1670         if (name != NULL) {
1671             uprv_strncpy(name, d, len);
1672         }
1673     } else if(_isIDSeparator(*tmpLocaleID)) {
1674         const char *scriptID;
1675
1676         ++fieldCount;
1677         if(len<nameCapacity) {
1678             name[len]='_';
1679         }
1680         ++len;
1681
1682         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1683             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1684         if(scriptSize > 0) {
1685             /* Found optional script */
1686             tmpLocaleID = scriptID;
1687             ++fieldCount;
1688             len+=scriptSize;
1689             if (_isIDSeparator(*tmpLocaleID)) {
1690                 /* If there is something else, then we add the _ */
1691                 if(len<nameCapacity) {
1692                     name[len]='_';
1693                 }
1694                 ++len;
1695             }
1696         }
1697
1698         if (_isIDSeparator(*tmpLocaleID)) {
1699             const char *cntryID;
1700             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1701                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1702             if (cntrySize > 0) {
1703                 /* Found optional country */
1704                 tmpLocaleID = cntryID;
1705                 len+=cntrySize;
1706             }
1707             if(_isIDSeparator(*tmpLocaleID)) {
1708                 /* If there is something else, then we add the _  if we found country before. */
1709                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1710                     ++fieldCount;
1711                     if(len<nameCapacity) {
1712                         name[len]='_';
1713                     }
1714                     ++len;
1715                 }
1716
1717                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1718                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1719                 if (variantSize > 0) {
1720                     variant = len<nameCapacity ? name+len : NULL;
1721                     len += variantSize;
1722                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1723                 }
1724             }
1725         }
1726     }
1727
1728     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1729     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1730         UBool done = FALSE;
1731         do {
1732             char c = *tmpLocaleID;
1733             switch (c) {
1734             case 0:
1735             case '@':
1736                 done = TRUE;
1737                 break;
1738             default:
1739                 if (len<nameCapacity) {
1740                     name[len] = c;
1741                 }
1742                 ++len;
1743                 ++tmpLocaleID;
1744                 break;
1745             }
1746         } while (!done);
1747     }
1748
1749     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1750        After this, tmpLocaleID either points to '@' or is NULL */
1751     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1752         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1753         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1754     }
1755
1756     /* Copy POSIX-style variant, if any [mr@FOO] */
1757     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1758         tmpLocaleID != NULL && keywordAssign == NULL) {
1759         for (;;) {
1760             char c = *tmpLocaleID;
1761             if (c == 0) {
1762                 break;
1763             }
1764             if (len<nameCapacity) {
1765                 name[len] = c;
1766             }
1767             ++len;
1768             ++tmpLocaleID;
1769         }
1770     }
1771
1772     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1773         /* Handle @FOO variant if @ is present and not followed by = */
1774         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1775             int32_t posixVariantSize;
1776             /* Add missing '_' if needed */
1777             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1778                 do {
1779                     if(len<nameCapacity) {
1780                         name[len]='_';
1781                     }
1782                     ++len;
1783                     ++fieldCount;
1784                 } while(fieldCount<2);
1785             }
1786             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1787                                              (UBool)(variantSize > 0));
1788             if (posixVariantSize > 0) {
1789                 if (variant == NULL) {
1790                     variant = name+len;
1791                 }
1792                 len += posixVariantSize;
1793                 variantSize += posixVariantSize;
1794             }
1795         }
1796
1797         /* Handle generic variants first */
1798         if (variant) {
1799             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1800                 const char* variantToCompare = VARIANT_MAP[j].variant;
1801                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1802                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1803                 len -= variantLen;
1804                 if (variantLen > 0) {
1805                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1806                         --len;
1807                     }
1808                     addKeyword = VARIANT_MAP[j].keyword;
1809                     addValue = VARIANT_MAP[j].value;
1810                     break;
1811                 }
1812             }
1813             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1814                 --len;
1815             }
1816         }
1817
1818         /* Look up the ID in the canonicalization map */
1819         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1820             const char* id = CANONICALIZE_MAP[j].id;
1821             int32_t n = (int32_t)uprv_strlen(id);
1822             if (len == n && uprv_strncmp(name, id, n) == 0) {
1823                 if (n == 0 && tmpLocaleID != NULL) {
1824                     break; /* Don't remap "" if keywords present */
1825                 }
1826                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1827                 if (CANONICALIZE_MAP[j].keyword) {
1828                     addKeyword = CANONICALIZE_MAP[j].keyword;
1829                     addValue = CANONICALIZE_MAP[j].value;
1830                 }
1831                 break;
1832             }
1833         }
1834     }
1835
1836     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1837         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1838             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1839             if(len<nameCapacity) {
1840                 name[len]='@';
1841             }
1842             ++len;
1843             ++fieldCount;
1844             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1845                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1846         } else if (addKeyword != NULL) {
1847             U_ASSERT(addValue != NULL && len < nameCapacity);
1848             /* inelegant but works -- later make _getKeywords do this? */
1849             len += _copyCount(name+len, nameCapacity-len, "@");
1850             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1851             len += _copyCount(name+len, nameCapacity-len, "=");
1852             len += _copyCount(name+len, nameCapacity-len, addValue);
1853         }
1854     }
1855
1856     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1857         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1858     }
1859
1860     return u_terminateChars(result, resultCapacity, len, err);
1861 }
1862
1863 /* ### ID parsing API **************************************************/
1864
1865 U_CAPI int32_t  U_EXPORT2
1866 uloc_getParent(const char*    localeID,
1867                char* parent,
1868                int32_t parentCapacity,
1869                UErrorCode* err)
1870 {
1871     const char *lastUnderscore;
1872     int32_t i;
1873
1874     if (U_FAILURE(*err))
1875         return 0;
1876
1877     if (localeID == NULL)
1878         localeID = uloc_getDefault();
1879
1880     lastUnderscore=uprv_strrchr(localeID, '_');
1881     if(lastUnderscore!=NULL) {
1882         i=(int32_t)(lastUnderscore-localeID);
1883     } else {
1884         i=0;
1885     }
1886
1887     if(i>0 && parent != localeID) {
1888         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1889     }
1890     return u_terminateChars(parent, parentCapacity, i, err);
1891 }
1892
1893 U_CAPI int32_t U_EXPORT2
1894 uloc_getLanguage(const char*    localeID,
1895          char* language,
1896          int32_t languageCapacity,
1897          UErrorCode* err)
1898 {
1899     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1900     int32_t i=0;
1901
1902     if (err==NULL || U_FAILURE(*err)) {
1903         return 0;
1904     }
1905
1906     if(localeID==NULL) {
1907         localeID=uloc_getDefault();
1908     }
1909
1910     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1911     return u_terminateChars(language, languageCapacity, i, err);
1912 }
1913
1914 U_CAPI int32_t U_EXPORT2
1915 uloc_getScript(const char*    localeID,
1916          char* script,
1917          int32_t scriptCapacity,
1918          UErrorCode* err)
1919 {
1920     int32_t i=0;
1921
1922     if(err==NULL || U_FAILURE(*err)) {
1923         return 0;
1924     }
1925
1926     if(localeID==NULL) {
1927         localeID=uloc_getDefault();
1928     }
1929
1930     /* skip the language */
1931     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1932     if(_isIDSeparator(*localeID)) {
1933         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1934     }
1935     return u_terminateChars(script, scriptCapacity, i, err);
1936 }
1937
1938 U_CAPI int32_t  U_EXPORT2
1939 uloc_getCountry(const char* localeID,
1940             char* country,
1941             int32_t countryCapacity,
1942             UErrorCode* err)
1943 {
1944     int32_t i=0;
1945
1946     if(err==NULL || U_FAILURE(*err)) {
1947         return 0;
1948     }
1949
1950     if(localeID==NULL) {
1951         localeID=uloc_getDefault();
1952     }
1953
1954     /* Skip the language */
1955     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1956     if(_isIDSeparator(*localeID)) {
1957         const char *scriptID;
1958         /* Skip the script if available */
1959         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1960         if(scriptID != localeID+1) {
1961             /* Found optional script */
1962             localeID = scriptID;
1963         }
1964         if(_isIDSeparator(*localeID)) {
1965             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1966         }
1967     }
1968     return u_terminateChars(country, countryCapacity, i, err);
1969 }
1970
1971 U_CAPI int32_t  U_EXPORT2
1972 uloc_getVariant(const char* localeID,
1973                 char* variant,
1974                 int32_t variantCapacity,
1975                 UErrorCode* err)
1976 {
1977     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1978     const char* tmpLocaleID;
1979     int32_t i=0;
1980
1981     if(err==NULL || U_FAILURE(*err)) {
1982         return 0;
1983     }
1984
1985     if (_hasBCP47Extension(localeID)) {
1986         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1987     } else {
1988         if (localeID==NULL) {
1989            localeID=uloc_getDefault();
1990         }
1991         tmpLocaleID=localeID;
1992     }
1993
1994     /* Skip the language */
1995     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1996     if(_isIDSeparator(*tmpLocaleID)) {
1997         const char *scriptID;
1998         /* Skip the script if available */
1999         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2000         if(scriptID != tmpLocaleID+1) {
2001             /* Found optional script */
2002             tmpLocaleID = scriptID;
2003         }
2004         /* Skip the Country */
2005         if (_isIDSeparator(*tmpLocaleID)) {
2006             const char *cntryID;
2007             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2008             if (cntryID != tmpLocaleID+1) {
2009                 /* Found optional country */
2010                 tmpLocaleID = cntryID;
2011             }
2012             if(_isIDSeparator(*tmpLocaleID)) {
2013                 /* If there was no country ID, skip a possible extra IDSeparator */
2014                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2015                     tmpLocaleID++;
2016                 }
2017                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2018             }
2019         }
2020     }
2021
2022     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2023     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2024 /*
2025     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2026         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2027     }
2028 */
2029     return u_terminateChars(variant, variantCapacity, i, err);
2030 }
2031
2032 U_CAPI int32_t  U_EXPORT2
2033 uloc_getName(const char* localeID,
2034              char* name,
2035              int32_t nameCapacity,
2036              UErrorCode* err)
2037 {
2038     return _canonicalize(localeID, name, nameCapacity, 0, err);
2039 }
2040
2041 U_CAPI int32_t  U_EXPORT2
2042 uloc_getBaseName(const char* localeID,
2043                  char* name,
2044                  int32_t nameCapacity,
2045                  UErrorCode* err)
2046 {
2047     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2048 }
2049
2050 U_CAPI int32_t  U_EXPORT2
2051 uloc_canonicalize(const char* localeID,
2052                   char* name,
2053                   int32_t nameCapacity,
2054                   UErrorCode* err)
2055 {
2056     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2057 }
2058
2059 U_CAPI const char*  U_EXPORT2
2060 uloc_getISO3Language(const char* localeID)
2061 {
2062     int16_t offset;
2063     char lang[ULOC_LANG_CAPACITY];
2064     UErrorCode err = U_ZERO_ERROR;
2065
2066     if (localeID == NULL)
2067     {
2068         localeID = uloc_getDefault();
2069     }
2070     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2071     if (U_FAILURE(err))
2072         return "";
2073     offset = _findIndex(LANGUAGES, lang);
2074     if (offset < 0)
2075         return "";
2076     return LANGUAGES_3[offset];
2077 }
2078
2079 U_CAPI const char*  U_EXPORT2
2080 uloc_getISO3Country(const char* localeID)
2081 {
2082     int16_t offset;
2083     char cntry[ULOC_LANG_CAPACITY];
2084     UErrorCode err = U_ZERO_ERROR;
2085
2086     if (localeID == NULL)
2087     {
2088         localeID = uloc_getDefault();
2089     }
2090     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2091     if (U_FAILURE(err))
2092         return "";
2093     offset = _findIndex(COUNTRIES, cntry);
2094     if (offset < 0)
2095         return "";
2096
2097     return COUNTRIES_3[offset];
2098 }
2099
2100 U_CAPI uint32_t  U_EXPORT2
2101 uloc_getLCID(const char* localeID)
2102 {
2103     UErrorCode status = U_ZERO_ERROR;
2104     char       langID[ULOC_FULLNAME_CAPACITY];
2105
2106     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2107     if (U_FAILURE(status)) {
2108         return 0;
2109     }
2110
2111     if (uprv_strchr(localeID, '@')) {
2112         // uprv_convertToLCID does not support keywords other than collation.
2113         // Remove all keywords except collation.
2114         int32_t len;
2115         char collVal[ULOC_KEYWORDS_CAPACITY];
2116         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2117
2118         len = uloc_getKeywordValue(localeID, "collation", collVal,
2119             sizeof(collVal)/sizeof(collVal[0]) - 1, &status);
2120
2121         if (U_SUCCESS(status) && len > 0) {
2122             collVal[len] = 0;
2123
2124             len = uloc_getBaseName(localeID, tmpLocaleID,
2125                 sizeof(tmpLocaleID)/sizeof(tmpLocaleID[0]) - 1, &status);
2126
2127             if (U_SUCCESS(status)) {
2128                 tmpLocaleID[len] = 0;
2129
2130                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2131                     sizeof(tmpLocaleID)/sizeof(tmpLocaleID[0]) - len - 1, &status);
2132
2133                 if (U_SUCCESS(status)) {
2134                     tmpLocaleID[len] = 0;
2135                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2136                 }
2137             }
2138         }
2139
2140         // fall through - all keywords are simply ignored
2141         status = U_ZERO_ERROR;
2142     }
2143
2144     return uprv_convertToLCID(langID, localeID, &status);
2145 }
2146
2147 U_CAPI int32_t U_EXPORT2
2148 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2149                 UErrorCode *status)
2150 {
2151     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2152 }
2153
2154 /* ### Default locale **************************************************/
2155
2156 U_CAPI const char*  U_EXPORT2
2157 uloc_getDefault()
2158 {
2159     return locale_get_default();
2160 }
2161
2162 U_CAPI void  U_EXPORT2
2163 uloc_setDefault(const char*   newDefaultLocale,
2164              UErrorCode* err)
2165 {
2166     if (U_FAILURE(*err))
2167         return;
2168     /* the error code isn't currently used for anything by this function*/
2169
2170     /* propagate change to C++ */
2171     locale_set_default(newDefaultLocale);
2172 }
2173
2174 /**
2175  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2176  * to an array of pointers to arrays of char.  All of these pointers are owned
2177  * by ICU-- do not delete them, and do not write through them.  The array is
2178  * terminated with a null pointer.
2179  */
2180 U_CAPI const char* const*  U_EXPORT2
2181 uloc_getISOLanguages()
2182 {
2183     return LANGUAGES;
2184 }
2185
2186 /**
2187  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2188  * pointer to an array of pointers to arrays of char.  All of these pointers are
2189  * owned by ICU-- do not delete them, and do not write through them.  The array is
2190  * terminated with a null pointer.
2191  */
2192 U_CAPI const char* const*  U_EXPORT2
2193 uloc_getISOCountries()
2194 {
2195     return COUNTRIES;
2196 }
2197
2198
2199 /* this function to be moved into cstring.c later */
2200 static char gDecimal = 0;
2201
2202 static /* U_CAPI */
2203 double
2204 /* U_EXPORT2 */
2205 _uloc_strtod(const char *start, char **end) {
2206     char *decimal;
2207     char *myEnd;
2208     char buf[30];
2209     double rv;
2210     if (!gDecimal) {
2211         char rep[5];
2212         /* For machines that decide to change the decimal on you,
2213         and try to be too smart with localization.
2214         This normally should be just a '.'. */
2215         sprintf(rep, "%+1.1f", 1.0);
2216         gDecimal = rep[2];
2217     }
2218
2219     if(gDecimal == '.') {
2220         return uprv_strtod(start, end); /* fall through to OS */
2221     } else {
2222         uprv_strncpy(buf, start, 29);
2223         buf[29]=0;
2224         decimal = uprv_strchr(buf, '.');
2225         if(decimal) {
2226             *decimal = gDecimal;
2227         } else {
2228             return uprv_strtod(start, end); /* no decimal point */
2229         }
2230         rv = uprv_strtod(buf, &myEnd);
2231         if(end) {
2232             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2233         }
2234         return rv;
2235     }
2236 }
2237
2238 typedef struct {
2239     float q;
2240     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2241     char *locale;
2242 } _acceptLangItem;
2243
2244 static int32_t U_CALLCONV
2245 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2246 {
2247     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2248     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2249
2250     int32_t rc = 0;
2251     if(bb->q < aa->q) {
2252         rc = -1;  /* A > B */
2253     } else if(bb->q > aa->q) {
2254         rc = 1;   /* A < B */
2255     } else {
2256         rc = 0;   /* A = B */
2257     }
2258
2259     if(rc==0) {
2260         rc = uprv_stricmp(aa->locale, bb->locale);
2261     }
2262
2263 #if defined(ULOC_DEBUG)
2264     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2265     aa->locale, aa->q,
2266     bb->locale, bb->q,
2267     rc);*/
2268 #endif
2269
2270     return rc;
2271 }
2272
2273 /*
2274 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2275 */
2276
2277 U_CAPI int32_t U_EXPORT2
2278 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2279                             const char *httpAcceptLanguage,
2280                             UEnumeration* availableLocales,
2281                             UErrorCode *status)
2282 {
2283     _acceptLangItem *j;
2284     _acceptLangItem smallBuffer[30];
2285     char **strs;
2286     char tmp[ULOC_FULLNAME_CAPACITY +1];
2287     int32_t n = 0;
2288     const char *itemEnd;
2289     const char *paramEnd;
2290     const char *s;
2291     const char *t;
2292     int32_t res;
2293     int32_t i;
2294     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2295     int32_t jSize;
2296     char *tempstr; /* Use for null pointer check */
2297
2298     j = smallBuffer;
2299     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2300     if(U_FAILURE(*status)) {
2301         return -1;
2302     }
2303
2304     for(s=httpAcceptLanguage;s&&*s;) {
2305         while(isspace(*s)) /* eat space at the beginning */
2306             s++;
2307         itemEnd=uprv_strchr(s,',');
2308         paramEnd=uprv_strchr(s,';');
2309         if(!itemEnd) {
2310             itemEnd = httpAcceptLanguage+l; /* end of string */
2311         }
2312         if(paramEnd && paramEnd<itemEnd) {
2313             /* semicolon (;) is closer than end (,) */
2314             t = paramEnd+1;
2315             if(*t=='q') {
2316                 t++;
2317             }
2318             while(isspace(*t)) {
2319                 t++;
2320             }
2321             if(*t=='=') {
2322                 t++;
2323             }
2324             while(isspace(*t)) {
2325                 t++;
2326             }
2327             j[n].q = (float)_uloc_strtod(t,NULL);
2328         } else {
2329             /* no semicolon - it's 1.0 */
2330             j[n].q = 1.0f;
2331             paramEnd = itemEnd;
2332         }
2333         j[n].dummy=0;
2334         /* eat spaces prior to semi */
2335         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2336             ;
2337         /* Check for null pointer from uprv_strndup */
2338         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2339         if (tempstr == NULL) {
2340             *status = U_MEMORY_ALLOCATION_ERROR;
2341             return -1;
2342         }
2343         j[n].locale = tempstr;
2344         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2345         if(strcmp(j[n].locale,tmp)) {
2346             uprv_free(j[n].locale);
2347             j[n].locale=uprv_strdup(tmp);
2348         }
2349 #if defined(ULOC_DEBUG)
2350         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2351 #endif
2352         n++;
2353         s = itemEnd;
2354         while(*s==',') { /* eat duplicate commas */
2355             s++;
2356         }
2357         if(n>=jSize) {
2358             if(j==smallBuffer) {  /* overflowed the small buffer. */
2359                 j = static_cast<_acceptLangItem *>(uprv_malloc(sizeof(j[0])*(jSize*2)));
2360                 if(j!=NULL) {
2361                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2362                 }
2363 #if defined(ULOC_DEBUG)
2364                 fprintf(stderr,"malloced at size %d\n", jSize);
2365 #endif
2366             } else {
2367                 j = static_cast<_acceptLangItem *>(uprv_realloc(j, sizeof(j[0])*jSize*2));
2368 #if defined(ULOC_DEBUG)
2369                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2370 #endif
2371             }
2372             jSize *= 2;
2373             if(j==NULL) {
2374                 *status = U_MEMORY_ALLOCATION_ERROR;
2375                 return -1;
2376             }
2377         }
2378     }
2379     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2380     if(U_FAILURE(*status)) {
2381         if(j != smallBuffer) {
2382 #if defined(ULOC_DEBUG)
2383             fprintf(stderr,"freeing j %p\n", j);
2384 #endif
2385             uprv_free(j);
2386         }
2387         return -1;
2388     }
2389     strs = static_cast<char **>(uprv_malloc((size_t)(sizeof(strs[0])*n)));
2390     /* Check for null pointer */
2391     if (strs == NULL) {
2392         uprv_free(j); /* Free to avoid memory leak */
2393         *status = U_MEMORY_ALLOCATION_ERROR;
2394         return -1;
2395     }
2396     for(i=0;i<n;i++) {
2397 #if defined(ULOC_DEBUG)
2398         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2399 #endif
2400         strs[i]=j[i].locale;
2401     }
2402     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2403         (const char**)strs, n, availableLocales, status);
2404     for(i=0;i<n;i++) {
2405         uprv_free(strs[i]);
2406     }
2407     uprv_free(strs);
2408     if(j != smallBuffer) {
2409 #if defined(ULOC_DEBUG)
2410         fprintf(stderr,"freeing j %p\n", j);
2411 #endif
2412         uprv_free(j);
2413     }
2414     return res;
2415 }
2416
2417
2418 U_CAPI int32_t U_EXPORT2
2419 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2420                     UAcceptResult *outResult, const char **acceptList,
2421                     int32_t acceptListCount,
2422                     UEnumeration* availableLocales,
2423                     UErrorCode *status)
2424 {
2425     int32_t i,j;
2426     int32_t len;
2427     int32_t maxLen=0;
2428     char tmp[ULOC_FULLNAME_CAPACITY+1];
2429     const char *l;
2430     char **fallbackList;
2431     if(U_FAILURE(*status)) {
2432         return -1;
2433     }
2434     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2435     if(fallbackList==NULL) {
2436         *status = U_MEMORY_ALLOCATION_ERROR;
2437         return -1;
2438     }
2439     for(i=0;i<acceptListCount;i++) {
2440 #if defined(ULOC_DEBUG)
2441         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2442 #endif
2443         while((l=uenum_next(availableLocales, NULL, status))) {
2444 #if defined(ULOC_DEBUG)
2445             fprintf(stderr,"  %s\n", l);
2446 #endif
2447             len = (int32_t)uprv_strlen(l);
2448             if(!uprv_strcmp(acceptList[i], l)) {
2449                 if(outResult) {
2450                     *outResult = ULOC_ACCEPT_VALID;
2451                 }
2452 #if defined(ULOC_DEBUG)
2453                 fprintf(stderr, "MATCH! %s\n", l);
2454 #endif
2455                 if(len>0) {
2456                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2457                 }
2458                 for(j=0;j<i;j++) {
2459                     uprv_free(fallbackList[j]);
2460                 }
2461                 uprv_free(fallbackList);
2462                 return u_terminateChars(result, resultAvailable, len, status);
2463             }
2464             if(len>maxLen) {
2465                 maxLen = len;
2466             }
2467         }
2468         uenum_reset(availableLocales, status);
2469         /* save off parent info */
2470         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2471             fallbackList[i] = uprv_strdup(tmp);
2472         } else {
2473             fallbackList[i]=0;
2474         }
2475     }
2476
2477     for(maxLen--;maxLen>0;maxLen--) {
2478         for(i=0;i<acceptListCount;i++) {
2479             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2480 #if defined(ULOC_DEBUG)
2481                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2482 #endif
2483                 while((l=uenum_next(availableLocales, NULL, status))) {
2484 #if defined(ULOC_DEBUG)
2485                     fprintf(stderr,"  %s\n", l);
2486 #endif
2487                     len = (int32_t)uprv_strlen(l);
2488                     if(!uprv_strcmp(fallbackList[i], l)) {
2489                         if(outResult) {
2490                             *outResult = ULOC_ACCEPT_FALLBACK;
2491                         }
2492 #if defined(ULOC_DEBUG)
2493                         fprintf(stderr, "fallback MATCH! %s\n", l);
2494 #endif
2495                         if(len>0) {
2496                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2497                         }
2498                         for(j=0;j<acceptListCount;j++) {
2499                             uprv_free(fallbackList[j]);
2500                         }
2501                         uprv_free(fallbackList);
2502                         return u_terminateChars(result, resultAvailable, len, status);
2503                     }
2504                 }
2505                 uenum_reset(availableLocales, status);
2506
2507                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2508                     uprv_free(fallbackList[i]);
2509                     fallbackList[i] = uprv_strdup(tmp);
2510                 } else {
2511                     uprv_free(fallbackList[i]);
2512                     fallbackList[i]=0;
2513                 }
2514             }
2515         }
2516         if(outResult) {
2517             *outResult = ULOC_ACCEPT_FAILED;
2518         }
2519     }
2520     for(i=0;i<acceptListCount;i++) {
2521         uprv_free(fallbackList[i]);
2522     }
2523     uprv_free(fallbackList);
2524     return -1;
2525 }
2526
2527 U_CAPI const char* U_EXPORT2
2528 uloc_toUnicodeLocaleKey(const char* keyword)
2529 {
2530     const char* bcpKey = ulocimp_toBcpKey(keyword);
2531     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2532         // unknown keyword, but syntax is fine..
2533         return keyword;
2534     }
2535     return bcpKey;
2536 }
2537
2538 U_CAPI const char* U_EXPORT2
2539 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2540 {
2541     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2542     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2543         // unknown keyword, but syntax is fine..
2544         return value;
2545     }
2546     return bcpType;
2547 }
2548
2549 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
2550 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
2551
2552 static UBool
2553 isWellFormedLegacyKey(const char* legacyKey)
2554 {
2555     const char* p = legacyKey;
2556     while (*p) {
2557         if (!UPRV_ISALPHANUM(*p)) {
2558             return FALSE;
2559         }
2560         p++;
2561     }
2562     return TRUE;
2563 }
2564
2565 static UBool
2566 isWellFormedLegacyType(const char* legacyType)
2567 {
2568     const char* p = legacyType;
2569     int32_t alphaNumLen = 0;
2570     while (*p) {
2571         if (*p == '_' || *p == '/' || *p == '-') {
2572             if (alphaNumLen == 0) {
2573                 return FALSE;
2574             }
2575             alphaNumLen = 0;
2576         } else if (UPRV_ISALPHANUM(*p)) {
2577             alphaNumLen++;
2578         } else {
2579             return FALSE;
2580         }
2581         p++;
2582     }
2583     return (alphaNumLen != 0);
2584 }
2585
2586 U_CAPI const char* U_EXPORT2
2587 uloc_toLegacyKey(const char* keyword)
2588 {
2589     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2590     if (legacyKey == NULL) {
2591         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2592         //
2593         // Note:
2594         //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2595         //  However, a key should not contain '=' obviously. For now, all existing
2596         //  keys are using ASCII alphabetic letters only. We won't add any new key
2597         //  that is not compatible with the BCP 47 syntax. Therefore, we assume
2598         //  a valid key consist from [0-9a-zA-Z], no symbols.
2599         if (isWellFormedLegacyKey(keyword)) {
2600             return keyword;
2601         }
2602     }
2603     return legacyKey;
2604 }
2605
2606 U_CAPI const char* U_EXPORT2
2607 uloc_toLegacyType(const char* keyword, const char* value)
2608 {
2609     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2610     if (legacyType == NULL) {
2611         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2612         //
2613         // Note:
2614         //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2615         //  However, a type should not contain '=' obviously. For now, all existing
2616         //  types are using ASCII alphabetic letters with a few symbol letters. We won't
2617         //  add any new type that is not compatible with the BCP 47 syntax except timezone
2618         //  IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain
2619         //  '-' '_' '/' in the middle.
2620         if (isWellFormedLegacyType(value)) {
2621             return value;
2622         }
2623     }
2624     return legacyType;
2625 }
2626
2627 /*eof*/