icuSources/common/uloc.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 1997-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *
   9 * File ULOC.CPP
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   04/01/97    aliu        Creation.
  15 *   08/21/98    stephen     JDK 1.2 sync
  16 *   12/08/98    rtg         New Locale implementation and C API
  17 *   03/15/99    damiba      overhaul.
  18 *   04/06/99    stephen     changed setDefault() to realloc and copy
  19 *   06/14/99    stephen     Changed calls to ures_open for new params
  20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  22 *                           brought canonicalization code into line with spec
  23 *****************************************************************************/
  24
  25 /*
  26    POSIX's locale format, from putil.c: [no spaces]
  27
  28      ll [ _CC ] [ . MM ] [ @ VV]
  29
  30      l = lang, C = ctry, M = charmap, V = variant
  31 */
  32
  33 #include "unicode/utypes.h"
  34 #include "unicode/ustring.h"
  35 #include "unicode/uloc.h"
  36
  37 #include "putilimp.h"
  38 #include "ustr_imp.h"
  39 #include "ulocimp.h"
  40 #include "umutex.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43 #include "locmap.h"
  44 #include "uarrsort.h"
  45 #include "uenumimp.h"
  46 #include "uassert.h"
  47 #include "charstr.h"
  48
  49 #include <stdio.h> /* for sprintf */
  50
  51 U_NAMESPACE_USE
  52
  53 /* ### Declarations **************************************************/
  54
  55 /* Locale stuff from locid.cpp */
  56 U_CFUNC void locale_set_default(const char *id);
  57 U_CFUNC const char *locale_get_default(void);
  58 U_CFUNC int32_t
  59 locale_getKeywords(const char *localeID,
  60             char prev,
  61             char *keywords, int32_t keywordCapacity,
  62             char *values, int32_t valuesCapacity, int32_t *valLen,
  63             UBool valuesToo,
  64             UErrorCode *status);
  65
  66 /* ### Data tables **************************************************/
  67
  68 /**
  69  * Table of language codes, both 2- and 3-letter, with preference
  70  * given to 2-letter codes where possible.  Includes 3-letter codes
  71  * that lack a 2-letter equivalent.
  72  *
  73  * This list must be in sorted order.  This list is returned directly
  74  * to the user by some API.
  75  *
  76  * This list must be kept in sync with LANGUAGES_3, with corresponding
  77  * entries matched.
  78  *
  79  * This table should be terminated with a NULL entry, followed by a
  80  * second list, and another NULL entry.  The first list is visible to
  81  * user code when this array is returned by API.  The second list
  82  * contains codes we support, but do not expose through user API.
  83  *
  84  * Notes
  85  *
  86  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  87  * include the revisions up to 2001/7/27 *CWB*
  88  *
  89  * The 3 character codes are the terminology codes like RFC 3066.  This
  90  * is compatible with prior ICU codes
  91  *
  92  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  93  * table but now at the end of the table because 3 character codes are
  94  * duplicates.  This avoids bad searches going from 3 to 2 character
  95  * codes.
  96  *
  97  * The range qaa-qtz is reserved for local use
  98  */
  99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 100 /* ISO639 table version is 20150505 */
 101 /* Subsequent hand addition of selected languages */
 102 static const char * const LANGUAGES[] = {
 103     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
 104     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
 105     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
 106     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
 107     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
 108     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
 109     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
 110     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
 111     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
 112     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
 113     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
 114     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
 115     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
 116     "cs",  "csb", "cu",  "cv",  "cy",
 117     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
 118     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
 119     "dyo", "dyu", "dz",  "dzg",
 120     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
 121     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
 122     "ext",
 123     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
 124     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
 125     "frs", "fur", "fy",
 126     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
 127     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
 128     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
 129     "gur", "guz", "gv",  "gwi",
 130     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
 131     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
 132     "hup", "hy",  "hz",
 133     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
 134     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
 135     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
 136     "jv",
 137     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
 138     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
 139     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
 140     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
 141     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
 142     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
 143     "kv",  "kw",  "ky",
 144     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
 145     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
 146     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
 147     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
 148     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
 149     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
 150     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
 151     "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
 152     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
 153     "my",  "mye", "myv", "mzn",
 154     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
 155     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
 156     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
 157     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
 158     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
 159     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
 160     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
 161     "pon", "prg", "pro", "ps",  "pt",
 162     "qu",  "quc", "qug",
 163     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
 164     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
 165     "rw",  "rwk",
 166     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
 167     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
 168     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
 169     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
 170     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
 171     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
 172     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
 173     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
 174     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
 175     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
 176     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
 177     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
 178     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
 179     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
 180     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
 181     "vot", "vro", "vun",
 182     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
 183     "xal", "xh",  "xmf", "xog",
 184     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
 185     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
 186     "zun", "zxx", "zza",
 187 NULL,
 188     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 189 NULL
 190 };
 191
 192 static const char* const DEPRECATED_LANGUAGES[]={
 193     "in", "iw", "ji", "jw", NULL, NULL
 194 };
 195 static const char* const REPLACEMENT_LANGUAGES[]={
 196     "id", "he", "yi", "jv", NULL, NULL
 197 };
 198
 199 /**
 200  * Table of 3-letter language codes.
 201  *
 202  * This is a lookup table used to convert 3-letter language codes to
 203  * their 2-letter equivalent, where possible.  It must be kept in sync
 204  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 205  * same language as LANGUAGES_3[i].  The commented-out lines are
 206  * copied from LANGUAGES to make eyeballing this baby easier.
 207  *
 208  * Where a 3-letter language code has no 2-letter equivalent, the
 209  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 210  *
 211  * This table should be terminated with a NULL entry, followed by a
 212  * second list, and another NULL entry.  The two lists correspond to
 213  * the two lists in LANGUAGES.
 214  */
 215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 216 /* ISO639 table version is 20150505 */
 217 /* Subsequent hand addition of selected languages */
 218 static const char * const LANGUAGES_3[] = {
 219     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
 220     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
 221     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
 222     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
 223     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
 224     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
 225     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
 226     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
 227     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
 228     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
 229     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
 230     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
 231     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
 232     "ces", "csb", "chu", "chv", "cym",
 233     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
 234     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
 235     "dyo", "dyu", "dzo", "dzg",
 236     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
 237     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
 238     "ext",
 239     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
 240     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
 241     "frs", "fur", "fry",
 242     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
 243     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
 244     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
 245     "gur", "guz", "glv", "gwi",
 246     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
 247     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
 248     "hup", "hye", "her",
 249     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
 250     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
 251     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
 252     "jav",
 253     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
 254     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
 255     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
 256     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
 257     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
 258     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
 259     "kom", "cor", "kir",
 260     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
 261     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
 262     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
 263     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
 264     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
 265     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
 266     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
 267     "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
 268     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
 269     "mya", "mye", "myv", "mzn",
 270     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
 271     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
 272     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
 273     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
 274     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
 275     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
 276     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
 277     "pon", "prg", "pro", "pus", "por",
 278     "que", "quc", "qug",
 279     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
 280     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
 281     "kin", "rwk",
 282     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
 283     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
 284     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
 285     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
 286     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
 287     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
 288     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
 289     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
 290     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
 291     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
 292     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
 293     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
 294     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
 295     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
 296     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
 297     "vot", "vro", "vun",
 298     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
 299     "xal", "xho", "xmf", "xog",
 300     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
 301     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
 302     "zun", "zxx", "zza",
 303 NULL,
 304 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 305     "ind", "heb", "yid", "jaw", "srp",
 306 NULL
 307 };
 308
 309 /**
 310  * Table of 2-letter country codes.
 311  *
 312  * This list must be in sorted order.  This list is returned directly
 313  * to the user by some API.
 314  *
 315  * This list must be kept in sync with COUNTRIES_3, with corresponding
 316  * entries matched.
 317  *
 318  * This table should be terminated with a NULL entry, followed by a
 319  * second list, and another NULL entry.  The first list is visible to
 320  * user code when this array is returned by API.  The second list
 321  * contains codes we support, but do not expose through user API.
 322  *
 323  * Notes:
 324  *
 325  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 326  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 327  * new codes keeping the old ones for compatibility updated to include
 328  * 1999/12/03 revisions *CWB*
 329  *
 330  * RO(ROM) is now RO(ROU) according to
 331  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 332  */
 333 static const char * const COUNTRIES[] = {
 334     "AC",  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
 335     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 336     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 337     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
 338     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 339     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CP",  "CR",
 340     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
 341     "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
 342     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 343     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 344     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 345     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 346     "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 347     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 348     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 349     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 350     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 351     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 352     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 353     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 354     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 355     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 356     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 357     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 358     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
 359     "SX",  "SY",  "SZ",  "TA",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 360     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 361     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 362     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 363     "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 364 NULL,
 365     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 366 NULL
 367 };
 368
 369 static const char* const DEPRECATED_COUNTRIES[] = {
 370     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
 371 };
 372 static const char* const REPLACEMENT_COUNTRIES[] = {
 373 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
 374     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
 375 };
 376
 377 /**
 378  * Table of 3-letter country codes.
 379  *
 380  * This is a lookup table used to convert 3-letter country codes to
 381  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 382  * For all valid i, COUNTRIES[i] must refer to the same country as
 383  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 384  * to make eyeballing this baby easier.
 385  *
 386  * This table should be terminated with a NULL entry, followed by a
 387  * second list, and another NULL entry.  The two lists correspond to
 388  * the two lists in COUNTRIES.
 389  */
 390 static const char * const COUNTRIES_3[] = {
 391 /*  "AC",  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
 392     "ASC", "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
 393 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 394     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 395 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 396     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 397 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
 398     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
 399 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 400     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 401 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CP",  "CR",     */
 402     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CPT", "CRI",
 403 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
 404     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
 405 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 406     "DMA", "DOM", "DZA", "EA ", "ECU", "EST", "EGY", "ESH", "ERI", /* no valid 3-letter code for EA */
 407 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 408     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 409 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 410     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 411 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 412     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 413 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 414     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 415 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 416     "IC ", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", /* no valid 3-letter code for IC */
 417 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 418     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 419 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 420     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 421 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 422     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 423 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 424     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 425 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 426     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 427 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 428     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 429 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 430     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 431 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 432     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 433 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 434     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 435 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 436     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 437 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 438     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 439 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
 440     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
 441 /*  "SX",  "SY",  "SZ",  "TA",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 442     "SXM", "SYR", "SWZ", "TAA", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 443 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 444     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 445 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 446     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 447 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 448     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 449 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 450     "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 451 NULL,
 452 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
 453     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
 454 NULL
 455 };
 456
 457 typedef struct CanonicalizationMap {
 458     const char *id;          /* input ID */
 459     const char *canonicalID; /* canonicalized output ID */
 460 } CanonicalizationMap;
 461
 462 /**
 463  * A map to canonicalize locale IDs.  This handles a variety of
 464  * different semantic kinds of transformations.
 465  */
 466 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 467     { "",               "en_US_POSIX" }, /* .NET name */ // open ICU 64 deleted, we restore
 468     { "c",              "en_US_POSIX" }, /* POSIX name */ // open ICU 64 deleted, we restore
 469     { "posix",          "en_US_POSIX" }, /* POSIX name (alias of C) */ // open ICU 64 deleted, we restore
 470     { "art_LOJBAN",     "jbo" }, /* registered name */
 471     { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
 472     { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
 473     { "zh_GAN",         "gan" }, /* registered name */
 474     { "zh_GUOYU",       "zh" }, /* registered name */
 475     { "zh_HAKKA",       "hak" }, /* registered name */
 476     { "zh_MIN_NAN",     "nan" }, /* registered name */
 477     { "zh_WUU",         "wuu" }, /* registered name */
 478     { "zh_XIANG",       "hsn" }, /* registered name */
 479     { "zh_YUE",         "yue" }, /* registered name */
 480 };
 481
 482 /* ### BCP47 Conversion *******************************************/
 483 /* Test if the locale id has BCP47 u extension and does not have '@' */
 484 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 485 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 486 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 487         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 ||  \
 488                 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
 489             finalID=id; \
 490             if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
 491         } else { \
 492             finalID=buffer; \
 493         }
 494 /* Gets the size of the shortest subtag in the given localeID. */
 495 static int32_t getShortestSubtagLength(const char *localeID) {
 496     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
 497     int32_t length = localeIDLength;
 498     int32_t tmpLength = 0;
 499     int32_t i;
 500     UBool reset = TRUE;
 501
 502     for (i = 0; i < localeIDLength; i++) {
 503         if (localeID[i] != '_' && localeID[i] != '-') {
 504             if (reset) {
 505                 tmpLength = 0;
 506                 reset = FALSE;
 507             }
 508             tmpLength++;
 509         } else {
 510             if (tmpLength != 0 && tmpLength < length) {
 511                 length = tmpLength;
 512             }
 513             reset = TRUE;
 514         }
 515     }
 516
 517     return length;
 518 }
 519
 520 /* ### Keywords **************************************************/
 521 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
 522 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
 523 /* Punctuation/symbols allowed in legacy key values */
 524 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
 525
 526 #define ULOC_KEYWORD_BUFFER_LEN 25
 527 #define ULOC_MAX_NO_KEYWORDS 25
 528
 529 U_CAPI const char * U_EXPORT2
 530 locale_getKeywordsStart(const char *localeID) {
 531     const char *result = NULL;
 532     if((result = uprv_strchr(localeID, '@')) != NULL) {
 533         return result;
 534     }
 535 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 536     else {
 537         /* We do this because the @ sign is variant, and the @ sign used on one
 538         EBCDIC machine won't be compiled the same way on other EBCDIC based
 539         machines. */
 540         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 541         const uint8_t *charToFind = ebcdicSigns;
 542         while(*charToFind) {
 543             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 544                 return result;
 545             }
 546             charToFind++;
 547         }
 548     }
 549 #endif
 550     return NULL;
 551 }
 552
 553 /**
 554  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 555  * @param keywordName incoming name to be canonicalized
 556  * @param status return status (keyword too long)
 557  * @return length of the keyword name
 558  */
 559 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 560 {
 561   int32_t keywordNameLen = 0;
 562
 563   for (; *keywordName != 0; keywordName++) {
 564     if (!UPRV_ISALPHANUM(*keywordName)) {
 565       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 566       return 0;
 567     }
 568     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
 569       buf[keywordNameLen++] = uprv_tolower(*keywordName);
 570     } else {
 571       /* keyword name too long for internal buffer */
 572       *status = U_INTERNAL_PROGRAM_ERROR;
 573       return 0;
 574     }
 575   }
 576   if (keywordNameLen == 0) {
 577     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
 578     return 0;
 579   }
 580   buf[keywordNameLen] = 0; /* terminate */
 581
 582   return keywordNameLen;
 583 }
 584
 585 typedef struct {
 586     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 587     int32_t keywordLen;
 588     const char *valueStart;
 589     int32_t valueLen;
 590 } KeywordStruct;
 591
 592 static int32_t U_CALLCONV
 593 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
 594     const char* leftString = ((const KeywordStruct *)left)->keyword;
 595     const char* rightString = ((const KeywordStruct *)right)->keyword;
 596     return uprv_strcmp(leftString, rightString);
 597 }
 598
 599 static int32_t
 600 _getKeywords(const char *localeID,
 601              char prev,
 602              char *keywords, int32_t keywordCapacity,
 603              char *values, int32_t valuesCapacity, int32_t *valLen,
 604              UBool valuesToo,
 605              UErrorCode *status)
 606 {
 607     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 608
 609     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 610     int32_t numKeywords = 0;
 611     const char* pos = localeID;
 612     const char* equalSign = NULL;
 613     const char* semicolon = NULL;
 614     int32_t i = 0, j, n;
 615     int32_t keywordsLen = 0;
 616     int32_t valuesLen = 0;
 617
 618     if(prev == '@') { /* start of keyword definition */
 619         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 620         do {
 621             UBool duplicate = FALSE;
 622             /* skip leading spaces */
 623             while(*pos == ' ') {
 624                 pos++;
 625             }
 626             if (!*pos) { /* handle trailing "; " */
 627                 break;
 628             }
 629             if(numKeywords == maxKeywords) {
 630                 *status = U_INTERNAL_PROGRAM_ERROR;
 631                 return 0;
 632             }
 633             equalSign = uprv_strchr(pos, '=');
 634             semicolon = uprv_strchr(pos, ';');
 635             /* lack of '=' [foo@currency] is illegal */
 636             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 637             if(!equalSign || (semicolon && semicolon<equalSign)) {
 638                 *status = U_INVALID_FORMAT_ERROR;
 639                 return 0;
 640             }
 641             /* need to normalize both keyword and keyword name */
 642             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 643                 /* keyword name too long for internal buffer */
 644                 *status = U_INTERNAL_PROGRAM_ERROR;
 645                 return 0;
 646             }
 647             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 648                 if (pos[i] != ' ') {
 649                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 650                 }
 651             }
 652
 653             /* zero-length keyword is an error. */
 654             if (n == 0) {
 655                 *status = U_INVALID_FORMAT_ERROR;
 656                 return 0;
 657             }
 658
 659             keywordList[numKeywords].keyword[n] = 0;
 660             keywordList[numKeywords].keywordLen = n;
 661             /* now grab the value part. First we skip the '=' */
 662             equalSign++;
 663             /* then we leading spaces */
 664             while(*equalSign == ' ') {
 665                 equalSign++;
 666             }
 667
 668             /* Premature end or zero-length value */
 669             if (!*equalSign || equalSign == semicolon) {
 670                 *status = U_INVALID_FORMAT_ERROR;
 671                 return 0;
 672             }
 673
 674             keywordList[numKeywords].valueStart = equalSign;
 675
 676             pos = semicolon;
 677             i = 0;
 678             if(pos) {
 679                 while(*(pos - i - 1) == ' ') {
 680                     i++;
 681                 }
 682                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 683                 pos++;
 684             } else {
 685                 i = (int32_t)uprv_strlen(equalSign);
 686                 while(i && equalSign[i-1] == ' ') {
 687                     i--;
 688                 }
 689                 keywordList[numKeywords].valueLen = i;
 690             }
 691             /* If this is a duplicate keyword, then ignore it */
 692             for (j=0; j<numKeywords; ++j) {
 693                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 694                     duplicate = TRUE;
 695                     break;
 696                 }
 697             }
 698             if (!duplicate) {
 699                 ++numKeywords;
 700             }
 701         } while(pos);
 702
 703         /* now we have a list of keywords */
 704         /* we need to sort it */
 705         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 706
 707         /* Now construct the keyword part */
 708         for(i = 0; i < numKeywords; i++) {
 709             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 710                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 711                 if(valuesToo) {
 712                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 713                 } else {
 714                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 715                 }
 716             }
 717             keywordsLen += keywordList[i].keywordLen + 1;
 718             if(valuesToo) {
 719                 if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
 720                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 721                 }
 722                 keywordsLen += keywordList[i].valueLen;
 723
 724                 if(i < numKeywords - 1) {
 725                     if(keywordsLen < keywordCapacity) {
 726                         keywords[keywordsLen] = ';';
 727                     }
 728                     keywordsLen++;
 729                 }
 730             }
 731             if(values) {
 732                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 733                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 734                     values[valuesLen + keywordList[i].valueLen] = 0;
 735                 }
 736                 valuesLen += keywordList[i].valueLen + 1;
 737             }
 738         }
 739         if(values) {
 740             values[valuesLen] = 0;
 741             if(valLen) {
 742                 *valLen = valuesLen;
 743             }
 744         }
 745         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 746     } else {
 747         return 0;
 748     }
 749 }
 750
 751 U_CFUNC int32_t
 752 locale_getKeywords(const char *localeID,
 753                    char prev,
 754                    char *keywords, int32_t keywordCapacity,
 755                    char *values, int32_t valuesCapacity, int32_t *valLen,
 756                    UBool valuesToo,
 757                    UErrorCode *status) {
 758     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 759                         values, valuesCapacity, valLen, valuesToo,
 760                         status);
 761 }
 762
 763 U_CAPI int32_t U_EXPORT2
 764 uloc_getKeywordValue(const char* localeID,
 765                      const char* keywordName,
 766                      char* buffer, int32_t bufferCapacity,
 767                      UErrorCode* status)
 768 {
 769     const char* startSearchHere = NULL;
 770     const char* nextSeparator = NULL;
 771     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 772     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 773     int32_t result = 0;
 774
 775     if(status && U_SUCCESS(*status) && localeID) {
 776       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 777       const char* tmpLocaleID;
 778
 779       if (keywordName == NULL || keywordName[0] == 0) {
 780         *status = U_ILLEGAL_ARGUMENT_ERROR;
 781         return 0;
 782       }
 783
 784       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 785       if(U_FAILURE(*status)) {
 786         return 0;
 787       }
 788
 789       if (_hasBCP47Extension(localeID)) {
 790           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 791       } else {
 792           tmpLocaleID=localeID;
 793       }
 794
 795       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
 796       if(startSearchHere == NULL) {
 797           /* no keywords, return at once */
 798           return 0;
 799       }
 800
 801       /* find the first keyword */
 802       while(startSearchHere) {
 803           const char* keyValueTail;
 804           int32_t keyValueLen;
 805
 806           startSearchHere++; /* skip @ or ; */
 807           nextSeparator = uprv_strchr(startSearchHere, '=');
 808           if(!nextSeparator) {
 809               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
 810               return 0;
 811           }
 812           /* strip leading & trailing spaces (TC decided to tolerate these) */
 813           while(*startSearchHere == ' ') {
 814               startSearchHere++;
 815           }
 816           keyValueTail = nextSeparator;
 817           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
 818               keyValueTail--;
 819           }
 820           /* now keyValueTail points to first char after the keyName */
 821           /* copy & normalize keyName from locale */
 822           if (startSearchHere == keyValueTail) {
 823               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
 824               return 0;
 825           }
 826           keyValueLen = 0;
 827           while (startSearchHere < keyValueTail) {
 828             if (!UPRV_ISALPHANUM(*startSearchHere)) {
 829               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 830               return 0;
 831             }
 832             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
 833               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
 834             } else {
 835               /* keyword name too long for internal buffer */
 836               *status = U_INTERNAL_PROGRAM_ERROR;
 837               return 0;
 838             }
 839           }
 840           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
 841
 842           startSearchHere = uprv_strchr(nextSeparator, ';');
 843
 844           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 845                /* current entry matches the keyword. */
 846              nextSeparator++; /* skip '=' */
 847               /* First strip leading & trailing spaces (TC decided to tolerate these) */
 848               while(*nextSeparator == ' ') {
 849                 nextSeparator++;
 850               }
 851               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
 852               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
 853                 keyValueTail--;
 854               }
 855               /* Now copy the value, but check well-formedness */
 856               if (nextSeparator == keyValueTail) {
 857                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
 858                 return 0;
 859               }
 860               keyValueLen = 0;
 861               while (nextSeparator < keyValueTail) {
 862                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
 863                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
 864                   return 0;
 865                 }
 866                 if (keyValueLen < bufferCapacity) {
 867                   /* Should we lowercase value to return here? Tests expect as-is. */
 868                   buffer[keyValueLen++] = *nextSeparator++;
 869                 } else { /* keep advancing so we return correct length in case of overflow */
 870                   keyValueLen++;
 871                   nextSeparator++;
 872                 }
 873               }
 874               result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
 875               return result;
 876           }
 877       }
 878     }
 879     return 0;
 880 }
 881
 882 U_CAPI int32_t U_EXPORT2
 883 uloc_setKeywordValue(const char* keywordName,
 884                      const char* keywordValue,
 885                      char* buffer, int32_t bufferCapacity,
 886                      UErrorCode* status)
 887 {
 888     /* TODO: sorting. removal. */
 889     int32_t keywordNameLen;
 890     int32_t keywordValueLen;
 891     int32_t bufLen;
 892     int32_t needLen = 0;
 893     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 894     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
 895     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 896     int32_t rc;
 897     char* nextSeparator = NULL;
 898     char* nextEqualsign = NULL;
 899     char* startSearchHere = NULL;
 900     char* keywordStart = NULL;
 901     CharString updatedKeysAndValues;
 902     int32_t updatedKeysAndValuesLen;
 903     UBool handledInputKeyAndValue = FALSE;
 904     char keyValuePrefix = '@';
 905
 906     if(U_FAILURE(*status)) {
 907         return -1;
 908     }
 909     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
 910         *status = U_ILLEGAL_ARGUMENT_ERROR;
 911         return 0;
 912     }
 913     bufLen = (int32_t)uprv_strlen(buffer);
 914     if(bufferCapacity<bufLen) {
 915         /* The capacity is less than the length?! Is this NULL terminated? */
 916         *status = U_ILLEGAL_ARGUMENT_ERROR;
 917         return 0;
 918     }
 919     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 920     if(U_FAILURE(*status)) {
 921         return 0;
 922     }
 923
 924     keywordValueLen = 0;
 925     if(keywordValue) {
 926         while (*keywordValue != 0) {
 927             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
 928                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
 929                 return 0;
 930             }
 931             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
 932                 /* Should we force lowercase in value to set? */
 933                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
 934             } else {
 935                 /* keywordValue too long for internal buffer */
 936                 *status = U_INTERNAL_PROGRAM_ERROR;
 937                 return 0;
 938             }
 939         }
 940     }
 941     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
 942
 943     startSearchHere = (char*)locale_getKeywordsStart(buffer);
 944     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
 945         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
 946             return bufLen;
 947         }
 948
 949         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
 950         if(startSearchHere) { /* had a single @ */
 951             needLen--; /* already had the @ */
 952             /* startSearchHere points at the @ */
 953         } else {
 954             startSearchHere=buffer+bufLen;
 955         }
 956         if(needLen >= bufferCapacity) {
 957             *status = U_BUFFER_OVERFLOW_ERROR;
 958             return needLen; /* no change */
 959         }
 960         *startSearchHere++ = '@';
 961         uprv_strcpy(startSearchHere, keywordNameBuffer);
 962         startSearchHere += keywordNameLen;
 963         *startSearchHere++ = '=';
 964         uprv_strcpy(startSearchHere, keywordValueBuffer);
 965         return needLen;
 966     } /* end shortcut - no @ */
 967
 968     keywordStart = startSearchHere;
 969     /* search for keyword */
 970     while(keywordStart) {
 971         const char* keyValueTail;
 972         int32_t keyValueLen;
 973
 974         keywordStart++; /* skip @ or ; */
 975         nextEqualsign = uprv_strchr(keywordStart, '=');
 976         if (!nextEqualsign) {
 977             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
 978             return 0;
 979         }
 980         /* strip leading & trailing spaces (TC decided to tolerate these) */
 981         while(*keywordStart == ' ') {
 982             keywordStart++;
 983         }
 984         keyValueTail = nextEqualsign;
 985         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
 986             keyValueTail--;
 987         }
 988         /* now keyValueTail points to first char after the keyName */
 989         /* copy & normalize keyName from locale */
 990         if (keywordStart == keyValueTail) {
 991             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
 992             return 0;
 993         }
 994         keyValueLen = 0;
 995         while (keywordStart < keyValueTail) {
 996             if (!UPRV_ISALPHANUM(*keywordStart)) {
 997                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 998                 return 0;
 999             }
1000             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1001                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1002             } else {
1003                 /* keyword name too long for internal buffer */
1004                 *status = U_INTERNAL_PROGRAM_ERROR;
1005                 return 0;
1006             }
1007         }
1008         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1009
1010         nextSeparator = uprv_strchr(nextEqualsign, ';');
1011
1012         /* start processing the value part */
1013         nextEqualsign++; /* skip '=' */
1014         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1015         while(*nextEqualsign == ' ') {
1016             nextEqualsign++;
1017         }
1018         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1019         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1020             keyValueTail--;
1021         }
1022         if (nextEqualsign == keyValueTail) {
1023             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1024             return 0;
1025         }
1026
1027         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1028         if(rc == 0) {
1029             /* Current entry matches the input keyword. Update the entry */
1030             if(keywordValueLen > 0) { /* updating a value */
1031                 updatedKeysAndValues.append(keyValuePrefix, *status);
1032                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1033                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1034                 updatedKeysAndValues.append('=', *status);
1035                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1036             } /* else removing this entry, don't emit anything */
1037             handledInputKeyAndValue = TRUE;
1038         } else {
1039            /* input keyword sorts earlier than current entry, add before current entry */
1040             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1041                 /* insert new entry at this location */
1042                 updatedKeysAndValues.append(keyValuePrefix, *status);
1043                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1044                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1045                 updatedKeysAndValues.append('=', *status);
1046                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1047                 handledInputKeyAndValue = TRUE;
1048             }
1049             /* copy the current entry */
1050             updatedKeysAndValues.append(keyValuePrefix, *status);
1051             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1052             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1053             updatedKeysAndValues.append('=', *status);
1054             updatedKeysAndValues.append(nextEqualsign, static_cast<int32_t>(keyValueTail-nextEqualsign), *status);
1055         }
1056         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1057             /* append new entry at the end, it sorts later than existing entries */
1058             updatedKeysAndValues.append(keyValuePrefix, *status);
1059             /* skip keyValuePrefix update, no subsequent key-value pair */
1060             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1061             updatedKeysAndValues.append('=', *status);
1062             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1063             handledInputKeyAndValue = TRUE;
1064         }
1065         keywordStart = nextSeparator;
1066     } /* end loop searching */
1067
1068     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1069      * problems with the passed-in locale. So if we did encounter problems with the
1070      * passed-in locale above, those errors took precedence and overrode any error
1071      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1072      * are errors here they are from updatedKeysAndValues.append; they do cause an
1073      * error return but the passed-in locale is unmodified and the original bufLen is
1074      * returned.
1075      */
1076     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1077         /* if input key/value specified removal of a keyword not present in locale, or
1078          * there was an error in CharString.append, leave original locale alone. */
1079         return bufLen;
1080     }
1081
1082     updatedKeysAndValuesLen = updatedKeysAndValues.length();
1083     /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1084     needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1085     if(needLen >= bufferCapacity) {
1086         *status = U_BUFFER_OVERFLOW_ERROR;
1087         return needLen; /* no change */
1088     }
1089     if (updatedKeysAndValuesLen > 0) {
1090         uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1091     }
1092     buffer[needLen]=0;
1093     return needLen;
1094 }
1095
1096 /* ### ID parsing implementation **************************************************/
1097
1098 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1099
1100 /*returns TRUE if one of the special prefixes is here (s=string)
1101   'x-' or 'i-' */
1102 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1103
1104 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1105  * except for variant
1106  */
1107 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1108
1109 /**
1110  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1111  * a NULL entry, followed by more entries, and a second NULL entry.
1112  *
1113  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1114  * COUNTRIES_3.
1115  */
1116 static int16_t _findIndex(const char* const* list, const char* key)
1117 {
1118     const char* const* anchor = list;
1119     int32_t pass = 0;
1120
1121     /* Make two passes through two NULL-terminated arrays at 'list' */
1122     while (pass++ < 2) {
1123         while (*list) {
1124             if (uprv_strcmp(key, *list) == 0) {
1125                 return (int16_t)(list - anchor);
1126             }
1127             list++;
1128         }
1129         ++list;     /* skip final NULL *CWB*/
1130     }
1131     return -1;
1132 }
1133
1134 /* count the length of src while copying it to dest; return strlen(src) */
1135 static inline int32_t
1136 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1137     const char *anchor;
1138     char c;
1139
1140     anchor=src;
1141     for(;;) {
1142         if((c=*src)==0) {
1143             return (int32_t)(src-anchor);
1144         }
1145         if(destCapacity<=0) {
1146             return (int32_t)((src-anchor)+uprv_strlen(src));
1147         }
1148         ++src;
1149         *dest++=c;
1150         --destCapacity;
1151     }
1152 }
1153
1154 U_CFUNC const char*
1155 uloc_getCurrentCountryID(const char* oldID){
1156     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1157     if (offset >= 0) {
1158         return REPLACEMENT_COUNTRIES[offset];
1159     }
1160     return oldID;
1161 }
1162 U_CFUNC const char*
1163 uloc_getCurrentLanguageID(const char* oldID){
1164     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1165     if (offset >= 0) {
1166         return REPLACEMENT_LANGUAGES[offset];
1167     }
1168     return oldID;
1169 }
1170 /*
1171  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1172  * avoid duplicating code to handle the earlier locale ID pieces
1173  * in the functions for the later ones by
1174  * setting the *pEnd pointer to where they stopped parsing
1175  *
1176  * TODO try to use this in Locale
1177  */
1178 U_CFUNC int32_t
1179 ulocimp_getLanguage(const char *localeID,
1180                     char *language, int32_t languageCapacity,
1181                     const char **pEnd) {
1182     int32_t i=0;
1183     int32_t offset;
1184     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1185
1186     /* if it starts with i- or x- then copy that prefix */
1187     if(_isIDPrefix(localeID)) {
1188         if(i<languageCapacity) {
1189             language[i]=(char)uprv_tolower(*localeID);
1190         }
1191         if(i<languageCapacity) {
1192             language[i+1]='-';
1193         }
1194         i+=2;
1195         localeID+=2;
1196     }
1197
1198     /* copy the language as far as possible and count its length */
1199     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1200         if(i<languageCapacity) {
1201             language[i]=(char)uprv_tolower(*localeID);
1202         }
1203         if(i<3) {
1204             U_ASSERT(i>=0);
1205             lang[i]=(char)uprv_tolower(*localeID);
1206         }
1207         i++;
1208         localeID++;
1209     }
1210
1211     if(i==3) {
1212         /* convert 3 character code to 2 character code if possible *CWB*/
1213         offset=_findIndex(LANGUAGES_3, lang);
1214         if(offset>=0) {
1215             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1216         }
1217     }
1218
1219     if(pEnd!=NULL) {
1220         *pEnd=localeID;
1221     }
1222     return i;
1223 }
1224
1225 U_CFUNC int32_t
1226 ulocimp_getScript(const char *localeID,
1227                   char *script, int32_t scriptCapacity,
1228                   const char **pEnd)
1229 {
1230     int32_t idLen = 0;
1231
1232     if (pEnd != NULL) {
1233         *pEnd = localeID;
1234     }
1235
1236     /* copy the second item as far as possible and count its length */
1237     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1238             && uprv_isASCIILetter(localeID[idLen])) {
1239         idLen++;
1240     }
1241
1242     /* If it's exactly 4 characters long, then it's a script and not a country. */
1243     if (idLen == 4) {
1244         int32_t i;
1245         if (pEnd != NULL) {
1246             *pEnd = localeID+idLen;
1247         }
1248         if(idLen > scriptCapacity) {
1249             idLen = scriptCapacity;
1250         }
1251         if (idLen >= 1) {
1252             script[0]=(char)uprv_toupper(*(localeID++));
1253         }
1254         for (i = 1; i < idLen; i++) {
1255             script[i]=(char)uprv_tolower(*(localeID++));
1256         }
1257     }
1258     else {
1259         idLen = 0;
1260     }
1261     return idLen;
1262 }
1263
1264 U_CFUNC int32_t
1265 ulocimp_getCountry(const char *localeID,
1266                    char *country, int32_t countryCapacity,
1267                    const char **pEnd)
1268 {
1269     int32_t idLen=0;
1270     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1271     int32_t offset;
1272
1273     /* copy the country as far as possible and count its length */
1274     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1275         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1276             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1277         }
1278         idLen++;
1279     }
1280
1281     /* the country should be either length 2 or 3 */
1282     if (idLen == 2 || idLen == 3) {
1283         UBool gotCountry = FALSE;
1284         /* convert 3 character code to 2 character code if possible *CWB*/
1285         if(idLen==3) {
1286             offset=_findIndex(COUNTRIES_3, cnty);
1287             if(offset>=0) {
1288                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1289                 gotCountry = TRUE;
1290             }
1291         }
1292         if (!gotCountry) {
1293             int32_t i = 0;
1294             for (i = 0; i < idLen; i++) {
1295                 if (i < countryCapacity) {
1296                     country[i]=(char)uprv_toupper(localeID[i]);
1297                 }
1298             }
1299         }
1300         localeID+=idLen;
1301     } else {
1302         idLen = 0;
1303     }
1304
1305     if(pEnd!=NULL) {
1306         *pEnd=localeID;
1307     }
1308
1309     return idLen;
1310 }
1311
1312 /**
1313  * @param needSeparator if true, then add leading '_' if any variants
1314  * are added to 'variant'
1315  */
1316 static int32_t
1317 _getVariantEx(const char *localeID,
1318               char prev,
1319               char *variant, int32_t variantCapacity,
1320               UBool needSeparator) {
1321     int32_t i=0;
1322
1323     /* get one or more variant tags and separate them with '_' */
1324     if(_isIDSeparator(prev)) {
1325         /* get a variant string after a '-' or '_' */
1326         while(!_isTerminator(*localeID)) {
1327             if (needSeparator) {
1328                 if (i<variantCapacity) {
1329                     variant[i] = '_';
1330                 }
1331                 ++i;
1332                 needSeparator = FALSE;
1333             }
1334             if(i<variantCapacity) {
1335                 variant[i]=(char)uprv_toupper(*localeID);
1336                 if(variant[i]=='-') {
1337                     variant[i]='_';
1338                 }
1339             }
1340             i++;
1341             localeID++;
1342         }
1343     }
1344
1345     /* if there is no variant tag after a '-' or '_' then look for '@' */
1346     if(i==0) {
1347         if(prev=='@') {
1348             /* keep localeID */
1349         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1350             ++localeID; /* point after the '@' */
1351         } else {
1352             return 0;
1353         }
1354         while(!_isTerminator(*localeID)) {
1355             if (needSeparator) {
1356                 if (i<variantCapacity) {
1357                     variant[i] = '_';
1358                 }
1359                 ++i;
1360                 needSeparator = FALSE;
1361             }
1362             if(i<variantCapacity) {
1363                 variant[i]=(char)uprv_toupper(*localeID);
1364                 if(variant[i]=='-' || variant[i]==',') {
1365                     variant[i]='_';
1366                 }
1367             }
1368             i++;
1369             localeID++;
1370         }
1371     }
1372
1373     return i;
1374 }
1375
1376 static int32_t
1377 _getVariant(const char *localeID,
1378             char prev,
1379             char *variant, int32_t variantCapacity) {
1380     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1381 }
1382
1383 /* Keyword enumeration */
1384
1385 typedef struct UKeywordsContext {
1386     char* keywords;
1387     char* current;
1388 } UKeywordsContext;
1389
1390 U_CDECL_BEGIN
1391
1392 static void U_CALLCONV
1393 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1394     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1395     uprv_free(enumerator->context);
1396     uprv_free(enumerator);
1397 }
1398
1399 static int32_t U_CALLCONV
1400 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1401     char *kw = ((UKeywordsContext *)en->context)->keywords;
1402     int32_t result = 0;
1403     while(*kw) {
1404         result++;
1405         kw += uprv_strlen(kw)+1;
1406     }
1407     return result;
1408 }
1409
1410 static const char * U_CALLCONV
1411 uloc_kw_nextKeyword(UEnumeration* en,
1412                     int32_t* resultLength,
1413                     UErrorCode* /*status*/) {
1414     const char* result = ((UKeywordsContext *)en->context)->current;
1415     int32_t len = 0;
1416     if(*result) {
1417         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1418         ((UKeywordsContext *)en->context)->current += len+1;
1419     } else {
1420         result = NULL;
1421     }
1422     if (resultLength) {
1423         *resultLength = len;
1424     }
1425     return result;
1426 }
1427
1428 static void U_CALLCONV
1429 uloc_kw_resetKeywords(UEnumeration* en,
1430                       UErrorCode* /*status*/) {
1431     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1432 }
1433
1434 U_CDECL_END
1435
1436
1437 static const UEnumeration gKeywordsEnum = {
1438     NULL,
1439     NULL,
1440     uloc_kw_closeKeywords,
1441     uloc_kw_countKeywords,
1442     uenum_unextDefault,
1443     uloc_kw_nextKeyword,
1444     uloc_kw_resetKeywords
1445 };
1446
1447 U_CAPI UEnumeration* U_EXPORT2
1448 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1449 {
1450     UKeywordsContext *myContext = NULL;
1451     UEnumeration *result = NULL;
1452
1453     if(U_FAILURE(*status)) {
1454         return NULL;
1455     }
1456     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1457     /* Null pointer test */
1458     if (result == NULL) {
1459         *status = U_MEMORY_ALLOCATION_ERROR;
1460         return NULL;
1461     }
1462     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1463     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1464     if (myContext == NULL) {
1465         *status = U_MEMORY_ALLOCATION_ERROR;
1466         uprv_free(result);
1467         return NULL;
1468     }
1469     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1470     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1471     myContext->keywords[keywordListSize] = 0;
1472     myContext->current = myContext->keywords;
1473     result->context = myContext;
1474     return result;
1475 }
1476
1477 U_CAPI UEnumeration* U_EXPORT2
1478 uloc_openKeywords(const char* localeID,
1479                         UErrorCode* status)
1480 {
1481     int32_t i=0;
1482     char keywords[256];
1483     int32_t keywordsCapacity = 256;
1484     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1485     const char* tmpLocaleID;
1486
1487     if(status==NULL || U_FAILURE(*status)) {
1488         return 0;
1489     }
1490
1491     if (_hasBCP47Extension(localeID)) {
1492         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1493     } else {
1494         if (localeID==NULL) {
1495            localeID=uloc_getDefault();
1496         }
1497         tmpLocaleID=localeID;
1498     }
1499
1500     /* Skip the language */
1501     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1502     if(_isIDSeparator(*tmpLocaleID)) {
1503         const char *scriptID;
1504         /* Skip the script if available */
1505         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1506         if(scriptID != tmpLocaleID+1) {
1507             /* Found optional script */
1508             tmpLocaleID = scriptID;
1509         }
1510         /* Skip the Country */
1511         if (_isIDSeparator(*tmpLocaleID)) {
1512             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1513             if(_isIDSeparator(*tmpLocaleID)) {
1514                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1515             }
1516         }
1517     }
1518
1519     /* keywords are located after '@' */
1520     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1521         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1522     }
1523
1524     if(i) {
1525         return uloc_openKeywordList(keywords, i, status);
1526     } else {
1527         return NULL;
1528     }
1529 }
1530
1531
1532 /* bit-flags for 'options' parameter of _canonicalize */
1533 #define _ULOC_STRIP_KEYWORDS 0x2
1534 #define _ULOC_CANONICALIZE   0x1
1535
1536 #define OPTION_SET(options, mask) ((options & mask) != 0)
1537
1538 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1539 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1540
1541 /**
1542  * Canonicalize the given localeID, to level 1 or to level 2,
1543  * depending on the options.  To specify level 1, pass in options=0.
1544  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1545  *
1546  * This is the code underlying uloc_getName and uloc_canonicalize.
1547  */
1548 static int32_t
1549 _canonicalize(const char* localeID,
1550               char* result,
1551               int32_t resultCapacity,
1552               uint32_t options,
1553               UErrorCode* err) {
1554     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1555     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1556     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1557     const char* origLocaleID;
1558     const char* tmpLocaleID;
1559     const char* keywordAssign = NULL;
1560     const char* separatorIndicator = NULL;
1561     char* name;
1562     char* variant = NULL; /* pointer into name, or NULL */
1563
1564     if (U_FAILURE(*err)) {
1565         return 0;
1566     }
1567
1568     if (_hasBCP47Extension(localeID)) {
1569         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1570     } else {
1571         if (localeID==NULL) {
1572            localeID=uloc_getDefault();
1573         }
1574         tmpLocaleID=localeID;
1575     }
1576
1577     origLocaleID=tmpLocaleID;
1578
1579     /* if we are doing a full canonicalization, then put results in
1580        localeBuffer, if necessary; otherwise send them to result. */
1581     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1582         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1583         name = localeBuffer;
1584         nameCapacity = (int32_t)sizeof(localeBuffer);
1585     } else {
1586         name = result;
1587         nameCapacity = resultCapacity;
1588     }
1589
1590     /* get all pieces, one after another, and separate with '_' */
1591     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1592
1593     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1594         const char *d = uloc_getDefault();
1595
1596         len = (int32_t)uprv_strlen(d);
1597
1598         if (name != NULL) {
1599             uprv_memcpy(name, d, len);
1600         }
1601     } else if(_isIDSeparator(*tmpLocaleID)) {
1602         const char *scriptID;
1603
1604         ++fieldCount;
1605         if(len<nameCapacity) {
1606             name[len]='_';
1607         }
1608         ++len;
1609
1610         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1611             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1612         if(scriptSize > 0) {
1613             /* Found optional script */
1614             tmpLocaleID = scriptID;
1615             ++fieldCount;
1616             len+=scriptSize;
1617             if (_isIDSeparator(*tmpLocaleID)) {
1618                 /* If there is something else, then we add the _ */
1619                 if(len<nameCapacity) {
1620                     name[len]='_';
1621                 }
1622                 ++len;
1623             }
1624         }
1625
1626         if (_isIDSeparator(*tmpLocaleID)) {
1627             const char *cntryID;
1628             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1629                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1630             if (cntrySize > 0) {
1631                 /* Found optional country */
1632                 tmpLocaleID = cntryID;
1633                 len+=cntrySize;
1634             }
1635             if(_isIDSeparator(*tmpLocaleID)) {
1636                 /* If there is something else, then we add the _  if we found country before. */
1637                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1638                     ++fieldCount;
1639                     if(len<nameCapacity) {
1640                         name[len]='_';
1641                     }
1642                     ++len;
1643                 }
1644
1645                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1646                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1647                 if (variantSize > 0) {
1648                     variant = len<nameCapacity ? name+len : NULL;
1649                     len += variantSize;
1650                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1651                 }
1652             }
1653         }
1654     }
1655
1656     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1657     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1658         UBool done = FALSE;
1659         do {
1660             char c = *tmpLocaleID;
1661             switch (c) {
1662             case 0:
1663             case '@':
1664                 done = TRUE;
1665                 break;
1666             default:
1667                 if (len<nameCapacity) {
1668                     name[len] = c;
1669                 }
1670                 ++len;
1671                 ++tmpLocaleID;
1672                 break;
1673             }
1674         } while (!done);
1675     }
1676
1677     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1678        After this, tmpLocaleID either points to '@' or is NULL */
1679     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1680         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1681         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1682     }
1683
1684     /* Copy POSIX-style variant, if any [mr@FOO] */
1685     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1686         tmpLocaleID != NULL && keywordAssign == NULL) {
1687         for (;;) {
1688             char c = *tmpLocaleID;
1689             if (c == 0) {
1690                 break;
1691             }
1692             if (len<nameCapacity) {
1693                 name[len] = c;
1694             }
1695             ++len;
1696             ++tmpLocaleID;
1697         }
1698     }
1699
1700     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1701         /* Handle @FOO variant if @ is present and not followed by = */
1702         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1703             int32_t posixVariantSize;
1704             /* Add missing '_' if needed */
1705             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1706                 do {
1707                     if(len<nameCapacity) {
1708                         name[len]='_';
1709                     }
1710                     ++len;
1711                     ++fieldCount;
1712                 } while(fieldCount<2);
1713             }
1714             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1715                                              (UBool)(variantSize > 0));
1716             if (posixVariantSize > 0) {
1717                 if (variant == NULL) {
1718                     variant = name+len;
1719                 }
1720                 len += posixVariantSize;
1721                 variantSize += posixVariantSize;
1722             }
1723         }
1724
1725         /* Look up the ID in the canonicalization map */
1726         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1727             const char* id = CANONICALIZE_MAP[j].id;
1728             int32_t n = (int32_t)uprv_strlen(id);
1729             if (len == n && uprv_strncmp(name, id, n) == 0) {
1730                 if (n == 0 && tmpLocaleID != NULL) {
1731                     break; /* Don't remap "" if keywords present */
1732                 }
1733                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1734                 break;
1735             }
1736         }
1737     }
1738
1739     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1740         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1741             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1742             if(len<nameCapacity) {
1743                 name[len]='@';
1744             }
1745             ++len;
1746             ++fieldCount;
1747             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1748                                 NULL, 0, NULL, TRUE, err);
1749         }
1750     }
1751
1752     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1753         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1754     }
1755
1756     return u_terminateChars(result, resultCapacity, len, err);
1757 }
1758
1759 /* ### ID parsing API **************************************************/
1760
1761 U_CAPI int32_t  U_EXPORT2
1762 uloc_getParent(const char*    localeID,
1763                char* parent,
1764                int32_t parentCapacity,
1765                UErrorCode* err)
1766 {
1767     const char *lastUnderscore;
1768     int32_t i;
1769
1770     if (U_FAILURE(*err))
1771         return 0;
1772
1773     if (localeID == NULL)
1774         localeID = uloc_getDefault();
1775
1776     lastUnderscore=uprv_strrchr(localeID, '_');
1777     if(lastUnderscore!=NULL) {
1778         i=(int32_t)(lastUnderscore-localeID);
1779     } else {
1780         i=0;
1781     }
1782
1783     if(i>0 && parent != localeID) {
1784         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1785     }
1786
1787     return u_terminateChars(parent, parentCapacity, i, err);
1788 }
1789
1790 U_CAPI int32_t U_EXPORT2
1791 uloc_getLanguage(const char*    localeID,
1792          char* language,
1793          int32_t languageCapacity,
1794          UErrorCode* err)
1795 {
1796     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1797     int32_t i=0;
1798
1799     if (err==NULL || U_FAILURE(*err)) {
1800         return 0;
1801     }
1802
1803     if(localeID==NULL) {
1804         localeID=uloc_getDefault();
1805     }
1806
1807     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1808     return u_terminateChars(language, languageCapacity, i, err);
1809 }
1810
1811 U_CAPI int32_t U_EXPORT2
1812 uloc_getScript(const char*    localeID,
1813          char* script,
1814          int32_t scriptCapacity,
1815          UErrorCode* err)
1816 {
1817     int32_t i=0;
1818
1819     if(err==NULL || U_FAILURE(*err)) {
1820         return 0;
1821     }
1822
1823     if(localeID==NULL) {
1824         localeID=uloc_getDefault();
1825     }
1826
1827     /* skip the language */
1828     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1829     if(_isIDSeparator(*localeID)) {
1830         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1831     }
1832     return u_terminateChars(script, scriptCapacity, i, err);
1833 }
1834
1835 U_CAPI int32_t  U_EXPORT2
1836 uloc_getCountry(const char* localeID,
1837             char* country,
1838             int32_t countryCapacity,
1839             UErrorCode* err)
1840 {
1841     int32_t i=0;
1842
1843     if(err==NULL || U_FAILURE(*err)) {
1844         return 0;
1845     }
1846
1847     if(localeID==NULL) {
1848         localeID=uloc_getDefault();
1849     }
1850
1851     /* Skip the language */
1852     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1853     if(_isIDSeparator(*localeID)) {
1854         const char *scriptID;
1855         /* Skip the script if available */
1856         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1857         if(scriptID != localeID+1) {
1858             /* Found optional script */
1859             localeID = scriptID;
1860         }
1861         if(_isIDSeparator(*localeID)) {
1862             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1863         }
1864     }
1865     return u_terminateChars(country, countryCapacity, i, err);
1866 }
1867
1868 U_CAPI int32_t  U_EXPORT2
1869 uloc_getVariant(const char* localeID,
1870                 char* variant,
1871                 int32_t variantCapacity,
1872                 UErrorCode* err)
1873 {
1874     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1875     const char* tmpLocaleID;
1876     int32_t i=0;
1877
1878     if(err==NULL || U_FAILURE(*err)) {
1879         return 0;
1880     }
1881
1882     if (_hasBCP47Extension(localeID)) {
1883         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1884     } else {
1885         if (localeID==NULL) {
1886            localeID=uloc_getDefault();
1887         }
1888         tmpLocaleID=localeID;
1889     }
1890
1891     /* Skip the language */
1892     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1893     if(_isIDSeparator(*tmpLocaleID)) {
1894         const char *scriptID;
1895         /* Skip the script if available */
1896         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1897         if(scriptID != tmpLocaleID+1) {
1898             /* Found optional script */
1899             tmpLocaleID = scriptID;
1900         }
1901         /* Skip the Country */
1902         if (_isIDSeparator(*tmpLocaleID)) {
1903             const char *cntryID;
1904             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
1905             if (cntryID != tmpLocaleID+1) {
1906                 /* Found optional country */
1907                 tmpLocaleID = cntryID;
1908             }
1909             if(_isIDSeparator(*tmpLocaleID)) {
1910                 /* If there was no country ID, skip a possible extra IDSeparator */
1911                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
1912                     tmpLocaleID++;
1913                 }
1914                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
1915             }
1916         }
1917     }
1918
1919     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
1920     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
1921 /*
1922     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
1923         i=_getVariant(localeID+1, '@', variant, variantCapacity);
1924     }
1925 */
1926     return u_terminateChars(variant, variantCapacity, i, err);
1927 }
1928
1929 U_CAPI int32_t  U_EXPORT2
1930 uloc_getName(const char* localeID,
1931              char* name,
1932              int32_t nameCapacity,
1933              UErrorCode* err)
1934 {
1935     return _canonicalize(localeID, name, nameCapacity, 0, err);
1936 }
1937
1938 U_CAPI int32_t  U_EXPORT2
1939 uloc_getBaseName(const char* localeID,
1940                  char* name,
1941                  int32_t nameCapacity,
1942                  UErrorCode* err)
1943 {
1944     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
1945 }
1946
1947 U_CAPI int32_t  U_EXPORT2
1948 uloc_canonicalize(const char* localeID,
1949                   char* name,
1950                   int32_t nameCapacity,
1951                   UErrorCode* err)
1952 {
1953     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
1954 }
1955
1956 U_CAPI const char*  U_EXPORT2
1957 uloc_getISO3Language(const char* localeID)
1958 {
1959     int16_t offset;
1960     char lang[ULOC_LANG_CAPACITY];
1961     UErrorCode err = U_ZERO_ERROR;
1962
1963     if (localeID == NULL)
1964     {
1965         localeID = uloc_getDefault();
1966     }
1967     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
1968     if (U_FAILURE(err))
1969         return "";
1970     offset = _findIndex(LANGUAGES, lang);
1971     if (offset < 0)
1972         return "";
1973     return LANGUAGES_3[offset];
1974 }
1975
1976 U_CAPI const char*  U_EXPORT2
1977 uloc_getISO3Country(const char* localeID)
1978 {
1979     int16_t offset;
1980     char cntry[ULOC_LANG_CAPACITY];
1981     UErrorCode err = U_ZERO_ERROR;
1982
1983     if (localeID == NULL)
1984     {
1985         localeID = uloc_getDefault();
1986     }
1987     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
1988     if (U_FAILURE(err))
1989         return "";
1990     offset = _findIndex(COUNTRIES, cntry);
1991     if (offset < 0)
1992         return "";
1993
1994     return COUNTRIES_3[offset];
1995 }
1996
1997 U_CAPI uint32_t  U_EXPORT2
1998 uloc_getLCID(const char* localeID)
1999 {
2000     UErrorCode status = U_ZERO_ERROR;
2001     char       langID[ULOC_FULLNAME_CAPACITY];
2002     uint32_t   lcid = 0;
2003
2004     /* Check for incomplete id. */
2005     if (!localeID || uprv_strlen(localeID) < 2) {
2006         return 0;
2007     }
2008
2009     // First, attempt Windows platform lookup if available, but fall
2010     // through to catch any special cases (ICU vs Windows name differences).
2011     lcid = uprv_convertToLCIDPlatform(localeID, &status);
2012     if (U_FAILURE(status)) {
2013         return 0;
2014     }
2015     if (lcid > 0) {
2016         // Windows found an LCID, return that
2017         return lcid;
2018     }
2019
2020     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2021     if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) {
2022         return 0;
2023     }
2024
2025     if (uprv_strchr(localeID, '@')) {
2026         // uprv_convertToLCID does not support keywords other than collation.
2027         // Remove all keywords except collation.
2028         int32_t len;
2029         char collVal[ULOC_KEYWORDS_CAPACITY];
2030         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2031
2032         len = uloc_getKeywordValue(localeID, "collation", collVal,
2033             UPRV_LENGTHOF(collVal) - 1, &status);
2034
2035         if (U_SUCCESS(status) && len > 0) {
2036             collVal[len] = 0;
2037
2038             len = uloc_getBaseName(localeID, tmpLocaleID,
2039                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2040
2041             if (U_SUCCESS(status) && len > 0) {
2042                 tmpLocaleID[len] = 0;
2043
2044                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2045                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2046
2047                 if (U_SUCCESS(status) && len > 0) {
2048                     tmpLocaleID[len] = 0;
2049                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2050                 }
2051             }
2052         }
2053
2054         // fall through - all keywords are simply ignored
2055         status = U_ZERO_ERROR;
2056     }
2057
2058     return uprv_convertToLCID(langID, localeID, &status);
2059 }
2060
2061 U_CAPI int32_t U_EXPORT2
2062 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2063                 UErrorCode *status)
2064 {
2065     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2066 }
2067
2068 /* ### Default locale **************************************************/
2069
2070 U_CAPI const char*  U_EXPORT2
2071 uloc_getDefault()
2072 {
2073     return locale_get_default();
2074 }
2075
2076 U_CAPI void  U_EXPORT2
2077 uloc_setDefault(const char*   newDefaultLocale,
2078              UErrorCode* err)
2079 {
2080     if (U_FAILURE(*err))
2081         return;
2082     /* the error code isn't currently used for anything by this function*/
2083
2084     /* propagate change to C++ */
2085     locale_set_default(newDefaultLocale);
2086 }
2087
2088 /**
2089  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2090  * to an array of pointers to arrays of char.  All of these pointers are owned
2091  * by ICU-- do not delete them, and do not write through them.  The array is
2092  * terminated with a null pointer.
2093  */
2094 U_CAPI const char* const*  U_EXPORT2
2095 uloc_getISOLanguages()
2096 {
2097     return LANGUAGES;
2098 }
2099
2100 /**
2101  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2102  * pointer to an array of pointers to arrays of char.  All of these pointers are
2103  * owned by ICU-- do not delete them, and do not write through them.  The array is
2104  * terminated with a null pointer.
2105  */
2106 U_CAPI const char* const*  U_EXPORT2
2107 uloc_getISOCountries()
2108 {
2109     return COUNTRIES;
2110 }
2111
2112
2113 /* this function to be moved into cstring.c later */
2114 static char gDecimal = 0;
2115
2116 static /* U_CAPI */
2117 double
2118 /* U_EXPORT2 */
2119 _uloc_strtod(const char *start, char **end) {
2120     char *decimal;
2121     char *myEnd;
2122     char buf[30];
2123     double rv;
2124     if (!gDecimal) {
2125         char rep[5];
2126         /* For machines that decide to change the decimal on you,
2127         and try to be too smart with localization.
2128         This normally should be just a '.'. */
2129         sprintf(rep, "%+1.1f", 1.0);
2130         gDecimal = rep[2];
2131     }
2132
2133     if(gDecimal == '.') {
2134         return uprv_strtod(start, end); /* fall through to OS */
2135     } else {
2136         uprv_strncpy(buf, start, 29);
2137         buf[29]=0;
2138         decimal = uprv_strchr(buf, '.');
2139         if(decimal) {
2140             *decimal = gDecimal;
2141         } else {
2142             return uprv_strtod(start, end); /* no decimal point */
2143         }
2144         rv = uprv_strtod(buf, &myEnd);
2145         if(end) {
2146             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2147         }
2148         return rv;
2149     }
2150 }
2151
2152 typedef struct {
2153     float q;
2154     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2155     char locale[ULOC_FULLNAME_CAPACITY+1];
2156 } _acceptLangItem;
2157
2158 static int32_t U_CALLCONV
2159 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2160 {
2161     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2162     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2163
2164     int32_t rc = 0;
2165     if(bb->q < aa->q) {
2166         rc = -1;  /* A > B */
2167     } else if(bb->q > aa->q) {
2168         rc = 1;   /* A < B */
2169     } else {
2170         rc = 0;   /* A = B */
2171     }
2172
2173     if(rc==0) {
2174         rc = uprv_stricmp(aa->locale, bb->locale);
2175     }
2176
2177 #if defined(ULOC_DEBUG)
2178     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2179     aa->locale, aa->q,
2180     bb->locale, bb->q,
2181     rc);*/
2182 #endif
2183
2184     return rc;
2185 }
2186
2187 /*
2188 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2189 */
2190
2191 U_CAPI int32_t U_EXPORT2
2192 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2193                             const char *httpAcceptLanguage,
2194                             UEnumeration* availableLocales,
2195                             UErrorCode *status)
2196 {
2197   MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2198     char tmp[ULOC_FULLNAME_CAPACITY +1];
2199     int32_t n = 0;
2200     const char *itemEnd;
2201     const char *paramEnd;
2202     const char *s;
2203     const char *t;
2204     int32_t res;
2205     int32_t i;
2206     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2207
2208     if(U_FAILURE(*status)) {
2209         return -1;
2210     }
2211
2212     for(s=httpAcceptLanguage;s&&*s;) {
2213         while(isspace(*s)) /* eat space at the beginning */
2214             s++;
2215         itemEnd=uprv_strchr(s,',');
2216         paramEnd=uprv_strchr(s,';');
2217         if(!itemEnd) {
2218             itemEnd = httpAcceptLanguage+l; /* end of string */
2219         }
2220         if(paramEnd && paramEnd<itemEnd) {
2221             /* semicolon (;) is closer than end (,) */
2222             t = paramEnd+1;
2223             if(*t=='q') {
2224                 t++;
2225             }
2226             while(isspace(*t)) {
2227                 t++;
2228             }
2229             if(*t=='=') {
2230                 t++;
2231             }
2232             while(isspace(*t)) {
2233                 t++;
2234             }
2235             items[n].q = (float)_uloc_strtod(t,NULL);
2236         } else {
2237             /* no semicolon - it's 1.0 */
2238             items[n].q = 1.0f;
2239             paramEnd = itemEnd;
2240         }
2241         items[n].dummy=0;
2242         /* eat spaces prior to semi */
2243         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2244             ;
2245         int32_t slen = static_cast<int32_t>(((t+1)-s));
2246         if(slen > ULOC_FULLNAME_CAPACITY) {
2247           *status = U_BUFFER_OVERFLOW_ERROR;
2248           return -1; // too big
2249         }
2250         uprv_strncpy(items[n].locale, s, slen);
2251         items[n].locale[slen]=0; // terminate
2252         int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2253         if(U_FAILURE(*status)) return -1;
2254         if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2255             // canonicalization had an effect- copy back
2256             uprv_strncpy(items[n].locale, tmp, clen);
2257             items[n].locale[clen] = 0; // terminate
2258         }
2259 #if defined(ULOC_DEBUG)
2260         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2261 #endif
2262         n++;
2263         s = itemEnd;
2264         while(*s==',') { /* eat duplicate commas */
2265             s++;
2266         }
2267         if(n>=items.getCapacity()) { // If we need more items
2268           if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2269               *status = U_MEMORY_ALLOCATION_ERROR;
2270               return -1;
2271           }
2272 #if defined(ULOC_DEBUG)
2273           fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2274 #endif
2275         }
2276     }
2277     uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2278     if (U_FAILURE(*status)) {
2279         return -1;
2280     }
2281     LocalMemory<const char*> strs(NULL);
2282     if (strs.allocateInsteadAndReset(n) == NULL) {
2283         *status = U_MEMORY_ALLOCATION_ERROR;
2284         return -1;
2285     }
2286     for(i=0;i<n;i++) {
2287 #if defined(ULOC_DEBUG)
2288         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2289 #endif
2290         strs[i]=items[i].locale;
2291     }
2292     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2293                                strs.getAlias(), n, availableLocales, status);
2294     return res;
2295 }
2296
2297
2298 U_CAPI int32_t U_EXPORT2
2299 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2300                     UAcceptResult *outResult, const char **acceptList,
2301                     int32_t acceptListCount,
2302                     UEnumeration* availableLocales,
2303                     UErrorCode *status)
2304 {
2305     int32_t i,j;
2306     int32_t len;
2307     int32_t maxLen=0;
2308     char tmp[ULOC_FULLNAME_CAPACITY+1];
2309     const char *l;
2310     char **fallbackList;
2311     if(U_FAILURE(*status)) {
2312         return -1;
2313     }
2314     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2315     if(fallbackList==NULL) {
2316         *status = U_MEMORY_ALLOCATION_ERROR;
2317         return -1;
2318     }
2319     for(i=0;i<acceptListCount;i++) {
2320 #if defined(ULOC_DEBUG)
2321         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2322 #endif
2323         while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2324 #if defined(ULOC_DEBUG)
2325             fprintf(stderr,"  %s\n", l);
2326 #endif
2327             len = (int32_t)uprv_strlen(l);
2328             if(!uprv_strcmp(acceptList[i], l)) {
2329                 if(outResult) {
2330                     *outResult = ULOC_ACCEPT_VALID;
2331                 }
2332 #if defined(ULOC_DEBUG)
2333                 fprintf(stderr, "MATCH! %s\n", l);
2334 #endif
2335                 if(len>0) {
2336                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2337                 }
2338                 for(j=0;j<i;j++) {
2339                     uprv_free(fallbackList[j]);
2340                 }
2341                 uprv_free(fallbackList);
2342                 return u_terminateChars(result, resultAvailable, len, status);
2343             }
2344             if(len>maxLen) {
2345                 maxLen = len;
2346             }
2347         }
2348         uenum_reset(availableLocales, status);
2349         /* save off parent info */
2350         if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2351             fallbackList[i] = uprv_strdup(tmp);
2352         } else {
2353             fallbackList[i]=0;
2354         }
2355     }
2356
2357     for(maxLen--;maxLen>0;maxLen--) {
2358         for(i=0;i<acceptListCount;i++) {
2359             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2360 #if defined(ULOC_DEBUG)
2361                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2362 #endif
2363                 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2364 #if defined(ULOC_DEBUG)
2365                     fprintf(stderr,"  %s\n", l);
2366 #endif
2367                     len = (int32_t)uprv_strlen(l);
2368                     if(!uprv_strcmp(fallbackList[i], l)) {
2369                         if(outResult) {
2370                             *outResult = ULOC_ACCEPT_FALLBACK;
2371                         }
2372 #if defined(ULOC_DEBUG)
2373                         fprintf(stderr, "fallback MATCH! %s\n", l);
2374 #endif
2375                         if(len>0) {
2376                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2377                         }
2378                         for(j=0;j<acceptListCount;j++) {
2379                             uprv_free(fallbackList[j]);
2380                         }
2381                         uprv_free(fallbackList);
2382                         return u_terminateChars(result, resultAvailable, len, status);
2383                     }
2384                 }
2385                 uenum_reset(availableLocales, status);
2386
2387                 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2388                     uprv_free(fallbackList[i]);
2389                     fallbackList[i] = uprv_strdup(tmp);
2390                 } else {
2391                     uprv_free(fallbackList[i]);
2392                     fallbackList[i]=0;
2393                 }
2394             }
2395         }
2396         if(outResult) {
2397             *outResult = ULOC_ACCEPT_FAILED;
2398         }
2399     }
2400     for(i=0;i<acceptListCount;i++) {
2401         uprv_free(fallbackList[i]);
2402     }
2403     uprv_free(fallbackList);
2404     return -1;
2405 }
2406
2407 U_CAPI const char* U_EXPORT2
2408 uloc_toUnicodeLocaleKey(const char* keyword)
2409 {
2410     const char* bcpKey = ulocimp_toBcpKey(keyword);
2411     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2412         // unknown keyword, but syntax is fine..
2413         return keyword;
2414     }
2415     return bcpKey;
2416 }
2417
2418 U_CAPI const char* U_EXPORT2
2419 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2420 {
2421     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2422     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2423         // unknown keyword, but syntax is fine..
2424         return value;
2425     }
2426     return bcpType;
2427 }
2428
2429 static UBool
2430 isWellFormedLegacyKey(const char* legacyKey)
2431 {
2432     const char* p = legacyKey;
2433     while (*p) {
2434         if (!UPRV_ISALPHANUM(*p)) {
2435             return FALSE;
2436         }
2437         p++;
2438     }
2439     return TRUE;
2440 }
2441
2442 static UBool
2443 isWellFormedLegacyType(const char* legacyType)
2444 {
2445     const char* p = legacyType;
2446     int32_t alphaNumLen = 0;
2447     while (*p) {
2448         if (*p == '_' || *p == '/' || *p == '-') {
2449             if (alphaNumLen == 0) {
2450                 return FALSE;
2451             }
2452             alphaNumLen = 0;
2453         } else if (UPRV_ISALPHANUM(*p)) {
2454             alphaNumLen++;
2455         } else {
2456             return FALSE;
2457         }
2458         p++;
2459     }
2460     return (alphaNumLen != 0);
2461 }
2462
2463 U_CAPI const char* U_EXPORT2
2464 uloc_toLegacyKey(const char* keyword)
2465 {
2466     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2467     if (legacyKey == NULL) {
2468         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2469         //
2470         // Note:
2471         //  LDML/CLDR provides some definition of keyword syntax in
2472         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2473         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2474         //  Keys can only consist of [0-9a-zA-Z].
2475         if (isWellFormedLegacyKey(keyword)) {
2476             return keyword;
2477         }
2478     }
2479     return legacyKey;
2480 }
2481
2482 U_CAPI const char* U_EXPORT2
2483 uloc_toLegacyType(const char* keyword, const char* value)
2484 {
2485     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2486     if (legacyType == NULL) {
2487         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2488         //
2489         // Note:
2490         //  LDML/CLDR provides some definition of keyword syntax in
2491         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2492         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2493         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2494         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2495         if (isWellFormedLegacyType(value)) {
2496             return value;
2497         }
2498     }
2499     return legacyType;
2500 }
2501
2502 /*eof*/