icuSources/common/uloc.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 1997-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *
   9 * File ULOC.CPP
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   04/01/97    aliu        Creation.
  15 *   08/21/98    stephen     JDK 1.2 sync
  16 *   12/08/98    rtg         New Locale implementation and C API
  17 *   03/15/99    damiba      overhaul.
  18 *   04/06/99    stephen     changed setDefault() to realloc and copy
  19 *   06/14/99    stephen     Changed calls to ures_open for new params
  20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  22 *                           brought canonicalization code into line with spec
  23 *****************************************************************************/
  24
  25 /*
  26    POSIX's locale format, from putil.c: [no spaces]
  27
  28      ll [ _CC ] [ . MM ] [ @ VV]
  29
  30      l = lang, C = ctry, M = charmap, V = variant
  31 */
  32
  33 #include "unicode/utypes.h"
  34 #include "unicode/ustring.h"
  35 #include "unicode/uloc.h"
  36
  37 #include "putilimp.h"
  38 #include "ustr_imp.h"
  39 #include "ulocimp.h"
  40 #include "umutex.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43 #include "locmap.h"
  44 #include "uarrsort.h"
  45 #include "uenumimp.h"
  46 #include "uassert.h"
  47 #include "charstr.h"
  48
  49 #include <stdio.h> /* for sprintf */
  50
  51 U_NAMESPACE_USE
  52
  53 /* ### Declarations **************************************************/
  54
  55 /* Locale stuff from locid.cpp */
  56 U_CFUNC void locale_set_default(const char *id);
  57 U_CFUNC const char *locale_get_default(void);
  58 U_CFUNC int32_t
  59 locale_getKeywords(const char *localeID,
  60             char prev,
  61             char *keywords, int32_t keywordCapacity,
  62             char *values, int32_t valuesCapacity, int32_t *valLen,
  63             UBool valuesToo,
  64             UErrorCode *status);
  65
  66 /* ### Data tables **************************************************/
  67
  68 /**
  69  * Table of language codes, both 2- and 3-letter, with preference
  70  * given to 2-letter codes where possible.  Includes 3-letter codes
  71  * that lack a 2-letter equivalent.
  72  *
  73  * This list must be in sorted order.  This list is returned directly
  74  * to the user by some API.
  75  *
  76  * This list must be kept in sync with LANGUAGES_3, with corresponding
  77  * entries matched.
  78  *
  79  * This table should be terminated with a NULL entry, followed by a
  80  * second list, and another NULL entry.  The first list is visible to
  81  * user code when this array is returned by API.  The second list
  82  * contains codes we support, but do not expose through user API.
  83  *
  84  * Notes
  85  *
  86  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  87  * include the revisions up to 2001/7/27 *CWB*
  88  *
  89  * The 3 character codes are the terminology codes like RFC 3066.  This
  90  * is compatible with prior ICU codes
  91  *
  92  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  93  * table but now at the end of the table because 3 character codes are
  94  * duplicates.  This avoids bad searches going from 3 to 2 character
  95  * codes.
  96  *
  97  * The range qaa-qtz is reserved for local use
  98  */
  99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 100 /* ISO639 table version is 20150505 */
 101 /* Subsequent hand addition of selected languages */
 102 static const char * const LANGUAGES[] = {
 103     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
 104     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
 105     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
 106     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
 107     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
 108     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
 109     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
 110     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
 111     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
 112     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
 113     "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
 114     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
 115     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
 116     "cs",  "csb", "cu",  "cv",  "cy",
 117     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
 118     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
 119     "dyo", "dyu", "dz",  "dzg",
 120     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
 121     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
 122     "ext",
 123     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
 124     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
 125     "frs", "fur", "fy",
 126     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
 127     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
 128     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
 129     "gur", "guz", "gv",  "gwi",
 130     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
 131     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
 132     "hup", "hy",  "hz",
 133     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
 134     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
 135     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
 136     "jv",
 137     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
 138     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
 139     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
 140     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
 141     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
 142     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
 143     "kv",  "kw",  "ky",
 144     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
 145     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
 146     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
 147     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
 148     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
 149     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
 150     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
 151     "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
 152     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
 153     "my",  "mye", "myv", "mzn",
 154     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
 155     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
 156     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
 157     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
 158     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
 159     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
 160     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
 161     "pon", "prg", "pro", "ps",  "pt",
 162     "qu",  "quc", "qug",
 163     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
 164     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
 165     "rw",  "rwk",
 166     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
 167     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
 168     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
 169     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
 170     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
 171     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
 172     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
 173     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
 174     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
 175     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
 176     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
 177     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
 178     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
 179     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
 180     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
 181     "vot", "vro", "vun",
 182     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
 183     "xal", "xh",  "xmf", "xog",
 184     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
 185     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
 186     "zun", "zxx", "zza",
 187 NULL,
 188     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 189 NULL
 190 };
 191
 192 static const char* const DEPRECATED_LANGUAGES[]={
 193     "in", "iw", "ji", "jw", NULL, NULL
 194 };
 195 static const char* const REPLACEMENT_LANGUAGES[]={
 196     "id", "he", "yi", "jv", NULL, NULL
 197 };
 198
 199 /**
 200  * Table of 3-letter language codes.
 201  *
 202  * This is a lookup table used to convert 3-letter language codes to
 203  * their 2-letter equivalent, where possible.  It must be kept in sync
 204  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 205  * same language as LANGUAGES_3[i].  The commented-out lines are
 206  * copied from LANGUAGES to make eyeballing this baby easier.
 207  *
 208  * Where a 3-letter language code has no 2-letter equivalent, the
 209  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 210  *
 211  * This table should be terminated with a NULL entry, followed by a
 212  * second list, and another NULL entry.  The two lists correspond to
 213  * the two lists in LANGUAGES.
 214  */
 215 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 216 /* ISO639 table version is 20150505 */
 217 /* Subsequent hand addition of selected languages */
 218 static const char * const LANGUAGES_3[] = {
 219     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
 220     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
 221     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
 222     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
 223     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
 224     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
 225     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
 226     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
 227     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
 228     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
 229     "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
 230     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
 231     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
 232     "ces", "csb", "chu", "chv", "cym",
 233     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
 234     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
 235     "dyo", "dyu", "dzo", "dzg",
 236     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
 237     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
 238     "ext",
 239     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
 240     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
 241     "frs", "fur", "fry",
 242     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
 243     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
 244     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
 245     "gur", "guz", "glv", "gwi",
 246     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
 247     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
 248     "hup", "hye", "her",
 249     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
 250     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
 251     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
 252     "jav",
 253     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
 254     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
 255     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
 256     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
 257     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
 258     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
 259     "kom", "cor", "kir",
 260     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
 261     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
 262     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
 263     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
 264     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
 265     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
 266     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
 267     "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
 268     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
 269     "mya", "mye", "myv", "mzn",
 270     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
 271     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
 272     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
 273     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
 274     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
 275     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
 276     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
 277     "pon", "prg", "pro", "pus", "por",
 278     "que", "quc", "qug",
 279     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
 280     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
 281     "kin", "rwk",
 282     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
 283     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
 284     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
 285     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
 286     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
 287     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
 288     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
 289     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
 290     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
 291     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
 292     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
 293     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
 294     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
 295     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
 296     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
 297     "vot", "vro", "vun",
 298     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
 299     "xal", "xho", "xmf", "xog",
 300     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
 301     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
 302     "zun", "zxx", "zza",
 303 NULL,
 304 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 305     "ind", "heb", "yid", "jaw", "srp",
 306 NULL
 307 };
 308
 309 /**
 310  * Table of 2-letter country codes.
 311  *
 312  * This list must be in sorted order.  This list is returned directly
 313  * to the user by some API.
 314  *
 315  * This list must be kept in sync with COUNTRIES_3, with corresponding
 316  * entries matched.
 317  *
 318  * This table should be terminated with a NULL entry, followed by a
 319  * second list, and another NULL entry.  The first list is visible to
 320  * user code when this array is returned by API.  The second list
 321  * contains codes we support, but do not expose through user API.
 322  *
 323  * Notes:
 324  *
 325  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 326  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 327  * new codes keeping the old ones for compatibility updated to include
 328  * 1999/12/03 revisions *CWB*
 329  *
 330  * RO(ROM) is now RO(ROU) according to
 331  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 332  */
 333 static const char * const COUNTRIES[] = {
 334     "AC",  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
 335     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 336     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 337     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
 338     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 339     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CP",  "CR",
 340     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
 341     "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
 342     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 343     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 344     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 345     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 346     "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 347     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 348     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 349     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 350     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 351     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 352     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 353     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 354     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 355     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 356     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 357     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 358     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
 359     "SX",  "SY",  "SZ",  "TA",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 360     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 361     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 362     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 363     "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 364 NULL,
 365     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 366 NULL
 367 };
 368
 369 static const char* const DEPRECATED_COUNTRIES[] = {
 370     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
 371 };
 372 static const char* const REPLACEMENT_COUNTRIES[] = {
 373 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
 374     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
 375 };
 376
 377 /**
 378  * Table of 3-letter country codes.
 379  *
 380  * This is a lookup table used to convert 3-letter country codes to
 381  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 382  * For all valid i, COUNTRIES[i] must refer to the same country as
 383  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 384  * to make eyeballing this baby easier.
 385  *
 386  * This table should be terminated with a NULL entry, followed by a
 387  * second list, and another NULL entry.  The two lists correspond to
 388  * the two lists in COUNTRIES.
 389  */
 390 static const char * const COUNTRIES_3[] = {
 391 /*  "AC",  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
 392     "ASC", "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
 393 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 394     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 395 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 396     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 397 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
 398     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
 399 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 400     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 401 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CP",  "CR",     */
 402     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CPT", "CRI",
 403 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
 404     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
 405 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 406     "DMA", "DOM", "DZA", "EA ", "ECU", "EST", "EGY", "ESH", "ERI", /* no valid 3-letter code for EA */
 407 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 408     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 409 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 410     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 411 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 412     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 413 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 414     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 415 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 416     "IC ", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", /* no valid 3-letter code for IC */
 417 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 418     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 419 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 420     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 421 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 422     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 423 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 424     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 425 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 426     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 427 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 428     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 429 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 430     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 431 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 432     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 433 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 434     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 435 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 436     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 437 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 438     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 439 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
 440     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
 441 /*  "SX",  "SY",  "SZ",  "TA",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 442     "SXM", "SYR", "SWZ", "TAA", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 443 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 444     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 445 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 446     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 447 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 448     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 449 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 450     "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 451 NULL,
 452 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
 453     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
 454 NULL
 455 };
 456
 457 typedef struct CanonicalizationMap {
 458     const char *id;          /* input ID */
 459     const char *canonicalID; /* canonicalized output ID */
 460     const char *keyword;     /* keyword, or NULL if none */
 461     const char *value;       /* keyword value, or NULL if kw==NULL */
 462 } CanonicalizationMap;
 463
 464 /**
 465  * A map to canonicalize locale IDs.  This handles a variety of
 466  * different semantic kinds of transformations.
 467  */
 468 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 469     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 470     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 471     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 472     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 473     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 474     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 475     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 476     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 477     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 478     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 479     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 480     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 481     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 482     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 483     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 484     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 485     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 486     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 487     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 488     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 489     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 490     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 491     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 492     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 493     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 494     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 495     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 496     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 497     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 498     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 499     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 500     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 501     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 502     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 503     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 504     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 505     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 506     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 507     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 508     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
 509     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 510     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
 511     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
 512     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
 513     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
 514     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
 515 };
 516
 517 typedef struct VariantMap {
 518     const char *variant;          /* input ID */
 519     const char *keyword;     /* keyword, or NULL if none */
 520     const char *value;       /* keyword value, or NULL if kw==NULL */
 521 } VariantMap;
 522
 523 static const VariantMap VARIANT_MAP[] = {
 524     { "EURO",   "currency", "EUR" },
 525     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 526     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 527 };
 528
 529 /* ### BCP47 Conversion *******************************************/
 530 /* Test if the locale id has BCP47 u extension and does not have '@' */
 531 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 532 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 533 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 534         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 ||  \
 535                 U_FAILURE(*err) || *err == U_STRING_NOT_TERMINATED_WARNING) { \
 536             finalID=id; \
 537             if (*err == U_STRING_NOT_TERMINATED_WARNING) { *err = U_BUFFER_OVERFLOW_ERROR; } \
 538         } else { \
 539             finalID=buffer; \
 540         }
 541 /* Gets the size of the shortest subtag in the given localeID. */
 542 static int32_t getShortestSubtagLength(const char *localeID) {
 543     int32_t localeIDLength = static_cast<int32_t>(uprv_strlen(localeID));
 544     int32_t length = localeIDLength;
 545     int32_t tmpLength = 0;
 546     int32_t i;
 547     UBool reset = TRUE;
 548
 549     for (i = 0; i < localeIDLength; i++) {
 550         if (localeID[i] != '_' && localeID[i] != '-') {
 551             if (reset) {
 552                 tmpLength = 0;
 553                 reset = FALSE;
 554             }
 555             tmpLength++;
 556         } else {
 557             if (tmpLength != 0 && tmpLength < length) {
 558                 length = tmpLength;
 559             }
 560             reset = TRUE;
 561         }
 562     }
 563
 564     return length;
 565 }
 566
 567 /* ### Keywords **************************************************/
 568 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
 569 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
 570 /* Punctuation/symbols allowed in legacy key values */
 571 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
 572
 573 #define ULOC_KEYWORD_BUFFER_LEN 25
 574 #define ULOC_MAX_NO_KEYWORDS 25
 575
 576 U_CAPI const char * U_EXPORT2
 577 locale_getKeywordsStart(const char *localeID) {
 578     const char *result = NULL;
 579     if((result = uprv_strchr(localeID, '@')) != NULL) {
 580         return result;
 581     }
 582 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 583     else {
 584         /* We do this because the @ sign is variant, and the @ sign used on one
 585         EBCDIC machine won't be compiled the same way on other EBCDIC based
 586         machines. */
 587         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 588         const uint8_t *charToFind = ebcdicSigns;
 589         while(*charToFind) {
 590             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 591                 return result;
 592             }
 593             charToFind++;
 594         }
 595     }
 596 #endif
 597     return NULL;
 598 }
 599
 600 /**
 601  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 602  * @param keywordName incoming name to be canonicalized
 603  * @param status return status (keyword too long)
 604  * @return length of the keyword name
 605  */
 606 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 607 {
 608   int32_t keywordNameLen = 0;
 609
 610   for (; *keywordName != 0; keywordName++) {
 611     if (!UPRV_ISALPHANUM(*keywordName)) {
 612       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 613       return 0;
 614     }
 615     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
 616       buf[keywordNameLen++] = uprv_tolower(*keywordName);
 617     } else {
 618       /* keyword name too long for internal buffer */
 619       *status = U_INTERNAL_PROGRAM_ERROR;
 620       return 0;
 621     }
 622   }
 623   if (keywordNameLen == 0) {
 624     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
 625     return 0;
 626   }
 627   buf[keywordNameLen] = 0; /* terminate */
 628
 629   return keywordNameLen;
 630 }
 631
 632 typedef struct {
 633     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 634     int32_t keywordLen;
 635     const char *valueStart;
 636     int32_t valueLen;
 637 } KeywordStruct;
 638
 639 static int32_t U_CALLCONV
 640 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
 641     const char* leftString = ((const KeywordStruct *)left)->keyword;
 642     const char* rightString = ((const KeywordStruct *)right)->keyword;
 643     return uprv_strcmp(leftString, rightString);
 644 }
 645
 646 /**
 647  * Both addKeyword and addValue must already be in canonical form.
 648  * Either both addKeyword and addValue are NULL, or neither is NULL.
 649  * If they are not NULL they must be zero terminated.
 650  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 651  */
 652 static int32_t
 653 _getKeywords(const char *localeID,
 654              char prev,
 655              char *keywords, int32_t keywordCapacity,
 656              char *values, int32_t valuesCapacity, int32_t *valLen,
 657              UBool valuesToo,
 658              const char* addKeyword,
 659              const char* addValue,
 660              UErrorCode *status)
 661 {
 662     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 663
 664     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 665     int32_t numKeywords = 0;
 666     const char* pos = localeID;
 667     const char* equalSign = NULL;
 668     const char* semicolon = NULL;
 669     int32_t i = 0, j, n;
 670     int32_t keywordsLen = 0;
 671     int32_t valuesLen = 0;
 672
 673     if(prev == '@') { /* start of keyword definition */
 674         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 675         do {
 676             UBool duplicate = FALSE;
 677             /* skip leading spaces */
 678             while(*pos == ' ') {
 679                 pos++;
 680             }
 681             if (!*pos) { /* handle trailing "; " */
 682                 break;
 683             }
 684             if(numKeywords == maxKeywords) {
 685                 *status = U_INTERNAL_PROGRAM_ERROR;
 686                 return 0;
 687             }
 688             equalSign = uprv_strchr(pos, '=');
 689             semicolon = uprv_strchr(pos, ';');
 690             /* lack of '=' [foo@currency] is illegal */
 691             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 692             if(!equalSign || (semicolon && semicolon<equalSign)) {
 693                 *status = U_INVALID_FORMAT_ERROR;
 694                 return 0;
 695             }
 696             /* need to normalize both keyword and keyword name */
 697             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 698                 /* keyword name too long for internal buffer */
 699                 *status = U_INTERNAL_PROGRAM_ERROR;
 700                 return 0;
 701             }
 702             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 703                 if (pos[i] != ' ') {
 704                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 705                 }
 706             }
 707
 708             /* zero-length keyword is an error. */
 709             if (n == 0) {
 710                 *status = U_INVALID_FORMAT_ERROR;
 711                 return 0;
 712             }
 713
 714             keywordList[numKeywords].keyword[n] = 0;
 715             keywordList[numKeywords].keywordLen = n;
 716             /* now grab the value part. First we skip the '=' */
 717             equalSign++;
 718             /* then we leading spaces */
 719             while(*equalSign == ' ') {
 720                 equalSign++;
 721             }
 722
 723             /* Premature end or zero-length value */
 724             if (!*equalSign || equalSign == semicolon) {
 725                 *status = U_INVALID_FORMAT_ERROR;
 726                 return 0;
 727             }
 728
 729             keywordList[numKeywords].valueStart = equalSign;
 730
 731             pos = semicolon;
 732             i = 0;
 733             if(pos) {
 734                 while(*(pos - i - 1) == ' ') {
 735                     i++;
 736                 }
 737                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 738                 pos++;
 739             } else {
 740                 i = (int32_t)uprv_strlen(equalSign);
 741                 while(i && equalSign[i-1] == ' ') {
 742                     i--;
 743                 }
 744                 keywordList[numKeywords].valueLen = i;
 745             }
 746             /* If this is a duplicate keyword, then ignore it */
 747             for (j=0; j<numKeywords; ++j) {
 748                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 749                     duplicate = TRUE;
 750                     break;
 751                 }
 752             }
 753             if (!duplicate) {
 754                 ++numKeywords;
 755             }
 756         } while(pos);
 757
 758         /* Handle addKeyword/addValue. */
 759         if (addKeyword != NULL) {
 760             UBool duplicate = FALSE;
 761             U_ASSERT(addValue != NULL);
 762             /* Search for duplicate; if found, do nothing. Explicit keyword
 763                overrides addKeyword. */
 764             for (j=0; j<numKeywords; ++j) {
 765                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 766                     duplicate = TRUE;
 767                     break;
 768                 }
 769             }
 770             if (!duplicate) {
 771                 if (numKeywords == maxKeywords) {
 772                     *status = U_INTERNAL_PROGRAM_ERROR;
 773                     return 0;
 774                 }
 775                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 776                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 777                 keywordList[numKeywords].valueStart = addValue;
 778                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 779                 ++numKeywords;
 780             }
 781         } else {
 782             U_ASSERT(addValue == NULL);
 783         }
 784
 785         /* now we have a list of keywords */
 786         /* we need to sort it */
 787         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 788
 789         /* Now construct the keyword part */
 790         for(i = 0; i < numKeywords; i++) {
 791             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 792                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 793                 if(valuesToo) {
 794                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 795                 } else {
 796                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 797                 }
 798             }
 799             keywordsLen += keywordList[i].keywordLen + 1;
 800             if(valuesToo) {
 801                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 802                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 803                 }
 804                 keywordsLen += keywordList[i].valueLen;
 805
 806                 if(i < numKeywords - 1) {
 807                     if(keywordsLen < keywordCapacity) {
 808                         keywords[keywordsLen] = ';';
 809                     }
 810                     keywordsLen++;
 811                 }
 812             }
 813             if(values) {
 814                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 815                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 816                     values[valuesLen + keywordList[i].valueLen] = 0;
 817                 }
 818                 valuesLen += keywordList[i].valueLen + 1;
 819             }
 820         }
 821         if(values) {
 822             values[valuesLen] = 0;
 823             if(valLen) {
 824                 *valLen = valuesLen;
 825             }
 826         }
 827         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 828     } else {
 829         return 0;
 830     }
 831 }
 832
 833 U_CFUNC int32_t
 834 locale_getKeywords(const char *localeID,
 835                    char prev,
 836                    char *keywords, int32_t keywordCapacity,
 837                    char *values, int32_t valuesCapacity, int32_t *valLen,
 838                    UBool valuesToo,
 839                    UErrorCode *status) {
 840     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 841                         values, valuesCapacity, valLen, valuesToo,
 842                         NULL, NULL, status);
 843 }
 844
 845 U_CAPI int32_t U_EXPORT2
 846 uloc_getKeywordValue(const char* localeID,
 847                      const char* keywordName,
 848                      char* buffer, int32_t bufferCapacity,
 849                      UErrorCode* status)
 850 {
 851     const char* startSearchHere = NULL;
 852     const char* nextSeparator = NULL;
 853     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 854     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 855     int32_t result = 0;
 856
 857     if(status && U_SUCCESS(*status) && localeID) {
 858       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 859       const char* tmpLocaleID;
 860
 861       if (keywordName == NULL || keywordName[0] == 0) {
 862         *status = U_ILLEGAL_ARGUMENT_ERROR;
 863         return 0;
 864       }
 865
 866       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 867       if(U_FAILURE(*status)) {
 868         return 0;
 869       }
 870
 871       if (_hasBCP47Extension(localeID)) {
 872           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 873       } else {
 874           tmpLocaleID=localeID;
 875       }
 876
 877       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
 878       if(startSearchHere == NULL) {
 879           /* no keywords, return at once */
 880           return 0;
 881       }
 882
 883       /* find the first keyword */
 884       while(startSearchHere) {
 885           const char* keyValueTail;
 886           int32_t keyValueLen;
 887
 888           startSearchHere++; /* skip @ or ; */
 889           nextSeparator = uprv_strchr(startSearchHere, '=');
 890           if(!nextSeparator) {
 891               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
 892               return 0;
 893           }
 894           /* strip leading & trailing spaces (TC decided to tolerate these) */
 895           while(*startSearchHere == ' ') {
 896               startSearchHere++;
 897           }
 898           keyValueTail = nextSeparator;
 899           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
 900               keyValueTail--;
 901           }
 902           /* now keyValueTail points to first char after the keyName */
 903           /* copy & normalize keyName from locale */
 904           if (startSearchHere == keyValueTail) {
 905               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
 906               return 0;
 907           }
 908           keyValueLen = 0;
 909           while (startSearchHere < keyValueTail) {
 910             if (!UPRV_ISALPHANUM(*startSearchHere)) {
 911               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 912               return 0;
 913             }
 914             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
 915               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
 916             } else {
 917               /* keyword name too long for internal buffer */
 918               *status = U_INTERNAL_PROGRAM_ERROR;
 919               return 0;
 920             }
 921           }
 922           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
 923
 924           startSearchHere = uprv_strchr(nextSeparator, ';');
 925
 926           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 927                /* current entry matches the keyword. */
 928              nextSeparator++; /* skip '=' */
 929               /* First strip leading & trailing spaces (TC decided to tolerate these) */
 930               while(*nextSeparator == ' ') {
 931                 nextSeparator++;
 932               }
 933               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
 934               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
 935                 keyValueTail--;
 936               }
 937               /* Now copy the value, but check well-formedness */
 938               if (nextSeparator == keyValueTail) {
 939                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
 940                 return 0;
 941               }
 942               keyValueLen = 0;
 943               while (nextSeparator < keyValueTail) {
 944                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
 945                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
 946                   return 0;
 947                 }
 948                 if (keyValueLen < bufferCapacity) {
 949                   /* Should we lowercase value to return here? Tests expect as-is. */
 950                   buffer[keyValueLen++] = *nextSeparator++;
 951                 } else { /* keep advancing so we return correct length in case of overflow */
 952                   keyValueLen++;
 953                   nextSeparator++;
 954                 }
 955               }
 956               result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
 957               return result;
 958           }
 959       }
 960     }
 961     return 0;
 962 }
 963
 964 U_CAPI int32_t U_EXPORT2
 965 uloc_setKeywordValue(const char* keywordName,
 966                      const char* keywordValue,
 967                      char* buffer, int32_t bufferCapacity,
 968                      UErrorCode* status)
 969 {
 970     /* TODO: sorting. removal. */
 971     int32_t keywordNameLen;
 972     int32_t keywordValueLen;
 973     int32_t bufLen;
 974     int32_t needLen = 0;
 975     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 976     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
 977     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 978     int32_t rc;
 979     char* nextSeparator = NULL;
 980     char* nextEqualsign = NULL;
 981     char* startSearchHere = NULL;
 982     char* keywordStart = NULL;
 983     CharString updatedKeysAndValues;
 984     int32_t updatedKeysAndValuesLen;
 985     UBool handledInputKeyAndValue = FALSE;
 986     char keyValuePrefix = '@';
 987
 988     if(U_FAILURE(*status)) {
 989         return -1;
 990     }
 991     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
 992         *status = U_ILLEGAL_ARGUMENT_ERROR;
 993         return 0;
 994     }
 995     bufLen = (int32_t)uprv_strlen(buffer);
 996     if(bufferCapacity<bufLen) {
 997         /* The capacity is less than the length?! Is this NULL terminated? */
 998         *status = U_ILLEGAL_ARGUMENT_ERROR;
 999         return 0;
1000     }
1001     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
1002     if(U_FAILURE(*status)) {
1003         return 0;
1004     }
1005
1006     keywordValueLen = 0;
1007     if(keywordValue) {
1008         while (*keywordValue != 0) {
1009             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
1010                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
1011                 return 0;
1012             }
1013             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
1014                 /* Should we force lowercase in value to set? */
1015                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
1016             } else {
1017                 /* keywordValue too long for internal buffer */
1018                 *status = U_INTERNAL_PROGRAM_ERROR;
1019                 return 0;
1020             }
1021         }
1022     }
1023     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
1024
1025     startSearchHere = (char*)locale_getKeywordsStart(buffer);
1026     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
1027         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
1028             return bufLen;
1029         }
1030
1031         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1032         if(startSearchHere) { /* had a single @ */
1033             needLen--; /* already had the @ */
1034             /* startSearchHere points at the @ */
1035         } else {
1036             startSearchHere=buffer+bufLen;
1037         }
1038         if(needLen >= bufferCapacity) {
1039             *status = U_BUFFER_OVERFLOW_ERROR;
1040             return needLen; /* no change */
1041         }
1042         *startSearchHere++ = '@';
1043         uprv_strcpy(startSearchHere, keywordNameBuffer);
1044         startSearchHere += keywordNameLen;
1045         *startSearchHere++ = '=';
1046         uprv_strcpy(startSearchHere, keywordValueBuffer);
1047         return needLen;
1048     } /* end shortcut - no @ */
1049
1050     keywordStart = startSearchHere;
1051     /* search for keyword */
1052     while(keywordStart) {
1053         const char* keyValueTail;
1054         int32_t keyValueLen;
1055
1056         keywordStart++; /* skip @ or ; */
1057         nextEqualsign = uprv_strchr(keywordStart, '=');
1058         if (!nextEqualsign) {
1059             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
1060             return 0;
1061         }
1062         /* strip leading & trailing spaces (TC decided to tolerate these) */
1063         while(*keywordStart == ' ') {
1064             keywordStart++;
1065         }
1066         keyValueTail = nextEqualsign;
1067         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
1068             keyValueTail--;
1069         }
1070         /* now keyValueTail points to first char after the keyName */
1071         /* copy & normalize keyName from locale */
1072         if (keywordStart == keyValueTail) {
1073             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1074             return 0;
1075         }
1076         keyValueLen = 0;
1077         while (keywordStart < keyValueTail) {
1078             if (!UPRV_ISALPHANUM(*keywordStart)) {
1079                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1080                 return 0;
1081             }
1082             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1083                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1084             } else {
1085                 /* keyword name too long for internal buffer */
1086                 *status = U_INTERNAL_PROGRAM_ERROR;
1087                 return 0;
1088             }
1089         }
1090         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1091
1092         nextSeparator = uprv_strchr(nextEqualsign, ';');
1093
1094         /* start processing the value part */
1095         nextEqualsign++; /* skip '=' */
1096         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1097         while(*nextEqualsign == ' ') {
1098             nextEqualsign++;
1099         }
1100         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1101         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1102             keyValueTail--;
1103         }
1104         if (nextEqualsign == keyValueTail) {
1105             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1106             return 0;
1107         }
1108
1109         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1110         if(rc == 0) {
1111             /* Current entry matches the input keyword. Update the entry */
1112             if(keywordValueLen > 0) { /* updating a value */
1113                 updatedKeysAndValues.append(keyValuePrefix, *status);
1114                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1115                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1116                 updatedKeysAndValues.append('=', *status);
1117                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1118             } /* else removing this entry, don't emit anything */
1119             handledInputKeyAndValue = TRUE;
1120         } else {
1121            /* input keyword sorts earlier than current entry, add before current entry */
1122             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1123                 /* insert new entry at this location */
1124                 updatedKeysAndValues.append(keyValuePrefix, *status);
1125                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1126                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1127                 updatedKeysAndValues.append('=', *status);
1128                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1129                 handledInputKeyAndValue = TRUE;
1130             }
1131             /* copy the current entry */
1132             updatedKeysAndValues.append(keyValuePrefix, *status);
1133             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1134             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1135             updatedKeysAndValues.append('=', *status);
1136             updatedKeysAndValues.append(nextEqualsign, keyValueTail-nextEqualsign, *status);
1137         }
1138         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1139             /* append new entry at the end, it sorts later than existing entries */
1140             updatedKeysAndValues.append(keyValuePrefix, *status);
1141             /* skip keyValuePrefix update, no subsequent key-value pair */
1142             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1143             updatedKeysAndValues.append('=', *status);
1144             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1145             handledInputKeyAndValue = TRUE;
1146         }
1147         keywordStart = nextSeparator;
1148     } /* end loop searching */
1149
1150     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1151      * problems with the passed-in locale. So if we did encounter problems with the
1152      * passed-in locale above, those errors took precedence and overrode any error
1153      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1154      * are errors here they are from updatedKeysAndValues.append; they do cause an
1155      * error return but the passed-in locale is unmodified and the original bufLen is
1156      * returned.
1157      */
1158     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1159         /* if input key/value specified removal of a keyword not present in locale, or
1160          * there was an error in CharString.append, leave original locale alone. */
1161         return bufLen;
1162     }
1163
1164     updatedKeysAndValuesLen = updatedKeysAndValues.length();
1165     /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1166     needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1167     if(needLen >= bufferCapacity) {
1168         *status = U_BUFFER_OVERFLOW_ERROR;
1169         return needLen; /* no change */
1170     }
1171     if (updatedKeysAndValuesLen > 0) {
1172         uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1173     }
1174     buffer[needLen]=0;
1175     return needLen;
1176 }
1177
1178 /* ### ID parsing implementation **************************************************/
1179
1180 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1181
1182 /*returns TRUE if one of the special prefixes is here (s=string)
1183   'x-' or 'i-' */
1184 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1185
1186 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1187  * except for variant
1188  */
1189 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1190
1191 static char* _strnchr(const char* str, int32_t len, char c) {
1192     U_ASSERT(str != 0 && len >= 0);
1193     while (len-- != 0) {
1194         char d = *str;
1195         if (d == c) {
1196             return (char*) str;
1197         } else if (d == 0) {
1198             break;
1199         }
1200         ++str;
1201     }
1202     return NULL;
1203 }
1204
1205 /**
1206  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1207  * a NULL entry, followed by more entries, and a second NULL entry.
1208  *
1209  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1210  * COUNTRIES_3.
1211  */
1212 static int16_t _findIndex(const char* const* list, const char* key)
1213 {
1214     const char* const* anchor = list;
1215     int32_t pass = 0;
1216
1217     /* Make two passes through two NULL-terminated arrays at 'list' */
1218     while (pass++ < 2) {
1219         while (*list) {
1220             if (uprv_strcmp(key, *list) == 0) {
1221                 return (int16_t)(list - anchor);
1222             }
1223             list++;
1224         }
1225         ++list;     /* skip final NULL *CWB*/
1226     }
1227     return -1;
1228 }
1229
1230 /* count the length of src while copying it to dest; return strlen(src) */
1231 static inline int32_t
1232 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1233     const char *anchor;
1234     char c;
1235
1236     anchor=src;
1237     for(;;) {
1238         if((c=*src)==0) {
1239             return (int32_t)(src-anchor);
1240         }
1241         if(destCapacity<=0) {
1242             return (int32_t)((src-anchor)+uprv_strlen(src));
1243         }
1244         ++src;
1245         *dest++=c;
1246         --destCapacity;
1247     }
1248 }
1249
1250 U_CFUNC const char*
1251 uloc_getCurrentCountryID(const char* oldID){
1252     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1253     if (offset >= 0) {
1254         return REPLACEMENT_COUNTRIES[offset];
1255     }
1256     return oldID;
1257 }
1258 U_CFUNC const char*
1259 uloc_getCurrentLanguageID(const char* oldID){
1260     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1261     if (offset >= 0) {
1262         return REPLACEMENT_LANGUAGES[offset];
1263     }
1264     return oldID;
1265 }
1266 /*
1267  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1268  * avoid duplicating code to handle the earlier locale ID pieces
1269  * in the functions for the later ones by
1270  * setting the *pEnd pointer to where they stopped parsing
1271  *
1272  * TODO try to use this in Locale
1273  */
1274 U_CFUNC int32_t
1275 ulocimp_getLanguage(const char *localeID,
1276                     char *language, int32_t languageCapacity,
1277                     const char **pEnd) {
1278     int32_t i=0;
1279     int32_t offset;
1280     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1281
1282     /* if it starts with i- or x- then copy that prefix */
1283     if(_isIDPrefix(localeID)) {
1284         if(i<languageCapacity) {
1285             language[i]=(char)uprv_tolower(*localeID);
1286         }
1287         if(i<languageCapacity) {
1288             language[i+1]='-';
1289         }
1290         i+=2;
1291         localeID+=2;
1292     }
1293
1294     /* copy the language as far as possible and count its length */
1295     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1296         if(i<languageCapacity) {
1297             language[i]=(char)uprv_tolower(*localeID);
1298         }
1299         if(i<3) {
1300             U_ASSERT(i>=0);
1301             lang[i]=(char)uprv_tolower(*localeID);
1302         }
1303         i++;
1304         localeID++;
1305     }
1306
1307     if(i==3) {
1308         /* convert 3 character code to 2 character code if possible *CWB*/
1309         offset=_findIndex(LANGUAGES_3, lang);
1310         if(offset>=0) {
1311             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1312         }
1313     }
1314
1315     if(pEnd!=NULL) {
1316         *pEnd=localeID;
1317     }
1318     return i;
1319 }
1320
1321 U_CFUNC int32_t
1322 ulocimp_getScript(const char *localeID,
1323                   char *script, int32_t scriptCapacity,
1324                   const char **pEnd)
1325 {
1326     int32_t idLen = 0;
1327
1328     if (pEnd != NULL) {
1329         *pEnd = localeID;
1330     }
1331
1332     /* copy the second item as far as possible and count its length */
1333     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1334             && uprv_isASCIILetter(localeID[idLen])) {
1335         idLen++;
1336     }
1337
1338     /* If it's exactly 4 characters long, then it's a script and not a country. */
1339     if (idLen == 4) {
1340         int32_t i;
1341         if (pEnd != NULL) {
1342             *pEnd = localeID+idLen;
1343         }
1344         if(idLen > scriptCapacity) {
1345             idLen = scriptCapacity;
1346         }
1347         if (idLen >= 1) {
1348             script[0]=(char)uprv_toupper(*(localeID++));
1349         }
1350         for (i = 1; i < idLen; i++) {
1351             script[i]=(char)uprv_tolower(*(localeID++));
1352         }
1353     }
1354     else {
1355         idLen = 0;
1356     }
1357     return idLen;
1358 }
1359
1360 U_CFUNC int32_t
1361 ulocimp_getCountry(const char *localeID,
1362                    char *country, int32_t countryCapacity,
1363                    const char **pEnd)
1364 {
1365     int32_t idLen=0;
1366     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1367     int32_t offset;
1368
1369     /* copy the country as far as possible and count its length */
1370     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1371         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1372             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1373         }
1374         idLen++;
1375     }
1376
1377     /* the country should be either length 2 or 3 */
1378     if (idLen == 2 || idLen == 3) {
1379         UBool gotCountry = FALSE;
1380         /* convert 3 character code to 2 character code if possible *CWB*/
1381         if(idLen==3) {
1382             offset=_findIndex(COUNTRIES_3, cnty);
1383             if(offset>=0) {
1384                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1385                 gotCountry = TRUE;
1386             }
1387         }
1388         if (!gotCountry) {
1389             int32_t i = 0;
1390             for (i = 0; i < idLen; i++) {
1391                 if (i < countryCapacity) {
1392                     country[i]=(char)uprv_toupper(localeID[i]);
1393                 }
1394             }
1395         }
1396         localeID+=idLen;
1397     } else {
1398         idLen = 0;
1399     }
1400
1401     if(pEnd!=NULL) {
1402         *pEnd=localeID;
1403     }
1404
1405     return idLen;
1406 }
1407
1408 /**
1409  * @param needSeparator if true, then add leading '_' if any variants
1410  * are added to 'variant'
1411  */
1412 static int32_t
1413 _getVariantEx(const char *localeID,
1414               char prev,
1415               char *variant, int32_t variantCapacity,
1416               UBool needSeparator) {
1417     int32_t i=0;
1418
1419     /* get one or more variant tags and separate them with '_' */
1420     if(_isIDSeparator(prev)) {
1421         /* get a variant string after a '-' or '_' */
1422         while(!_isTerminator(*localeID)) {
1423             if (needSeparator) {
1424                 if (i<variantCapacity) {
1425                     variant[i] = '_';
1426                 }
1427                 ++i;
1428                 needSeparator = FALSE;
1429             }
1430             if(i<variantCapacity) {
1431                 variant[i]=(char)uprv_toupper(*localeID);
1432                 if(variant[i]=='-') {
1433                     variant[i]='_';
1434                 }
1435             }
1436             i++;
1437             localeID++;
1438         }
1439     }
1440
1441     /* if there is no variant tag after a '-' or '_' then look for '@' */
1442     if(i==0) {
1443         if(prev=='@') {
1444             /* keep localeID */
1445         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1446             ++localeID; /* point after the '@' */
1447         } else {
1448             return 0;
1449         }
1450         while(!_isTerminator(*localeID)) {
1451             if (needSeparator) {
1452                 if (i<variantCapacity) {
1453                     variant[i] = '_';
1454                 }
1455                 ++i;
1456                 needSeparator = FALSE;
1457             }
1458             if(i<variantCapacity) {
1459                 variant[i]=(char)uprv_toupper(*localeID);
1460                 if(variant[i]=='-' || variant[i]==',') {
1461                     variant[i]='_';
1462                 }
1463             }
1464             i++;
1465             localeID++;
1466         }
1467     }
1468
1469     return i;
1470 }
1471
1472 static int32_t
1473 _getVariant(const char *localeID,
1474             char prev,
1475             char *variant, int32_t variantCapacity) {
1476     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1477 }
1478
1479 /**
1480  * Delete ALL instances of a variant from the given list of one or
1481  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1482  * @param variants the source string of one or more variants,
1483  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1484  * terminated; if it is, trailing zero will NOT be maintained.
1485  * @param variantsLen length of variants
1486  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1487  * or "PREEURO"; not zero terminated
1488  * @param toDeleteLen length of toDelete
1489  * @return number of characters deleted from variants
1490  */
1491 static int32_t
1492 _deleteVariant(char* variants, int32_t variantsLen,
1493                const char* toDelete, int32_t toDeleteLen)
1494 {
1495     int32_t delta = 0; /* number of chars deleted */
1496     for (;;) {
1497         UBool flag = FALSE;
1498         if (variantsLen < toDeleteLen) {
1499             return delta;
1500         }
1501         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1502             (variantsLen == toDeleteLen ||
1503              (flag=(variants[toDeleteLen] == '_'))))
1504         {
1505             int32_t d = toDeleteLen + (flag?1:0);
1506             variantsLen -= d;
1507             delta += d;
1508             if (variantsLen > 0) {
1509                 uprv_memmove(variants, variants+d, variantsLen);
1510             }
1511         } else {
1512             char* p = _strnchr(variants, variantsLen, '_');
1513             if (p == NULL) {
1514                 return delta;
1515             }
1516             ++p;
1517             variantsLen -= (int32_t)(p - variants);
1518             variants = p;
1519         }
1520     }
1521 }
1522
1523 /* Keyword enumeration */
1524
1525 typedef struct UKeywordsContext {
1526     char* keywords;
1527     char* current;
1528 } UKeywordsContext;
1529
1530 U_CDECL_BEGIN
1531
1532 static void U_CALLCONV
1533 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1534     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1535     uprv_free(enumerator->context);
1536     uprv_free(enumerator);
1537 }
1538
1539 static int32_t U_CALLCONV
1540 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1541     char *kw = ((UKeywordsContext *)en->context)->keywords;
1542     int32_t result = 0;
1543     while(*kw) {
1544         result++;
1545         kw += uprv_strlen(kw)+1;
1546     }
1547     return result;
1548 }
1549
1550 static const char * U_CALLCONV
1551 uloc_kw_nextKeyword(UEnumeration* en,
1552                     int32_t* resultLength,
1553                     UErrorCode* /*status*/) {
1554     const char* result = ((UKeywordsContext *)en->context)->current;
1555     int32_t len = 0;
1556     if(*result) {
1557         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1558         ((UKeywordsContext *)en->context)->current += len+1;
1559     } else {
1560         result = NULL;
1561     }
1562     if (resultLength) {
1563         *resultLength = len;
1564     }
1565     return result;
1566 }
1567
1568 static void U_CALLCONV
1569 uloc_kw_resetKeywords(UEnumeration* en,
1570                       UErrorCode* /*status*/) {
1571     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1572 }
1573
1574 U_CDECL_END
1575
1576
1577 static const UEnumeration gKeywordsEnum = {
1578     NULL,
1579     NULL,
1580     uloc_kw_closeKeywords,
1581     uloc_kw_countKeywords,
1582     uenum_unextDefault,
1583     uloc_kw_nextKeyword,
1584     uloc_kw_resetKeywords
1585 };
1586
1587 U_CAPI UEnumeration* U_EXPORT2
1588 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1589 {
1590     UKeywordsContext *myContext = NULL;
1591     UEnumeration *result = NULL;
1592
1593     if(U_FAILURE(*status)) {
1594         return NULL;
1595     }
1596     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1597     /* Null pointer test */
1598     if (result == NULL) {
1599         *status = U_MEMORY_ALLOCATION_ERROR;
1600         return NULL;
1601     }
1602     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1603     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1604     if (myContext == NULL) {
1605         *status = U_MEMORY_ALLOCATION_ERROR;
1606         uprv_free(result);
1607         return NULL;
1608     }
1609     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1610     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1611     myContext->keywords[keywordListSize] = 0;
1612     myContext->current = myContext->keywords;
1613     result->context = myContext;
1614     return result;
1615 }
1616
1617 U_CAPI UEnumeration* U_EXPORT2
1618 uloc_openKeywords(const char* localeID,
1619                         UErrorCode* status)
1620 {
1621     int32_t i=0;
1622     char keywords[256];
1623     int32_t keywordsCapacity = 256;
1624     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1625     const char* tmpLocaleID;
1626
1627     if(status==NULL || U_FAILURE(*status)) {
1628         return 0;
1629     }
1630
1631     if (_hasBCP47Extension(localeID)) {
1632         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1633     } else {
1634         if (localeID==NULL) {
1635            localeID=uloc_getDefault();
1636         }
1637         tmpLocaleID=localeID;
1638     }
1639
1640     /* Skip the language */
1641     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1642     if(_isIDSeparator(*tmpLocaleID)) {
1643         const char *scriptID;
1644         /* Skip the script if available */
1645         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1646         if(scriptID != tmpLocaleID+1) {
1647             /* Found optional script */
1648             tmpLocaleID = scriptID;
1649         }
1650         /* Skip the Country */
1651         if (_isIDSeparator(*tmpLocaleID)) {
1652             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1653             if(_isIDSeparator(*tmpLocaleID)) {
1654                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1655             }
1656         }
1657     }
1658
1659     /* keywords are located after '@' */
1660     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1661         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1662     }
1663
1664     if(i) {
1665         return uloc_openKeywordList(keywords, i, status);
1666     } else {
1667         return NULL;
1668     }
1669 }
1670
1671
1672 /* bit-flags for 'options' parameter of _canonicalize */
1673 #define _ULOC_STRIP_KEYWORDS 0x2
1674 #define _ULOC_CANONICALIZE   0x1
1675
1676 #define OPTION_SET(options, mask) ((options & mask) != 0)
1677
1678 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1679 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1680
1681 /**
1682  * Canonicalize the given localeID, to level 1 or to level 2,
1683  * depending on the options.  To specify level 1, pass in options=0.
1684  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1685  *
1686  * This is the code underlying uloc_getName and uloc_canonicalize.
1687  */
1688 static int32_t
1689 _canonicalize(const char* localeID,
1690               char* result,
1691               int32_t resultCapacity,
1692               uint32_t options,
1693               UErrorCode* err) {
1694     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1695     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1696     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1697     const char* origLocaleID;
1698     const char* tmpLocaleID;
1699     const char* keywordAssign = NULL;
1700     const char* separatorIndicator = NULL;
1701     const char* addKeyword = NULL;
1702     const char* addValue = NULL;
1703     char* name;
1704     char* variant = NULL; /* pointer into name, or NULL */
1705
1706     if (U_FAILURE(*err)) {
1707         return 0;
1708     }
1709
1710     if (_hasBCP47Extension(localeID)) {
1711         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1712     } else {
1713         if (localeID==NULL) {
1714            localeID=uloc_getDefault();
1715         }
1716         tmpLocaleID=localeID;
1717     }
1718
1719     origLocaleID=tmpLocaleID;
1720
1721     /* if we are doing a full canonicalization, then put results in
1722        localeBuffer, if necessary; otherwise send them to result. */
1723     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1724         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1725         name = localeBuffer;
1726         nameCapacity = (int32_t)sizeof(localeBuffer);
1727     } else {
1728         name = result;
1729         nameCapacity = resultCapacity;
1730     }
1731
1732     /* get all pieces, one after another, and separate with '_' */
1733     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1734
1735     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1736         const char *d = uloc_getDefault();
1737
1738         len = (int32_t)uprv_strlen(d);
1739
1740         if (name != NULL) {
1741             uprv_strncpy(name, d, len);
1742         }
1743     } else if(_isIDSeparator(*tmpLocaleID)) {
1744         const char *scriptID;
1745
1746         ++fieldCount;
1747         if(len<nameCapacity) {
1748             name[len]='_';
1749         }
1750         ++len;
1751
1752         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1753             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1754         if(scriptSize > 0) {
1755             /* Found optional script */
1756             tmpLocaleID = scriptID;
1757             ++fieldCount;
1758             len+=scriptSize;
1759             if (_isIDSeparator(*tmpLocaleID)) {
1760                 /* If there is something else, then we add the _ */
1761                 if(len<nameCapacity) {
1762                     name[len]='_';
1763                 }
1764                 ++len;
1765             }
1766         }
1767
1768         if (_isIDSeparator(*tmpLocaleID)) {
1769             const char *cntryID;
1770             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1771                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1772             if (cntrySize > 0) {
1773                 /* Found optional country */
1774                 tmpLocaleID = cntryID;
1775                 len+=cntrySize;
1776             }
1777             if(_isIDSeparator(*tmpLocaleID)) {
1778                 /* If there is something else, then we add the _  if we found country before. */
1779                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1780                     ++fieldCount;
1781                     if(len<nameCapacity) {
1782                         name[len]='_';
1783                     }
1784                     ++len;
1785                 }
1786
1787                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1788                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1789                 if (variantSize > 0) {
1790                     variant = len<nameCapacity ? name+len : NULL;
1791                     len += variantSize;
1792                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1793                 }
1794             }
1795         }
1796     }
1797
1798     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1799     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1800         UBool done = FALSE;
1801         do {
1802             char c = *tmpLocaleID;
1803             switch (c) {
1804             case 0:
1805             case '@':
1806                 done = TRUE;
1807                 break;
1808             default:
1809                 if (len<nameCapacity) {
1810                     name[len] = c;
1811                 }
1812                 ++len;
1813                 ++tmpLocaleID;
1814                 break;
1815             }
1816         } while (!done);
1817     }
1818
1819     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1820        After this, tmpLocaleID either points to '@' or is NULL */
1821     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1822         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1823         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1824     }
1825
1826     /* Copy POSIX-style variant, if any [mr@FOO] */
1827     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1828         tmpLocaleID != NULL && keywordAssign == NULL) {
1829         for (;;) {
1830             char c = *tmpLocaleID;
1831             if (c == 0) {
1832                 break;
1833             }
1834             if (len<nameCapacity) {
1835                 name[len] = c;
1836             }
1837             ++len;
1838             ++tmpLocaleID;
1839         }
1840     }
1841
1842     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1843         /* Handle @FOO variant if @ is present and not followed by = */
1844         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1845             int32_t posixVariantSize;
1846             /* Add missing '_' if needed */
1847             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1848                 do {
1849                     if(len<nameCapacity) {
1850                         name[len]='_';
1851                     }
1852                     ++len;
1853                     ++fieldCount;
1854                 } while(fieldCount<2);
1855             }
1856             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1857                                              (UBool)(variantSize > 0));
1858             if (posixVariantSize > 0) {
1859                 if (variant == NULL) {
1860                     variant = name+len;
1861                 }
1862                 len += posixVariantSize;
1863                 variantSize += posixVariantSize;
1864             }
1865         }
1866
1867         /* Handle generic variants first */
1868         if (variant) {
1869             for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
1870                 const char* variantToCompare = VARIANT_MAP[j].variant;
1871                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1872                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1873                 len -= variantLen;
1874                 if (variantLen > 0) {
1875                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1876                         --len;
1877                     }
1878                     addKeyword = VARIANT_MAP[j].keyword;
1879                     addValue = VARIANT_MAP[j].value;
1880                     break;
1881                 }
1882             }
1883             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1884                 --len;
1885             }
1886         }
1887
1888         /* Look up the ID in the canonicalization map */
1889         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1890             const char* id = CANONICALIZE_MAP[j].id;
1891             int32_t n = (int32_t)uprv_strlen(id);
1892             if (len == n && uprv_strncmp(name, id, n) == 0) {
1893                 if (n == 0 && tmpLocaleID != NULL) {
1894                     break; /* Don't remap "" if keywords present */
1895                 }
1896                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1897                 if (CANONICALIZE_MAP[j].keyword) {
1898                     addKeyword = CANONICALIZE_MAP[j].keyword;
1899                     addValue = CANONICALIZE_MAP[j].value;
1900                 }
1901                 break;
1902             }
1903         }
1904     }
1905
1906     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1907         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1908             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1909             if(len<nameCapacity) {
1910                 name[len]='@';
1911             }
1912             ++len;
1913             ++fieldCount;
1914             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1915                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1916         } else if (addKeyword != NULL) {
1917             U_ASSERT(addValue != NULL && len < nameCapacity);
1918             /* inelegant but works -- later make _getKeywords do this? */
1919             len += _copyCount(name+len, nameCapacity-len, "@");
1920             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1921             len += _copyCount(name+len, nameCapacity-len, "=");
1922             len += _copyCount(name+len, nameCapacity-len, addValue);
1923         }
1924     }
1925
1926     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1927         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1928     }
1929
1930     return u_terminateChars(result, resultCapacity, len, err);
1931 }
1932
1933 /* ### ID parsing API **************************************************/
1934
1935 U_CAPI int32_t  U_EXPORT2
1936 uloc_getParent(const char*    localeID,
1937                char* parent,
1938                int32_t parentCapacity,
1939                UErrorCode* err)
1940 {
1941     const char *lastUnderscore;
1942     int32_t i;
1943
1944     if (U_FAILURE(*err))
1945         return 0;
1946
1947     if (localeID == NULL)
1948         localeID = uloc_getDefault();
1949
1950     lastUnderscore=uprv_strrchr(localeID, '_');
1951     if(lastUnderscore!=NULL) {
1952         i=(int32_t)(lastUnderscore-localeID);
1953     } else {
1954         i=0;
1955     }
1956
1957     if(i>0 && parent != localeID) {
1958         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1959     }
1960     return u_terminateChars(parent, parentCapacity, i, err);
1961 }
1962
1963 U_CAPI int32_t U_EXPORT2
1964 uloc_getLanguage(const char*    localeID,
1965          char* language,
1966          int32_t languageCapacity,
1967          UErrorCode* err)
1968 {
1969     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1970     int32_t i=0;
1971
1972     if (err==NULL || U_FAILURE(*err)) {
1973         return 0;
1974     }
1975
1976     if(localeID==NULL) {
1977         localeID=uloc_getDefault();
1978     }
1979
1980     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1981     return u_terminateChars(language, languageCapacity, i, err);
1982 }
1983
1984 U_CAPI int32_t U_EXPORT2
1985 uloc_getScript(const char*    localeID,
1986          char* script,
1987          int32_t scriptCapacity,
1988          UErrorCode* err)
1989 {
1990     int32_t i=0;
1991
1992     if(err==NULL || U_FAILURE(*err)) {
1993         return 0;
1994     }
1995
1996     if(localeID==NULL) {
1997         localeID=uloc_getDefault();
1998     }
1999
2000     /* skip the language */
2001     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
2002     if(_isIDSeparator(*localeID)) {
2003         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
2004     }
2005     return u_terminateChars(script, scriptCapacity, i, err);
2006 }
2007
2008 U_CAPI int32_t  U_EXPORT2
2009 uloc_getCountry(const char* localeID,
2010             char* country,
2011             int32_t countryCapacity,
2012             UErrorCode* err)
2013 {
2014     int32_t i=0;
2015
2016     if(err==NULL || U_FAILURE(*err)) {
2017         return 0;
2018     }
2019
2020     if(localeID==NULL) {
2021         localeID=uloc_getDefault();
2022     }
2023
2024     /* Skip the language */
2025     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
2026     if(_isIDSeparator(*localeID)) {
2027         const char *scriptID;
2028         /* Skip the script if available */
2029         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
2030         if(scriptID != localeID+1) {
2031             /* Found optional script */
2032             localeID = scriptID;
2033         }
2034         if(_isIDSeparator(*localeID)) {
2035             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
2036         }
2037     }
2038     return u_terminateChars(country, countryCapacity, i, err);
2039 }
2040
2041 U_CAPI int32_t  U_EXPORT2
2042 uloc_getVariant(const char* localeID,
2043                 char* variant,
2044                 int32_t variantCapacity,
2045                 UErrorCode* err)
2046 {
2047     char tempBuffer[ULOC_FULLNAME_CAPACITY];
2048     const char* tmpLocaleID;
2049     int32_t i=0;
2050
2051     if(err==NULL || U_FAILURE(*err)) {
2052         return 0;
2053     }
2054
2055     if (_hasBCP47Extension(localeID)) {
2056         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
2057     } else {
2058         if (localeID==NULL) {
2059            localeID=uloc_getDefault();
2060         }
2061         tmpLocaleID=localeID;
2062     }
2063
2064     /* Skip the language */
2065     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2066     if(_isIDSeparator(*tmpLocaleID)) {
2067         const char *scriptID;
2068         /* Skip the script if available */
2069         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2070         if(scriptID != tmpLocaleID+1) {
2071             /* Found optional script */
2072             tmpLocaleID = scriptID;
2073         }
2074         /* Skip the Country */
2075         if (_isIDSeparator(*tmpLocaleID)) {
2076             const char *cntryID;
2077             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2078             if (cntryID != tmpLocaleID+1) {
2079                 /* Found optional country */
2080                 tmpLocaleID = cntryID;
2081             }
2082             if(_isIDSeparator(*tmpLocaleID)) {
2083                 /* If there was no country ID, skip a possible extra IDSeparator */
2084                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2085                     tmpLocaleID++;
2086                 }
2087                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2088             }
2089         }
2090     }
2091
2092     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2093     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2094 /*
2095     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2096         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2097     }
2098 */
2099     return u_terminateChars(variant, variantCapacity, i, err);
2100 }
2101
2102 U_CAPI int32_t  U_EXPORT2
2103 uloc_getName(const char* localeID,
2104              char* name,
2105              int32_t nameCapacity,
2106              UErrorCode* err)
2107 {
2108     return _canonicalize(localeID, name, nameCapacity, 0, err);
2109 }
2110
2111 U_CAPI int32_t  U_EXPORT2
2112 uloc_getBaseName(const char* localeID,
2113                  char* name,
2114                  int32_t nameCapacity,
2115                  UErrorCode* err)
2116 {
2117     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2118 }
2119
2120 U_CAPI int32_t  U_EXPORT2
2121 uloc_canonicalize(const char* localeID,
2122                   char* name,
2123                   int32_t nameCapacity,
2124                   UErrorCode* err)
2125 {
2126     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2127 }
2128
2129 U_CAPI const char*  U_EXPORT2
2130 uloc_getISO3Language(const char* localeID)
2131 {
2132     int16_t offset;
2133     char lang[ULOC_LANG_CAPACITY];
2134     UErrorCode err = U_ZERO_ERROR;
2135
2136     if (localeID == NULL)
2137     {
2138         localeID = uloc_getDefault();
2139     }
2140     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2141     if (U_FAILURE(err))
2142         return "";
2143     offset = _findIndex(LANGUAGES, lang);
2144     if (offset < 0)
2145         return "";
2146     return LANGUAGES_3[offset];
2147 }
2148
2149 U_CAPI const char*  U_EXPORT2
2150 uloc_getISO3Country(const char* localeID)
2151 {
2152     int16_t offset;
2153     char cntry[ULOC_LANG_CAPACITY];
2154     UErrorCode err = U_ZERO_ERROR;
2155
2156     if (localeID == NULL)
2157     {
2158         localeID = uloc_getDefault();
2159     }
2160     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2161     if (U_FAILURE(err))
2162         return "";
2163     offset = _findIndex(COUNTRIES, cntry);
2164     if (offset < 0)
2165         return "";
2166
2167     return COUNTRIES_3[offset];
2168 }
2169
2170 U_CAPI uint32_t  U_EXPORT2
2171 uloc_getLCID(const char* localeID)
2172 {
2173     UErrorCode status = U_ZERO_ERROR;
2174     char       langID[ULOC_FULLNAME_CAPACITY];
2175     uint32_t   lcid = 0;
2176
2177     /* Check for incomplete id. */
2178     if (!localeID || uprv_strlen(localeID) < 2) {
2179         return 0;
2180     }
2181
2182     // Attempt platform lookup if available
2183     lcid = uprv_convertToLCIDPlatform(localeID);
2184     if (lcid > 0)
2185     {
2186         // Windows found an LCID, return that
2187         return lcid;
2188     }
2189
2190     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2191     if (U_FAILURE(status)) {
2192         return 0;
2193     }
2194
2195     if (uprv_strchr(localeID, '@')) {
2196         // uprv_convertToLCID does not support keywords other than collation.
2197         // Remove all keywords except collation.
2198         int32_t len;
2199         char collVal[ULOC_KEYWORDS_CAPACITY];
2200         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2201
2202         len = uloc_getKeywordValue(localeID, "collation", collVal,
2203             UPRV_LENGTHOF(collVal) - 1, &status);
2204
2205         if (U_SUCCESS(status) && len > 0) {
2206             collVal[len] = 0;
2207
2208             len = uloc_getBaseName(localeID, tmpLocaleID,
2209                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2210
2211             if (U_SUCCESS(status) && len > 0) {
2212                 tmpLocaleID[len] = 0;
2213
2214                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2215                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2216
2217                 if (U_SUCCESS(status) && len > 0) {
2218                     tmpLocaleID[len] = 0;
2219                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2220                 }
2221             }
2222         }
2223
2224         // fall through - all keywords are simply ignored
2225         status = U_ZERO_ERROR;
2226     }
2227
2228     return uprv_convertToLCID(langID, localeID, &status);
2229 }
2230
2231 U_CAPI int32_t U_EXPORT2
2232 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2233                 UErrorCode *status)
2234 {
2235     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2236 }
2237
2238 /* ### Default locale **************************************************/
2239
2240 U_CAPI const char*  U_EXPORT2
2241 uloc_getDefault()
2242 {
2243     return locale_get_default();
2244 }
2245
2246 U_CAPI void  U_EXPORT2
2247 uloc_setDefault(const char*   newDefaultLocale,
2248              UErrorCode* err)
2249 {
2250     if (U_FAILURE(*err))
2251         return;
2252     /* the error code isn't currently used for anything by this function*/
2253
2254     /* propagate change to C++ */
2255     locale_set_default(newDefaultLocale);
2256 }
2257
2258 /**
2259  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2260  * to an array of pointers to arrays of char.  All of these pointers are owned
2261  * by ICU-- do not delete them, and do not write through them.  The array is
2262  * terminated with a null pointer.
2263  */
2264 U_CAPI const char* const*  U_EXPORT2
2265 uloc_getISOLanguages()
2266 {
2267     return LANGUAGES;
2268 }
2269
2270 /**
2271  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2272  * pointer to an array of pointers to arrays of char.  All of these pointers are
2273  * owned by ICU-- do not delete them, and do not write through them.  The array is
2274  * terminated with a null pointer.
2275  */
2276 U_CAPI const char* const*  U_EXPORT2
2277 uloc_getISOCountries()
2278 {
2279     return COUNTRIES;
2280 }
2281
2282
2283 /* this function to be moved into cstring.c later */
2284 static char gDecimal = 0;
2285
2286 static /* U_CAPI */
2287 double
2288 /* U_EXPORT2 */
2289 _uloc_strtod(const char *start, char **end) {
2290     char *decimal;
2291     char *myEnd;
2292     char buf[30];
2293     double rv;
2294     if (!gDecimal) {
2295         char rep[5];
2296         /* For machines that decide to change the decimal on you,
2297         and try to be too smart with localization.
2298         This normally should be just a '.'. */
2299         sprintf(rep, "%+1.1f", 1.0);
2300         gDecimal = rep[2];
2301     }
2302
2303     if(gDecimal == '.') {
2304         return uprv_strtod(start, end); /* fall through to OS */
2305     } else {
2306         uprv_strncpy(buf, start, 29);
2307         buf[29]=0;
2308         decimal = uprv_strchr(buf, '.');
2309         if(decimal) {
2310             *decimal = gDecimal;
2311         } else {
2312             return uprv_strtod(start, end); /* no decimal point */
2313         }
2314         rv = uprv_strtod(buf, &myEnd);
2315         if(end) {
2316             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2317         }
2318         return rv;
2319     }
2320 }
2321
2322 typedef struct {
2323     float q;
2324     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2325     char locale[ULOC_FULLNAME_CAPACITY+1];
2326 } _acceptLangItem;
2327
2328 static int32_t U_CALLCONV
2329 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2330 {
2331     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2332     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2333
2334     int32_t rc = 0;
2335     if(bb->q < aa->q) {
2336         rc = -1;  /* A > B */
2337     } else if(bb->q > aa->q) {
2338         rc = 1;   /* A < B */
2339     } else {
2340         rc = 0;   /* A = B */
2341     }
2342
2343     if(rc==0) {
2344         rc = uprv_stricmp(aa->locale, bb->locale);
2345     }
2346
2347 #if defined(ULOC_DEBUG)
2348     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2349     aa->locale, aa->q,
2350     bb->locale, bb->q,
2351     rc);*/
2352 #endif
2353
2354     return rc;
2355 }
2356
2357 /*
2358 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2359 */
2360
2361 U_CAPI int32_t U_EXPORT2
2362 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2363                             const char *httpAcceptLanguage,
2364                             UEnumeration* availableLocales,
2365                             UErrorCode *status)
2366 {
2367   MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2368     char tmp[ULOC_FULLNAME_CAPACITY +1];
2369     int32_t n = 0;
2370     const char *itemEnd;
2371     const char *paramEnd;
2372     const char *s;
2373     const char *t;
2374     int32_t res;
2375     int32_t i;
2376     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2377
2378     if(U_FAILURE(*status)) {
2379         return -1;
2380     }
2381
2382     for(s=httpAcceptLanguage;s&&*s;) {
2383         while(isspace(*s)) /* eat space at the beginning */
2384             s++;
2385         itemEnd=uprv_strchr(s,',');
2386         paramEnd=uprv_strchr(s,';');
2387         if(!itemEnd) {
2388             itemEnd = httpAcceptLanguage+l; /* end of string */
2389         }
2390         if(paramEnd && paramEnd<itemEnd) {
2391             /* semicolon (;) is closer than end (,) */
2392             t = paramEnd+1;
2393             if(*t=='q') {
2394                 t++;
2395             }
2396             while(isspace(*t)) {
2397                 t++;
2398             }
2399             if(*t=='=') {
2400                 t++;
2401             }
2402             while(isspace(*t)) {
2403                 t++;
2404             }
2405             items[n].q = (float)_uloc_strtod(t,NULL);
2406         } else {
2407             /* no semicolon - it's 1.0 */
2408             items[n].q = 1.0f;
2409             paramEnd = itemEnd;
2410         }
2411         items[n].dummy=0;
2412         /* eat spaces prior to semi */
2413         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2414             ;
2415         int32_t slen = ((t+1)-s);
2416         if(slen > ULOC_FULLNAME_CAPACITY) {
2417           *status = U_BUFFER_OVERFLOW_ERROR;
2418           return -1; // too big
2419         }
2420         uprv_strncpy(items[n].locale, s, slen);
2421         items[n].locale[slen]=0; // terminate
2422         int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2423         if(U_FAILURE(*status)) return -1;
2424         if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2425             // canonicalization had an effect- copy back
2426             uprv_strncpy(items[n].locale, tmp, clen);
2427             items[n].locale[clen] = 0; // terminate
2428         }
2429 #if defined(ULOC_DEBUG)
2430         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2431 #endif
2432         n++;
2433         s = itemEnd;
2434         while(*s==',') { /* eat duplicate commas */
2435             s++;
2436         }
2437         if(n>=items.getCapacity()) { // If we need more items
2438           if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2439               *status = U_MEMORY_ALLOCATION_ERROR;
2440               return -1;
2441           }
2442 #if defined(ULOC_DEBUG)
2443           fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2444 #endif
2445         }
2446     }
2447     uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2448     if (U_FAILURE(*status)) {
2449         return -1;
2450     }
2451     LocalMemory<const char*> strs(NULL);
2452     if (strs.allocateInsteadAndReset(n) == NULL) {
2453         *status = U_MEMORY_ALLOCATION_ERROR;
2454         return -1;
2455     }
2456     for(i=0;i<n;i++) {
2457 #if defined(ULOC_DEBUG)
2458         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2459 #endif
2460         strs[i]=items[i].locale;
2461     }
2462     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2463                                strs.getAlias(), n, availableLocales, status);
2464     return res;
2465 }
2466
2467
2468 U_CAPI int32_t U_EXPORT2
2469 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2470                     UAcceptResult *outResult, const char **acceptList,
2471                     int32_t acceptListCount,
2472                     UEnumeration* availableLocales,
2473                     UErrorCode *status)
2474 {
2475     int32_t i,j;
2476     int32_t len;
2477     int32_t maxLen=0;
2478     char tmp[ULOC_FULLNAME_CAPACITY+1];
2479     const char *l;
2480     char **fallbackList;
2481     if(U_FAILURE(*status)) {
2482         return -1;
2483     }
2484     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2485     if(fallbackList==NULL) {
2486         *status = U_MEMORY_ALLOCATION_ERROR;
2487         return -1;
2488     }
2489     for(i=0;i<acceptListCount;i++) {
2490 #if defined(ULOC_DEBUG)
2491         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2492 #endif
2493         while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2494 #if defined(ULOC_DEBUG)
2495             fprintf(stderr,"  %s\n", l);
2496 #endif
2497             len = (int32_t)uprv_strlen(l);
2498             if(!uprv_strcmp(acceptList[i], l)) {
2499                 if(outResult) {
2500                     *outResult = ULOC_ACCEPT_VALID;
2501                 }
2502 #if defined(ULOC_DEBUG)
2503                 fprintf(stderr, "MATCH! %s\n", l);
2504 #endif
2505                 if(len>0) {
2506                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2507                 }
2508                 for(j=0;j<i;j++) {
2509                     uprv_free(fallbackList[j]);
2510                 }
2511                 uprv_free(fallbackList);
2512                 return u_terminateChars(result, resultAvailable, len, status);
2513             }
2514             if(len>maxLen) {
2515                 maxLen = len;
2516             }
2517         }
2518         uenum_reset(availableLocales, status);
2519         /* save off parent info */
2520         if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2521             fallbackList[i] = uprv_strdup(tmp);
2522         } else {
2523             fallbackList[i]=0;
2524         }
2525     }
2526
2527     for(maxLen--;maxLen>0;maxLen--) {
2528         for(i=0;i<acceptListCount;i++) {
2529             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2530 #if defined(ULOC_DEBUG)
2531                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2532 #endif
2533                 while((l=uenum_next(availableLocales, NULL, status)) != NULL) {
2534 #if defined(ULOC_DEBUG)
2535                     fprintf(stderr,"  %s\n", l);
2536 #endif
2537                     len = (int32_t)uprv_strlen(l);
2538                     if(!uprv_strcmp(fallbackList[i], l)) {
2539                         if(outResult) {
2540                             *outResult = ULOC_ACCEPT_FALLBACK;
2541                         }
2542 #if defined(ULOC_DEBUG)
2543                         fprintf(stderr, "fallback MATCH! %s\n", l);
2544 #endif
2545                         if(len>0) {
2546                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2547                         }
2548                         for(j=0;j<acceptListCount;j++) {
2549                             uprv_free(fallbackList[j]);
2550                         }
2551                         uprv_free(fallbackList);
2552                         return u_terminateChars(result, resultAvailable, len, status);
2553                     }
2554                 }
2555                 uenum_reset(availableLocales, status);
2556
2557                 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2558                     uprv_free(fallbackList[i]);
2559                     fallbackList[i] = uprv_strdup(tmp);
2560                 } else {
2561                     uprv_free(fallbackList[i]);
2562                     fallbackList[i]=0;
2563                 }
2564             }
2565         }
2566         if(outResult) {
2567             *outResult = ULOC_ACCEPT_FAILED;
2568         }
2569     }
2570     for(i=0;i<acceptListCount;i++) {
2571         uprv_free(fallbackList[i]);
2572     }
2573     uprv_free(fallbackList);
2574     return -1;
2575 }
2576
2577 U_CAPI const char* U_EXPORT2
2578 uloc_toUnicodeLocaleKey(const char* keyword)
2579 {
2580     const char* bcpKey = ulocimp_toBcpKey(keyword);
2581     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2582         // unknown keyword, but syntax is fine..
2583         return keyword;
2584     }
2585     return bcpKey;
2586 }
2587
2588 U_CAPI const char* U_EXPORT2
2589 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2590 {
2591     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2592     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2593         // unknown keyword, but syntax is fine..
2594         return value;
2595     }
2596     return bcpType;
2597 }
2598
2599 static UBool
2600 isWellFormedLegacyKey(const char* legacyKey)
2601 {
2602     const char* p = legacyKey;
2603     while (*p) {
2604         if (!UPRV_ISALPHANUM(*p)) {
2605             return FALSE;
2606         }
2607         p++;
2608     }
2609     return TRUE;
2610 }
2611
2612 static UBool
2613 isWellFormedLegacyType(const char* legacyType)
2614 {
2615     const char* p = legacyType;
2616     int32_t alphaNumLen = 0;
2617     while (*p) {
2618         if (*p == '_' || *p == '/' || *p == '-') {
2619             if (alphaNumLen == 0) {
2620                 return FALSE;
2621             }
2622             alphaNumLen = 0;
2623         } else if (UPRV_ISALPHANUM(*p)) {
2624             alphaNumLen++;
2625         } else {
2626             return FALSE;
2627         }
2628         p++;
2629     }
2630     return (alphaNumLen != 0);
2631 }
2632
2633 U_CAPI const char* U_EXPORT2
2634 uloc_toLegacyKey(const char* keyword)
2635 {
2636     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2637     if (legacyKey == NULL) {
2638         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2639         //
2640         // Note:
2641         //  LDML/CLDR provides some definition of keyword syntax in
2642         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2643         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2644         //  Keys can only consist of [0-9a-zA-Z].
2645         if (isWellFormedLegacyKey(keyword)) {
2646             return keyword;
2647         }
2648     }
2649     return legacyKey;
2650 }
2651
2652 U_CAPI const char* U_EXPORT2
2653 uloc_toLegacyType(const char* keyword, const char* value)
2654 {
2655     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2656     if (legacyType == NULL) {
2657         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2658         //
2659         // Note:
2660         //  LDML/CLDR provides some definition of keyword syntax in
2661         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2662         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2663         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2664         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2665         if (isWellFormedLegacyType(value)) {
2666             return value;
2667         }
2668     }
2669     return legacyType;
2670 }
2671
2672 /*eof*/