icuSources/common/uloc.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 1997-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *
   9 * File ULOC.CPP
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   04/01/97    aliu        Creation.
  15 *   08/21/98    stephen     JDK 1.2 sync
  16 *   12/08/98    rtg         New Locale implementation and C API
  17 *   03/15/99    damiba      overhaul.
  18 *   04/06/99    stephen     changed setDefault() to realloc and copy
  19 *   06/14/99    stephen     Changed calls to ures_open for new params
  20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  22 *                           brought canonicalization code into line with spec
  23 *****************************************************************************/
  24
  25 /*
  26    POSIX's locale format, from putil.c: [no spaces]
  27
  28      ll [ _CC ] [ . MM ] [ @ VV]
  29
  30      l = lang, C = ctry, M = charmap, V = variant
  31 */
  32
  33 #include "unicode/utypes.h"
  34 #include "unicode/ustring.h"
  35 #include "unicode/uloc.h"
  36
  37 #include "putilimp.h"
  38 #include "ustr_imp.h"
  39 #include "ulocimp.h"
  40 #include "umutex.h"
  41 #include "cstring.h"
  42 #include "cmemory.h"
  43 #include "locmap.h"
  44 #include "uarrsort.h"
  45 #include "uenumimp.h"
  46 #include "uassert.h"
  47 #include "charstr.h"
  48
  49 #include <stdio.h> /* for sprintf */
  50
  51 U_NAMESPACE_USE
  52
  53 /* ### Declarations **************************************************/
  54
  55 /* Locale stuff from locid.cpp */
  56 U_CFUNC void locale_set_default(const char *id);
  57 U_CFUNC const char *locale_get_default(void);
  58 U_CFUNC int32_t
  59 locale_getKeywords(const char *localeID,
  60             char prev,
  61             char *keywords, int32_t keywordCapacity,
  62             char *values, int32_t valuesCapacity, int32_t *valLen,
  63             UBool valuesToo,
  64             UErrorCode *status);
  65
  66 /* ### Data tables **************************************************/
  67
  68 /**
  69  * Table of language codes, both 2- and 3-letter, with preference
  70  * given to 2-letter codes where possible.  Includes 3-letter codes
  71  * that lack a 2-letter equivalent.
  72  *
  73  * This list must be in sorted order.  This list is returned directly
  74  * to the user by some API.
  75  *
  76  * This list must be kept in sync with LANGUAGES_3, with corresponding
  77  * entries matched.
  78  *
  79  * This table should be terminated with a NULL entry, followed by a
  80  * second list, and another NULL entry.  The first list is visible to
  81  * user code when this array is returned by API.  The second list
  82  * contains codes we support, but do not expose through user API.
  83  *
  84  * Notes
  85  *
  86  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  87  * include the revisions up to 2001/7/27 *CWB*
  88  *
  89  * The 3 character codes are the terminology codes like RFC 3066.  This
  90  * is compatible with prior ICU codes
  91  *
  92  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  93  * table but now at the end of the table because 3 character codes are
  94  * duplicates.  This avoids bad searches going from 3 to 2 character
  95  * codes.
  96  *
  97  * The range qaa-qtz is reserved for local use
  98  */
  99 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 100 /* ISO639 table version is 20150505 */
 101 static const char * const LANGUAGES[] = {
 102     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
 103     "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
 104     "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
 105     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
 106     "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
 107     "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
 108     "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
 109     "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
 110     "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
 111     "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
 112     "ca",  "cad", "car", "cay", "cch", "ce",  "ceb", "cgg",
 113     "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
 114     "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
 115     "cs",  "csb", "cu",  "cv",  "cy",
 116     "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
 117     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
 118     "dyo", "dyu", "dz",  "dzg",
 119     "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
 120     "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
 121     "ext",
 122     "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
 123     "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
 124     "frs", "fur", "fy",
 125     "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
 126     "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
 127     "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
 128     "gur", "guz", "gv",  "gwi",
 129     "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
 130     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
 131     "hup", "hy",  "hz",
 132     "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
 133     "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
 134     "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
 135     "jv",
 136     "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
 137     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
 138     "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
 139     "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
 140     "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
 141     "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
 142     "kv",  "kw",  "ky",
 143     "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
 144     "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
 145     "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
 146     "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
 147     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
 148     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
 149     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
 150     "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
 151     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
 152     "my",  "mye", "myv", "mzn",
 153     "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
 154     "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
 155     "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
 156     "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
 157     "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
 158     "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
 159     "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
 160     "pon", "prg", "pro", "ps",  "pt",
 161     "qu",  "quc", "qug",
 162     "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
 163     "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
 164     "rw",  "rwk",
 165     "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
 166     "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
 167     "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
 168     "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
 169     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
 170     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
 171     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
 172     "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
 173     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
 174     "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
 175     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
 176     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
 177     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
 178     "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
 179     "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
 180     "vot", "vro", "vun",
 181     "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
 182     "xal", "xh",  "xmf", "xog",
 183     "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
 184     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
 185     "zun", "zxx", "zza",
 186 NULL,
 187     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 188 NULL
 189 };
 190
 191 static const char* const DEPRECATED_LANGUAGES[]={
 192     "in", "iw", "ji", "jw", NULL, NULL
 193 };
 194 static const char* const REPLACEMENT_LANGUAGES[]={
 195     "id", "he", "yi", "jv", NULL, NULL
 196 };
 197
 198 /**
 199  * Table of 3-letter language codes.
 200  *
 201  * This is a lookup table used to convert 3-letter language codes to
 202  * their 2-letter equivalent, where possible.  It must be kept in sync
 203  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 204  * same language as LANGUAGES_3[i].  The commented-out lines are
 205  * copied from LANGUAGES to make eyeballing this baby easier.
 206  *
 207  * Where a 3-letter language code has no 2-letter equivalent, the
 208  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 209  *
 210  * This table should be terminated with a NULL entry, followed by a
 211  * second list, and another NULL entry.  The two lists correspond to
 212  * the two lists in LANGUAGES.
 213  */
 214 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
 215 /* ISO639 table version is 20150505 */
 216 static const char * const LANGUAGES_3[] = {
 217     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
 218     "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
 219     "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
 220     "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
 221     "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
 222     "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
 223     "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
 224     "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
 225     "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
 226     "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
 227     "cat", "cad", "car", "cay", "cch", "che", "ceb", "cgg",
 228     "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
 229     "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
 230     "ces", "csb", "chu", "chv", "cym",
 231     "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
 232     "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
 233     "dyo", "dyu", "dzo", "dzg",
 234     "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
 235     "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
 236     "ext",
 237     "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
 238     "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
 239     "frs", "fur", "fry",
 240     "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
 241     "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
 242     "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
 243     "gur", "guz", "glv", "gwi",
 244     "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
 245     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
 246     "hup", "hye", "her",
 247     "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
 248     "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
 249     "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
 250     "jav",
 251     "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
 252     "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
 253     "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
 254     "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
 255     "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
 256     "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
 257     "kom", "cor", "kir",
 258     "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
 259     "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
 260     "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
 261     "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
 262     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
 263     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
 264     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
 265     "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
 266     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
 267     "mya", "mye", "myv", "mzn",
 268     "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
 269     "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
 270     "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
 271     "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
 272     "oci", "oji", "orm", "ori", "oss", "osa", "ota",
 273     "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
 274     "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
 275     "pon", "prg", "pro", "pus", "por",
 276     "que", "quc", "qug",
 277     "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
 278     "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
 279     "kin", "rwk",
 280     "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
 281     "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
 282     "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
 283     "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
 284     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
 285     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
 286     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
 287     "swe", "swa", "swb", "swc", "syc", "syr", "szl",
 288     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
 289     "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
 290     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
 291     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
 292     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
 293     "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
 294     "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
 295     "vot", "vro", "vun",
 296     "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
 297     "xal", "xho", "xmf", "xog",
 298     "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
 299     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
 300     "zun", "zxx", "zza",
 301 NULL,
 302 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 303     "ind", "heb", "yid", "jaw", "srp",
 304 NULL
 305 };
 306
 307 /**
 308  * Table of 2-letter country codes.
 309  *
 310  * This list must be in sorted order.  This list is returned directly
 311  * to the user by some API.
 312  *
 313  * This list must be kept in sync with COUNTRIES_3, with corresponding
 314  * entries matched.
 315  *
 316  * This table should be terminated with a NULL entry, followed by a
 317  * second list, and another NULL entry.  The first list is visible to
 318  * user code when this array is returned by API.  The second list
 319  * contains codes we support, but do not expose through user API.
 320  *
 321  * Notes:
 322  *
 323  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 324  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 325  * new codes keeping the old ones for compatibility updated to include
 326  * 1999/12/03 revisions *CWB*
 327  *
 328  * RO(ROM) is now RO(ROU) according to
 329  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 330  */
 331 static const char * const COUNTRIES[] = {
 332     "AC",  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
 333     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 334     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 335     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
 336     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 337     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CP",  "CR",
 338     "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
 339     "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
 340     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 341     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 342     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 343     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 344     "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 345     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 346     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 347     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 348     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 349     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 350     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 351     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 352     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 353     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 354     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 355     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 356     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
 357     "SX",  "SY",  "SZ",  "TA",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 358     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 359     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 360     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 361     "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 362 NULL,
 363     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 364 NULL
 365 };
 366
 367 static const char* const DEPRECATED_COUNTRIES[] = {
 368     "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
 369 };
 370 static const char* const REPLACEMENT_COUNTRIES[] = {
 371 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
 372     "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
 373 };
 374
 375 /**
 376  * Table of 3-letter country codes.
 377  *
 378  * This is a lookup table used to convert 3-letter country codes to
 379  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 380  * For all valid i, COUNTRIES[i] must refer to the same country as
 381  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 382  * to make eyeballing this baby easier.
 383  *
 384  * This table should be terminated with a NULL entry, followed by a
 385  * second list, and another NULL entry.  The two lists correspond to
 386  * the two lists in COUNTRIES.
 387  */
 388 static const char * const COUNTRIES_3[] = {
 389 /*  "AC",  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
 390     "ASC", "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
 391 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 392     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 393 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 394     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 395 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
 396     "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
 397 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 398     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 399 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CP",  "CR",     */
 400     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CPT", "CRI",
 401 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
 402     "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
 403 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 404     "DMA", "DOM", "DZA", "EA ", "ECU", "EST", "EGY", "ESH", "ERI", /* no valid 3-letter code for EA */
 405 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 406     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 407 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 408     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 409 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 410     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 411 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 412     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 413 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 414     "IC ", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", /* no valid 3-letter code for IC */
 415 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 416     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 417 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 418     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 419 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 420     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 421 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 422     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 423 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 424     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 425 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 426     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 427 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 428     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 429 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 430     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 431 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 432     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 433 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 434     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 435 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 436     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 437 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
 438     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
 439 /*  "SX",  "SY",  "SZ",  "TA",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 440     "SXM", "SYR", "SWZ", "TAA", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 441 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 442     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 443 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 444     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 445 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 446     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 447 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 448     "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 449 NULL,
 450 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
 451     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
 452 NULL
 453 };
 454
 455 typedef struct CanonicalizationMap {
 456     const char *id;          /* input ID */
 457     const char *canonicalID; /* canonicalized output ID */
 458     const char *keyword;     /* keyword, or NULL if none */
 459     const char *value;       /* keyword value, or NULL if kw==NULL */
 460 } CanonicalizationMap;
 461
 462 /**
 463  * A map to canonicalize locale IDs.  This handles a variety of
 464  * different semantic kinds of transformations.
 465  */
 466 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 467     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 468     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 469     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 470     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 471     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 472     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 473     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 474     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 475     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 476     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 477     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 478     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 479     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 480     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 481     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 482     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 483     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 484     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 485     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 486     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 487     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 488     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 489     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 490     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 491     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 492     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 493     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 494     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 495     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 496     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 497     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 498     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 499     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 500     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 501     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 502     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 503     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 504     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 505     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 506     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
 507     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 508     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
 509     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
 510     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
 511     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
 512     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
 513 };
 514
 515 typedef struct VariantMap {
 516     const char *variant;          /* input ID */
 517     const char *keyword;     /* keyword, or NULL if none */
 518     const char *value;       /* keyword value, or NULL if kw==NULL */
 519 } VariantMap;
 520
 521 static const VariantMap VARIANT_MAP[] = {
 522     { "EURO",   "currency", "EUR" },
 523     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 524     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 525 };
 526
 527 /* ### BCP47 Conversion *******************************************/
 528 /* Test if the locale id has BCP47 u extension and does not have '@' */
 529 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 530 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 531 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 532         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
 533             finalID=id; \
 534         } else { \
 535             finalID=buffer; \
 536         }
 537 /* Gets the size of the shortest subtag in the given localeID. */
 538 static int32_t getShortestSubtagLength(const char *localeID) {
 539     int32_t localeIDLength = uprv_strlen(localeID);
 540     int32_t length = localeIDLength;
 541     int32_t tmpLength = 0;
 542     int32_t i;
 543     UBool reset = TRUE;
 544
 545     for (i = 0; i < localeIDLength; i++) {
 546         if (localeID[i] != '_' && localeID[i] != '-') {
 547             if (reset) {
 548                 tmpLength = 0;
 549                 reset = FALSE;
 550             }
 551             tmpLength++;
 552         } else {
 553             if (tmpLength != 0 && tmpLength < length) {
 554                 length = tmpLength;
 555             }
 556             reset = TRUE;
 557         }
 558     }
 559
 560     return length;
 561 }
 562
 563 /* ### Keywords **************************************************/
 564 #define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
 565 #define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
 566 /* Punctuation/symbols allowed in legacy key values */
 567 #define UPRV_OK_VALUE_PUNCTUATION(c) ((c) == '_' || (c) == '-' || (c) == '+' || (c) == '/')
 568
 569 #define ULOC_KEYWORD_BUFFER_LEN 25
 570 #define ULOC_MAX_NO_KEYWORDS 25
 571
 572 U_CAPI const char * U_EXPORT2
 573 locale_getKeywordsStart(const char *localeID) {
 574     const char *result = NULL;
 575     if((result = uprv_strchr(localeID, '@')) != NULL) {
 576         return result;
 577     }
 578 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 579     else {
 580         /* We do this because the @ sign is variant, and the @ sign used on one
 581         EBCDIC machine won't be compiled the same way on other EBCDIC based
 582         machines. */
 583         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 584         const uint8_t *charToFind = ebcdicSigns;
 585         while(*charToFind) {
 586             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 587                 return result;
 588             }
 589             charToFind++;
 590         }
 591     }
 592 #endif
 593     return NULL;
 594 }
 595
 596 /**
 597  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 598  * @param keywordName incoming name to be canonicalized
 599  * @param status return status (keyword too long)
 600  * @return length of the keyword name
 601  */
 602 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 603 {
 604   int32_t keywordNameLen = 0;
 605
 606   for (; *keywordName != 0; keywordName++) {
 607     if (!UPRV_ISALPHANUM(*keywordName)) {
 608       *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 609       return 0;
 610     }
 611     if (keywordNameLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
 612       buf[keywordNameLen++] = uprv_tolower(*keywordName);
 613     } else {
 614       /* keyword name too long for internal buffer */
 615       *status = U_INTERNAL_PROGRAM_ERROR;
 616       return 0;
 617     }
 618   }
 619   if (keywordNameLen == 0) {
 620     *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
 621     return 0;
 622   }
 623   buf[keywordNameLen] = 0; /* terminate */
 624
 625   return keywordNameLen;
 626 }
 627
 628 typedef struct {
 629     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 630     int32_t keywordLen;
 631     const char *valueStart;
 632     int32_t valueLen;
 633 } KeywordStruct;
 634
 635 static int32_t U_CALLCONV
 636 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
 637     const char* leftString = ((const KeywordStruct *)left)->keyword;
 638     const char* rightString = ((const KeywordStruct *)right)->keyword;
 639     return uprv_strcmp(leftString, rightString);
 640 }
 641
 642 /**
 643  * Both addKeyword and addValue must already be in canonical form.
 644  * Either both addKeyword and addValue are NULL, or neither is NULL.
 645  * If they are not NULL they must be zero terminated.
 646  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 647  */
 648 static int32_t
 649 _getKeywords(const char *localeID,
 650              char prev,
 651              char *keywords, int32_t keywordCapacity,
 652              char *values, int32_t valuesCapacity, int32_t *valLen,
 653              UBool valuesToo,
 654              const char* addKeyword,
 655              const char* addValue,
 656              UErrorCode *status)
 657 {
 658     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 659
 660     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 661     int32_t numKeywords = 0;
 662     const char* pos = localeID;
 663     const char* equalSign = NULL;
 664     const char* semicolon = NULL;
 665     int32_t i = 0, j, n;
 666     int32_t keywordsLen = 0;
 667     int32_t valuesLen = 0;
 668
 669     if(prev == '@') { /* start of keyword definition */
 670         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 671         do {
 672             UBool duplicate = FALSE;
 673             /* skip leading spaces */
 674             while(*pos == ' ') {
 675                 pos++;
 676             }
 677             if (!*pos) { /* handle trailing "; " */
 678                 break;
 679             }
 680             if(numKeywords == maxKeywords) {
 681                 *status = U_INTERNAL_PROGRAM_ERROR;
 682                 return 0;
 683             }
 684             equalSign = uprv_strchr(pos, '=');
 685             semicolon = uprv_strchr(pos, ';');
 686             /* lack of '=' [foo@currency] is illegal */
 687             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 688             if(!equalSign || (semicolon && semicolon<equalSign)) {
 689                 *status = U_INVALID_FORMAT_ERROR;
 690                 return 0;
 691             }
 692             /* need to normalize both keyword and keyword name */
 693             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 694                 /* keyword name too long for internal buffer */
 695                 *status = U_INTERNAL_PROGRAM_ERROR;
 696                 return 0;
 697             }
 698             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 699                 if (pos[i] != ' ') {
 700                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 701                 }
 702             }
 703
 704             /* zero-length keyword is an error. */
 705             if (n == 0) {
 706                 *status = U_INVALID_FORMAT_ERROR;
 707                 return 0;
 708             }
 709
 710             keywordList[numKeywords].keyword[n] = 0;
 711             keywordList[numKeywords].keywordLen = n;
 712             /* now grab the value part. First we skip the '=' */
 713             equalSign++;
 714             /* then we leading spaces */
 715             while(*equalSign == ' ') {
 716                 equalSign++;
 717             }
 718
 719             /* Premature end or zero-length value */
 720             if (!*equalSign || equalSign == semicolon) {
 721                 *status = U_INVALID_FORMAT_ERROR;
 722                 return 0;
 723             }
 724
 725             keywordList[numKeywords].valueStart = equalSign;
 726
 727             pos = semicolon;
 728             i = 0;
 729             if(pos) {
 730                 while(*(pos - i - 1) == ' ') {
 731                     i++;
 732                 }
 733                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 734                 pos++;
 735             } else {
 736                 i = (int32_t)uprv_strlen(equalSign);
 737                 while(i && equalSign[i-1] == ' ') {
 738                     i--;
 739                 }
 740                 keywordList[numKeywords].valueLen = i;
 741             }
 742             /* If this is a duplicate keyword, then ignore it */
 743             for (j=0; j<numKeywords; ++j) {
 744                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 745                     duplicate = TRUE;
 746                     break;
 747                 }
 748             }
 749             if (!duplicate) {
 750                 ++numKeywords;
 751             }
 752         } while(pos);
 753
 754         /* Handle addKeyword/addValue. */
 755         if (addKeyword != NULL) {
 756             UBool duplicate = FALSE;
 757             U_ASSERT(addValue != NULL);
 758             /* Search for duplicate; if found, do nothing. Explicit keyword
 759                overrides addKeyword. */
 760             for (j=0; j<numKeywords; ++j) {
 761                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 762                     duplicate = TRUE;
 763                     break;
 764                 }
 765             }
 766             if (!duplicate) {
 767                 if (numKeywords == maxKeywords) {
 768                     *status = U_INTERNAL_PROGRAM_ERROR;
 769                     return 0;
 770                 }
 771                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 772                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 773                 keywordList[numKeywords].valueStart = addValue;
 774                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 775                 ++numKeywords;
 776             }
 777         } else {
 778             U_ASSERT(addValue == NULL);
 779         }
 780
 781         /* now we have a list of keywords */
 782         /* we need to sort it */
 783         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 784
 785         /* Now construct the keyword part */
 786         for(i = 0; i < numKeywords; i++) {
 787             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 788                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 789                 if(valuesToo) {
 790                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 791                 } else {
 792                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 793                 }
 794             }
 795             keywordsLen += keywordList[i].keywordLen + 1;
 796             if(valuesToo) {
 797                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 798                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 799                 }
 800                 keywordsLen += keywordList[i].valueLen;
 801
 802                 if(i < numKeywords - 1) {
 803                     if(keywordsLen < keywordCapacity) {
 804                         keywords[keywordsLen] = ';';
 805                     }
 806                     keywordsLen++;
 807                 }
 808             }
 809             if(values) {
 810                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 811                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 812                     values[valuesLen + keywordList[i].valueLen] = 0;
 813                 }
 814                 valuesLen += keywordList[i].valueLen + 1;
 815             }
 816         }
 817         if(values) {
 818             values[valuesLen] = 0;
 819             if(valLen) {
 820                 *valLen = valuesLen;
 821             }
 822         }
 823         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 824     } else {
 825         return 0;
 826     }
 827 }
 828
 829 U_CFUNC int32_t
 830 locale_getKeywords(const char *localeID,
 831                    char prev,
 832                    char *keywords, int32_t keywordCapacity,
 833                    char *values, int32_t valuesCapacity, int32_t *valLen,
 834                    UBool valuesToo,
 835                    UErrorCode *status) {
 836     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 837                         values, valuesCapacity, valLen, valuesToo,
 838                         NULL, NULL, status);
 839 }
 840
 841 U_CAPI int32_t U_EXPORT2
 842 uloc_getKeywordValue(const char* localeID,
 843                      const char* keywordName,
 844                      char* buffer, int32_t bufferCapacity,
 845                      UErrorCode* status)
 846 {
 847     const char* startSearchHere = NULL;
 848     const char* nextSeparator = NULL;
 849     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 850     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 851     int32_t result = 0;
 852
 853     if(status && U_SUCCESS(*status) && localeID) {
 854       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 855       const char* tmpLocaleID;
 856
 857       if (keywordName == NULL || keywordName[0] == 0) {
 858         *status = U_ILLEGAL_ARGUMENT_ERROR;
 859         return 0;
 860       }
 861
 862       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 863       if(U_FAILURE(*status)) {
 864         return 0;
 865       }
 866
 867       if (_hasBCP47Extension(localeID)) {
 868           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 869       } else {
 870           tmpLocaleID=localeID;
 871       }
 872
 873       startSearchHere = locale_getKeywordsStart(tmpLocaleID);
 874       if(startSearchHere == NULL) {
 875           /* no keywords, return at once */
 876           return 0;
 877       }
 878
 879       /* find the first keyword */
 880       while(startSearchHere) {
 881           const char* keyValueTail;
 882           int32_t keyValueLen;
 883
 884           startSearchHere++; /* skip @ or ; */
 885           nextSeparator = uprv_strchr(startSearchHere, '=');
 886           if(!nextSeparator) {
 887               *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
 888               return 0;
 889           }
 890           /* strip leading & trailing spaces (TC decided to tolerate these) */
 891           while(*startSearchHere == ' ') {
 892               startSearchHere++;
 893           }
 894           keyValueTail = nextSeparator;
 895           while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
 896               keyValueTail--;
 897           }
 898           /* now keyValueTail points to first char after the keyName */
 899           /* copy & normalize keyName from locale */
 900           if (startSearchHere == keyValueTail) {
 901               *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
 902               return 0;
 903           }
 904           keyValueLen = 0;
 905           while (startSearchHere < keyValueTail) {
 906             if (!UPRV_ISALPHANUM(*startSearchHere)) {
 907               *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
 908               return 0;
 909             }
 910             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
 911               localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*startSearchHere++);
 912             } else {
 913               /* keyword name too long for internal buffer */
 914               *status = U_INTERNAL_PROGRAM_ERROR;
 915               return 0;
 916             }
 917           }
 918           localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
 919
 920           startSearchHere = uprv_strchr(nextSeparator, ';');
 921
 922           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 923                /* current entry matches the keyword. */
 924              nextSeparator++; /* skip '=' */
 925               /* First strip leading & trailing spaces (TC decided to tolerate these) */
 926               while(*nextSeparator == ' ') {
 927                 nextSeparator++;
 928               }
 929               keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
 930               while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
 931                 keyValueTail--;
 932               }
 933               /* Now copy the value, but check well-formedness */
 934               if (nextSeparator == keyValueTail) {
 935                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
 936                 return 0;
 937               }
 938               keyValueLen = 0;
 939               while (nextSeparator < keyValueTail) {
 940                 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
 941                   *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
 942                   return 0;
 943                 }
 944                 if (keyValueLen < bufferCapacity) {
 945                   /* Should we lowercase value to return here? Tests expect as-is. */
 946                   buffer[keyValueLen++] = *nextSeparator++;
 947                 } else { /* keep advancing so we return correct length in case of overflow */
 948                   keyValueLen++;
 949                   nextSeparator++;
 950                 }
 951               }
 952               result = u_terminateChars(buffer, bufferCapacity, keyValueLen, status);
 953               return result;
 954           }
 955       }
 956     }
 957     return 0;
 958 }
 959
 960 U_CAPI int32_t U_EXPORT2
 961 uloc_setKeywordValue(const char* keywordName,
 962                      const char* keywordValue,
 963                      char* buffer, int32_t bufferCapacity,
 964                      UErrorCode* status)
 965 {
 966     /* TODO: sorting. removal. */
 967     int32_t keywordNameLen;
 968     int32_t keywordValueLen;
 969     int32_t bufLen;
 970     int32_t needLen = 0;
 971     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 972     char keywordValueBuffer[ULOC_KEYWORDS_CAPACITY+1];
 973     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 974     int32_t rc;
 975     char* nextSeparator = NULL;
 976     char* nextEqualsign = NULL;
 977     char* startSearchHere = NULL;
 978     char* keywordStart = NULL;
 979     CharString updatedKeysAndValues;
 980     int32_t updatedKeysAndValuesLen;
 981     UBool handledInputKeyAndValue = FALSE;
 982     char keyValuePrefix = '@';
 983
 984     if(U_FAILURE(*status)) {
 985         return -1;
 986     }
 987     if (keywordName == NULL || keywordName[0] == 0 || bufferCapacity <= 1) {
 988         *status = U_ILLEGAL_ARGUMENT_ERROR;
 989         return 0;
 990     }
 991     bufLen = (int32_t)uprv_strlen(buffer);
 992     if(bufferCapacity<bufLen) {
 993         /* The capacity is less than the length?! Is this NULL terminated? */
 994         *status = U_ILLEGAL_ARGUMENT_ERROR;
 995         return 0;
 996     }
 997     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 998     if(U_FAILURE(*status)) {
 999         return 0;
1000     }
1001
1002     keywordValueLen = 0;
1003     if(keywordValue) {
1004         while (*keywordValue != 0) {
1005             if (!UPRV_ISALPHANUM(*keywordValue) && !UPRV_OK_VALUE_PUNCTUATION(*keywordValue)) {
1006                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
1007                 return 0;
1008             }
1009             if (keywordValueLen < ULOC_KEYWORDS_CAPACITY) {
1010                 /* Should we force lowercase in value to set? */
1011                 keywordValueBuffer[keywordValueLen++] = *keywordValue++;
1012             } else {
1013                 /* keywordValue too long for internal buffer */
1014                 *status = U_INTERNAL_PROGRAM_ERROR;
1015                 return 0;
1016             }
1017         }
1018     }
1019     keywordValueBuffer[keywordValueLen] = 0; /* terminate */
1020
1021     startSearchHere = (char*)locale_getKeywordsStart(buffer);
1022     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
1023         if(keywordValueLen == 0) { /* no keywords = nothing to remove */
1024             return bufLen;
1025         }
1026
1027         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1028         if(startSearchHere) { /* had a single @ */
1029             needLen--; /* already had the @ */
1030             /* startSearchHere points at the @ */
1031         } else {
1032             startSearchHere=buffer+bufLen;
1033         }
1034         if(needLen >= bufferCapacity) {
1035             *status = U_BUFFER_OVERFLOW_ERROR;
1036             return needLen; /* no change */
1037         }
1038         *startSearchHere++ = '@';
1039         uprv_strcpy(startSearchHere, keywordNameBuffer);
1040         startSearchHere += keywordNameLen;
1041         *startSearchHere++ = '=';
1042         uprv_strcpy(startSearchHere, keywordValueBuffer);
1043         return needLen;
1044     } /* end shortcut - no @ */
1045
1046     keywordStart = startSearchHere;
1047     /* search for keyword */
1048     while(keywordStart) {
1049         const char* keyValueTail;
1050         int32_t keyValueLen;
1051
1052         keywordStart++; /* skip @ or ; */
1053         nextEqualsign = uprv_strchr(keywordStart, '=');
1054         if (!nextEqualsign) {
1055             *status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
1056             return 0;
1057         }
1058         /* strip leading & trailing spaces (TC decided to tolerate these) */
1059         while(*keywordStart == ' ') {
1060             keywordStart++;
1061         }
1062         keyValueTail = nextEqualsign;
1063         while (keyValueTail > keywordStart && *(keyValueTail-1) == ' ') {
1064             keyValueTail--;
1065         }
1066         /* now keyValueTail points to first char after the keyName */
1067         /* copy & normalize keyName from locale */
1068         if (keywordStart == keyValueTail) {
1069             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
1070             return 0;
1071         }
1072         keyValueLen = 0;
1073         while (keywordStart < keyValueTail) {
1074             if (!UPRV_ISALPHANUM(*keywordStart)) {
1075                 *status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
1076                 return 0;
1077             }
1078             if (keyValueLen < ULOC_KEYWORD_BUFFER_LEN - 1) {
1079                 localeKeywordNameBuffer[keyValueLen++] = uprv_tolower(*keywordStart++);
1080             } else {
1081                 /* keyword name too long for internal buffer */
1082                 *status = U_INTERNAL_PROGRAM_ERROR;
1083                 return 0;
1084             }
1085         }
1086         localeKeywordNameBuffer[keyValueLen] = 0; /* terminate */
1087
1088         nextSeparator = uprv_strchr(nextEqualsign, ';');
1089
1090         /* start processing the value part */
1091         nextEqualsign++; /* skip '=' */
1092         /* First strip leading & trailing spaces (TC decided to tolerate these) */
1093         while(*nextEqualsign == ' ') {
1094             nextEqualsign++;
1095         }
1096         keyValueTail = (nextSeparator)? nextSeparator: nextEqualsign + uprv_strlen(nextEqualsign);
1097         while(keyValueTail > nextEqualsign && *(keyValueTail-1) == ' ') {
1098             keyValueTail--;
1099         }
1100         if (nextEqualsign == keyValueTail) {
1101             *status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
1102             return 0;
1103         }
1104
1105         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1106         if(rc == 0) {
1107             /* Current entry matches the input keyword. Update the entry */
1108             if(keywordValueLen > 0) { /* updating a value */
1109                 updatedKeysAndValues.append(keyValuePrefix, *status);
1110                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1111                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1112                 updatedKeysAndValues.append('=', *status);
1113                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1114             } /* else removing this entry, don't emit anything */
1115             handledInputKeyAndValue = TRUE;
1116         } else {
1117            /* input keyword sorts earlier than current entry, add before current entry */
1118             if (rc < 0 && keywordValueLen > 0 && !handledInputKeyAndValue) {
1119                 /* insert new entry at this location */
1120                 updatedKeysAndValues.append(keyValuePrefix, *status);
1121                 keyValuePrefix = ';'; /* for any subsequent key-value pair */
1122                 updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1123                 updatedKeysAndValues.append('=', *status);
1124                 updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1125                 handledInputKeyAndValue = TRUE;
1126             }
1127             /* copy the current entry */
1128             updatedKeysAndValues.append(keyValuePrefix, *status);
1129             keyValuePrefix = ';'; /* for any subsequent key-value pair */
1130             updatedKeysAndValues.append(localeKeywordNameBuffer, keyValueLen, *status);
1131             updatedKeysAndValues.append('=', *status);
1132             updatedKeysAndValues.append(nextEqualsign, keyValueTail-nextEqualsign, *status);
1133         }
1134         if (!nextSeparator && keywordValueLen > 0 && !handledInputKeyAndValue) {
1135             /* append new entry at the end, it sorts later than existing entries */
1136             updatedKeysAndValues.append(keyValuePrefix, *status);
1137             /* skip keyValuePrefix update, no subsequent key-value pair */
1138             updatedKeysAndValues.append(keywordNameBuffer, keywordNameLen, *status);
1139             updatedKeysAndValues.append('=', *status);
1140             updatedKeysAndValues.append(keywordValueBuffer, keywordValueLen, *status);
1141             handledInputKeyAndValue = TRUE;
1142         }
1143         keywordStart = nextSeparator;
1144     } /* end loop searching */
1145
1146     /* Any error from updatedKeysAndValues.append above would be internal and not due to
1147      * problems with the passed-in locale. So if we did encounter problems with the
1148      * passed-in locale above, those errors took precedence and overrode any error
1149      * status from updatedKeysAndValues.append, and also caused a return of 0. If there
1150      * are errors here they are from updatedKeysAndValues.append; they do cause an
1151      * error return but the passed-in locale is unmodified and the original bufLen is
1152      * returned.
1153      */
1154     if (!handledInputKeyAndValue || U_FAILURE(*status)) {
1155         /* if input key/value specified removal of a keyword not present in locale, or
1156          * there was an error in CharString.append, leave original locale alone. */
1157         return bufLen;
1158     }
1159
1160     updatedKeysAndValuesLen = updatedKeysAndValues.length();
1161     /* needLen = length of the part before '@' + length of updated key-value part including '@' */
1162     needLen = (int32_t)(startSearchHere - buffer) + updatedKeysAndValuesLen;
1163     if(needLen >= bufferCapacity) {
1164         *status = U_BUFFER_OVERFLOW_ERROR;
1165         return needLen; /* no change */
1166     }
1167     if (updatedKeysAndValuesLen > 0) {
1168         uprv_strncpy(startSearchHere, updatedKeysAndValues.data(), updatedKeysAndValuesLen);
1169     }
1170     buffer[needLen]=0;
1171     return needLen;
1172 }
1173
1174 /* ### ID parsing implementation **************************************************/
1175
1176 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1177
1178 /*returns TRUE if one of the special prefixes is here (s=string)
1179   'x-' or 'i-' */
1180 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1181
1182 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1183  * except for variant
1184  */
1185 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1186
1187 static char* _strnchr(const char* str, int32_t len, char c) {
1188     U_ASSERT(str != 0 && len >= 0);
1189     while (len-- != 0) {
1190         char d = *str;
1191         if (d == c) {
1192             return (char*) str;
1193         } else if (d == 0) {
1194             break;
1195         }
1196         ++str;
1197     }
1198     return NULL;
1199 }
1200
1201 /**
1202  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1203  * a NULL entry, followed by more entries, and a second NULL entry.
1204  *
1205  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1206  * COUNTRIES_3.
1207  */
1208 static int16_t _findIndex(const char* const* list, const char* key)
1209 {
1210     const char* const* anchor = list;
1211     int32_t pass = 0;
1212
1213     /* Make two passes through two NULL-terminated arrays at 'list' */
1214     while (pass++ < 2) {
1215         while (*list) {
1216             if (uprv_strcmp(key, *list) == 0) {
1217                 return (int16_t)(list - anchor);
1218             }
1219             list++;
1220         }
1221         ++list;     /* skip final NULL *CWB*/
1222     }
1223     return -1;
1224 }
1225
1226 /* count the length of src while copying it to dest; return strlen(src) */
1227 static inline int32_t
1228 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1229     const char *anchor;
1230     char c;
1231
1232     anchor=src;
1233     for(;;) {
1234         if((c=*src)==0) {
1235             return (int32_t)(src-anchor);
1236         }
1237         if(destCapacity<=0) {
1238             return (int32_t)((src-anchor)+uprv_strlen(src));
1239         }
1240         ++src;
1241         *dest++=c;
1242         --destCapacity;
1243     }
1244 }
1245
1246 U_CFUNC const char*
1247 uloc_getCurrentCountryID(const char* oldID){
1248     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1249     if (offset >= 0) {
1250         return REPLACEMENT_COUNTRIES[offset];
1251     }
1252     return oldID;
1253 }
1254 U_CFUNC const char*
1255 uloc_getCurrentLanguageID(const char* oldID){
1256     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1257     if (offset >= 0) {
1258         return REPLACEMENT_LANGUAGES[offset];
1259     }
1260     return oldID;
1261 }
1262 /*
1263  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1264  * avoid duplicating code to handle the earlier locale ID pieces
1265  * in the functions for the later ones by
1266  * setting the *pEnd pointer to where they stopped parsing
1267  *
1268  * TODO try to use this in Locale
1269  */
1270 U_CFUNC int32_t
1271 ulocimp_getLanguage(const char *localeID,
1272                     char *language, int32_t languageCapacity,
1273                     const char **pEnd) {
1274     int32_t i=0;
1275     int32_t offset;
1276     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1277
1278     /* if it starts with i- or x- then copy that prefix */
1279     if(_isIDPrefix(localeID)) {
1280         if(i<languageCapacity) {
1281             language[i]=(char)uprv_tolower(*localeID);
1282         }
1283         if(i<languageCapacity) {
1284             language[i+1]='-';
1285         }
1286         i+=2;
1287         localeID+=2;
1288     }
1289
1290     /* copy the language as far as possible and count its length */
1291     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1292         if(i<languageCapacity) {
1293             language[i]=(char)uprv_tolower(*localeID);
1294         }
1295         if(i<3) {
1296             U_ASSERT(i>=0);
1297             lang[i]=(char)uprv_tolower(*localeID);
1298         }
1299         i++;
1300         localeID++;
1301     }
1302
1303     if(i==3) {
1304         /* convert 3 character code to 2 character code if possible *CWB*/
1305         offset=_findIndex(LANGUAGES_3, lang);
1306         if(offset>=0) {
1307             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1308         }
1309     }
1310
1311     if(pEnd!=NULL) {
1312         *pEnd=localeID;
1313     }
1314     return i;
1315 }
1316
1317 U_CFUNC int32_t
1318 ulocimp_getScript(const char *localeID,
1319                   char *script, int32_t scriptCapacity,
1320                   const char **pEnd)
1321 {
1322     int32_t idLen = 0;
1323
1324     if (pEnd != NULL) {
1325         *pEnd = localeID;
1326     }
1327
1328     /* copy the second item as far as possible and count its length */
1329     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1330             && uprv_isASCIILetter(localeID[idLen])) {
1331         idLen++;
1332     }
1333
1334     /* If it's exactly 4 characters long, then it's a script and not a country. */
1335     if (idLen == 4) {
1336         int32_t i;
1337         if (pEnd != NULL) {
1338             *pEnd = localeID+idLen;
1339         }
1340         if(idLen > scriptCapacity) {
1341             idLen = scriptCapacity;
1342         }
1343         if (idLen >= 1) {
1344             script[0]=(char)uprv_toupper(*(localeID++));
1345         }
1346         for (i = 1; i < idLen; i++) {
1347             script[i]=(char)uprv_tolower(*(localeID++));
1348         }
1349     }
1350     else {
1351         idLen = 0;
1352     }
1353     return idLen;
1354 }
1355
1356 U_CFUNC int32_t
1357 ulocimp_getCountry(const char *localeID,
1358                    char *country, int32_t countryCapacity,
1359                    const char **pEnd)
1360 {
1361     int32_t idLen=0;
1362     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1363     int32_t offset;
1364
1365     /* copy the country as far as possible and count its length */
1366     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1367         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1368             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1369         }
1370         idLen++;
1371     }
1372
1373     /* the country should be either length 2 or 3 */
1374     if (idLen == 2 || idLen == 3) {
1375         UBool gotCountry = FALSE;
1376         /* convert 3 character code to 2 character code if possible *CWB*/
1377         if(idLen==3) {
1378             offset=_findIndex(COUNTRIES_3, cnty);
1379             if(offset>=0) {
1380                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1381                 gotCountry = TRUE;
1382             }
1383         }
1384         if (!gotCountry) {
1385             int32_t i = 0;
1386             for (i = 0; i < idLen; i++) {
1387                 if (i < countryCapacity) {
1388                     country[i]=(char)uprv_toupper(localeID[i]);
1389                 }
1390             }
1391         }
1392         localeID+=idLen;
1393     } else {
1394         idLen = 0;
1395     }
1396
1397     if(pEnd!=NULL) {
1398         *pEnd=localeID;
1399     }
1400
1401     return idLen;
1402 }
1403
1404 /**
1405  * @param needSeparator if true, then add leading '_' if any variants
1406  * are added to 'variant'
1407  */
1408 static int32_t
1409 _getVariantEx(const char *localeID,
1410               char prev,
1411               char *variant, int32_t variantCapacity,
1412               UBool needSeparator) {
1413     int32_t i=0;
1414
1415     /* get one or more variant tags and separate them with '_' */
1416     if(_isIDSeparator(prev)) {
1417         /* get a variant string after a '-' or '_' */
1418         while(!_isTerminator(*localeID)) {
1419             if (needSeparator) {
1420                 if (i<variantCapacity) {
1421                     variant[i] = '_';
1422                 }
1423                 ++i;
1424                 needSeparator = FALSE;
1425             }
1426             if(i<variantCapacity) {
1427                 variant[i]=(char)uprv_toupper(*localeID);
1428                 if(variant[i]=='-') {
1429                     variant[i]='_';
1430                 }
1431             }
1432             i++;
1433             localeID++;
1434         }
1435     }
1436
1437     /* if there is no variant tag after a '-' or '_' then look for '@' */
1438     if(i==0) {
1439         if(prev=='@') {
1440             /* keep localeID */
1441         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1442             ++localeID; /* point after the '@' */
1443         } else {
1444             return 0;
1445         }
1446         while(!_isTerminator(*localeID)) {
1447             if (needSeparator) {
1448                 if (i<variantCapacity) {
1449                     variant[i] = '_';
1450                 }
1451                 ++i;
1452                 needSeparator = FALSE;
1453             }
1454             if(i<variantCapacity) {
1455                 variant[i]=(char)uprv_toupper(*localeID);
1456                 if(variant[i]=='-' || variant[i]==',') {
1457                     variant[i]='_';
1458                 }
1459             }
1460             i++;
1461             localeID++;
1462         }
1463     }
1464
1465     return i;
1466 }
1467
1468 static int32_t
1469 _getVariant(const char *localeID,
1470             char prev,
1471             char *variant, int32_t variantCapacity) {
1472     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1473 }
1474
1475 /**
1476  * Delete ALL instances of a variant from the given list of one or
1477  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1478  * @param variants the source string of one or more variants,
1479  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1480  * terminated; if it is, trailing zero will NOT be maintained.
1481  * @param variantsLen length of variants
1482  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1483  * or "PREEURO"; not zero terminated
1484  * @param toDeleteLen length of toDelete
1485  * @return number of characters deleted from variants
1486  */
1487 static int32_t
1488 _deleteVariant(char* variants, int32_t variantsLen,
1489                const char* toDelete, int32_t toDeleteLen)
1490 {
1491     int32_t delta = 0; /* number of chars deleted */
1492     for (;;) {
1493         UBool flag = FALSE;
1494         if (variantsLen < toDeleteLen) {
1495             return delta;
1496         }
1497         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1498             (variantsLen == toDeleteLen ||
1499              (flag=(variants[toDeleteLen] == '_'))))
1500         {
1501             int32_t d = toDeleteLen + (flag?1:0);
1502             variantsLen -= d;
1503             delta += d;
1504             if (variantsLen > 0) {
1505                 uprv_memmove(variants, variants+d, variantsLen);
1506             }
1507         } else {
1508             char* p = _strnchr(variants, variantsLen, '_');
1509             if (p == NULL) {
1510                 return delta;
1511             }
1512             ++p;
1513             variantsLen -= (int32_t)(p - variants);
1514             variants = p;
1515         }
1516     }
1517 }
1518
1519 /* Keyword enumeration */
1520
1521 typedef struct UKeywordsContext {
1522     char* keywords;
1523     char* current;
1524 } UKeywordsContext;
1525
1526 U_CDECL_BEGIN
1527
1528 static void U_CALLCONV
1529 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1530     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1531     uprv_free(enumerator->context);
1532     uprv_free(enumerator);
1533 }
1534
1535 static int32_t U_CALLCONV
1536 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1537     char *kw = ((UKeywordsContext *)en->context)->keywords;
1538     int32_t result = 0;
1539     while(*kw) {
1540         result++;
1541         kw += uprv_strlen(kw)+1;
1542     }
1543     return result;
1544 }
1545
1546 static const char * U_CALLCONV
1547 uloc_kw_nextKeyword(UEnumeration* en,
1548                     int32_t* resultLength,
1549                     UErrorCode* /*status*/) {
1550     const char* result = ((UKeywordsContext *)en->context)->current;
1551     int32_t len = 0;
1552     if(*result) {
1553         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1554         ((UKeywordsContext *)en->context)->current += len+1;
1555     } else {
1556         result = NULL;
1557     }
1558     if (resultLength) {
1559         *resultLength = len;
1560     }
1561     return result;
1562 }
1563
1564 static void U_CALLCONV
1565 uloc_kw_resetKeywords(UEnumeration* en,
1566                       UErrorCode* /*status*/) {
1567     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1568 }
1569
1570 U_CDECL_END
1571
1572
1573 static const UEnumeration gKeywordsEnum = {
1574     NULL,
1575     NULL,
1576     uloc_kw_closeKeywords,
1577     uloc_kw_countKeywords,
1578     uenum_unextDefault,
1579     uloc_kw_nextKeyword,
1580     uloc_kw_resetKeywords
1581 };
1582
1583 U_CAPI UEnumeration* U_EXPORT2
1584 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1585 {
1586     UKeywordsContext *myContext = NULL;
1587     UEnumeration *result = NULL;
1588
1589     if(U_FAILURE(*status)) {
1590         return NULL;
1591     }
1592     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1593     /* Null pointer test */
1594     if (result == NULL) {
1595         *status = U_MEMORY_ALLOCATION_ERROR;
1596         return NULL;
1597     }
1598     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1599     myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1600     if (myContext == NULL) {
1601         *status = U_MEMORY_ALLOCATION_ERROR;
1602         uprv_free(result);
1603         return NULL;
1604     }
1605     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1606     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1607     myContext->keywords[keywordListSize] = 0;
1608     myContext->current = myContext->keywords;
1609     result->context = myContext;
1610     return result;
1611 }
1612
1613 U_CAPI UEnumeration* U_EXPORT2
1614 uloc_openKeywords(const char* localeID,
1615                         UErrorCode* status)
1616 {
1617     int32_t i=0;
1618     char keywords[256];
1619     int32_t keywordsCapacity = 256;
1620     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1621     const char* tmpLocaleID;
1622
1623     if(status==NULL || U_FAILURE(*status)) {
1624         return 0;
1625     }
1626
1627     if (_hasBCP47Extension(localeID)) {
1628         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1629     } else {
1630         if (localeID==NULL) {
1631            localeID=uloc_getDefault();
1632         }
1633         tmpLocaleID=localeID;
1634     }
1635
1636     /* Skip the language */
1637     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1638     if(_isIDSeparator(*tmpLocaleID)) {
1639         const char *scriptID;
1640         /* Skip the script if available */
1641         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1642         if(scriptID != tmpLocaleID+1) {
1643             /* Found optional script */
1644             tmpLocaleID = scriptID;
1645         }
1646         /* Skip the Country */
1647         if (_isIDSeparator(*tmpLocaleID)) {
1648             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1649             if(_isIDSeparator(*tmpLocaleID)) {
1650                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1651             }
1652         }
1653     }
1654
1655     /* keywords are located after '@' */
1656     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1657         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1658     }
1659
1660     if(i) {
1661         return uloc_openKeywordList(keywords, i, status);
1662     } else {
1663         return NULL;
1664     }
1665 }
1666
1667
1668 /* bit-flags for 'options' parameter of _canonicalize */
1669 #define _ULOC_STRIP_KEYWORDS 0x2
1670 #define _ULOC_CANONICALIZE   0x1
1671
1672 #define OPTION_SET(options, mask) ((options & mask) != 0)
1673
1674 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1675 #define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1676
1677 /**
1678  * Canonicalize the given localeID, to level 1 or to level 2,
1679  * depending on the options.  To specify level 1, pass in options=0.
1680  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1681  *
1682  * This is the code underlying uloc_getName and uloc_canonicalize.
1683  */
1684 static int32_t
1685 _canonicalize(const char* localeID,
1686               char* result,
1687               int32_t resultCapacity,
1688               uint32_t options,
1689               UErrorCode* err) {
1690     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1691     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1692     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1693     const char* origLocaleID;
1694     const char* tmpLocaleID;
1695     const char* keywordAssign = NULL;
1696     const char* separatorIndicator = NULL;
1697     const char* addKeyword = NULL;
1698     const char* addValue = NULL;
1699     char* name;
1700     char* variant = NULL; /* pointer into name, or NULL */
1701
1702     if (U_FAILURE(*err)) {
1703         return 0;
1704     }
1705
1706     if (_hasBCP47Extension(localeID)) {
1707         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1708     } else {
1709         if (localeID==NULL) {
1710            localeID=uloc_getDefault();
1711         }
1712         tmpLocaleID=localeID;
1713     }
1714
1715     origLocaleID=tmpLocaleID;
1716
1717     /* if we are doing a full canonicalization, then put results in
1718        localeBuffer, if necessary; otherwise send them to result. */
1719     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1720         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1721         name = localeBuffer;
1722         nameCapacity = (int32_t)sizeof(localeBuffer);
1723     } else {
1724         name = result;
1725         nameCapacity = resultCapacity;
1726     }
1727
1728     /* get all pieces, one after another, and separate with '_' */
1729     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1730
1731     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1732         const char *d = uloc_getDefault();
1733
1734         len = (int32_t)uprv_strlen(d);
1735
1736         if (name != NULL) {
1737             uprv_strncpy(name, d, len);
1738         }
1739     } else if(_isIDSeparator(*tmpLocaleID)) {
1740         const char *scriptID;
1741
1742         ++fieldCount;
1743         if(len<nameCapacity) {
1744             name[len]='_';
1745         }
1746         ++len;
1747
1748         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1749             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1750         if(scriptSize > 0) {
1751             /* Found optional script */
1752             tmpLocaleID = scriptID;
1753             ++fieldCount;
1754             len+=scriptSize;
1755             if (_isIDSeparator(*tmpLocaleID)) {
1756                 /* If there is something else, then we add the _ */
1757                 if(len<nameCapacity) {
1758                     name[len]='_';
1759                 }
1760                 ++len;
1761             }
1762         }
1763
1764         if (_isIDSeparator(*tmpLocaleID)) {
1765             const char *cntryID;
1766             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1767                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1768             if (cntrySize > 0) {
1769                 /* Found optional country */
1770                 tmpLocaleID = cntryID;
1771                 len+=cntrySize;
1772             }
1773             if(_isIDSeparator(*tmpLocaleID)) {
1774                 /* If there is something else, then we add the _  if we found country before. */
1775                 if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1776                     ++fieldCount;
1777                     if(len<nameCapacity) {
1778                         name[len]='_';
1779                     }
1780                     ++len;
1781                 }
1782
1783                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1784                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1785                 if (variantSize > 0) {
1786                     variant = len<nameCapacity ? name+len : NULL;
1787                     len += variantSize;
1788                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1789                 }
1790             }
1791         }
1792     }
1793
1794     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1795     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1796         UBool done = FALSE;
1797         do {
1798             char c = *tmpLocaleID;
1799             switch (c) {
1800             case 0:
1801             case '@':
1802                 done = TRUE;
1803                 break;
1804             default:
1805                 if (len<nameCapacity) {
1806                     name[len] = c;
1807                 }
1808                 ++len;
1809                 ++tmpLocaleID;
1810                 break;
1811             }
1812         } while (!done);
1813     }
1814
1815     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1816        After this, tmpLocaleID either points to '@' or is NULL */
1817     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1818         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1819         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1820     }
1821
1822     /* Copy POSIX-style variant, if any [mr@FOO] */
1823     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1824         tmpLocaleID != NULL && keywordAssign == NULL) {
1825         for (;;) {
1826             char c = *tmpLocaleID;
1827             if (c == 0) {
1828                 break;
1829             }
1830             if (len<nameCapacity) {
1831                 name[len] = c;
1832             }
1833             ++len;
1834             ++tmpLocaleID;
1835         }
1836     }
1837
1838     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1839         /* Handle @FOO variant if @ is present and not followed by = */
1840         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1841             int32_t posixVariantSize;
1842             /* Add missing '_' if needed */
1843             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1844                 do {
1845                     if(len<nameCapacity) {
1846                         name[len]='_';
1847                     }
1848                     ++len;
1849                     ++fieldCount;
1850                 } while(fieldCount<2);
1851             }
1852             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1853                                              (UBool)(variantSize > 0));
1854             if (posixVariantSize > 0) {
1855                 if (variant == NULL) {
1856                     variant = name+len;
1857                 }
1858                 len += posixVariantSize;
1859                 variantSize += posixVariantSize;
1860             }
1861         }
1862
1863         /* Handle generic variants first */
1864         if (variant) {
1865             for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
1866                 const char* variantToCompare = VARIANT_MAP[j].variant;
1867                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1868                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1869                 len -= variantLen;
1870                 if (variantLen > 0) {
1871                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1872                         --len;
1873                     }
1874                     addKeyword = VARIANT_MAP[j].keyword;
1875                     addValue = VARIANT_MAP[j].value;
1876                     break;
1877                 }
1878             }
1879             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1880                 --len;
1881             }
1882         }
1883
1884         /* Look up the ID in the canonicalization map */
1885         for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1886             const char* id = CANONICALIZE_MAP[j].id;
1887             int32_t n = (int32_t)uprv_strlen(id);
1888             if (len == n && uprv_strncmp(name, id, n) == 0) {
1889                 if (n == 0 && tmpLocaleID != NULL) {
1890                     break; /* Don't remap "" if keywords present */
1891                 }
1892                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1893                 if (CANONICALIZE_MAP[j].keyword) {
1894                     addKeyword = CANONICALIZE_MAP[j].keyword;
1895                     addValue = CANONICALIZE_MAP[j].value;
1896                 }
1897                 break;
1898             }
1899         }
1900     }
1901
1902     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1903         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1904             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1905             if(len<nameCapacity) {
1906                 name[len]='@';
1907             }
1908             ++len;
1909             ++fieldCount;
1910             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1911                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1912         } else if (addKeyword != NULL) {
1913             U_ASSERT(addValue != NULL && len < nameCapacity);
1914             /* inelegant but works -- later make _getKeywords do this? */
1915             len += _copyCount(name+len, nameCapacity-len, "@");
1916             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1917             len += _copyCount(name+len, nameCapacity-len, "=");
1918             len += _copyCount(name+len, nameCapacity-len, addValue);
1919         }
1920     }
1921
1922     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1923         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1924     }
1925
1926     return u_terminateChars(result, resultCapacity, len, err);
1927 }
1928
1929 /* ### ID parsing API **************************************************/
1930
1931 U_CAPI int32_t  U_EXPORT2
1932 uloc_getParent(const char*    localeID,
1933                char* parent,
1934                int32_t parentCapacity,
1935                UErrorCode* err)
1936 {
1937     const char *lastUnderscore;
1938     int32_t i;
1939
1940     if (U_FAILURE(*err))
1941         return 0;
1942
1943     if (localeID == NULL)
1944         localeID = uloc_getDefault();
1945
1946     lastUnderscore=uprv_strrchr(localeID, '_');
1947     if(lastUnderscore!=NULL) {
1948         i=(int32_t)(lastUnderscore-localeID);
1949     } else {
1950         i=0;
1951     }
1952
1953     if(i>0 && parent != localeID) {
1954         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1955     }
1956     return u_terminateChars(parent, parentCapacity, i, err);
1957 }
1958
1959 U_CAPI int32_t U_EXPORT2
1960 uloc_getLanguage(const char*    localeID,
1961          char* language,
1962          int32_t languageCapacity,
1963          UErrorCode* err)
1964 {
1965     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1966     int32_t i=0;
1967
1968     if (err==NULL || U_FAILURE(*err)) {
1969         return 0;
1970     }
1971
1972     if(localeID==NULL) {
1973         localeID=uloc_getDefault();
1974     }
1975
1976     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1977     return u_terminateChars(language, languageCapacity, i, err);
1978 }
1979
1980 U_CAPI int32_t U_EXPORT2
1981 uloc_getScript(const char*    localeID,
1982          char* script,
1983          int32_t scriptCapacity,
1984          UErrorCode* err)
1985 {
1986     int32_t i=0;
1987
1988     if(err==NULL || U_FAILURE(*err)) {
1989         return 0;
1990     }
1991
1992     if(localeID==NULL) {
1993         localeID=uloc_getDefault();
1994     }
1995
1996     /* skip the language */
1997     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1998     if(_isIDSeparator(*localeID)) {
1999         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
2000     }
2001     return u_terminateChars(script, scriptCapacity, i, err);
2002 }
2003
2004 U_CAPI int32_t  U_EXPORT2
2005 uloc_getCountry(const char* localeID,
2006             char* country,
2007             int32_t countryCapacity,
2008             UErrorCode* err)
2009 {
2010     int32_t i=0;
2011
2012     if(err==NULL || U_FAILURE(*err)) {
2013         return 0;
2014     }
2015
2016     if(localeID==NULL) {
2017         localeID=uloc_getDefault();
2018     }
2019
2020     /* Skip the language */
2021     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
2022     if(_isIDSeparator(*localeID)) {
2023         const char *scriptID;
2024         /* Skip the script if available */
2025         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
2026         if(scriptID != localeID+1) {
2027             /* Found optional script */
2028             localeID = scriptID;
2029         }
2030         if(_isIDSeparator(*localeID)) {
2031             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
2032         }
2033     }
2034     return u_terminateChars(country, countryCapacity, i, err);
2035 }
2036
2037 U_CAPI int32_t  U_EXPORT2
2038 uloc_getVariant(const char* localeID,
2039                 char* variant,
2040                 int32_t variantCapacity,
2041                 UErrorCode* err)
2042 {
2043     char tempBuffer[ULOC_FULLNAME_CAPACITY];
2044     const char* tmpLocaleID;
2045     int32_t i=0;
2046
2047     if(err==NULL || U_FAILURE(*err)) {
2048         return 0;
2049     }
2050
2051     if (_hasBCP47Extension(localeID)) {
2052         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
2053     } else {
2054         if (localeID==NULL) {
2055            localeID=uloc_getDefault();
2056         }
2057         tmpLocaleID=localeID;
2058     }
2059
2060     /* Skip the language */
2061     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2062     if(_isIDSeparator(*tmpLocaleID)) {
2063         const char *scriptID;
2064         /* Skip the script if available */
2065         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2066         if(scriptID != tmpLocaleID+1) {
2067             /* Found optional script */
2068             tmpLocaleID = scriptID;
2069         }
2070         /* Skip the Country */
2071         if (_isIDSeparator(*tmpLocaleID)) {
2072             const char *cntryID;
2073             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2074             if (cntryID != tmpLocaleID+1) {
2075                 /* Found optional country */
2076                 tmpLocaleID = cntryID;
2077             }
2078             if(_isIDSeparator(*tmpLocaleID)) {
2079                 /* If there was no country ID, skip a possible extra IDSeparator */
2080                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2081                     tmpLocaleID++;
2082                 }
2083                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2084             }
2085         }
2086     }
2087
2088     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2089     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2090 /*
2091     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2092         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2093     }
2094 */
2095     return u_terminateChars(variant, variantCapacity, i, err);
2096 }
2097
2098 U_CAPI int32_t  U_EXPORT2
2099 uloc_getName(const char* localeID,
2100              char* name,
2101              int32_t nameCapacity,
2102              UErrorCode* err)
2103 {
2104     return _canonicalize(localeID, name, nameCapacity, 0, err);
2105 }
2106
2107 U_CAPI int32_t  U_EXPORT2
2108 uloc_getBaseName(const char* localeID,
2109                  char* name,
2110                  int32_t nameCapacity,
2111                  UErrorCode* err)
2112 {
2113     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2114 }
2115
2116 U_CAPI int32_t  U_EXPORT2
2117 uloc_canonicalize(const char* localeID,
2118                   char* name,
2119                   int32_t nameCapacity,
2120                   UErrorCode* err)
2121 {
2122     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2123 }
2124
2125 U_CAPI const char*  U_EXPORT2
2126 uloc_getISO3Language(const char* localeID)
2127 {
2128     int16_t offset;
2129     char lang[ULOC_LANG_CAPACITY];
2130     UErrorCode err = U_ZERO_ERROR;
2131
2132     if (localeID == NULL)
2133     {
2134         localeID = uloc_getDefault();
2135     }
2136     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2137     if (U_FAILURE(err))
2138         return "";
2139     offset = _findIndex(LANGUAGES, lang);
2140     if (offset < 0)
2141         return "";
2142     return LANGUAGES_3[offset];
2143 }
2144
2145 U_CAPI const char*  U_EXPORT2
2146 uloc_getISO3Country(const char* localeID)
2147 {
2148     int16_t offset;
2149     char cntry[ULOC_LANG_CAPACITY];
2150     UErrorCode err = U_ZERO_ERROR;
2151
2152     if (localeID == NULL)
2153     {
2154         localeID = uloc_getDefault();
2155     }
2156     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2157     if (U_FAILURE(err))
2158         return "";
2159     offset = _findIndex(COUNTRIES, cntry);
2160     if (offset < 0)
2161         return "";
2162
2163     return COUNTRIES_3[offset];
2164 }
2165
2166 U_CAPI uint32_t  U_EXPORT2
2167 uloc_getLCID(const char* localeID)
2168 {
2169     UErrorCode status = U_ZERO_ERROR;
2170     char       langID[ULOC_FULLNAME_CAPACITY];
2171     uint32_t   lcid = 0;
2172
2173     /* Check for incomplete id. */
2174     if (!localeID || uprv_strlen(localeID) < 2) {
2175         return 0;
2176     }
2177
2178     // Attempt platform lookup if available
2179     lcid = uprv_convertToLCIDPlatform(localeID);
2180     if (lcid > 0)
2181     {
2182         // Windows found an LCID, return that
2183         return lcid;
2184     }
2185
2186     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2187     if (U_FAILURE(status)) {
2188         return 0;
2189     }
2190
2191     if (uprv_strchr(localeID, '@')) {
2192         // uprv_convertToLCID does not support keywords other than collation.
2193         // Remove all keywords except collation.
2194         int32_t len;
2195         char collVal[ULOC_KEYWORDS_CAPACITY];
2196         char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2197
2198         len = uloc_getKeywordValue(localeID, "collation", collVal,
2199             UPRV_LENGTHOF(collVal) - 1, &status);
2200
2201         if (U_SUCCESS(status) && len > 0) {
2202             collVal[len] = 0;
2203
2204             len = uloc_getBaseName(localeID, tmpLocaleID,
2205                 UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2206
2207             if (U_SUCCESS(status) && len > 0) {
2208                 tmpLocaleID[len] = 0;
2209
2210                 len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2211                     UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2212
2213                 if (U_SUCCESS(status) && len > 0) {
2214                     tmpLocaleID[len] = 0;
2215                     return uprv_convertToLCID(langID, tmpLocaleID, &status);
2216                 }
2217             }
2218         }
2219
2220         // fall through - all keywords are simply ignored
2221         status = U_ZERO_ERROR;
2222     }
2223
2224     return uprv_convertToLCID(langID, localeID, &status);
2225 }
2226
2227 U_CAPI int32_t U_EXPORT2
2228 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2229                 UErrorCode *status)
2230 {
2231     return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2232 }
2233
2234 /* ### Default locale **************************************************/
2235
2236 U_CAPI const char*  U_EXPORT2
2237 uloc_getDefault()
2238 {
2239     return locale_get_default();
2240 }
2241
2242 U_CAPI void  U_EXPORT2
2243 uloc_setDefault(const char*   newDefaultLocale,
2244              UErrorCode* err)
2245 {
2246     if (U_FAILURE(*err))
2247         return;
2248     /* the error code isn't currently used for anything by this function*/
2249
2250     /* propagate change to C++ */
2251     locale_set_default(newDefaultLocale);
2252 }
2253
2254 /**
2255  * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2256  * to an array of pointers to arrays of char.  All of these pointers are owned
2257  * by ICU-- do not delete them, and do not write through them.  The array is
2258  * terminated with a null pointer.
2259  */
2260 U_CAPI const char* const*  U_EXPORT2
2261 uloc_getISOLanguages()
2262 {
2263     return LANGUAGES;
2264 }
2265
2266 /**
2267  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2268  * pointer to an array of pointers to arrays of char.  All of these pointers are
2269  * owned by ICU-- do not delete them, and do not write through them.  The array is
2270  * terminated with a null pointer.
2271  */
2272 U_CAPI const char* const*  U_EXPORT2
2273 uloc_getISOCountries()
2274 {
2275     return COUNTRIES;
2276 }
2277
2278
2279 /* this function to be moved into cstring.c later */
2280 static char gDecimal = 0;
2281
2282 static /* U_CAPI */
2283 double
2284 /* U_EXPORT2 */
2285 _uloc_strtod(const char *start, char **end) {
2286     char *decimal;
2287     char *myEnd;
2288     char buf[30];
2289     double rv;
2290     if (!gDecimal) {
2291         char rep[5];
2292         /* For machines that decide to change the decimal on you,
2293         and try to be too smart with localization.
2294         This normally should be just a '.'. */
2295         sprintf(rep, "%+1.1f", 1.0);
2296         gDecimal = rep[2];
2297     }
2298
2299     if(gDecimal == '.') {
2300         return uprv_strtod(start, end); /* fall through to OS */
2301     } else {
2302         uprv_strncpy(buf, start, 29);
2303         buf[29]=0;
2304         decimal = uprv_strchr(buf, '.');
2305         if(decimal) {
2306             *decimal = gDecimal;
2307         } else {
2308             return uprv_strtod(start, end); /* no decimal point */
2309         }
2310         rv = uprv_strtod(buf, &myEnd);
2311         if(end) {
2312             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2313         }
2314         return rv;
2315     }
2316 }
2317
2318 typedef struct {
2319     float q;
2320     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2321     char locale[ULOC_FULLNAME_CAPACITY+1];
2322 } _acceptLangItem;
2323
2324 static int32_t U_CALLCONV
2325 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2326 {
2327     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2328     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2329
2330     int32_t rc = 0;
2331     if(bb->q < aa->q) {
2332         rc = -1;  /* A > B */
2333     } else if(bb->q > aa->q) {
2334         rc = 1;   /* A < B */
2335     } else {
2336         rc = 0;   /* A = B */
2337     }
2338
2339     if(rc==0) {
2340         rc = uprv_stricmp(aa->locale, bb->locale);
2341     }
2342
2343 #if defined(ULOC_DEBUG)
2344     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2345     aa->locale, aa->q,
2346     bb->locale, bb->q,
2347     rc);*/
2348 #endif
2349
2350     return rc;
2351 }
2352
2353 /*
2354 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2355 */
2356
2357 U_CAPI int32_t U_EXPORT2
2358 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2359                             const char *httpAcceptLanguage,
2360                             UEnumeration* availableLocales,
2361                             UErrorCode *status)
2362 {
2363   MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2364     char tmp[ULOC_FULLNAME_CAPACITY +1];
2365     int32_t n = 0;
2366     const char *itemEnd;
2367     const char *paramEnd;
2368     const char *s;
2369     const char *t;
2370     int32_t res;
2371     int32_t i;
2372     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2373
2374     if(U_FAILURE(*status)) {
2375         return -1;
2376     }
2377
2378     for(s=httpAcceptLanguage;s&&*s;) {
2379         while(isspace(*s)) /* eat space at the beginning */
2380             s++;
2381         itemEnd=uprv_strchr(s,',');
2382         paramEnd=uprv_strchr(s,';');
2383         if(!itemEnd) {
2384             itemEnd = httpAcceptLanguage+l; /* end of string */
2385         }
2386         if(paramEnd && paramEnd<itemEnd) {
2387             /* semicolon (;) is closer than end (,) */
2388             t = paramEnd+1;
2389             if(*t=='q') {
2390                 t++;
2391             }
2392             while(isspace(*t)) {
2393                 t++;
2394             }
2395             if(*t=='=') {
2396                 t++;
2397             }
2398             while(isspace(*t)) {
2399                 t++;
2400             }
2401             items[n].q = (float)_uloc_strtod(t,NULL);
2402         } else {
2403             /* no semicolon - it's 1.0 */
2404             items[n].q = 1.0f;
2405             paramEnd = itemEnd;
2406         }
2407         items[n].dummy=0;
2408         /* eat spaces prior to semi */
2409         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2410             ;
2411         int32_t slen = ((t+1)-s);
2412         if(slen > ULOC_FULLNAME_CAPACITY) {
2413           *status = U_BUFFER_OVERFLOW_ERROR;
2414           return -1; // too big
2415         }
2416         uprv_strncpy(items[n].locale, s, slen);
2417         items[n].locale[slen]=0; // terminate
2418         int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2419         if(U_FAILURE(*status)) return -1;
2420         if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2421             // canonicalization had an effect- copy back
2422             uprv_strncpy(items[n].locale, tmp, clen);
2423             items[n].locale[clen] = 0; // terminate
2424         }
2425 #if defined(ULOC_DEBUG)
2426         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2427 #endif
2428         n++;
2429         s = itemEnd;
2430         while(*s==',') { /* eat duplicate commas */
2431             s++;
2432         }
2433         if(n>=items.getCapacity()) { // If we need more items
2434           if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2435               *status = U_MEMORY_ALLOCATION_ERROR;
2436               return -1;
2437           }
2438 #if defined(ULOC_DEBUG)
2439           fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2440 #endif
2441         }
2442     }
2443     uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2444     if (U_FAILURE(*status)) {
2445         return -1;
2446     }
2447     LocalMemory<const char*> strs(NULL);
2448     if (strs.allocateInsteadAndReset(n) == NULL) {
2449         *status = U_MEMORY_ALLOCATION_ERROR;
2450         return -1;
2451     }
2452     for(i=0;i<n;i++) {
2453 #if defined(ULOC_DEBUG)
2454         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2455 #endif
2456         strs[i]=items[i].locale;
2457     }
2458     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2459                                strs.getAlias(), n, availableLocales, status);
2460     return res;
2461 }
2462
2463
2464 U_CAPI int32_t U_EXPORT2
2465 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2466                     UAcceptResult *outResult, const char **acceptList,
2467                     int32_t acceptListCount,
2468                     UEnumeration* availableLocales,
2469                     UErrorCode *status)
2470 {
2471     int32_t i,j;
2472     int32_t len;
2473     int32_t maxLen=0;
2474     char tmp[ULOC_FULLNAME_CAPACITY+1];
2475     const char *l;
2476     char **fallbackList;
2477     if(U_FAILURE(*status)) {
2478         return -1;
2479     }
2480     fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2481     if(fallbackList==NULL) {
2482         *status = U_MEMORY_ALLOCATION_ERROR;
2483         return -1;
2484     }
2485     for(i=0;i<acceptListCount;i++) {
2486 #if defined(ULOC_DEBUG)
2487         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2488 #endif
2489         while((l=uenum_next(availableLocales, NULL, status))) {
2490 #if defined(ULOC_DEBUG)
2491             fprintf(stderr,"  %s\n", l);
2492 #endif
2493             len = (int32_t)uprv_strlen(l);
2494             if(!uprv_strcmp(acceptList[i], l)) {
2495                 if(outResult) {
2496                     *outResult = ULOC_ACCEPT_VALID;
2497                 }
2498 #if defined(ULOC_DEBUG)
2499                 fprintf(stderr, "MATCH! %s\n", l);
2500 #endif
2501                 if(len>0) {
2502                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2503                 }
2504                 for(j=0;j<i;j++) {
2505                     uprv_free(fallbackList[j]);
2506                 }
2507                 uprv_free(fallbackList);
2508                 return u_terminateChars(result, resultAvailable, len, status);
2509             }
2510             if(len>maxLen) {
2511                 maxLen = len;
2512             }
2513         }
2514         uenum_reset(availableLocales, status);
2515         /* save off parent info */
2516         if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2517             fallbackList[i] = uprv_strdup(tmp);
2518         } else {
2519             fallbackList[i]=0;
2520         }
2521     }
2522
2523     for(maxLen--;maxLen>0;maxLen--) {
2524         for(i=0;i<acceptListCount;i++) {
2525             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2526 #if defined(ULOC_DEBUG)
2527                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2528 #endif
2529                 while((l=uenum_next(availableLocales, NULL, status))) {
2530 #if defined(ULOC_DEBUG)
2531                     fprintf(stderr,"  %s\n", l);
2532 #endif
2533                     len = (int32_t)uprv_strlen(l);
2534                     if(!uprv_strcmp(fallbackList[i], l)) {
2535                         if(outResult) {
2536                             *outResult = ULOC_ACCEPT_FALLBACK;
2537                         }
2538 #if defined(ULOC_DEBUG)
2539                         fprintf(stderr, "fallback MATCH! %s\n", l);
2540 #endif
2541                         if(len>0) {
2542                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2543                         }
2544                         for(j=0;j<acceptListCount;j++) {
2545                             uprv_free(fallbackList[j]);
2546                         }
2547                         uprv_free(fallbackList);
2548                         return u_terminateChars(result, resultAvailable, len, status);
2549                     }
2550                 }
2551                 uenum_reset(availableLocales, status);
2552
2553                 if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2554                     uprv_free(fallbackList[i]);
2555                     fallbackList[i] = uprv_strdup(tmp);
2556                 } else {
2557                     uprv_free(fallbackList[i]);
2558                     fallbackList[i]=0;
2559                 }
2560             }
2561         }
2562         if(outResult) {
2563             *outResult = ULOC_ACCEPT_FAILED;
2564         }
2565     }
2566     for(i=0;i<acceptListCount;i++) {
2567         uprv_free(fallbackList[i]);
2568     }
2569     uprv_free(fallbackList);
2570     return -1;
2571 }
2572
2573 U_CAPI const char* U_EXPORT2
2574 uloc_toUnicodeLocaleKey(const char* keyword)
2575 {
2576     const char* bcpKey = ulocimp_toBcpKey(keyword);
2577     if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2578         // unknown keyword, but syntax is fine..
2579         return keyword;
2580     }
2581     return bcpKey;
2582 }
2583
2584 U_CAPI const char* U_EXPORT2
2585 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2586 {
2587     const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2588     if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2589         // unknown keyword, but syntax is fine..
2590         return value;
2591     }
2592     return bcpType;
2593 }
2594
2595 static UBool
2596 isWellFormedLegacyKey(const char* legacyKey)
2597 {
2598     const char* p = legacyKey;
2599     while (*p) {
2600         if (!UPRV_ISALPHANUM(*p)) {
2601             return FALSE;
2602         }
2603         p++;
2604     }
2605     return TRUE;
2606 }
2607
2608 static UBool
2609 isWellFormedLegacyType(const char* legacyType)
2610 {
2611     const char* p = legacyType;
2612     int32_t alphaNumLen = 0;
2613     while (*p) {
2614         if (*p == '_' || *p == '/' || *p == '-') {
2615             if (alphaNumLen == 0) {
2616                 return FALSE;
2617             }
2618             alphaNumLen = 0;
2619         } else if (UPRV_ISALPHANUM(*p)) {
2620             alphaNumLen++;
2621         } else {
2622             return FALSE;
2623         }
2624         p++;
2625     }
2626     return (alphaNumLen != 0);
2627 }
2628
2629 U_CAPI const char* U_EXPORT2
2630 uloc_toLegacyKey(const char* keyword)
2631 {
2632     const char* legacyKey = ulocimp_toLegacyKey(keyword);
2633     if (legacyKey == NULL) {
2634         // Checks if the specified locale key is well-formed with the legacy locale syntax.
2635         //
2636         // Note:
2637         //  LDML/CLDR provides some definition of keyword syntax in
2638         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2639         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2640         //  Keys can only consist of [0-9a-zA-Z].
2641         if (isWellFormedLegacyKey(keyword)) {
2642             return keyword;
2643         }
2644     }
2645     return legacyKey;
2646 }
2647
2648 U_CAPI const char* U_EXPORT2
2649 uloc_toLegacyType(const char* keyword, const char* value)
2650 {
2651     const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2652     if (legacyType == NULL) {
2653         // Checks if the specified locale type is well-formed with the legacy locale syntax.
2654         //
2655         // Note:
2656         //  LDML/CLDR provides some definition of keyword syntax in
2657         //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
2658         //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
2659         //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
2660         //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
2661         if (isWellFormedLegacyType(value)) {
2662             return value;
2663         }
2664     }
2665     return legacyType;
2666 }
2667
2668 /*eof*/