icuSources/common/uloc.c

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1997-2010, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *
   7 * File ULOC.CPP
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   04/01/97    aliu        Creation.
  13 *   08/21/98    stephen     JDK 1.2 sync
  14 *   12/08/98    rtg         New Locale implementation and C API
  15 *   03/15/99    damiba      overhaul.
  16 *   04/06/99    stephen     changed setDefault() to realloc and copy
  17 *   06/14/99    stephen     Changed calls to ures_open for new params
  18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  20 *                           brought canonicalization code into line with spec
  21 *****************************************************************************/
  22
  23 /*
  24    POSIX's locale format, from putil.c: [no spaces]
  25
  26      ll [ _CC ] [ . MM ] [ @ VV]
  27
  28      l = lang, C = ctry, M = charmap, V = variant
  29 */
  30
  31 #include "unicode/utypes.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/uloc.h"
  34
  35 #include "putilimp.h"
  36 #include "ustr_imp.h"
  37 #include "ulocimp.h"
  38 #include "umutex.h"
  39 #include "cstring.h"
  40 #include "cmemory.h"
  41 #include "ucln_cmn.h"
  42 #include "locmap.h"
  43 #include "uarrsort.h"
  44 #include "uenumimp.h"
  45 #include "uassert.h"
  46
  47 #include <stdio.h> /* for sprintf */
  48
  49 /* ### Declarations **************************************************/
  50
  51 /* Locale stuff from locid.cpp */
  52 U_CFUNC void locale_set_default(const char *id);
  53 U_CFUNC const char *locale_get_default(void);
  54 U_CFUNC int32_t
  55 locale_getKeywords(const char *localeID,
  56             char prev,
  57             char *keywords, int32_t keywordCapacity,
  58             char *values, int32_t valuesCapacity, int32_t *valLen,
  59             UBool valuesToo,
  60             UErrorCode *status);
  61
  62 /* ### Data tables **************************************************/
  63
  64 /**
  65  * Table of language codes, both 2- and 3-letter, with preference
  66  * given to 2-letter codes where possible.  Includes 3-letter codes
  67  * that lack a 2-letter equivalent.
  68  *
  69  * This list must be in sorted order.  This list is returned directly
  70  * to the user by some API.
  71  *
  72  * This list must be kept in sync with LANGUAGES_3, with corresponding
  73  * entries matched.
  74  *
  75  * This table should be terminated with a NULL entry, followed by a
  76  * second list, and another NULL entry.  The first list is visible to
  77  * user code when this array is returned by API.  The second list
  78  * contains codes we support, but do not expose through user API.
  79  *
  80  * Notes
  81  *
  82  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  83  * include the revisions up to 2001/7/27 *CWB*
  84  *
  85  * The 3 character codes are the terminology codes like RFC 3066.  This
  86  * is compatible with prior ICU codes
  87  *
  88  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  89  * table but now at the end of the table because 3 character codes are
  90  * duplicates.  This avoids bad searches going from 3 to 2 character
  91  * codes.
  92  *
  93  * The range qaa-qtz is reserved for local use
  94  */
  95 static const char * const LANGUAGES[] = {
  96     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",
  97     "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",
  98     "ang", "anp", "apa",
  99     "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",
 100     "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",
 101     "bai", "bal", "ban", "bas", "bat", "be",  "bej",
 102     "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",
 103     "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",
 104     "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",
 105     "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",
 106     "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",
 107     "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",
 108     "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",
 109     "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",
 110     "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",
 111     "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",
 112     "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",
 113     "fr",  "frm", "fro", "frr", "frs", "fur", "fy",
 114     "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
 115     "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
 116     "grc", "gsw", "gu",  "gv", "gwi",
 117     "ha",  "hai", "haw", "he",  "hi",  "hil", "him",
 118     "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",
 119     "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",
 120     "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",
 121     "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",
 122     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
 123     "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",
 124     "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",
 125     "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",
 126     "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",
 127     "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",
 128     "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",
 129     "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",
 130     "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",
 131     "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",
 132     "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",
 133     "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",
 134     "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",
 135     "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",
 136     "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",
 137     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
 138     "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",
 139     "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",
 140     "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",
 141     "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",
 142     "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",
 143     "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
 144     "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
 145     "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",
 146     "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",
 147     "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",
 148     "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",
 149     "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",
 150     "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",
 151     "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",
 152     "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",
 153     "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",
 154     "zu",  "zun", "zxx", "zza",
 155 NULL,
 156     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 157 NULL
 158 };
 159 static const char* const DEPRECATED_LANGUAGES[]={
 160     "in", "iw", "ji", "jw", NULL, NULL
 161 };
 162 static const char* const REPLACEMENT_LANGUAGES[]={
 163     "id", "he", "yi", "jv", NULL, NULL
 164 };
 165
 166 /**
 167  * Table of 3-letter language codes.
 168  *
 169  * This is a lookup table used to convert 3-letter language codes to
 170  * their 2-letter equivalent, where possible.  It must be kept in sync
 171  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 172  * same language as LANGUAGES_3[i].  The commented-out lines are
 173  * copied from LANGUAGES to make eyeballing this baby easier.
 174  *
 175  * Where a 3-letter language code has no 2-letter equivalent, the
 176  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 177  *
 178  * This table should be terminated with a NULL entry, followed by a
 179  * second list, and another NULL entry.  The two lists correspond to
 180  * the two lists in LANGUAGES.
 181  */
 182 static const char * const LANGUAGES_3[] = {
 183 /*  "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",    */
 184     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
 185 /*  "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",  "ang", "anp", "apa",    */
 186     "afh", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
 187 /*  "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",    */
 188     "ara", "arc", "arn", "arp", "art", "arw", "asm", "ast",
 189 /*  "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",    */
 190     "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
 191 /*  "bai", "bal", "ban", "bas", "bat", "be",  "bej",    */
 192     "bai", "bal", "ban", "bas", "bat", "bel", "bej",
 193 /*  "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",    */
 194     "bem", "ber", "bul", "bih", "bho", "bis", "bik", "bin",
 195 /*  "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",     */
 196     "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "bos",
 197 /*  "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",    */
 198     "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
 199 /*  "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",    */
 200     "cch", "che", "ceb", "cel", "cha", "chb", "chg", "chk", "chm",
 201 /*  "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",    */
 202     "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
 203 /*  "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",    */
 204     "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
 205 /*  "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",    */
 206     "chv", "cym", "dan", "dak", "dar", "day", "deu", "del", "den",
 207 /*  "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",    */
 208     "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "div", "dyu",
 209 /*  "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",     */
 210     "dzo", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
 211 /*  "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",     */
 212     "enm", "epo", "spa", "est", "eus", "ewo", "fas",
 213 /*  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",    */
 214     "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
 215 /*  "fr",  "frm", "fro", "frr", "frs", "fur", "fy",  "ga",  "gaa", "gay",    */
 216     "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gay",
 217 /*  "gba", "gd",  "gem", "gez", "gil", "gl",  "gmh", "gn",     */
 218     "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
 219 /*  "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "gv",     */
 220     "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "glv",
 221 /*  "gwi", "ha",  "hai", "haw", "he",  "hi",  "hil", "him",    */
 222     "gwi", "hau", "hai", "haw", "heb", "hin", "hil", "him",
 223 /*  "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",     */
 224     "hit", "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye", "her",
 225 /*  "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",     */
 226     "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
 227 /*  "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",      */
 228     "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
 229 /*  "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",   */
 230     "iku", "jpn", "jbo", "jpr", "jrb", "jav", "kat", "kaa", "kab",
 231 /*  "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",*/
 232     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
 233 /*  "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",     */
 234     "kho", "kik", "kua", "kaz", "kal", "khm", "kmb", "kan",
 235 /*  "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",     */
 236     "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas",
 237 /*  "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",    */
 238     "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad",
 239 /*  "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",    */
 240     "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
 241 /*  "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",    */
 242     "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus",
 243 /*  "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",    */
 244     "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
 245 /*  "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",    */
 246     "mdf", "mdr", "men", "mfe", "mlg", "mga", "mah", "mri", "mic", "min",
 247 /*  "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",    */
 248     "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
 249 /*  "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",    */
 250     "mol", "moh", "mos", "mar", "msa", "mlt", "mul", "mun",
 251 /*  "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",    */
 252     "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nap",
 253 /*  "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",    */
 254     "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
 255 /*  "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",    */
 256     "niu", "nld", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub",
 257 /*  "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",     */
 258     "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
 259 /*  "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",    */
 260     "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
 261 /*  "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",    */
 262     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
 263 /*  "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",     */
 264     "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
 265 /*  "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",    */
 266     "raj", "rap", "rar", "roh", "run", "ron", "roa", "rom",
 267 /*  "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",    */
 268     "rus", "rup", "kin", "san", "sad", "sah", "sai", "sal", "sam",
 269 /*  "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",    */
 270     "sas", "sat", "srd", "scn", "sco", "snd", "sme", "sel", "sem",
 271 /*  "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",    */
 272     "sag", "sga", "sgn", "shn", "sin", "sid", "sio", "sit",
 273 /*  "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",    */
 274     "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
 275 /*  "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",     */
 276     "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
 277 /*  "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",    */
 278     "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
 279 /*  "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",    */
 280     "swe", "swa", "syc", "syr", "tam", "tai", "tel", "tem", "ter",
 281 /*  "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",    */
 282     "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
 283 /*  "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",    */
 284     "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
 285 /*  "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",     */
 286     "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi",
 287 /*  "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",     */
 288     "tah", "tyv", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
 289 /*  "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",    */
 290     "uzb", "vai", "ven", "vie", "vol", "vot", "wln", "wak",
 291 /*  "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",    */
 292     "wal", "war", "was", "wen", "wol", "xal", "xho", "yao", "yap",
 293 /*  "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",    */
 294     "yid", "yor", "ypk", "zha", "zap", "zbl", "zen", "zho", "znd",
 295 /*  "zu",  "zun", "zxx", "zza",                                         */
 296     "zul", "zun", "zxx", "zza",
 297 NULL,
 298 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 299     "ind", "heb", "yid", "jaw", "srp",
 300 NULL
 301 };
 302
 303 /**
 304  * Table of 2-letter country codes.
 305  *
 306  * This list must be in sorted order.  This list is returned directly
 307  * to the user by some API.
 308  *
 309  * This list must be kept in sync with COUNTRIES_3, with corresponding
 310  * entries matched.
 311  *
 312  * This table should be terminated with a NULL entry, followed by a
 313  * second list, and another NULL entry.  The first list is visible to
 314  * user code when this array is returned by API.  The second list
 315  * contains codes we support, but do not expose through user API.
 316  *
 317  * Notes:
 318  *
 319  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 320  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 321  * new codes keeping the old ones for compatibility updated to include
 322  * 1999/12/03 revisions *CWB*
 323  *
 324  * RO(ROM) is now RO(ROU) according to
 325  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 326  */
 327 static const char * const COUNTRIES[] = {
 328     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",
 329     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 330     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 331     "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",
 332     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 333     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
 334     "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
 335     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
 336     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 337     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 338     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 339     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 340     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 341     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 342     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 343     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 344     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 345     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 346     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 347     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 348     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 349     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 350     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 351     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 352     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",
 353     "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 354     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 355     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 356     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 357     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 358 NULL,
 359     "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   /* obsolete country codes */
 360 NULL
 361 };
 362
 363 static const char* const DEPRECATED_COUNTRIES[] ={
 364     "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
 365 };
 366 static const char* const REPLACEMENT_COUNTRIES[] = {
 367 /*  "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
 368     "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL  /* replacement country codes */
 369 };
 370
 371 /**
 372  * Table of 3-letter country codes.
 373  *
 374  * This is a lookup table used to convert 3-letter country codes to
 375  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 376  * For all valid i, COUNTRIES[i] must refer to the same country as
 377  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 378  * to make eyeballing this baby easier.
 379  *
 380  * This table should be terminated with a NULL entry, followed by a
 381  * second list, and another NULL entry.  The two lists correspond to
 382  * the two lists in COUNTRIES.
 383  */
 384 static const char * const COUNTRIES_3[] = {
 385 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",     */
 386     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
 387 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 388     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 389 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 390     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 391 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",     */
 392     "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
 393 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 394     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 395 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
 396     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
 397 /*  "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
 398     "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
 399 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 400     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
 401 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 402     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 403 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 404     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 405 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 406     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 407 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 408     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 409 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 410     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 411 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 412     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 413 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 414     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 415 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 416     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 417 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 418     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 419 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 420     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 421 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 422     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 423 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 424     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 425 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 426     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 427 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 428     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 429 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 430     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 431 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 432     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 433 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",     */
 434     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
 435 /*  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 436     "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 437 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 438     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 439 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 440     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 441 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 442     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 443 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 444     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 445 NULL,
 446 /*  "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   */
 447     "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
 448 NULL
 449 };
 450
 451 typedef struct CanonicalizationMap {
 452     const char *id;          /* input ID */
 453     const char *canonicalID; /* canonicalized output ID */
 454     const char *keyword;     /* keyword, or NULL if none */
 455     const char *value;       /* keyword value, or NULL if kw==NULL */
 456 } CanonicalizationMap;
 457
 458 /**
 459  * A map to canonicalize locale IDs.  This handles a variety of
 460  * different semantic kinds of transformations.
 461  */
 462 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 463     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 464     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 465     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 466     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 467     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 468     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 469     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 470     { "cel_GAULISH",    "cel__GAULISH", NULL, NULL }, /* registered name */
 471     { "de_1901",        "de__1901", NULL, NULL }, /* registered name */
 472     { "de_1906",        "de__1906", NULL, NULL }, /* registered name */
 473     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 474     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 475     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 476     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 477     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 478     { "en_BOONT",       "en__BOONT", NULL, NULL }, /* registered name */
 479     { "en_SCOUSE",      "en__SCOUSE", NULL, NULL }, /* registered name */
 480     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 481     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 482     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 483     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 484     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 485     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 486     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 487     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 488     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 489     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 490     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 491     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 492     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 493     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 494     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 495     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 496     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 497     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 498     { "sl_ROZAJ",       "sl__ROZAJ", NULL, NULL }, /* registered name */
 499     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 500     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 501     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 502     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 503     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 504     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 505     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 506     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 507     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 508     { "zh_GAN",         "zh__GAN", NULL, NULL }, /* registered name */
 509     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 510     { "zh_HAKKA",       "zh__HAKKA", NULL, NULL }, /* registered name */
 511     { "zh_MIN",         "zh__MIN", NULL, NULL }, /* registered name */
 512     { "zh_MIN_NAN",     "zh__MINNAN", NULL, NULL }, /* registered name */
 513     { "zh_WUU",         "zh__WUU", NULL, NULL }, /* registered name */
 514     { "zh_XIANG",       "zh__XIANG", NULL, NULL }, /* registered name */
 515     { "zh_YUE",         "zh__YUE", NULL, NULL }, /* registered name */
 516 };
 517
 518 typedef struct VariantMap {
 519     const char *variant;          /* input ID */
 520     const char *keyword;     /* keyword, or NULL if none */
 521     const char *value;       /* keyword value, or NULL if kw==NULL */
 522 } VariantMap;
 523
 524 static const VariantMap VARIANT_MAP[] = {
 525     { "EURO",   "currency", "EUR" },
 526     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 527     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 528 };
 529
 530 /* ### BCP47 Conversion *******************************************/
 531 /* Test if the locale id has BCP47 u extension and does not have '@' */
 532 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 533 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 534 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 535         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
 536             finalID=id; \
 537         } else { \
 538             finalID=buffer; \
 539         }
 540 /* Gets the size of the shortest subtag in the given localeID. */
 541 static int32_t getShortestSubtagLength(const char *localeID) {
 542     int32_t localeIDLength = uprv_strlen(localeID);
 543     int32_t length = localeIDLength;
 544     int32_t tmpLength = 0;
 545     int32_t i;
 546     UBool reset = TRUE;
 547
 548     for (i = 0; i < localeIDLength; i++) {
 549         if (localeID[i] != '_' && localeID[i] != '-') {
 550             if (reset) {
 551                 tmpLength = 0;
 552                 reset = FALSE;
 553             }
 554             tmpLength++;
 555         } else {
 556             if (tmpLength != 0 && tmpLength < length) {
 557                 length = tmpLength;
 558             }
 559             reset = TRUE;
 560         }
 561     }
 562
 563     return length;
 564 }
 565
 566 /* ### Keywords **************************************************/
 567
 568 #define ULOC_KEYWORD_BUFFER_LEN 25
 569 #define ULOC_MAX_NO_KEYWORDS 25
 570
 571 U_CAPI const char * U_EXPORT2
 572 locale_getKeywordsStart(const char *localeID) {
 573     const char *result = NULL;
 574     if((result = uprv_strchr(localeID, '@')) != NULL) {
 575         return result;
 576     }
 577 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 578     else {
 579         /* We do this because the @ sign is variant, and the @ sign used on one
 580         EBCDIC machine won't be compiled the same way on other EBCDIC based
 581         machines. */
 582         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 583         const uint8_t *charToFind = ebcdicSigns;
 584         while(*charToFind) {
 585             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 586                 return result;
 587             }
 588             charToFind++;
 589         }
 590     }
 591 #endif
 592     return NULL;
 593 }
 594
 595 /**
 596  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 597  * @param keywordName incoming name to be canonicalized
 598  * @param status return status (keyword too long)
 599  * @return length of the keyword name
 600  */
 601 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 602 {
 603   int32_t i;
 604   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
 605
 606   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
 607     /* keyword name too long for internal buffer */
 608     *status = U_INTERNAL_PROGRAM_ERROR;
 609           return 0;
 610   }
 611
 612   /* normalize the keyword name */
 613   for(i = 0; i < keywordNameLen; i++) {
 614     buf[i] = uprv_tolower(keywordName[i]);
 615   }
 616   buf[i] = 0;
 617
 618   return keywordNameLen;
 619 }
 620
 621 typedef struct {
 622     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 623     int32_t keywordLen;
 624     const char *valueStart;
 625     int32_t valueLen;
 626 } KeywordStruct;
 627
 628 static int32_t U_CALLCONV
 629 compareKeywordStructs(const void *context, const void *left, const void *right) {
 630     const char* leftString = ((const KeywordStruct *)left)->keyword;
 631     const char* rightString = ((const KeywordStruct *)right)->keyword;
 632     return uprv_strcmp(leftString, rightString);
 633 }
 634
 635 /**
 636  * Both addKeyword and addValue must already be in canonical form.
 637  * Either both addKeyword and addValue are NULL, or neither is NULL.
 638  * If they are not NULL they must be zero terminated.
 639  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 640  */
 641 static int32_t
 642 _getKeywords(const char *localeID,
 643              char prev,
 644              char *keywords, int32_t keywordCapacity,
 645              char *values, int32_t valuesCapacity, int32_t *valLen,
 646              UBool valuesToo,
 647              const char* addKeyword,
 648              const char* addValue,
 649              UErrorCode *status)
 650 {
 651     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 652
 653     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 654     int32_t numKeywords = 0;
 655     const char* pos = localeID;
 656     const char* equalSign = NULL;
 657     const char* semicolon = NULL;
 658     int32_t i = 0, j, n;
 659     int32_t keywordsLen = 0;
 660     int32_t valuesLen = 0;
 661
 662     if(prev == '@') { /* start of keyword definition */
 663         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 664         do {
 665             UBool duplicate = FALSE;
 666             /* skip leading spaces */
 667             while(*pos == ' ') {
 668                 pos++;
 669             }
 670             if (!*pos) { /* handle trailing "; " */
 671                 break;
 672             }
 673             if(numKeywords == maxKeywords) {
 674                 *status = U_INTERNAL_PROGRAM_ERROR;
 675                 return 0;
 676             }
 677             equalSign = uprv_strchr(pos, '=');
 678             semicolon = uprv_strchr(pos, ';');
 679             /* lack of '=' [foo@currency] is illegal */
 680             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 681             if(!equalSign || (semicolon && semicolon<equalSign)) {
 682                 *status = U_INVALID_FORMAT_ERROR;
 683                 return 0;
 684             }
 685             /* need to normalize both keyword and keyword name */
 686             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 687                 /* keyword name too long for internal buffer */
 688                 *status = U_INTERNAL_PROGRAM_ERROR;
 689                 return 0;
 690             }
 691             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 692                 if (pos[i] != ' ') {
 693                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 694                 }
 695             }
 696             keywordList[numKeywords].keyword[n] = 0;
 697             keywordList[numKeywords].keywordLen = n;
 698             /* now grab the value part. First we skip the '=' */
 699             equalSign++;
 700             /* then we leading spaces */
 701             while(*equalSign == ' ') {
 702                 equalSign++;
 703             }
 704             keywordList[numKeywords].valueStart = equalSign;
 705
 706             pos = semicolon;
 707             i = 0;
 708             if(pos) {
 709                 while(*(pos - i - 1) == ' ') {
 710                     i++;
 711                 }
 712                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 713                 pos++;
 714             } else {
 715                 i = (int32_t)uprv_strlen(equalSign);
 716                 while(equalSign[i-1] == ' ') {
 717                     i--;
 718                 }
 719                 keywordList[numKeywords].valueLen = i;
 720             }
 721             /* If this is a duplicate keyword, then ignore it */
 722             for (j=0; j<numKeywords; ++j) {
 723                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 724                     duplicate = TRUE;
 725                     break;
 726                 }
 727             }
 728             if (!duplicate) {
 729                 ++numKeywords;
 730             }
 731         } while(pos);
 732
 733         /* Handle addKeyword/addValue. */
 734         if (addKeyword != NULL) {
 735             UBool duplicate = FALSE;
 736             U_ASSERT(addValue != NULL);
 737             /* Search for duplicate; if found, do nothing. Explicit keyword
 738                overrides addKeyword. */
 739             for (j=0; j<numKeywords; ++j) {
 740                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 741                     duplicate = TRUE;
 742                     break;
 743                 }
 744             }
 745             if (!duplicate) {
 746                 if (numKeywords == maxKeywords) {
 747                     *status = U_INTERNAL_PROGRAM_ERROR;
 748                     return 0;
 749                 }
 750                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 751                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 752                 keywordList[numKeywords].valueStart = addValue;
 753                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 754                 ++numKeywords;
 755             }
 756         } else {
 757             U_ASSERT(addValue == NULL);
 758         }
 759
 760         /* now we have a list of keywords */
 761         /* we need to sort it */
 762         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 763
 764         /* Now construct the keyword part */
 765         for(i = 0; i < numKeywords; i++) {
 766             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 767                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 768                 if(valuesToo) {
 769                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 770                 } else {
 771                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 772                 }
 773             }
 774             keywordsLen += keywordList[i].keywordLen + 1;
 775             if(valuesToo) {
 776                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 777                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 778                 }
 779                 keywordsLen += keywordList[i].valueLen;
 780
 781                 if(i < numKeywords - 1) {
 782                     if(keywordsLen < keywordCapacity) {
 783                         keywords[keywordsLen] = ';';
 784                     }
 785                     keywordsLen++;
 786                 }
 787             }
 788             if(values) {
 789                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 790                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 791                     values[valuesLen + keywordList[i].valueLen] = 0;
 792                 }
 793                 valuesLen += keywordList[i].valueLen + 1;
 794             }
 795         }
 796         if(values) {
 797             values[valuesLen] = 0;
 798             if(valLen) {
 799                 *valLen = valuesLen;
 800             }
 801         }
 802         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 803     } else {
 804         return 0;
 805     }
 806 }
 807
 808 U_CFUNC int32_t
 809 locale_getKeywords(const char *localeID,
 810                    char prev,
 811                    char *keywords, int32_t keywordCapacity,
 812                    char *values, int32_t valuesCapacity, int32_t *valLen,
 813                    UBool valuesToo,
 814                    UErrorCode *status) {
 815     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 816                         values, valuesCapacity, valLen, valuesToo,
 817                         NULL, NULL, status);
 818 }
 819
 820 U_CAPI int32_t U_EXPORT2
 821 uloc_getKeywordValue(const char* localeID,
 822                      const char* keywordName,
 823                      char* buffer, int32_t bufferCapacity,
 824                      UErrorCode* status)
 825 {
 826     const char* startSearchHere = NULL;
 827     const char* nextSeparator = NULL;
 828     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 829     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 830     int32_t i = 0;
 831     int32_t result = 0;
 832
 833     if(status && U_SUCCESS(*status) && localeID) {
 834       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 835       const char* tmpLocaleID;
 836
 837       if (_hasBCP47Extension(localeID)) {
 838           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 839       } else {
 840           tmpLocaleID=localeID;
 841       }
 842
 843       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
 844       if(startSearchHere == NULL) {
 845           /* no keywords, return at once */
 846           return 0;
 847       }
 848
 849       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 850       if(U_FAILURE(*status)) {
 851         return 0;
 852       }
 853
 854       /* find the first keyword */
 855       while(startSearchHere) {
 856           startSearchHere++;
 857           /* skip leading spaces (allowed?) */
 858           while(*startSearchHere == ' ') {
 859               startSearchHere++;
 860           }
 861           nextSeparator = uprv_strchr(startSearchHere, '=');
 862           /* need to normalize both keyword and keyword name */
 863           if(!nextSeparator) {
 864               break;
 865           }
 866           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
 867               /* keyword name too long for internal buffer */
 868               *status = U_INTERNAL_PROGRAM_ERROR;
 869               return 0;
 870           }
 871           for(i = 0; i < nextSeparator - startSearchHere; i++) {
 872               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
 873           }
 874           /* trim trailing spaces */
 875           while(startSearchHere[i-1] == ' ') {
 876               i--;
 877           }
 878           localeKeywordNameBuffer[i] = 0;
 879
 880           startSearchHere = uprv_strchr(nextSeparator, ';');
 881
 882           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 883               nextSeparator++;
 884               while(*nextSeparator == ' ') {
 885                   nextSeparator++;
 886               }
 887               /* we actually found the keyword. Copy the value */
 888               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
 889                   while(*(startSearchHere-1) == ' ') {
 890                       startSearchHere--;
 891                   }
 892                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
 893                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
 894               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
 895                   i = (int32_t)uprv_strlen(nextSeparator);
 896                   while(nextSeparator[i - 1] == ' ') {
 897                       i--;
 898                   }
 899                   uprv_strncpy(buffer, nextSeparator, i);
 900                   result = u_terminateChars(buffer, bufferCapacity, i, status);
 901               } else {
 902                   /* give a bigger buffer, please */
 903                   *status = U_BUFFER_OVERFLOW_ERROR;
 904                   if(startSearchHere) {
 905                       result = (int32_t)(startSearchHere - nextSeparator);
 906                   } else {
 907                       result = (int32_t)uprv_strlen(nextSeparator);
 908                   }
 909               }
 910               return result;
 911           }
 912       }
 913     }
 914     return 0;
 915 }
 916
 917 U_CAPI int32_t U_EXPORT2
 918 uloc_setKeywordValue(const char* keywordName,
 919                      const char* keywordValue,
 920                      char* buffer, int32_t bufferCapacity,
 921                      UErrorCode* status)
 922 {
 923     /* TODO: sorting. removal. */
 924     int32_t keywordNameLen;
 925     int32_t keywordValueLen;
 926     int32_t bufLen;
 927     int32_t needLen = 0;
 928     int32_t foundValueLen;
 929     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
 930     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 931     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 932     int32_t i = 0;
 933     int32_t rc;
 934     char* nextSeparator = NULL;
 935     char* nextEqualsign = NULL;
 936     char* startSearchHere = NULL;
 937     char* keywordStart = NULL;
 938     char *insertHere = NULL;
 939     if(U_FAILURE(*status)) {
 940         return -1;
 941     }
 942     if(bufferCapacity>1) {
 943         bufLen = (int32_t)uprv_strlen(buffer);
 944     } else {
 945         *status = U_ILLEGAL_ARGUMENT_ERROR;
 946         return 0;
 947     }
 948     if(bufferCapacity<bufLen) {
 949         /* The capacity is less than the length?! Is this NULL terminated? */
 950         *status = U_ILLEGAL_ARGUMENT_ERROR;
 951         return 0;
 952     }
 953     if(keywordValue && !*keywordValue) {
 954         keywordValue = NULL;
 955     }
 956     if(keywordValue) {
 957         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
 958     } else {
 959         keywordValueLen = 0;
 960     }
 961     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 962     if(U_FAILURE(*status)) {
 963         return 0;
 964     }
 965     startSearchHere = (char*)locale_getKeywordsStart(buffer);
 966     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
 967         if(!keywordValue) { /* no keywords = nothing to remove */
 968             return bufLen;
 969         }
 970
 971         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
 972         if(startSearchHere) { /* had a single @ */
 973             needLen--; /* already had the @ */
 974             /* startSearchHere points at the @ */
 975         } else {
 976             startSearchHere=buffer+bufLen;
 977         }
 978         if(needLen >= bufferCapacity) {
 979             *status = U_BUFFER_OVERFLOW_ERROR;
 980             return needLen; /* no change */
 981         }
 982         *startSearchHere = '@';
 983         startSearchHere++;
 984         uprv_strcpy(startSearchHere, keywordNameBuffer);
 985         startSearchHere += keywordNameLen;
 986         *startSearchHere = '=';
 987         startSearchHere++;
 988         uprv_strcpy(startSearchHere, keywordValue);
 989         startSearchHere+=keywordValueLen;
 990         return needLen;
 991     } /* end shortcut - no @ */
 992
 993     keywordStart = startSearchHere;
 994     /* search for keyword */
 995     while(keywordStart) {
 996         keywordStart++;
 997         /* skip leading spaces (allowed?) */
 998         while(*keywordStart == ' ') {
 999             keywordStart++;
1000         }
1001         nextEqualsign = uprv_strchr(keywordStart, '=');
1002         /* need to normalize both keyword and keyword name */
1003         if(!nextEqualsign) {
1004             break;
1005         }
1006         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1007             /* keyword name too long for internal buffer */
1008             *status = U_INTERNAL_PROGRAM_ERROR;
1009             return 0;
1010         }
1011         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1012             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1013         }
1014         /* trim trailing spaces */
1015         while(keywordStart[i-1] == ' ') {
1016             i--;
1017         }
1018         localeKeywordNameBuffer[i] = 0;
1019
1020         nextSeparator = uprv_strchr(nextEqualsign, ';');
1021         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1022         if(rc == 0) {
1023             nextEqualsign++;
1024             while(*nextEqualsign == ' ') {
1025                 nextEqualsign++;
1026             }
1027             /* we actually found the keyword. Change the value */
1028             if (nextSeparator) {
1029                 keywordAtEnd = 0;
1030                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1031             } else {
1032                 keywordAtEnd = 1;
1033                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1034             }
1035             if(keywordValue) { /* adding a value - not removing */
1036               if(foundValueLen == keywordValueLen) {
1037                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1038                 return bufLen; /* no change in size */
1039               } else if(foundValueLen > keywordValueLen) {
1040                 int32_t delta = foundValueLen - keywordValueLen;
1041                 if(nextSeparator) { /* RH side */
1042                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1043                 }
1044                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1045                 bufLen -= delta;
1046                 buffer[bufLen]=0;
1047                 return bufLen;
1048               } else { /* FVL < KVL */
1049                 int32_t delta = keywordValueLen - foundValueLen;
1050                 if((bufLen+delta) >= bufferCapacity) {
1051                   *status = U_BUFFER_OVERFLOW_ERROR;
1052                   return bufLen+delta;
1053                 }
1054                 if(nextSeparator) { /* RH side */
1055                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1056                 }
1057                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1058                 bufLen += delta;
1059                 buffer[bufLen]=0;
1060                 return bufLen;
1061               }
1062             } else { /* removing a keyword */
1063               if(keywordAtEnd) {
1064                 /* zero out the ';' or '@' just before startSearchhere */
1065                 keywordStart[-1] = 0;
1066                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1067               } else {
1068                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1069                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1070                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1071               }
1072             }
1073         } else if(rc<0){ /* end match keyword */
1074           /* could insert at this location. */
1075           insertHere = keywordStart;
1076         }
1077         keywordStart = nextSeparator;
1078     } /* end loop searching */
1079
1080     if(!keywordValue) {
1081       return bufLen; /* removal of non-extant keyword - no change */
1082     }
1083
1084     /* we know there is at least one keyword. */
1085     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1086     if(needLen >= bufferCapacity) {
1087         *status = U_BUFFER_OVERFLOW_ERROR;
1088         return needLen; /* no change */
1089     }
1090
1091     if(insertHere) {
1092       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1093       keywordStart = insertHere;
1094     } else {
1095       keywordStart = buffer+bufLen;
1096       *keywordStart = ';';
1097       keywordStart++;
1098     }
1099     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1100     keywordStart += keywordNameLen;
1101     *keywordStart = '=';
1102     keywordStart++;
1103     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1104     keywordStart+=keywordValueLen;
1105     if(insertHere) {
1106       *keywordStart = ';';
1107       keywordStart++;
1108     }
1109     buffer[needLen]=0;
1110     return needLen;
1111 }
1112
1113 /* ### ID parsing implementation **************************************************/
1114
1115 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1116
1117 /*returns TRUE if one of the special prefixes is here (s=string)
1118   'x-' or 'i-' */
1119 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1120
1121 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1122  * except for variant
1123  */
1124 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1125
1126 static char* _strnchr(const char* str, int32_t len, char c) {
1127     U_ASSERT(str != 0 && len >= 0);
1128     while (len-- != 0) {
1129         char d = *str;
1130         if (d == c) {
1131             return (char*) str;
1132         } else if (d == 0) {
1133             break;
1134         }
1135         ++str;
1136     }
1137     return NULL;
1138 }
1139
1140 /**
1141  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1142  * a NULL entry, followed by more entries, and a second NULL entry.
1143  *
1144  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1145  * COUNTRIES_3.
1146  */
1147 static int16_t _findIndex(const char* const* list, const char* key)
1148 {
1149     const char* const* anchor = list;
1150     int32_t pass = 0;
1151
1152     /* Make two passes through two NULL-terminated arrays at 'list' */
1153     while (pass++ < 2) {
1154         while (*list) {
1155             if (uprv_strcmp(key, *list) == 0) {
1156                 return (int16_t)(list - anchor);
1157             }
1158             list++;
1159         }
1160         ++list;     /* skip final NULL *CWB*/
1161     }
1162     return -1;
1163 }
1164
1165 /* count the length of src while copying it to dest; return strlen(src) */
1166 static U_INLINE int32_t
1167 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1168     const char *anchor;
1169     char c;
1170
1171     anchor=src;
1172     for(;;) {
1173         if((c=*src)==0) {
1174             return (int32_t)(src-anchor);
1175         }
1176         if(destCapacity<=0) {
1177             return (int32_t)((src-anchor)+uprv_strlen(src));
1178         }
1179         ++src;
1180         *dest++=c;
1181         --destCapacity;
1182     }
1183 }
1184
1185 U_CFUNC const char*
1186 uloc_getCurrentCountryID(const char* oldID){
1187     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1188     if (offset >= 0) {
1189         return REPLACEMENT_COUNTRIES[offset];
1190     }
1191     return oldID;
1192 }
1193 U_CFUNC const char*
1194 uloc_getCurrentLanguageID(const char* oldID){
1195     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1196     if (offset >= 0) {
1197         return REPLACEMENT_LANGUAGES[offset];
1198     }
1199     return oldID;
1200 }
1201 /*
1202  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1203  * avoid duplicating code to handle the earlier locale ID pieces
1204  * in the functions for the later ones by
1205  * setting the *pEnd pointer to where they stopped parsing
1206  *
1207  * TODO try to use this in Locale
1208  */
1209 U_CFUNC int32_t
1210 ulocimp_getLanguage(const char *localeID,
1211                     char *language, int32_t languageCapacity,
1212                     const char **pEnd) {
1213     int32_t i=0;
1214     int32_t offset;
1215     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1216
1217     /* if it starts with i- or x- then copy that prefix */
1218     if(_isIDPrefix(localeID)) {
1219         if(i<languageCapacity) {
1220             language[i]=(char)uprv_tolower(*localeID);
1221         }
1222         if(i<languageCapacity) {
1223             language[i+1]='-';
1224         }
1225         i+=2;
1226         localeID+=2;
1227     }
1228
1229     /* copy the language as far as possible and count its length */
1230     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1231         if(i<languageCapacity) {
1232             language[i]=(char)uprv_tolower(*localeID);
1233         }
1234         if(i<3) {
1235             lang[i]=(char)uprv_tolower(*localeID);
1236         }
1237         i++;
1238         localeID++;
1239     }
1240
1241     if(i==3) {
1242         /* convert 3 character code to 2 character code if possible *CWB*/
1243         offset=_findIndex(LANGUAGES_3, lang);
1244         if(offset>=0) {
1245             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1246         }
1247     }
1248
1249     if(pEnd!=NULL) {
1250         *pEnd=localeID;
1251     }
1252     return i;
1253 }
1254
1255 U_CFUNC int32_t
1256 ulocimp_getScript(const char *localeID,
1257                   char *script, int32_t scriptCapacity,
1258                   const char **pEnd)
1259 {
1260     int32_t idLen = 0;
1261
1262     if (pEnd != NULL) {
1263         *pEnd = localeID;
1264     }
1265
1266     /* copy the second item as far as possible and count its length */
1267     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1268         idLen++;
1269     }
1270
1271     /* If it's exactly 4 characters long, then it's a script and not a country. */
1272     if (idLen == 4) {
1273         int32_t i;
1274         if (pEnd != NULL) {
1275             *pEnd = localeID+idLen;
1276         }
1277         if(idLen > scriptCapacity) {
1278             idLen = scriptCapacity;
1279         }
1280         if (idLen >= 1) {
1281             script[0]=(char)uprv_toupper(*(localeID++));
1282         }
1283         for (i = 1; i < idLen; i++) {
1284             script[i]=(char)uprv_tolower(*(localeID++));
1285         }
1286     }
1287     else {
1288         idLen = 0;
1289     }
1290     return idLen;
1291 }
1292
1293 U_CFUNC int32_t
1294 ulocimp_getCountry(const char *localeID,
1295                    char *country, int32_t countryCapacity,
1296                    const char **pEnd)
1297 {
1298     int32_t idLen=0;
1299     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1300     int32_t offset;
1301
1302     /* copy the country as far as possible and count its length */
1303     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1304         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1305             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1306         }
1307         idLen++;
1308     }
1309
1310     /* the country should be either length 2 or 3 */
1311     if (idLen == 2 || idLen == 3) {
1312         UBool gotCountry = FALSE;
1313         /* convert 3 character code to 2 character code if possible *CWB*/
1314         if(idLen==3) {
1315             offset=_findIndex(COUNTRIES_3, cnty);
1316             if(offset>=0) {
1317                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1318                 gotCountry = TRUE;
1319             }
1320         }
1321         if (!gotCountry) {
1322             int32_t i = 0;
1323             for (i = 0; i < idLen; i++) {
1324                 if (i < countryCapacity) {
1325                     country[i]=(char)uprv_toupper(localeID[i]);
1326                 }
1327             }
1328         }
1329         localeID+=idLen;
1330     } else {
1331         idLen = 0;
1332     }
1333
1334     if(pEnd!=NULL) {
1335         *pEnd=localeID;
1336     }
1337
1338     return idLen;
1339 }
1340
1341 /**
1342  * @param needSeparator if true, then add leading '_' if any variants
1343  * are added to 'variant'
1344  */
1345 static int32_t
1346 _getVariantEx(const char *localeID,
1347               char prev,
1348               char *variant, int32_t variantCapacity,
1349               UBool needSeparator) {
1350     int32_t i=0;
1351
1352     /* get one or more variant tags and separate them with '_' */
1353     if(_isIDSeparator(prev)) {
1354         /* get a variant string after a '-' or '_' */
1355         while(!_isTerminator(*localeID)) {
1356             if (needSeparator) {
1357                 if (i<variantCapacity) {
1358                     variant[i] = '_';
1359                 }
1360                 ++i;
1361                 needSeparator = FALSE;
1362             }
1363             if(i<variantCapacity) {
1364                 variant[i]=(char)uprv_toupper(*localeID);
1365                 if(variant[i]=='-') {
1366                     variant[i]='_';
1367                 }
1368             }
1369             i++;
1370             localeID++;
1371         }
1372     }
1373
1374     /* if there is no variant tag after a '-' or '_' then look for '@' */
1375     if(i==0) {
1376         if(prev=='@') {
1377             /* keep localeID */
1378         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1379             ++localeID; /* point after the '@' */
1380         } else {
1381             return 0;
1382         }
1383         while(!_isTerminator(*localeID)) {
1384             if (needSeparator) {
1385                 if (i<variantCapacity) {
1386                     variant[i] = '_';
1387                 }
1388                 ++i;
1389                 needSeparator = FALSE;
1390             }
1391             if(i<variantCapacity) {
1392                 variant[i]=(char)uprv_toupper(*localeID);
1393                 if(variant[i]=='-' || variant[i]==',') {
1394                     variant[i]='_';
1395                 }
1396             }
1397             i++;
1398             localeID++;
1399         }
1400     }
1401
1402     return i;
1403 }
1404
1405 static int32_t
1406 _getVariant(const char *localeID,
1407             char prev,
1408             char *variant, int32_t variantCapacity) {
1409     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1410 }
1411
1412 /**
1413  * Delete ALL instances of a variant from the given list of one or
1414  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1415  * @param variants the source string of one or more variants,
1416  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1417  * terminated; if it is, trailing zero will NOT be maintained.
1418  * @param variantsLen length of variants
1419  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1420  * or "PREEURO"; not zero terminated
1421  * @param toDeleteLen length of toDelete
1422  * @return number of characters deleted from variants
1423  */
1424 static int32_t
1425 _deleteVariant(char* variants, int32_t variantsLen,
1426                const char* toDelete, int32_t toDeleteLen)
1427 {
1428     int32_t delta = 0; /* number of chars deleted */
1429     for (;;) {
1430         UBool flag = FALSE;
1431         if (variantsLen < toDeleteLen) {
1432             return delta;
1433         }
1434         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1435             (variantsLen == toDeleteLen ||
1436              (flag=(variants[toDeleteLen] == '_'))))
1437         {
1438             int32_t d = toDeleteLen + (flag?1:0);
1439             variantsLen -= d;
1440             delta += d;
1441             if (variantsLen > 0) {
1442                 uprv_memmove(variants, variants+d, variantsLen);
1443             }
1444         } else {
1445             char* p = _strnchr(variants, variantsLen, '_');
1446             if (p == NULL) {
1447                 return delta;
1448             }
1449             ++p;
1450             variantsLen -= (int32_t)(p - variants);
1451             variants = p;
1452         }
1453     }
1454 }
1455
1456 /* Keyword enumeration */
1457
1458 typedef struct UKeywordsContext {
1459     char* keywords;
1460     char* current;
1461 } UKeywordsContext;
1462
1463 static void U_CALLCONV
1464 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1465     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1466     uprv_free(enumerator->context);
1467     uprv_free(enumerator);
1468 }
1469
1470 static int32_t U_CALLCONV
1471 uloc_kw_countKeywords(UEnumeration *en, UErrorCode *status) {
1472     char *kw = ((UKeywordsContext *)en->context)->keywords;
1473     int32_t result = 0;
1474     while(*kw) {
1475         result++;
1476         kw += uprv_strlen(kw)+1;
1477     }
1478     return result;
1479 }
1480
1481 static const char* U_CALLCONV
1482 uloc_kw_nextKeyword(UEnumeration* en,
1483                     int32_t* resultLength,
1484                     UErrorCode* status) {
1485     const char* result = ((UKeywordsContext *)en->context)->current;
1486     int32_t len = 0;
1487     if(*result) {
1488         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1489         ((UKeywordsContext *)en->context)->current += len+1;
1490     } else {
1491         result = NULL;
1492     }
1493     if (resultLength) {
1494         *resultLength = len;
1495     }
1496     return result;
1497 }
1498
1499 static void U_CALLCONV
1500 uloc_kw_resetKeywords(UEnumeration* en,
1501                       UErrorCode* status) {
1502     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1503 }
1504
1505 static const UEnumeration gKeywordsEnum = {
1506     NULL,
1507     NULL,
1508     uloc_kw_closeKeywords,
1509     uloc_kw_countKeywords,
1510     uenum_unextDefault,
1511     uloc_kw_nextKeyword,
1512     uloc_kw_resetKeywords
1513 };
1514
1515 U_CAPI UEnumeration* U_EXPORT2
1516 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1517 {
1518     UKeywordsContext *myContext = NULL;
1519     UEnumeration *result = NULL;
1520
1521     if(U_FAILURE(*status)) {
1522         return NULL;
1523     }
1524     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1525     /* Null pointer test */
1526     if (result == NULL) {
1527         *status = U_MEMORY_ALLOCATION_ERROR;
1528         return NULL;
1529     }
1530     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1531     myContext = uprv_malloc(sizeof(UKeywordsContext));
1532     if (myContext == NULL) {
1533         *status = U_MEMORY_ALLOCATION_ERROR;
1534         uprv_free(result);
1535         return NULL;
1536     }
1537     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1538     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1539     myContext->keywords[keywordListSize] = 0;
1540     myContext->current = myContext->keywords;
1541     result->context = myContext;
1542     return result;
1543 }
1544
1545 U_CAPI UEnumeration* U_EXPORT2
1546 uloc_openKeywords(const char* localeID,
1547                         UErrorCode* status)
1548 {
1549     int32_t i=0;
1550     char keywords[256];
1551     int32_t keywordsCapacity = 256;
1552     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1553     const char* tmpLocaleID;
1554
1555     if(status==NULL || U_FAILURE(*status)) {
1556         return 0;
1557     }
1558
1559     if (_hasBCP47Extension(localeID)) {
1560         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1561     } else {
1562         if (localeID==NULL) {
1563            localeID=uloc_getDefault();
1564         }
1565         tmpLocaleID=localeID;
1566     }
1567
1568     /* Skip the language */
1569     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1570     if(_isIDSeparator(*tmpLocaleID)) {
1571         const char *scriptID;
1572         /* Skip the script if available */
1573         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1574         if(scriptID != tmpLocaleID+1) {
1575             /* Found optional script */
1576             tmpLocaleID = scriptID;
1577         }
1578         /* Skip the Country */
1579         if (_isIDSeparator(*tmpLocaleID)) {
1580             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1581             if(_isIDSeparator(*tmpLocaleID)) {
1582                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1583             }
1584         }
1585     }
1586
1587     /* keywords are located after '@' */
1588     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1589         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1590     }
1591
1592     if(i) {
1593         return uloc_openKeywordList(keywords, i, status);
1594     } else {
1595         return NULL;
1596     }
1597 }
1598
1599
1600 /* bit-flags for 'options' parameter of _canonicalize */
1601 #define _ULOC_STRIP_KEYWORDS 0x2
1602 #define _ULOC_CANONICALIZE   0x1
1603
1604 #define OPTION_SET(options, mask) ((options & mask) != 0)
1605
1606 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1607 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1608
1609 /**
1610  * Canonicalize the given localeID, to level 1 or to level 2,
1611  * depending on the options.  To specify level 1, pass in options=0.
1612  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1613  *
1614  * This is the code underlying uloc_getName and uloc_canonicalize.
1615  */
1616 static int32_t
1617 _canonicalize(const char* localeID,
1618               char* result,
1619               int32_t resultCapacity,
1620               uint32_t options,
1621               UErrorCode* err) {
1622     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1623     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1624     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1625     const char* origLocaleID;
1626     const char* tmpLocaleID;
1627     const char* keywordAssign = NULL;
1628     const char* separatorIndicator = NULL;
1629     const char* addKeyword = NULL;
1630     const char* addValue = NULL;
1631     char* name;
1632     char* variant = NULL; /* pointer into name, or NULL */
1633
1634     if (U_FAILURE(*err)) {
1635         return 0;
1636     }
1637
1638     if (_hasBCP47Extension(localeID)) {
1639         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1640     } else {
1641         if (localeID==NULL) {
1642            localeID=uloc_getDefault();
1643         }
1644         tmpLocaleID=localeID;
1645     }
1646
1647     origLocaleID=tmpLocaleID;
1648
1649     /* if we are doing a full canonicalization, then put results in
1650        localeBuffer, if necessary; otherwise send them to result. */
1651     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1652         (result == NULL || resultCapacity <  sizeof(localeBuffer))) {
1653         name = localeBuffer;
1654         nameCapacity = sizeof(localeBuffer);
1655     } else {
1656         name = result;
1657         nameCapacity = resultCapacity;
1658     }
1659
1660     /* get all pieces, one after another, and separate with '_' */
1661     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1662
1663     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1664         const char *d = uloc_getDefault();
1665
1666         len = (int32_t)uprv_strlen(d);
1667
1668         if (name != NULL) {
1669             uprv_strncpy(name, d, len);
1670         }
1671     } else if(_isIDSeparator(*tmpLocaleID)) {
1672         const char *scriptID;
1673
1674         ++fieldCount;
1675         if(len<nameCapacity) {
1676             name[len]='_';
1677         }
1678         ++len;
1679
1680         scriptSize=ulocimp_getScript(tmpLocaleID+1, name+len, nameCapacity-len, &scriptID);
1681         if(scriptSize > 0) {
1682             /* Found optional script */
1683             tmpLocaleID = scriptID;
1684             ++fieldCount;
1685             len+=scriptSize;
1686             if (_isIDSeparator(*tmpLocaleID)) {
1687                 /* If there is something else, then we add the _ */
1688                 if(len<nameCapacity) {
1689                     name[len]='_';
1690                 }
1691                 ++len;
1692             }
1693         }
1694
1695         if (_isIDSeparator(*tmpLocaleID)) {
1696             const char *cntryID;
1697             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1, name+len, nameCapacity-len, &cntryID);
1698             if (cntrySize > 0) {
1699                 /* Found optional country */
1700                 tmpLocaleID = cntryID;
1701                 len+=cntrySize;
1702             }
1703             if(_isIDSeparator(*tmpLocaleID)) {
1704                 /* If there is something else, then we add the _  if we found country before.*/
1705                 if (cntrySize > 0) {
1706                     ++fieldCount;
1707                     if(len<nameCapacity) {
1708                         name[len]='_';
1709                     }
1710                     ++len;
1711                 }
1712
1713                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID, name+len, nameCapacity-len);
1714                 if (variantSize > 0) {
1715                     variant = name+len;
1716                     len += variantSize;
1717                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1718                 }
1719             }
1720         }
1721     }
1722
1723     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1724     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1725         UBool done = FALSE;
1726         do {
1727             char c = *tmpLocaleID;
1728             switch (c) {
1729             case 0:
1730             case '@':
1731                 done = TRUE;
1732                 break;
1733             default:
1734                 if (len<nameCapacity) {
1735                     name[len] = c;
1736                 }
1737                 ++len;
1738                 ++tmpLocaleID;
1739                 break;
1740             }
1741         } while (!done);
1742     }
1743
1744     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1745        After this, tmpLocaleID either points to '@' or is NULL */
1746     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1747         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1748         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1749     }
1750
1751     /* Copy POSIX-style variant, if any [mr@FOO] */
1752     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1753         tmpLocaleID != NULL && keywordAssign == NULL) {
1754         for (;;) {
1755             char c = *tmpLocaleID;
1756             if (c == 0) {
1757                 break;
1758             }
1759             if (len<nameCapacity) {
1760                 name[len] = c;
1761             }
1762             ++len;
1763             ++tmpLocaleID;
1764         }
1765     }
1766
1767     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1768         /* Handle @FOO variant if @ is present and not followed by = */
1769         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1770             int32_t posixVariantSize;
1771             /* Add missing '_' if needed */
1772             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1773                 do {
1774                     if(len<nameCapacity) {
1775                         name[len]='_';
1776                     }
1777                     ++len;
1778                     ++fieldCount;
1779                 } while(fieldCount<2);
1780             }
1781             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1782                                              (UBool)(variantSize > 0));
1783             if (posixVariantSize > 0) {
1784                 if (variant == NULL) {
1785                     variant = name+len;
1786                 }
1787                 len += posixVariantSize;
1788                 variantSize += posixVariantSize;
1789             }
1790         }
1791
1792         /* Handle generic variants first */
1793         if (variant) {
1794             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1795                 const char* variantToCompare = VARIANT_MAP[j].variant;
1796                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1797                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1798                 len -= variantLen;
1799                 if (variantLen > 0) {
1800                     if (name[len-1] == '_') { /* delete trailing '_' */
1801                         --len;
1802                     }
1803                     addKeyword = VARIANT_MAP[j].keyword;
1804                     addValue = VARIANT_MAP[j].value;
1805                     break;
1806                 }
1807             }
1808             if (name[len-1] == '_') { /* delete trailing '_' */
1809                 --len;
1810             }
1811         }
1812
1813         /* Look up the ID in the canonicalization map */
1814         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1815             const char* id = CANONICALIZE_MAP[j].id;
1816             int32_t n = (int32_t)uprv_strlen(id);
1817             if (len == n && uprv_strncmp(name, id, n) == 0) {
1818                 if (n == 0 && tmpLocaleID != NULL) {
1819                     break; /* Don't remap "" if keywords present */
1820                 }
1821                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1822                 if (CANONICALIZE_MAP[j].keyword) {
1823                     addKeyword = CANONICALIZE_MAP[j].keyword;
1824                     addValue = CANONICALIZE_MAP[j].value;
1825                 }
1826                 break;
1827             }
1828         }
1829     }
1830
1831     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1832         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1833             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1834             if(len<nameCapacity) {
1835                 name[len]='@';
1836             }
1837             ++len;
1838             ++fieldCount;
1839             len += _getKeywords(tmpLocaleID+1, '@', name+len, nameCapacity-len, NULL, 0, NULL, TRUE,
1840                                 addKeyword, addValue, err);
1841         } else if (addKeyword != NULL) {
1842             U_ASSERT(addValue != NULL);
1843             /* inelegant but works -- later make _getKeywords do this? */
1844             len += _copyCount(name+len, nameCapacity-len, "@");
1845             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1846             len += _copyCount(name+len, nameCapacity-len, "=");
1847             len += _copyCount(name+len, nameCapacity-len, addValue);
1848         }
1849     }
1850
1851     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1852         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1853     }
1854
1855     return u_terminateChars(result, resultCapacity, len, err);
1856 }
1857
1858 /* ### ID parsing API **************************************************/
1859
1860 U_CAPI int32_t  U_EXPORT2
1861 uloc_getParent(const char*    localeID,
1862                char* parent,
1863                int32_t parentCapacity,
1864                UErrorCode* err)
1865 {
1866     const char *lastUnderscore;
1867     int32_t i;
1868
1869     if (U_FAILURE(*err))
1870         return 0;
1871
1872     if (localeID == NULL)
1873         localeID = uloc_getDefault();
1874
1875     lastUnderscore=uprv_strrchr(localeID, '_');
1876     if(lastUnderscore!=NULL) {
1877         i=(int32_t)(lastUnderscore-localeID);
1878     } else {
1879         i=0;
1880     }
1881
1882     if(i>0 && parent != localeID) {
1883         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1884     }
1885     return u_terminateChars(parent, parentCapacity, i, err);
1886 }
1887
1888 U_CAPI int32_t U_EXPORT2
1889 uloc_getLanguage(const char*    localeID,
1890          char* language,
1891          int32_t languageCapacity,
1892          UErrorCode* err)
1893 {
1894     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1895     int32_t i=0;
1896
1897     if (err==NULL || U_FAILURE(*err)) {
1898         return 0;
1899     }
1900
1901     if(localeID==NULL) {
1902         localeID=uloc_getDefault();
1903     }
1904
1905     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1906     return u_terminateChars(language, languageCapacity, i, err);
1907 }
1908
1909 U_CAPI int32_t U_EXPORT2
1910 uloc_getScript(const char*    localeID,
1911          char* script,
1912          int32_t scriptCapacity,
1913          UErrorCode* err)
1914 {
1915     int32_t i=0;
1916
1917     if(err==NULL || U_FAILURE(*err)) {
1918         return 0;
1919     }
1920
1921     if(localeID==NULL) {
1922         localeID=uloc_getDefault();
1923     }
1924
1925     /* skip the language */
1926     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1927     if(_isIDSeparator(*localeID)) {
1928         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1929     }
1930     return u_terminateChars(script, scriptCapacity, i, err);
1931 }
1932
1933 U_CAPI int32_t  U_EXPORT2
1934 uloc_getCountry(const char* localeID,
1935             char* country,
1936             int32_t countryCapacity,
1937             UErrorCode* err)
1938 {
1939     int32_t i=0;
1940
1941     if(err==NULL || U_FAILURE(*err)) {
1942         return 0;
1943     }
1944
1945     if(localeID==NULL) {
1946         localeID=uloc_getDefault();
1947     }
1948
1949     /* Skip the language */
1950     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1951     if(_isIDSeparator(*localeID)) {
1952         const char *scriptID;
1953         /* Skip the script if available */
1954         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1955         if(scriptID != localeID+1) {
1956             /* Found optional script */
1957             localeID = scriptID;
1958         }
1959         if(_isIDSeparator(*localeID)) {
1960             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1961         }
1962     }
1963     return u_terminateChars(country, countryCapacity, i, err);
1964 }
1965
1966 U_CAPI int32_t  U_EXPORT2
1967 uloc_getVariant(const char* localeID,
1968                 char* variant,
1969                 int32_t variantCapacity,
1970                 UErrorCode* err)
1971 {
1972     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1973     const char* tmpLocaleID;
1974     int32_t i=0;
1975
1976     if(err==NULL || U_FAILURE(*err)) {
1977         return 0;
1978     }
1979
1980     if (_hasBCP47Extension(localeID)) {
1981         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1982     } else {
1983         if (localeID==NULL) {
1984            localeID=uloc_getDefault();
1985         }
1986         tmpLocaleID=localeID;
1987     }
1988
1989     /* Skip the language */
1990     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1991     if(_isIDSeparator(*tmpLocaleID)) {
1992         const char *scriptID;
1993         /* Skip the script if available */
1994         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1995         if(scriptID != tmpLocaleID+1) {
1996             /* Found optional script */
1997             tmpLocaleID = scriptID;
1998         }
1999         /* Skip the Country */
2000         if (_isIDSeparator(*tmpLocaleID)) {
2001             const char *cntryID;
2002             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2003             if (cntryID != tmpLocaleID+1) {
2004                 /* Found optional country */
2005                 tmpLocaleID = cntryID;
2006             }
2007             if(_isIDSeparator(*tmpLocaleID)) {
2008                 /* If there was no country ID, skip a possible extra IDSeparator */
2009                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2010                     tmpLocaleID++;
2011                 }
2012                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2013             }
2014         }
2015     }
2016
2017     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2018     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2019 /*
2020     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2021         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2022     }
2023 */
2024     return u_terminateChars(variant, variantCapacity, i, err);
2025 }
2026
2027 U_CAPI int32_t  U_EXPORT2
2028 uloc_getName(const char* localeID,
2029              char* name,
2030              int32_t nameCapacity,
2031              UErrorCode* err)
2032 {
2033     return _canonicalize(localeID, name, nameCapacity, 0, err);
2034 }
2035
2036 U_CAPI int32_t  U_EXPORT2
2037 uloc_getBaseName(const char* localeID,
2038                  char* name,
2039                  int32_t nameCapacity,
2040                  UErrorCode* err)
2041 {
2042     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2043 }
2044
2045 U_CAPI int32_t  U_EXPORT2
2046 uloc_canonicalize(const char* localeID,
2047                   char* name,
2048                   int32_t nameCapacity,
2049                   UErrorCode* err)
2050 {
2051     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2052 }
2053
2054 U_CAPI const char*  U_EXPORT2
2055 uloc_getISO3Language(const char* localeID)
2056 {
2057     int16_t offset;
2058     char lang[ULOC_LANG_CAPACITY];
2059     UErrorCode err = U_ZERO_ERROR;
2060
2061     if (localeID == NULL)
2062     {
2063         localeID = uloc_getDefault();
2064     }
2065     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2066     if (U_FAILURE(err))
2067         return "";
2068     offset = _findIndex(LANGUAGES, lang);
2069     if (offset < 0)
2070         return "";
2071     return LANGUAGES_3[offset];
2072 }
2073
2074 U_CAPI const char*  U_EXPORT2
2075 uloc_getISO3Country(const char* localeID)
2076 {
2077     int16_t offset;
2078     char cntry[ULOC_LANG_CAPACITY];
2079     UErrorCode err = U_ZERO_ERROR;
2080
2081     if (localeID == NULL)
2082     {
2083         localeID = uloc_getDefault();
2084     }
2085     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2086     if (U_FAILURE(err))
2087         return "";
2088     offset = _findIndex(COUNTRIES, cntry);
2089     if (offset < 0)
2090         return "";
2091
2092     return COUNTRIES_3[offset];
2093 }
2094
2095 U_CAPI uint32_t  U_EXPORT2
2096 uloc_getLCID(const char* localeID)
2097 {
2098     UErrorCode status = U_ZERO_ERROR;
2099     char       langID[ULOC_FULLNAME_CAPACITY];
2100
2101     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2102     if (U_FAILURE(status)) {
2103         return 0;
2104     }
2105
2106     return uprv_convertToLCID(langID, localeID, &status);
2107 }
2108
2109 U_CAPI int32_t U_EXPORT2
2110 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2111                 UErrorCode *status)
2112 {
2113     int32_t length;
2114     const char *posix = uprv_convertToPosix(hostid, status);
2115     if (U_FAILURE(*status) || posix == NULL) {
2116         return 0;
2117     }
2118     length = (int32_t)uprv_strlen(posix);
2119     if (length+1 > localeCapacity) {
2120         *status = U_BUFFER_OVERFLOW_ERROR;
2121     }
2122     else {
2123         uprv_strcpy(locale, posix);
2124     }
2125     return length;
2126 }
2127
2128 /* ### Default locale **************************************************/
2129
2130 U_CAPI const char*  U_EXPORT2
2131 uloc_getDefault()
2132 {
2133     return locale_get_default();
2134 }
2135
2136 U_CAPI void  U_EXPORT2
2137 uloc_setDefault(const char*   newDefaultLocale,
2138              UErrorCode* err)
2139 {
2140     if (U_FAILURE(*err))
2141         return;
2142     /* the error code isn't currently used for anything by this function*/
2143
2144     /* propagate change to C++ */
2145     locale_set_default(newDefaultLocale);
2146 }
2147
2148 /**
2149  * Returns a list of all language codes defined in ISO 639.  This is a pointer
2150  * to an array of pointers to arrays of char.  All of these pointers are owned
2151  * by ICU-- do not delete them, and do not write through them.  The array is
2152  * terminated with a null pointer.
2153  */
2154 U_CAPI const char* const*  U_EXPORT2
2155 uloc_getISOLanguages()
2156 {
2157     return LANGUAGES;
2158 }
2159
2160 /**
2161  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2162  * pointer to an array of pointers to arrays of char.  All of these pointers are
2163  * owned by ICU-- do not delete them, and do not write through them.  The array is
2164  * terminated with a null pointer.
2165  */
2166 U_CAPI const char* const*  U_EXPORT2
2167 uloc_getISOCountries()
2168 {
2169     return COUNTRIES;
2170 }
2171
2172
2173 /* this function to be moved into cstring.c later */
2174 static char gDecimal = 0;
2175
2176 static /* U_CAPI */
2177 double
2178 /* U_EXPORT2 */
2179 _uloc_strtod(const char *start, char **end) {
2180     char *decimal;
2181     char *myEnd;
2182     char buf[30];
2183     double rv;
2184     if (!gDecimal) {
2185         char rep[5];
2186         /* For machines that decide to change the decimal on you,
2187         and try to be too smart with localization.
2188         This normally should be just a '.'. */
2189         sprintf(rep, "%+1.1f", 1.0);
2190         gDecimal = rep[2];
2191     }
2192
2193     if(gDecimal == '.') {
2194         return uprv_strtod(start, end); /* fall through to OS */
2195     } else {
2196         uprv_strncpy(buf, start, 29);
2197         buf[29]=0;
2198         decimal = uprv_strchr(buf, '.');
2199         if(decimal) {
2200             *decimal = gDecimal;
2201         } else {
2202             return uprv_strtod(start, end); /* no decimal point */
2203         }
2204         rv = uprv_strtod(buf, &myEnd);
2205         if(end) {
2206             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2207         }
2208         return rv;
2209     }
2210 }
2211
2212 typedef struct {
2213     float q;
2214     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2215     char *locale;
2216 } _acceptLangItem;
2217
2218 static int32_t U_CALLCONV
2219 uloc_acceptLanguageCompare(const void *context, const void *a, const void *b)
2220 {
2221     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2222     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2223
2224     int32_t rc = 0;
2225     if(bb->q < aa->q) {
2226         rc = -1;  /* A > B */
2227     } else if(bb->q > aa->q) {
2228         rc = 1;   /* A < B */
2229     } else {
2230         rc = 0;   /* A = B */
2231     }
2232
2233     if(rc==0) {
2234         rc = uprv_stricmp(aa->locale, bb->locale);
2235     }
2236
2237 #if defined(ULOC_DEBUG)
2238     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2239     aa->locale, aa->q,
2240     bb->locale, bb->q,
2241     rc);*/
2242 #endif
2243
2244     return rc;
2245 }
2246
2247 /*
2248 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2249 */
2250
2251 U_CAPI int32_t U_EXPORT2
2252 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2253                             const char *httpAcceptLanguage,
2254                             UEnumeration* availableLocales,
2255                             UErrorCode *status)
2256 {
2257     _acceptLangItem *j;
2258     _acceptLangItem smallBuffer[30];
2259     char **strs;
2260     char tmp[ULOC_FULLNAME_CAPACITY +1];
2261     int32_t n = 0;
2262     const char *itemEnd;
2263     const char *paramEnd;
2264     const char *s;
2265     const char *t;
2266     int32_t res;
2267     int32_t i;
2268     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2269     int32_t jSize;
2270     char *tempstr; /* Use for null pointer check */
2271
2272     j = smallBuffer;
2273     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2274     if(U_FAILURE(*status)) {
2275         return -1;
2276     }
2277
2278     for(s=httpAcceptLanguage;s&&*s;) {
2279         while(isspace(*s)) /* eat space at the beginning */
2280             s++;
2281         itemEnd=uprv_strchr(s,',');
2282         paramEnd=uprv_strchr(s,';');
2283         if(!itemEnd) {
2284             itemEnd = httpAcceptLanguage+l; /* end of string */
2285         }
2286         if(paramEnd && paramEnd<itemEnd) {
2287             /* semicolon (;) is closer than end (,) */
2288             t = paramEnd+1;
2289             if(*t=='q') {
2290                 t++;
2291             }
2292             while(isspace(*t)) {
2293                 t++;
2294             }
2295             if(*t=='=') {
2296                 t++;
2297             }
2298             while(isspace(*t)) {
2299                 t++;
2300             }
2301             j[n].q = (float)_uloc_strtod(t,NULL);
2302         } else {
2303             /* no semicolon - it's 1.0 */
2304             j[n].q = 1.0f;
2305             paramEnd = itemEnd;
2306         }
2307         j[n].dummy=0;
2308         /* eat spaces prior to semi */
2309         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2310             ;
2311         /* Check for null pointer from uprv_strndup */
2312         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2313         if (tempstr == NULL) {
2314             *status = U_MEMORY_ALLOCATION_ERROR;
2315             return -1;
2316         }
2317         j[n].locale = tempstr;
2318         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2319         if(strcmp(j[n].locale,tmp)) {
2320             uprv_free(j[n].locale);
2321             j[n].locale=uprv_strdup(tmp);
2322         }
2323 #if defined(ULOC_DEBUG)
2324         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2325 #endif
2326         n++;
2327         s = itemEnd;
2328         while(*s==',') { /* eat duplicate commas */
2329             s++;
2330         }
2331         if(n>=jSize) {
2332             if(j==smallBuffer) {  /* overflowed the small buffer. */
2333                 j = uprv_malloc(sizeof(j[0])*(jSize*2));
2334                 if(j!=NULL) {
2335                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2336                 }
2337 #if defined(ULOC_DEBUG)
2338                 fprintf(stderr,"malloced at size %d\n", jSize);
2339 #endif
2340             } else {
2341                 j = uprv_realloc(j, sizeof(j[0])*jSize*2);
2342 #if defined(ULOC_DEBUG)
2343                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2344 #endif
2345             }
2346             jSize *= 2;
2347             if(j==NULL) {
2348                 *status = U_MEMORY_ALLOCATION_ERROR;
2349                 return -1;
2350             }
2351         }
2352     }
2353     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2354     if(U_FAILURE(*status)) {
2355         if(j != smallBuffer) {
2356 #if defined(ULOC_DEBUG)
2357             fprintf(stderr,"freeing j %p\n", j);
2358 #endif
2359             uprv_free(j);
2360         }
2361         return -1;
2362     }
2363     strs = uprv_malloc((size_t)(sizeof(strs[0])*n));
2364     /* Check for null pointer */
2365     if (strs == NULL) {
2366         uprv_free(j); /* Free to avoid memory leak */
2367         *status = U_MEMORY_ALLOCATION_ERROR;
2368         return -1;
2369     }
2370     for(i=0;i<n;i++) {
2371 #if defined(ULOC_DEBUG)
2372         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2373 #endif
2374         strs[i]=j[i].locale;
2375     }
2376     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2377         (const char**)strs, n, availableLocales, status);
2378     for(i=0;i<n;i++) {
2379         uprv_free(strs[i]);
2380     }
2381     uprv_free(strs);
2382     if(j != smallBuffer) {
2383 #if defined(ULOC_DEBUG)
2384         fprintf(stderr,"freeing j %p\n", j);
2385 #endif
2386         uprv_free(j);
2387     }
2388     return res;
2389 }
2390
2391
2392 U_CAPI int32_t U_EXPORT2
2393 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2394                     UAcceptResult *outResult, const char **acceptList,
2395                     int32_t acceptListCount,
2396                     UEnumeration* availableLocales,
2397                     UErrorCode *status)
2398 {
2399     int32_t i,j;
2400     int32_t len;
2401     int32_t maxLen=0;
2402     char tmp[ULOC_FULLNAME_CAPACITY+1];
2403     const char *l;
2404     char **fallbackList;
2405     if(U_FAILURE(*status)) {
2406         return -1;
2407     }
2408     fallbackList = uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount));
2409     if(fallbackList==NULL) {
2410         *status = U_MEMORY_ALLOCATION_ERROR;
2411         return -1;
2412     }
2413     for(i=0;i<acceptListCount;i++) {
2414 #if defined(ULOC_DEBUG)
2415         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2416 #endif
2417         while((l=uenum_next(availableLocales, NULL, status))) {
2418 #if defined(ULOC_DEBUG)
2419             fprintf(stderr,"  %s\n", l);
2420 #endif
2421             len = (int32_t)uprv_strlen(l);
2422             if(!uprv_strcmp(acceptList[i], l)) {
2423                 if(outResult) {
2424                     *outResult = ULOC_ACCEPT_VALID;
2425                 }
2426 #if defined(ULOC_DEBUG)
2427                 fprintf(stderr, "MATCH! %s\n", l);
2428 #endif
2429                 if(len>0) {
2430                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2431                 }
2432                 for(j=0;j<i;j++) {
2433                     uprv_free(fallbackList[j]);
2434                 }
2435                 uprv_free(fallbackList);
2436                 return u_terminateChars(result, resultAvailable, len, status);
2437             }
2438             if(len>maxLen) {
2439                 maxLen = len;
2440             }
2441         }
2442         uenum_reset(availableLocales, status);
2443         /* save off parent info */
2444         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2445             fallbackList[i] = uprv_strdup(tmp);
2446         } else {
2447             fallbackList[i]=0;
2448         }
2449     }
2450
2451     for(maxLen--;maxLen>0;maxLen--) {
2452         for(i=0;i<acceptListCount;i++) {
2453             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2454 #if defined(ULOC_DEBUG)
2455                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2456 #endif
2457                 while((l=uenum_next(availableLocales, NULL, status))) {
2458 #if defined(ULOC_DEBUG)
2459                     fprintf(stderr,"  %s\n", l);
2460 #endif
2461                     len = (int32_t)uprv_strlen(l);
2462                     if(!uprv_strcmp(fallbackList[i], l)) {
2463                         if(outResult) {
2464                             *outResult = ULOC_ACCEPT_FALLBACK;
2465                         }
2466 #if defined(ULOC_DEBUG)
2467                         fprintf(stderr, "fallback MATCH! %s\n", l);
2468 #endif
2469                         if(len>0) {
2470                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2471                         }
2472                         for(j=0;j<acceptListCount;j++) {
2473                             uprv_free(fallbackList[j]);
2474                         }
2475                         uprv_free(fallbackList);
2476                         return u_terminateChars(result, resultAvailable, len, status);
2477                     }
2478                 }
2479                 uenum_reset(availableLocales, status);
2480
2481                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2482                     uprv_free(fallbackList[i]);
2483                     fallbackList[i] = uprv_strdup(tmp);
2484                 } else {
2485                     uprv_free(fallbackList[i]);
2486                     fallbackList[i]=0;
2487                 }
2488             }
2489         }
2490         if(outResult) {
2491             *outResult = ULOC_ACCEPT_FAILED;
2492         }
2493     }
2494     for(i=0;i<acceptListCount;i++) {
2495         uprv_free(fallbackList[i]);
2496     }
2497     uprv_free(fallbackList);
2498     return -1;
2499 }
2500
2501 /*eof*/