icuSources/common/uloc.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1997-2012, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *
   7 * File ULOC.CPP
   8 *
   9 * Modification History:
  10 *
  11 *   Date        Name        Description
  12 *   04/01/97    aliu        Creation.
  13 *   08/21/98    stephen     JDK 1.2 sync
  14 *   12/08/98    rtg         New Locale implementation and C API
  15 *   03/15/99    damiba      overhaul.
  16 *   04/06/99    stephen     changed setDefault() to realloc and copy
  17 *   06/14/99    stephen     Changed calls to ures_open for new params
  18 *   07/21/99    stephen     Modified setDefault() to propagate to C++
  19 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
  20 *                           brought canonicalization code into line with spec
  21 *****************************************************************************/
  22
  23 /*
  24    POSIX's locale format, from putil.c: [no spaces]
  25
  26      ll [ _CC ] [ . MM ] [ @ VV]
  27
  28      l = lang, C = ctry, M = charmap, V = variant
  29 */
  30
  31 #include "unicode/utypes.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/uloc.h"
  34
  35 #include "putilimp.h"
  36 #include "ustr_imp.h"
  37 #include "ulocimp.h"
  38 #include "umutex.h"
  39 #include "cstring.h"
  40 #include "cmemory.h"
  41 #include "ucln_cmn.h"
  42 #include "locmap.h"
  43 #include "uarrsort.h"
  44 #include "uenumimp.h"
  45 #include "uassert.h"
  46
  47 #include <stdio.h> /* for sprintf */
  48
  49 /* ### Declarations **************************************************/
  50
  51 /* Locale stuff from locid.cpp */
  52 U_CFUNC void locale_set_default(const char *id);
  53 U_CFUNC const char *locale_get_default(void);
  54 U_CFUNC int32_t
  55 locale_getKeywords(const char *localeID,
  56             char prev,
  57             char *keywords, int32_t keywordCapacity,
  58             char *values, int32_t valuesCapacity, int32_t *valLen,
  59             UBool valuesToo,
  60             UErrorCode *status);
  61
  62 /* ### Data tables **************************************************/
  63
  64 /**
  65  * Table of language codes, both 2- and 3-letter, with preference
  66  * given to 2-letter codes where possible.  Includes 3-letter codes
  67  * that lack a 2-letter equivalent.
  68  *
  69  * This list must be in sorted order.  This list is returned directly
  70  * to the user by some API.
  71  *
  72  * This list must be kept in sync with LANGUAGES_3, with corresponding
  73  * entries matched.
  74  *
  75  * This table should be terminated with a NULL entry, followed by a
  76  * second list, and another NULL entry.  The first list is visible to
  77  * user code when this array is returned by API.  The second list
  78  * contains codes we support, but do not expose through user API.
  79  *
  80  * Notes
  81  *
  82  * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
  83  * include the revisions up to 2001/7/27 *CWB*
  84  *
  85  * The 3 character codes are the terminology codes like RFC 3066.  This
  86  * is compatible with prior ICU codes
  87  *
  88  * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
  89  * table but now at the end of the table because 3 character codes are
  90  * duplicates.  This avoids bad searches going from 3 to 2 character
  91  * codes.
  92  *
  93  * The range qaa-qtz is reserved for local use
  94  */
  95 static const char * const LANGUAGES[] = {
  96     "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",
  97     "afh", "agq", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",
  98     "ang", "anp", "apa",
  99     "ar",  "arc", "arn", "arp", "art", "arw", "as", "asa", "ast",
 100     "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",
 101     "bai", "bal", "ban", "bas", "bat", "be",  "bej",
 102     "bem", "ber", "bez", "bg",  "bh",  "bho", "bi",  "bik", "bin",
 103     "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "brx", "bs",
 104     "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",
 105     "cch", "ce",  "ceb", "cel", "cgg", "ch",  "chb", "chg", "chk", "chm",
 106     "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",
 107     "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",
 108     "cv",  "cy",  "da",  "dak", "dar", "dav", "day", "de",  "del", "den",
 109     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyo", "dyu",
 110     "dz",  "ebu", "ee",  "efi", "egy", "eka", "el",  "elx", "en",
 111     "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",
 112     "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",
 113     "fr",  "frm", "fro", "frr", "frs", "fur", "fy",
 114     "ga",  "gaa", "gan", "gay", "gba", "gd",  "gem", "gez", "gil",
 115     "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
 116     "grc", "gsw", "gu",  "guz", "gv",  "gwi",
 117     "ha",  "hai", "hak", "haw", "he",  "hi",  "hil", "him",
 118     "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",  "hup", "hy",  "hz",
 119     "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",
 120     "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",
 121     "iu",  "ja",  "jbo", "jmc", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",
 122     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg",  "kha", "khi",
 123     "kho", "khq", "ki",  "kj",  "kk",  "kl",  "kln", "km",  "kmb", "kn",
 124     "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks", "ksb", "ksf",
 125     "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad", "lag",
 126     "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",
 127     "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus", "luy",
 128     "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",
 129     "mdf", "mdr", "men", "mer", "mfe", "mg",  "mga", "mgh", "mh",  "mi",  "mic", "min",
 130     "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",
 131     "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mua", "mul", "mun",
 132     "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nan", "nap", "naq",
 133     "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",
 134     "niu", "nl",  "nmg", "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub", "nus",
 135     "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",
 136     "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",
 137     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
 138     "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",
 139     "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rof", "rom",
 140     "ru",  "rup", "rw",  "rwk", "sa",  "sad", "sah", "sai", "sal", "sam", "saq",
 141     "sas", "sat", "sbp", "sc",  "scn", "sco", "sd",  "se",  "seh", "sel", "sem", "ses",
 142     "sg",  "sga", "sgn", "shi", "shn", "si",  "sid", "sio", "sit",
 143     "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
 144     "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
 145     "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",
 146     "sv",  "sw",  "swc", "syc", "syr", "ta",  "tai", "te",  "tem", "teo", "ter",
 147     "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",
 148     "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",
 149     "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw", "twq",
 150     "ty",  "tyv", "tzm", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",
 151     "uz",  "vai", "ve",  "vi",  "vo",  "vot", "vun", "wa",  "wak",
 152     "wal", "war", "was", "wen", "wo",  "wuu", "xal", "xh",  "xog", "yao", "yap", "yav",
 153     "yi",  "yo",  "ypk", "yue", "za",  "zap", "zbl", "zen", "zh",  "znd",
 154     "zu",  "zun", "zxx", "zza",
 155 NULL,
 156     "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
 157 NULL
 158 };
 159 static const char* const DEPRECATED_LANGUAGES[]={
 160     "in", "iw", "ji", "jw", NULL, NULL
 161 };
 162 static const char* const REPLACEMENT_LANGUAGES[]={
 163     "id", "he", "yi", "jv", NULL, NULL
 164 };
 165
 166 /**
 167  * Table of 3-letter language codes.
 168  *
 169  * This is a lookup table used to convert 3-letter language codes to
 170  * their 2-letter equivalent, where possible.  It must be kept in sync
 171  * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
 172  * same language as LANGUAGES_3[i].  The commented-out lines are
 173  * copied from LANGUAGES to make eyeballing this baby easier.
 174  *
 175  * Where a 3-letter language code has no 2-letter equivalent, the
 176  * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
 177  *
 178  * This table should be terminated with a NULL entry, followed by a
 179  * second list, and another NULL entry.  The two lists correspond to
 180  * the two lists in LANGUAGES.
 181  */
 182 static const char * const LANGUAGES_3[] = {
 183 /*  "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",    */
 184     "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
 185 /*  "afh", "agq", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",  "ang", "anp", "apa",    */
 186     "afh", "agq", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
 187 /*  "ar",  "arc", "arn", "arp", "art", "arw", "as",  "asa", "ast",    */
 188     "ara", "arc", "arn", "arp", "art", "arw", "asm", "asa", "ast",
 189 /*  "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",    */
 190     "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
 191 /*  "bai", "bal", "ban", "bas", "bat", "be",  "bej",    */
 192     "bai", "bal", "ban", "bas", "bat", "bel", "bej",
 193 /*  "bem", "ber", "bez", "bg",  "bh",  "bho", "bi",  "bik", "bin",    */
 194     "bem", "ber", "bez", "bul", "bih", "bho", "bis", "bik", "bin",
 195 /*  "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "brx", "bs",     */
 196     "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "brx", "bos",
 197 /*  "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",    */
 198     "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
 199 /*  "cch", "ce",  "ceb", "cel", "cgg", "ch",  "chb", "chg", "chk", "chm",    */
 200     "cch", "che", "ceb", "cel", "cgg", "cha", "chb", "chg", "chk", "chm",
 201 /*  "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",    */
 202     "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
 203 /*  "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",    */
 204     "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
 205 /*  "cv",  "cy",  "da",  "dak", "dar", "dav", "day", "de",  "del", "den",    */
 206     "chv", "cym", "dan", "dak", "dar", "dav", "day", "deu", "del", "den",
 207 /*  "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyo", "dyu",    */
 208     "dgr", "din", "dje", "doi", "dra", "dsb", "dua", "dum", "div", "dyo", "dyu",
 209 /*  "dz",  "ebu", "ee",  "efi", "egy", "eka", "el",  "elx", "en",     */
 210     "dzo", "ebu", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
 211 /*  "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",     */
 212     "enm", "epo", "spa", "est", "eus", "ewo", "fas",
 213 /*  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",    */
 214     "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
 215 /*  "fr",  "frm", "fro", "frr", "frs", "fur", "fy",  "ga",  "gaa", "gan", "gay",    */
 216     "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gan", "gay",
 217 /*  "gba", "gd",  "gem", "gez", "gil", "gl",  "gmh", "gn",     */
 218     "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
 219 /*  "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guz", "gv",     */
 220     "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guz", "glv",
 221 /*  "gwi", "ha",  "hai", "hak", "haw", "he",  "hi",  "hil", "him",    */
 222     "gwi", "hau", "hai", "hak", "haw", "heb", "hin", "hil", "him",
 223 /*  "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",  "hup", "hy",  "hz",     */
 224     "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun", "hup", "hye", "her",
 225 /*  "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",     */
 226     "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
 227 /*  "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",      */
 228     "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
 229 /*  "iu",  "ja",  "jbo", "jmc", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",   */
 230     "iku", "jpn", "jbo", "jmc", "jpr", "jrb", "jav", "kat", "kaa", "kab",
 231 /*  "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg",  "kha", "khi",*/
 232     "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kde", "kea", "kfo", "kg",  "kha", "khi",
 233 /*  "kho", "khq", "ki",  "kj",  "kk",  "kl",  "kln", "km",  "kmb", "kn",     */
 234     "kho", "khq", "kik", "kua", "kaz", "kal", "kln", "khm", "kmb", "kan",
 235 /*  "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",  "ksb", "ksf", */
 236     "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas", "ksb", "ksf",
 237 /*  "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad", "lag",    */
 238     "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad", "lag",
 239 /*  "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",    */
 240     "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
 241 /*  "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus", "luy",   */
 242     "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus", "luy",
 243 /*  "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",    */
 244     "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
 245 /*  "mdf", "mdr", "men", "mer", "mfe", "mg",  "mga", "mgh", "mh",  "mi",  "mic", "min",    */
 246     "mdf", "mdr", "men", "mer", "mfe", "mlg", "mga", "mgh", "mah", "mri", "mic", "min",
 247 /*  "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",    */
 248     "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
 249 /*  "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mua", "mul", "mun",    */
 250     "mol", "moh", "mos", "mar", "msa", "mlt", "mua", "mul", "mun",
 251 /*  "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nan", "nap", "naq",   */
 252     "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nan", "nap", "naq",
 253 /*  "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",    */
 254     "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
 255 /*  "niu", "nl",  "nmg", "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub", "nus",   */
 256     "niu", "nld", "nmg", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub", "nus",
 257 /*  "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",     */
 258     "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
 259 /*  "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",    */
 260     "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
 261 /*  "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",    */
 262     "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
 263 /*  "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",     */
 264     "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
 265 /*  "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rof", "rom",    */
 266     "raj", "rap", "rar", "roh", "run", "ron", "roa", "rof", "rom",
 267 /*  "ru",  "rup", "rw",  "rwk", "sa",  "sad", "sah", "sai", "sal", "sam", "saq",    */
 268     "rus", "rup", "kin", "rwk", "san", "sad", "sah", "sai", "sal", "sam", "saq",
 269 /*  "sas", "sat", "sbp", "sc",  "scn", "sco", "sd",  "se",  "seh", "sel", "sem", "ses",    */
 270     "sas", "sat", "sbp", "srd", "scn", "sco", "snd", "sme", "seh", "sel", "sem", "ses",
 271 /*  "sg",  "sga", "sgn", "shi", "shn", "si",  "sid", "sio", "sit",    */
 272     "sag", "sga", "sgn", "shi", "shn", "sin", "sid", "sio", "sit",
 273 /*  "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",    */
 274     "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
 275 /*  "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",     */
 276     "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
 277 /*  "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",    */
 278     "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
 279 /*  "sv",  "sw",  "swc", "syc", "syr", "ta",  "tai", "te",  "tem", "teo", "ter",    */
 280     "swe", "swa", "swc", "syc", "syr", "tam", "tai", "tel", "tem", "teo", "ter",
 281 /*  "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",    */
 282     "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
 283 /*  "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",    */
 284     "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
 285 /*  "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",  "twq"   */
 286     "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi", "twq",
 287 /*  "ty",  "tyv", "tzm", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",     */
 288     "tah", "tyv", "tzm", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
 289 /*  "uz",  "vai", "ve",  "vi",  "vo",  "vot", "vun", "wa",  "wak",    */
 290     "uzb", "vai", "ven", "vie", "vol", "vot", "vun", "wln", "wak",
 291 /*  "wal", "war", "was", "wen", "wo",  "wuu", "xal", "xh",  "xog", "yao", "yap", "yav",   */
 292     "wal", "war", "was", "wen", "wol", "wuu", "xal", "xho", "xog", "yao", "yap", "yav",
 293 /*  "yi",  "yo",  "ypk", "yue", "za",  "zap", "zbl", "zen", "zh",  "znd",    */
 294     "yid", "yor", "ypk", "yue", "zha", "zap", "zbl", "zen", "zho", "znd",
 295 /*  "zu",  "zun", "zxx", "zza",                                         */
 296     "zul", "zun", "zxx", "zza",
 297 NULL,
 298 /*  "in",  "iw",  "ji",  "jw",  "sh",                          */
 299     "ind", "heb", "yid", "jaw", "srp",
 300 NULL
 301 };
 302
 303 /**
 304  * Table of 2-letter country codes.
 305  *
 306  * This list must be in sorted order.  This list is returned directly
 307  * to the user by some API.
 308  *
 309  * This list must be kept in sync with COUNTRIES_3, with corresponding
 310  * entries matched.
 311  *
 312  * This table should be terminated with a NULL entry, followed by a
 313  * second list, and another NULL entry.  The first list is visible to
 314  * user code when this array is returned by API.  The second list
 315  * contains codes we support, but do not expose through user API.
 316  *
 317  * Notes:
 318  *
 319  * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
 320  * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
 321  * new codes keeping the old ones for compatibility updated to include
 322  * 1999/12/03 revisions *CWB*
 323  *
 324  * RO(ROM) is now RO(ROU) according to
 325  * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
 326  */
 327 static const char * const COUNTRIES[] = {
 328     "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",
 329     "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
 330     "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
 331     "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",
 332     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
 333     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
 334     "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
 335     "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
 336     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
 337     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
 338     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
 339     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
 340     "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
 341     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
 342     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
 343     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
 344     "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
 345     "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
 346     "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
 347     "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
 348     "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
 349     "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
 350     "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
 351     "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
 352     "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",
 353     "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
 354     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
 355     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
 356     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
 357     "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 358 NULL,
 359     "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   /* obsolete country codes */
 360 NULL
 361 };
 362
 363 static const char* const DEPRECATED_COUNTRIES[] ={
 364     "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
 365 };
 366 static const char* const REPLACEMENT_COUNTRIES[] = {
 367 /*  "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
 368     "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL  /* replacement country codes */
 369 };
 370
 371 /**
 372  * Table of 3-letter country codes.
 373  *
 374  * This is a lookup table used to convert 3-letter country codes to
 375  * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
 376  * For all valid i, COUNTRIES[i] must refer to the same country as
 377  * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
 378  * to make eyeballing this baby easier.
 379  *
 380  * This table should be terminated with a NULL entry, followed by a
 381  * second list, and another NULL entry.  The two lists correspond to
 382  * the two lists in COUNTRIES.
 383  */
 384 static const char * const COUNTRIES_3[] = {
 385 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",     */
 386     "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
 387 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
 388     "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
 389 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
 390     "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
 391 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",     */
 392     "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
 393 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
 394     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 395 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
 396     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
 397 /*  "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
 398     "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
 399 /*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
 400     "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
 401 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
 402     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 403 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
 404     "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
 405 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
 406     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 407 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
 408     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
 409 /*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
 410     "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 411 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
 412     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 413 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
 414     "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
 415 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
 416     "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
 417 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
 418     "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
 419 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
 420     "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
 421 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
 422     "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
 423 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
 424     "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
 425 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
 426     "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
 427 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
 428     "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
 429 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
 430     "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
 431 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
 432     "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
 433 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",     */
 434     "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
 435 /*  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
 436     "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
 437 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
 438     "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
 439 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
 440     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 441 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
 442     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 443 /*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
 444     "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 445 NULL,
 446 /*  "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   */
 447     "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
 448 NULL
 449 };
 450
 451 typedef struct CanonicalizationMap {
 452     const char *id;          /* input ID */
 453     const char *canonicalID; /* canonicalized output ID */
 454     const char *keyword;     /* keyword, or NULL if none */
 455     const char *value;       /* keyword value, or NULL if kw==NULL */
 456 } CanonicalizationMap;
 457
 458 /**
 459  * A map to canonicalize locale IDs.  This handles a variety of
 460  * different semantic kinds of transformations.
 461  */
 462 static const CanonicalizationMap CANONICALIZE_MAP[] = {
 463     { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
 464     { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
 465     { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
 466     { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
 467     { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
 468     { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
 469     { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
 470     { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
 471     { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
 472     { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
 473     { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
 474     { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
 475     { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
 476     { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
 477     { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
 478     { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
 479     { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
 480     { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
 481     { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
 482     { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
 483     { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
 484     { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
 485     { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
 486     { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
 487     { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
 488     { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
 489     { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
 490     { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
 491     { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
 492     { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
 493     { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
 494     { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
 495     { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
 496     { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
 497     { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
 498     { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
 499     { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
 500     { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
 501     { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
 502     { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
 503     { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
 504     { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
 505     { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
 506     { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
 507     { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
 508     { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
 509 };
 510
 511 typedef struct VariantMap {
 512     const char *variant;          /* input ID */
 513     const char *keyword;     /* keyword, or NULL if none */
 514     const char *value;       /* keyword value, or NULL if kw==NULL */
 515 } VariantMap;
 516
 517 static const VariantMap VARIANT_MAP[] = {
 518     { "EURO",   "currency", "EUR" },
 519     { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
 520     { "STROKE", "collation", "stroke" }  /* Solaris variant */
 521 };
 522
 523 /* ### BCP47 Conversion *******************************************/
 524 /* Test if the locale id has BCP47 u extension and does not have '@' */
 525 #define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
 526 /* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
 527 #define _ConvertBCP47(finalID, id, buffer, length,err) \
 528         if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
 529             finalID=id; \
 530         } else { \
 531             finalID=buffer; \
 532         }
 533 /* Gets the size of the shortest subtag in the given localeID. */
 534 static int32_t getShortestSubtagLength(const char *localeID) {
 535     int32_t localeIDLength = uprv_strlen(localeID);
 536     int32_t length = localeIDLength;
 537     int32_t tmpLength = 0;
 538     int32_t i;
 539     UBool reset = TRUE;
 540
 541     for (i = 0; i < localeIDLength; i++) {
 542         if (localeID[i] != '_' && localeID[i] != '-') {
 543             if (reset) {
 544                 tmpLength = 0;
 545                 reset = FALSE;
 546             }
 547             tmpLength++;
 548         } else {
 549             if (tmpLength != 0 && tmpLength < length) {
 550                 length = tmpLength;
 551             }
 552             reset = TRUE;
 553         }
 554     }
 555
 556     return length;
 557 }
 558
 559 /* ### Keywords **************************************************/
 560
 561 #define ULOC_KEYWORD_BUFFER_LEN 25
 562 #define ULOC_MAX_NO_KEYWORDS 25
 563
 564 U_CAPI const char * U_EXPORT2
 565 locale_getKeywordsStart(const char *localeID) {
 566     const char *result = NULL;
 567     if((result = uprv_strchr(localeID, '@')) != NULL) {
 568         return result;
 569     }
 570 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
 571     else {
 572         /* We do this because the @ sign is variant, and the @ sign used on one
 573         EBCDIC machine won't be compiled the same way on other EBCDIC based
 574         machines. */
 575         static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
 576         const uint8_t *charToFind = ebcdicSigns;
 577         while(*charToFind) {
 578             if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
 579                 return result;
 580             }
 581             charToFind++;
 582         }
 583     }
 584 #endif
 585     return NULL;
 586 }
 587
 588 /**
 589  * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
 590  * @param keywordName incoming name to be canonicalized
 591  * @param status return status (keyword too long)
 592  * @return length of the keyword name
 593  */
 594 static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
 595 {
 596   int32_t i;
 597   int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
 598
 599   if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
 600     /* keyword name too long for internal buffer */
 601     *status = U_INTERNAL_PROGRAM_ERROR;
 602           return 0;
 603   }
 604
 605   /* normalize the keyword name */
 606   for(i = 0; i < keywordNameLen; i++) {
 607     buf[i] = uprv_tolower(keywordName[i]);
 608   }
 609   buf[i] = 0;
 610
 611   return keywordNameLen;
 612 }
 613
 614 typedef struct {
 615     char keyword[ULOC_KEYWORD_BUFFER_LEN];
 616     int32_t keywordLen;
 617     const char *valueStart;
 618     int32_t valueLen;
 619 } KeywordStruct;
 620
 621 static int32_t U_CALLCONV
 622 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
 623     const char* leftString = ((const KeywordStruct *)left)->keyword;
 624     const char* rightString = ((const KeywordStruct *)right)->keyword;
 625     return uprv_strcmp(leftString, rightString);
 626 }
 627
 628 /**
 629  * Both addKeyword and addValue must already be in canonical form.
 630  * Either both addKeyword and addValue are NULL, or neither is NULL.
 631  * If they are not NULL they must be zero terminated.
 632  * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
 633  */
 634 static int32_t
 635 _getKeywords(const char *localeID,
 636              char prev,
 637              char *keywords, int32_t keywordCapacity,
 638              char *values, int32_t valuesCapacity, int32_t *valLen,
 639              UBool valuesToo,
 640              const char* addKeyword,
 641              const char* addValue,
 642              UErrorCode *status)
 643 {
 644     KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
 645
 646     int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
 647     int32_t numKeywords = 0;
 648     const char* pos = localeID;
 649     const char* equalSign = NULL;
 650     const char* semicolon = NULL;
 651     int32_t i = 0, j, n;
 652     int32_t keywordsLen = 0;
 653     int32_t valuesLen = 0;
 654
 655     if(prev == '@') { /* start of keyword definition */
 656         /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
 657         do {
 658             UBool duplicate = FALSE;
 659             /* skip leading spaces */
 660             while(*pos == ' ') {
 661                 pos++;
 662             }
 663             if (!*pos) { /* handle trailing "; " */
 664                 break;
 665             }
 666             if(numKeywords == maxKeywords) {
 667                 *status = U_INTERNAL_PROGRAM_ERROR;
 668                 return 0;
 669             }
 670             equalSign = uprv_strchr(pos, '=');
 671             semicolon = uprv_strchr(pos, ';');
 672             /* lack of '=' [foo@currency] is illegal */
 673             /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
 674             if(!equalSign || (semicolon && semicolon<equalSign)) {
 675                 *status = U_INVALID_FORMAT_ERROR;
 676                 return 0;
 677             }
 678             /* need to normalize both keyword and keyword name */
 679             if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
 680                 /* keyword name too long for internal buffer */
 681                 *status = U_INTERNAL_PROGRAM_ERROR;
 682                 return 0;
 683             }
 684             for(i = 0, n = 0; i < equalSign - pos; ++i) {
 685                 if (pos[i] != ' ') {
 686                     keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
 687                 }
 688             }
 689             keywordList[numKeywords].keyword[n] = 0;
 690             keywordList[numKeywords].keywordLen = n;
 691             /* now grab the value part. First we skip the '=' */
 692             equalSign++;
 693             /* then we leading spaces */
 694             while(*equalSign == ' ') {
 695                 equalSign++;
 696             }
 697             keywordList[numKeywords].valueStart = equalSign;
 698
 699             pos = semicolon;
 700             i = 0;
 701             if(pos) {
 702                 while(*(pos - i - 1) == ' ') {
 703                     i++;
 704                 }
 705                 keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
 706                 pos++;
 707             } else {
 708                 i = (int32_t)uprv_strlen(equalSign);
 709                 while(i && equalSign[i-1] == ' ') {
 710                     i--;
 711                 }
 712                 keywordList[numKeywords].valueLen = i;
 713             }
 714             /* If this is a duplicate keyword, then ignore it */
 715             for (j=0; j<numKeywords; ++j) {
 716                 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
 717                     duplicate = TRUE;
 718                     break;
 719                 }
 720             }
 721             if (!duplicate) {
 722                 ++numKeywords;
 723             }
 724         } while(pos);
 725
 726         /* Handle addKeyword/addValue. */
 727         if (addKeyword != NULL) {
 728             UBool duplicate = FALSE;
 729             U_ASSERT(addValue != NULL);
 730             /* Search for duplicate; if found, do nothing. Explicit keyword
 731                overrides addKeyword. */
 732             for (j=0; j<numKeywords; ++j) {
 733                 if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
 734                     duplicate = TRUE;
 735                     break;
 736                 }
 737             }
 738             if (!duplicate) {
 739                 if (numKeywords == maxKeywords) {
 740                     *status = U_INTERNAL_PROGRAM_ERROR;
 741                     return 0;
 742                 }
 743                 uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
 744                 keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
 745                 keywordList[numKeywords].valueStart = addValue;
 746                 keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
 747                 ++numKeywords;
 748             }
 749         } else {
 750             U_ASSERT(addValue == NULL);
 751         }
 752
 753         /* now we have a list of keywords */
 754         /* we need to sort it */
 755         uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
 756
 757         /* Now construct the keyword part */
 758         for(i = 0; i < numKeywords; i++) {
 759             if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
 760                 uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
 761                 if(valuesToo) {
 762                     keywords[keywordsLen + keywordList[i].keywordLen] = '=';
 763                 } else {
 764                     keywords[keywordsLen + keywordList[i].keywordLen] = 0;
 765                 }
 766             }
 767             keywordsLen += keywordList[i].keywordLen + 1;
 768             if(valuesToo) {
 769                 if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
 770                     uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
 771                 }
 772                 keywordsLen += keywordList[i].valueLen;
 773
 774                 if(i < numKeywords - 1) {
 775                     if(keywordsLen < keywordCapacity) {
 776                         keywords[keywordsLen] = ';';
 777                     }
 778                     keywordsLen++;
 779                 }
 780             }
 781             if(values) {
 782                 if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
 783                     uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
 784                     values[valuesLen + keywordList[i].valueLen] = 0;
 785                 }
 786                 valuesLen += keywordList[i].valueLen + 1;
 787             }
 788         }
 789         if(values) {
 790             values[valuesLen] = 0;
 791             if(valLen) {
 792                 *valLen = valuesLen;
 793             }
 794         }
 795         return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
 796     } else {
 797         return 0;
 798     }
 799 }
 800
 801 U_CFUNC int32_t
 802 locale_getKeywords(const char *localeID,
 803                    char prev,
 804                    char *keywords, int32_t keywordCapacity,
 805                    char *values, int32_t valuesCapacity, int32_t *valLen,
 806                    UBool valuesToo,
 807                    UErrorCode *status) {
 808     return _getKeywords(localeID, prev, keywords, keywordCapacity,
 809                         values, valuesCapacity, valLen, valuesToo,
 810                         NULL, NULL, status);
 811 }
 812
 813 U_CAPI int32_t U_EXPORT2
 814 uloc_getKeywordValue(const char* localeID,
 815                      const char* keywordName,
 816                      char* buffer, int32_t bufferCapacity,
 817                      UErrorCode* status)
 818 {
 819     const char* startSearchHere = NULL;
 820     const char* nextSeparator = NULL;
 821     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 822     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 823     int32_t i = 0;
 824     int32_t result = 0;
 825
 826     if(status && U_SUCCESS(*status) && localeID) {
 827       char tempBuffer[ULOC_FULLNAME_CAPACITY];
 828       const char* tmpLocaleID;
 829
 830       if (_hasBCP47Extension(localeID)) {
 831           _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
 832       } else {
 833           tmpLocaleID=localeID;
 834       }
 835
 836       startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
 837       if(startSearchHere == NULL) {
 838           /* no keywords, return at once */
 839           return 0;
 840       }
 841
 842       locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 843       if(U_FAILURE(*status)) {
 844         return 0;
 845       }
 846
 847       /* find the first keyword */
 848       while(startSearchHere) {
 849           startSearchHere++;
 850           /* skip leading spaces (allowed?) */
 851           while(*startSearchHere == ' ') {
 852               startSearchHere++;
 853           }
 854           nextSeparator = uprv_strchr(startSearchHere, '=');
 855           /* need to normalize both keyword and keyword name */
 856           if(!nextSeparator) {
 857               break;
 858           }
 859           if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
 860               /* keyword name too long for internal buffer */
 861               *status = U_INTERNAL_PROGRAM_ERROR;
 862               return 0;
 863           }
 864           for(i = 0; i < nextSeparator - startSearchHere; i++) {
 865               localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
 866           }
 867           /* trim trailing spaces */
 868           while(startSearchHere[i-1] == ' ') {
 869               i--;
 870               U_ASSERT(i>=0);
 871           }
 872           localeKeywordNameBuffer[i] = 0;
 873
 874           startSearchHere = uprv_strchr(nextSeparator, ';');
 875
 876           if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
 877               nextSeparator++;
 878               while(*nextSeparator == ' ') {
 879                   nextSeparator++;
 880               }
 881               /* we actually found the keyword. Copy the value */
 882               if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
 883                   while(*(startSearchHere-1) == ' ') {
 884                       startSearchHere--;
 885                   }
 886                   uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
 887                   result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
 888               } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
 889                   i = (int32_t)uprv_strlen(nextSeparator);
 890                   while(nextSeparator[i - 1] == ' ') {
 891                       i--;
 892                   }
 893                   uprv_strncpy(buffer, nextSeparator, i);
 894                   result = u_terminateChars(buffer, bufferCapacity, i, status);
 895               } else {
 896                   /* give a bigger buffer, please */
 897                   *status = U_BUFFER_OVERFLOW_ERROR;
 898                   if(startSearchHere) {
 899                       result = (int32_t)(startSearchHere - nextSeparator);
 900                   } else {
 901                       result = (int32_t)uprv_strlen(nextSeparator);
 902                   }
 903               }
 904               return result;
 905           }
 906       }
 907     }
 908     return 0;
 909 }
 910
 911 U_CAPI int32_t U_EXPORT2
 912 uloc_setKeywordValue(const char* keywordName,
 913                      const char* keywordValue,
 914                      char* buffer, int32_t bufferCapacity,
 915                      UErrorCode* status)
 916 {
 917     /* TODO: sorting. removal. */
 918     int32_t keywordNameLen;
 919     int32_t keywordValueLen;
 920     int32_t bufLen;
 921     int32_t needLen = 0;
 922     int32_t foundValueLen;
 923     int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
 924     char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 925     char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
 926     int32_t i = 0;
 927     int32_t rc;
 928     char* nextSeparator = NULL;
 929     char* nextEqualsign = NULL;
 930     char* startSearchHere = NULL;
 931     char* keywordStart = NULL;
 932     char *insertHere = NULL;
 933     if(U_FAILURE(*status)) {
 934         return -1;
 935     }
 936     if(bufferCapacity>1) {
 937         bufLen = (int32_t)uprv_strlen(buffer);
 938     } else {
 939         *status = U_ILLEGAL_ARGUMENT_ERROR;
 940         return 0;
 941     }
 942     if(bufferCapacity<bufLen) {
 943         /* The capacity is less than the length?! Is this NULL terminated? */
 944         *status = U_ILLEGAL_ARGUMENT_ERROR;
 945         return 0;
 946     }
 947     if(keywordValue && !*keywordValue) {
 948         keywordValue = NULL;
 949     }
 950     if(keywordValue) {
 951         keywordValueLen = (int32_t)uprv_strlen(keywordValue);
 952     } else {
 953         keywordValueLen = 0;
 954     }
 955     keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
 956     if(U_FAILURE(*status)) {
 957         return 0;
 958     }
 959     startSearchHere = (char*)locale_getKeywordsStart(buffer);
 960     if(startSearchHere == NULL || (startSearchHere[1]==0)) {
 961         if(!keywordValue) { /* no keywords = nothing to remove */
 962             return bufLen;
 963         }
 964
 965         needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
 966         if(startSearchHere) { /* had a single @ */
 967             needLen--; /* already had the @ */
 968             /* startSearchHere points at the @ */
 969         } else {
 970             startSearchHere=buffer+bufLen;
 971         }
 972         if(needLen >= bufferCapacity) {
 973             *status = U_BUFFER_OVERFLOW_ERROR;
 974             return needLen; /* no change */
 975         }
 976         *startSearchHere = '@';
 977         startSearchHere++;
 978         uprv_strcpy(startSearchHere, keywordNameBuffer);
 979         startSearchHere += keywordNameLen;
 980         *startSearchHere = '=';
 981         startSearchHere++;
 982         uprv_strcpy(startSearchHere, keywordValue);
 983         startSearchHere+=keywordValueLen;
 984         return needLen;
 985     } /* end shortcut - no @ */
 986
 987     keywordStart = startSearchHere;
 988     /* search for keyword */
 989     while(keywordStart) {
 990         keywordStart++;
 991         /* skip leading spaces (allowed?) */
 992         while(*keywordStart == ' ') {
 993             keywordStart++;
 994         }
 995         nextEqualsign = uprv_strchr(keywordStart, '=');
 996         /* need to normalize both keyword and keyword name */
 997         if(!nextEqualsign) {
 998             break;
 999         }
1000         if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1001             /* keyword name too long for internal buffer */
1002             *status = U_INTERNAL_PROGRAM_ERROR;
1003             return 0;
1004         }
1005         for(i = 0; i < nextEqualsign - keywordStart; i++) {
1006             localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1007         }
1008         /* trim trailing spaces */
1009         while(keywordStart[i-1] == ' ') {
1010             i--;
1011         }
1012         U_ASSERT(i>=0);
1013         localeKeywordNameBuffer[i] = 0;
1014
1015         nextSeparator = uprv_strchr(nextEqualsign, ';');
1016         rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1017         if(rc == 0) {
1018             nextEqualsign++;
1019             while(*nextEqualsign == ' ') {
1020                 nextEqualsign++;
1021             }
1022             /* we actually found the keyword. Change the value */
1023             if (nextSeparator) {
1024                 keywordAtEnd = 0;
1025                 foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1026             } else {
1027                 keywordAtEnd = 1;
1028                 foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1029             }
1030             if(keywordValue) { /* adding a value - not removing */
1031               if(foundValueLen == keywordValueLen) {
1032                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1033                 return bufLen; /* no change in size */
1034               } else if(foundValueLen > keywordValueLen) {
1035                 int32_t delta = foundValueLen - keywordValueLen;
1036                 if(nextSeparator) { /* RH side */
1037                   uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1038                 }
1039                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1040                 bufLen -= delta;
1041                 buffer[bufLen]=0;
1042                 return bufLen;
1043               } else { /* FVL < KVL */
1044                 int32_t delta = keywordValueLen - foundValueLen;
1045                 if((bufLen+delta) >= bufferCapacity) {
1046                   *status = U_BUFFER_OVERFLOW_ERROR;
1047                   return bufLen+delta;
1048                 }
1049                 if(nextSeparator) { /* RH side */
1050                   uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1051                 }
1052                 uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1053                 bufLen += delta;
1054                 buffer[bufLen]=0;
1055                 return bufLen;
1056               }
1057             } else { /* removing a keyword */
1058               if(keywordAtEnd) {
1059                 /* zero out the ';' or '@' just before startSearchhere */
1060                 keywordStart[-1] = 0;
1061                 return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1062               } else {
1063                 uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1064                 keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1065                 return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1066               }
1067             }
1068         } else if(rc<0){ /* end match keyword */
1069           /* could insert at this location. */
1070           insertHere = keywordStart;
1071         }
1072         keywordStart = nextSeparator;
1073     } /* end loop searching */
1074
1075     if(!keywordValue) {
1076       return bufLen; /* removal of non-extant keyword - no change */
1077     }
1078
1079     /* we know there is at least one keyword. */
1080     needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1081     if(needLen >= bufferCapacity) {
1082         *status = U_BUFFER_OVERFLOW_ERROR;
1083         return needLen; /* no change */
1084     }
1085
1086     if(insertHere) {
1087       uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1088       keywordStart = insertHere;
1089     } else {
1090       keywordStart = buffer+bufLen;
1091       *keywordStart = ';';
1092       keywordStart++;
1093     }
1094     uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1095     keywordStart += keywordNameLen;
1096     *keywordStart = '=';
1097     keywordStart++;
1098     uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1099     keywordStart+=keywordValueLen;
1100     if(insertHere) {
1101       *keywordStart = ';';
1102       keywordStart++;
1103     }
1104     buffer[needLen]=0;
1105     return needLen;
1106 }
1107
1108 /* ### ID parsing implementation **************************************************/
1109
1110 #define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1111
1112 /*returns TRUE if one of the special prefixes is here (s=string)
1113   'x-' or 'i-' */
1114 #define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1115
1116 /* Dot terminates it because of POSIX form  where dot precedes the codepage
1117  * except for variant
1118  */
1119 #define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1120
1121 static char* _strnchr(const char* str, int32_t len, char c) {
1122     U_ASSERT(str != 0 && len >= 0);
1123     while (len-- != 0) {
1124         char d = *str;
1125         if (d == c) {
1126             return (char*) str;
1127         } else if (d == 0) {
1128             break;
1129         }
1130         ++str;
1131     }
1132     return NULL;
1133 }
1134
1135 /**
1136  * Lookup 'key' in the array 'list'.  The array 'list' should contain
1137  * a NULL entry, followed by more entries, and a second NULL entry.
1138  *
1139  * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1140  * COUNTRIES_3.
1141  */
1142 static int16_t _findIndex(const char* const* list, const char* key)
1143 {
1144     const char* const* anchor = list;
1145     int32_t pass = 0;
1146
1147     /* Make two passes through two NULL-terminated arrays at 'list' */
1148     while (pass++ < 2) {
1149         while (*list) {
1150             if (uprv_strcmp(key, *list) == 0) {
1151                 return (int16_t)(list - anchor);
1152             }
1153             list++;
1154         }
1155         ++list;     /* skip final NULL *CWB*/
1156     }
1157     return -1;
1158 }
1159
1160 /* count the length of src while copying it to dest; return strlen(src) */
1161 static inline int32_t
1162 _copyCount(char *dest, int32_t destCapacity, const char *src) {
1163     const char *anchor;
1164     char c;
1165
1166     anchor=src;
1167     for(;;) {
1168         if((c=*src)==0) {
1169             return (int32_t)(src-anchor);
1170         }
1171         if(destCapacity<=0) {
1172             return (int32_t)((src-anchor)+uprv_strlen(src));
1173         }
1174         ++src;
1175         *dest++=c;
1176         --destCapacity;
1177     }
1178 }
1179
1180 U_CFUNC const char*
1181 uloc_getCurrentCountryID(const char* oldID){
1182     int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1183     if (offset >= 0) {
1184         return REPLACEMENT_COUNTRIES[offset];
1185     }
1186     return oldID;
1187 }
1188 U_CFUNC const char*
1189 uloc_getCurrentLanguageID(const char* oldID){
1190     int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1191     if (offset >= 0) {
1192         return REPLACEMENT_LANGUAGES[offset];
1193     }
1194     return oldID;
1195 }
1196 /*
1197  * the internal functions _getLanguage(), _getCountry(), _getVariant()
1198  * avoid duplicating code to handle the earlier locale ID pieces
1199  * in the functions for the later ones by
1200  * setting the *pEnd pointer to where they stopped parsing
1201  *
1202  * TODO try to use this in Locale
1203  */
1204 U_CFUNC int32_t
1205 ulocimp_getLanguage(const char *localeID,
1206                     char *language, int32_t languageCapacity,
1207                     const char **pEnd) {
1208     int32_t i=0;
1209     int32_t offset;
1210     char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1211
1212     /* if it starts with i- or x- then copy that prefix */
1213     if(_isIDPrefix(localeID)) {
1214         if(i<languageCapacity) {
1215             language[i]=(char)uprv_tolower(*localeID);
1216         }
1217         if(i<languageCapacity) {
1218             language[i+1]='-';
1219         }
1220         i+=2;
1221         localeID+=2;
1222     }
1223
1224     /* copy the language as far as possible and count its length */
1225     while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1226         if(i<languageCapacity) {
1227             language[i]=(char)uprv_tolower(*localeID);
1228         }
1229         if(i<3) {
1230             U_ASSERT(i>=0);
1231             lang[i]=(char)uprv_tolower(*localeID);
1232         }
1233         i++;
1234         localeID++;
1235     }
1236
1237     if(i==3) {
1238         /* convert 3 character code to 2 character code if possible *CWB*/
1239         offset=_findIndex(LANGUAGES_3, lang);
1240         if(offset>=0) {
1241             i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1242         }
1243     }
1244
1245     if(pEnd!=NULL) {
1246         *pEnd=localeID;
1247     }
1248     return i;
1249 }
1250
1251 U_CFUNC int32_t
1252 ulocimp_getScript(const char *localeID,
1253                   char *script, int32_t scriptCapacity,
1254                   const char **pEnd)
1255 {
1256     int32_t idLen = 0;
1257
1258     if (pEnd != NULL) {
1259         *pEnd = localeID;
1260     }
1261
1262     /* copy the second item as far as possible and count its length */
1263     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1264             && uprv_isASCIILetter(localeID[idLen])) {
1265         idLen++;
1266     }
1267
1268     /* If it's exactly 4 characters long, then it's a script and not a country. */
1269     if (idLen == 4) {
1270         int32_t i;
1271         if (pEnd != NULL) {
1272             *pEnd = localeID+idLen;
1273         }
1274         if(idLen > scriptCapacity) {
1275             idLen = scriptCapacity;
1276         }
1277         if (idLen >= 1) {
1278             script[0]=(char)uprv_toupper(*(localeID++));
1279         }
1280         for (i = 1; i < idLen; i++) {
1281             script[i]=(char)uprv_tolower(*(localeID++));
1282         }
1283     }
1284     else {
1285         idLen = 0;
1286     }
1287     return idLen;
1288 }
1289
1290 U_CFUNC int32_t
1291 ulocimp_getCountry(const char *localeID,
1292                    char *country, int32_t countryCapacity,
1293                    const char **pEnd)
1294 {
1295     int32_t idLen=0;
1296     char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1297     int32_t offset;
1298
1299     /* copy the country as far as possible and count its length */
1300     while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1301         if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1302             cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1303         }
1304         idLen++;
1305     }
1306
1307     /* the country should be either length 2 or 3 */
1308     if (idLen == 2 || idLen == 3) {
1309         UBool gotCountry = FALSE;
1310         /* convert 3 character code to 2 character code if possible *CWB*/
1311         if(idLen==3) {
1312             offset=_findIndex(COUNTRIES_3, cnty);
1313             if(offset>=0) {
1314                 idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1315                 gotCountry = TRUE;
1316             }
1317         }
1318         if (!gotCountry) {
1319             int32_t i = 0;
1320             for (i = 0; i < idLen; i++) {
1321                 if (i < countryCapacity) {
1322                     country[i]=(char)uprv_toupper(localeID[i]);
1323                 }
1324             }
1325         }
1326         localeID+=idLen;
1327     } else {
1328         idLen = 0;
1329     }
1330
1331     if(pEnd!=NULL) {
1332         *pEnd=localeID;
1333     }
1334
1335     return idLen;
1336 }
1337
1338 /**
1339  * @param needSeparator if true, then add leading '_' if any variants
1340  * are added to 'variant'
1341  */
1342 static int32_t
1343 _getVariantEx(const char *localeID,
1344               char prev,
1345               char *variant, int32_t variantCapacity,
1346               UBool needSeparator) {
1347     int32_t i=0;
1348
1349     /* get one or more variant tags and separate them with '_' */
1350     if(_isIDSeparator(prev)) {
1351         /* get a variant string after a '-' or '_' */
1352         while(!_isTerminator(*localeID)) {
1353             if (needSeparator) {
1354                 if (i<variantCapacity) {
1355                     variant[i] = '_';
1356                 }
1357                 ++i;
1358                 needSeparator = FALSE;
1359             }
1360             if(i<variantCapacity) {
1361                 variant[i]=(char)uprv_toupper(*localeID);
1362                 if(variant[i]=='-') {
1363                     variant[i]='_';
1364                 }
1365             }
1366             i++;
1367             localeID++;
1368         }
1369     }
1370
1371     /* if there is no variant tag after a '-' or '_' then look for '@' */
1372     if(i==0) {
1373         if(prev=='@') {
1374             /* keep localeID */
1375         } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1376             ++localeID; /* point after the '@' */
1377         } else {
1378             return 0;
1379         }
1380         while(!_isTerminator(*localeID)) {
1381             if (needSeparator) {
1382                 if (i<variantCapacity) {
1383                     variant[i] = '_';
1384                 }
1385                 ++i;
1386                 needSeparator = FALSE;
1387             }
1388             if(i<variantCapacity) {
1389                 variant[i]=(char)uprv_toupper(*localeID);
1390                 if(variant[i]=='-' || variant[i]==',') {
1391                     variant[i]='_';
1392                 }
1393             }
1394             i++;
1395             localeID++;
1396         }
1397     }
1398
1399     return i;
1400 }
1401
1402 static int32_t
1403 _getVariant(const char *localeID,
1404             char prev,
1405             char *variant, int32_t variantCapacity) {
1406     return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1407 }
1408
1409 /**
1410  * Delete ALL instances of a variant from the given list of one or
1411  * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1412  * @param variants the source string of one or more variants,
1413  * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1414  * terminated; if it is, trailing zero will NOT be maintained.
1415  * @param variantsLen length of variants
1416  * @param toDelete variant to delete, without separators, e.g.  "EURO"
1417  * or "PREEURO"; not zero terminated
1418  * @param toDeleteLen length of toDelete
1419  * @return number of characters deleted from variants
1420  */
1421 static int32_t
1422 _deleteVariant(char* variants, int32_t variantsLen,
1423                const char* toDelete, int32_t toDeleteLen)
1424 {
1425     int32_t delta = 0; /* number of chars deleted */
1426     for (;;) {
1427         UBool flag = FALSE;
1428         if (variantsLen < toDeleteLen) {
1429             return delta;
1430         }
1431         if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1432             (variantsLen == toDeleteLen ||
1433              (flag=(variants[toDeleteLen] == '_'))))
1434         {
1435             int32_t d = toDeleteLen + (flag?1:0);
1436             variantsLen -= d;
1437             delta += d;
1438             if (variantsLen > 0) {
1439                 uprv_memmove(variants, variants+d, variantsLen);
1440             }
1441         } else {
1442             char* p = _strnchr(variants, variantsLen, '_');
1443             if (p == NULL) {
1444                 return delta;
1445             }
1446             ++p;
1447             variantsLen -= (int32_t)(p - variants);
1448             variants = p;
1449         }
1450     }
1451 }
1452
1453 /* Keyword enumeration */
1454
1455 typedef struct UKeywordsContext {
1456     char* keywords;
1457     char* current;
1458 } UKeywordsContext;
1459
1460 static void U_CALLCONV
1461 uloc_kw_closeKeywords(UEnumeration *enumerator) {
1462     uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1463     uprv_free(enumerator->context);
1464     uprv_free(enumerator);
1465 }
1466
1467 static int32_t U_CALLCONV
1468 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1469     char *kw = ((UKeywordsContext *)en->context)->keywords;
1470     int32_t result = 0;
1471     while(*kw) {
1472         result++;
1473         kw += uprv_strlen(kw)+1;
1474     }
1475     return result;
1476 }
1477
1478 static const char* U_CALLCONV
1479 uloc_kw_nextKeyword(UEnumeration* en,
1480                     int32_t* resultLength,
1481                     UErrorCode* /*status*/) {
1482     const char* result = ((UKeywordsContext *)en->context)->current;
1483     int32_t len = 0;
1484     if(*result) {
1485         len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1486         ((UKeywordsContext *)en->context)->current += len+1;
1487     } else {
1488         result = NULL;
1489     }
1490     if (resultLength) {
1491         *resultLength = len;
1492     }
1493     return result;
1494 }
1495
1496 static void U_CALLCONV
1497 uloc_kw_resetKeywords(UEnumeration* en,
1498                       UErrorCode* /*status*/) {
1499     ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1500 }
1501
1502 static const UEnumeration gKeywordsEnum = {
1503     NULL,
1504     NULL,
1505     uloc_kw_closeKeywords,
1506     uloc_kw_countKeywords,
1507     uenum_unextDefault,
1508     uloc_kw_nextKeyword,
1509     uloc_kw_resetKeywords
1510 };
1511
1512 U_CAPI UEnumeration* U_EXPORT2
1513 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1514 {
1515     UKeywordsContext *myContext = NULL;
1516     UEnumeration *result = NULL;
1517
1518     if(U_FAILURE(*status)) {
1519         return NULL;
1520     }
1521     result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1522     /* Null pointer test */
1523     if (result == NULL) {
1524         *status = U_MEMORY_ALLOCATION_ERROR;
1525         return NULL;
1526     }
1527     uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1528     myContext = reinterpret_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1529     if (myContext == NULL) {
1530         *status = U_MEMORY_ALLOCATION_ERROR;
1531         uprv_free(result);
1532         return NULL;
1533     }
1534     myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1535     uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1536     myContext->keywords[keywordListSize] = 0;
1537     myContext->current = myContext->keywords;
1538     result->context = myContext;
1539     return result;
1540 }
1541
1542 U_CAPI UEnumeration* U_EXPORT2
1543 uloc_openKeywords(const char* localeID,
1544                         UErrorCode* status)
1545 {
1546     int32_t i=0;
1547     char keywords[256];
1548     int32_t keywordsCapacity = 256;
1549     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1550     const char* tmpLocaleID;
1551
1552     if(status==NULL || U_FAILURE(*status)) {
1553         return 0;
1554     }
1555
1556     if (_hasBCP47Extension(localeID)) {
1557         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1558     } else {
1559         if (localeID==NULL) {
1560            localeID=uloc_getDefault();
1561         }
1562         tmpLocaleID=localeID;
1563     }
1564
1565     /* Skip the language */
1566     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1567     if(_isIDSeparator(*tmpLocaleID)) {
1568         const char *scriptID;
1569         /* Skip the script if available */
1570         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1571         if(scriptID != tmpLocaleID+1) {
1572             /* Found optional script */
1573             tmpLocaleID = scriptID;
1574         }
1575         /* Skip the Country */
1576         if (_isIDSeparator(*tmpLocaleID)) {
1577             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1578             if(_isIDSeparator(*tmpLocaleID)) {
1579                 _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1580             }
1581         }
1582     }
1583
1584     /* keywords are located after '@' */
1585     if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1586         i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1587     }
1588
1589     if(i) {
1590         return uloc_openKeywordList(keywords, i, status);
1591     } else {
1592         return NULL;
1593     }
1594 }
1595
1596
1597 /* bit-flags for 'options' parameter of _canonicalize */
1598 #define _ULOC_STRIP_KEYWORDS 0x2
1599 #define _ULOC_CANONICALIZE   0x1
1600
1601 #define OPTION_SET(options, mask) ((options & mask) != 0)
1602
1603 static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1604 #define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1605
1606 /**
1607  * Canonicalize the given localeID, to level 1 or to level 2,
1608  * depending on the options.  To specify level 1, pass in options=0.
1609  * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1610  *
1611  * This is the code underlying uloc_getName and uloc_canonicalize.
1612  */
1613 static int32_t
1614 _canonicalize(const char* localeID,
1615               char* result,
1616               int32_t resultCapacity,
1617               uint32_t options,
1618               UErrorCode* err) {
1619     int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1620     char localeBuffer[ULOC_FULLNAME_CAPACITY];
1621     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1622     const char* origLocaleID;
1623     const char* tmpLocaleID;
1624     const char* keywordAssign = NULL;
1625     const char* separatorIndicator = NULL;
1626     const char* addKeyword = NULL;
1627     const char* addValue = NULL;
1628     char* name;
1629     char* variant = NULL; /* pointer into name, or NULL */
1630
1631     if (U_FAILURE(*err)) {
1632         return 0;
1633     }
1634
1635     if (_hasBCP47Extension(localeID)) {
1636         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1637     } else {
1638         if (localeID==NULL) {
1639            localeID=uloc_getDefault();
1640         }
1641         tmpLocaleID=localeID;
1642     }
1643
1644     origLocaleID=tmpLocaleID;
1645
1646     /* if we are doing a full canonicalization, then put results in
1647        localeBuffer, if necessary; otherwise send them to result. */
1648     if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1649         (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1650         name = localeBuffer;
1651         nameCapacity = (int32_t)sizeof(localeBuffer);
1652     } else {
1653         name = result;
1654         nameCapacity = resultCapacity;
1655     }
1656
1657     /* get all pieces, one after another, and separate with '_' */
1658     len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1659
1660     if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1661         const char *d = uloc_getDefault();
1662
1663         len = (int32_t)uprv_strlen(d);
1664
1665         if (name != NULL) {
1666             uprv_strncpy(name, d, len);
1667         }
1668     } else if(_isIDSeparator(*tmpLocaleID)) {
1669         const char *scriptID;
1670
1671         ++fieldCount;
1672         if(len<nameCapacity) {
1673             name[len]='_';
1674         }
1675         ++len;
1676
1677         scriptSize=ulocimp_getScript(tmpLocaleID+1,
1678             (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1679         if(scriptSize > 0) {
1680             /* Found optional script */
1681             tmpLocaleID = scriptID;
1682             ++fieldCount;
1683             len+=scriptSize;
1684             if (_isIDSeparator(*tmpLocaleID)) {
1685                 /* If there is something else, then we add the _ */
1686                 if(len<nameCapacity) {
1687                     name[len]='_';
1688                 }
1689                 ++len;
1690             }
1691         }
1692
1693         if (_isIDSeparator(*tmpLocaleID)) {
1694             const char *cntryID;
1695             int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1696                 (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1697             if (cntrySize > 0) {
1698                 /* Found optional country */
1699                 tmpLocaleID = cntryID;
1700                 len+=cntrySize;
1701             }
1702             if(_isIDSeparator(*tmpLocaleID)) {
1703                 /* If there is something else, then we add the _  if we found country before.*/
1704                 if (cntrySize > 0) {
1705                     ++fieldCount;
1706                     if(len<nameCapacity) {
1707                         name[len]='_';
1708                     }
1709                     ++len;
1710                 }
1711
1712                 variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1713                     (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1714                 if (variantSize > 0) {
1715                     variant = len<nameCapacity ? name+len : NULL;
1716                     len += variantSize;
1717                     tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1718                 }
1719             }
1720         }
1721     }
1722
1723     /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1724     if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1725         UBool done = FALSE;
1726         do {
1727             char c = *tmpLocaleID;
1728             switch (c) {
1729             case 0:
1730             case '@':
1731                 done = TRUE;
1732                 break;
1733             default:
1734                 if (len<nameCapacity) {
1735                     name[len] = c;
1736                 }
1737                 ++len;
1738                 ++tmpLocaleID;
1739                 break;
1740             }
1741         } while (!done);
1742     }
1743
1744     /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1745        After this, tmpLocaleID either points to '@' or is NULL */
1746     if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1747         keywordAssign = uprv_strchr(tmpLocaleID, '=');
1748         separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1749     }
1750
1751     /* Copy POSIX-style variant, if any [mr@FOO] */
1752     if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1753         tmpLocaleID != NULL && keywordAssign == NULL) {
1754         for (;;) {
1755             char c = *tmpLocaleID;
1756             if (c == 0) {
1757                 break;
1758             }
1759             if (len<nameCapacity) {
1760                 name[len] = c;
1761             }
1762             ++len;
1763             ++tmpLocaleID;
1764         }
1765     }
1766
1767     if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1768         /* Handle @FOO variant if @ is present and not followed by = */
1769         if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1770             int32_t posixVariantSize;
1771             /* Add missing '_' if needed */
1772             if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1773                 do {
1774                     if(len<nameCapacity) {
1775                         name[len]='_';
1776                     }
1777                     ++len;
1778                     ++fieldCount;
1779                 } while(fieldCount<2);
1780             }
1781             posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1782                                              (UBool)(variantSize > 0));
1783             if (posixVariantSize > 0) {
1784                 if (variant == NULL) {
1785                     variant = name+len;
1786                 }
1787                 len += posixVariantSize;
1788                 variantSize += posixVariantSize;
1789             }
1790         }
1791
1792         /* Handle generic variants first */
1793         if (variant) {
1794             for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1795                 const char* variantToCompare = VARIANT_MAP[j].variant;
1796                 int32_t n = (int32_t)uprv_strlen(variantToCompare);
1797                 int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1798                 len -= variantLen;
1799                 if (variantLen > 0) {
1800                     if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1801                         --len;
1802                     }
1803                     addKeyword = VARIANT_MAP[j].keyword;
1804                     addValue = VARIANT_MAP[j].value;
1805                     break;
1806                 }
1807             }
1808             if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1809                 --len;
1810             }
1811         }
1812
1813         /* Look up the ID in the canonicalization map */
1814         for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1815             const char* id = CANONICALIZE_MAP[j].id;
1816             int32_t n = (int32_t)uprv_strlen(id);
1817             if (len == n && uprv_strncmp(name, id, n) == 0) {
1818                 if (n == 0 && tmpLocaleID != NULL) {
1819                     break; /* Don't remap "" if keywords present */
1820                 }
1821                 len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1822                 if (CANONICALIZE_MAP[j].keyword) {
1823                     addKeyword = CANONICALIZE_MAP[j].keyword;
1824                     addValue = CANONICALIZE_MAP[j].value;
1825                 }
1826                 break;
1827             }
1828         }
1829     }
1830
1831     if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1832         if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1833             (!separatorIndicator || separatorIndicator > keywordAssign)) {
1834             if(len<nameCapacity) {
1835                 name[len]='@';
1836             }
1837             ++len;
1838             ++fieldCount;
1839             len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1840                                 NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1841         } else if (addKeyword != NULL) {
1842             U_ASSERT(addValue != NULL);
1843             /* inelegant but works -- later make _getKeywords do this? */
1844             len += _copyCount(name+len, nameCapacity-len, "@");
1845             len += _copyCount(name+len, nameCapacity-len, addKeyword);
1846             len += _copyCount(name+len, nameCapacity-len, "=");
1847             len += _copyCount(name+len, nameCapacity-len, addValue);
1848         }
1849     }
1850
1851     if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1852         uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1853     }
1854
1855     return u_terminateChars(result, resultCapacity, len, err);
1856 }
1857
1858 /* ### ID parsing API **************************************************/
1859
1860 U_CAPI int32_t  U_EXPORT2
1861 uloc_getParent(const char*    localeID,
1862                char* parent,
1863                int32_t parentCapacity,
1864                UErrorCode* err)
1865 {
1866     const char *lastUnderscore;
1867     int32_t i;
1868
1869     if (U_FAILURE(*err))
1870         return 0;
1871
1872     if (localeID == NULL)
1873         localeID = uloc_getDefault();
1874
1875     lastUnderscore=uprv_strrchr(localeID, '_');
1876     if(lastUnderscore!=NULL) {
1877         i=(int32_t)(lastUnderscore-localeID);
1878     } else {
1879         i=0;
1880     }
1881
1882     if(i>0 && parent != localeID) {
1883         uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1884     }
1885     return u_terminateChars(parent, parentCapacity, i, err);
1886 }
1887
1888 U_CAPI int32_t U_EXPORT2
1889 uloc_getLanguage(const char*    localeID,
1890          char* language,
1891          int32_t languageCapacity,
1892          UErrorCode* err)
1893 {
1894     /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1895     int32_t i=0;
1896
1897     if (err==NULL || U_FAILURE(*err)) {
1898         return 0;
1899     }
1900
1901     if(localeID==NULL) {
1902         localeID=uloc_getDefault();
1903     }
1904
1905     i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1906     return u_terminateChars(language, languageCapacity, i, err);
1907 }
1908
1909 U_CAPI int32_t U_EXPORT2
1910 uloc_getScript(const char*    localeID,
1911          char* script,
1912          int32_t scriptCapacity,
1913          UErrorCode* err)
1914 {
1915     int32_t i=0;
1916
1917     if(err==NULL || U_FAILURE(*err)) {
1918         return 0;
1919     }
1920
1921     if(localeID==NULL) {
1922         localeID=uloc_getDefault();
1923     }
1924
1925     /* skip the language */
1926     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1927     if(_isIDSeparator(*localeID)) {
1928         i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1929     }
1930     return u_terminateChars(script, scriptCapacity, i, err);
1931 }
1932
1933 U_CAPI int32_t  U_EXPORT2
1934 uloc_getCountry(const char* localeID,
1935             char* country,
1936             int32_t countryCapacity,
1937             UErrorCode* err)
1938 {
1939     int32_t i=0;
1940
1941     if(err==NULL || U_FAILURE(*err)) {
1942         return 0;
1943     }
1944
1945     if(localeID==NULL) {
1946         localeID=uloc_getDefault();
1947     }
1948
1949     /* Skip the language */
1950     ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1951     if(_isIDSeparator(*localeID)) {
1952         const char *scriptID;
1953         /* Skip the script if available */
1954         ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1955         if(scriptID != localeID+1) {
1956             /* Found optional script */
1957             localeID = scriptID;
1958         }
1959         if(_isIDSeparator(*localeID)) {
1960             i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1961         }
1962     }
1963     return u_terminateChars(country, countryCapacity, i, err);
1964 }
1965
1966 U_CAPI int32_t  U_EXPORT2
1967 uloc_getVariant(const char* localeID,
1968                 char* variant,
1969                 int32_t variantCapacity,
1970                 UErrorCode* err)
1971 {
1972     char tempBuffer[ULOC_FULLNAME_CAPACITY];
1973     const char* tmpLocaleID;
1974     int32_t i=0;
1975
1976     if(err==NULL || U_FAILURE(*err)) {
1977         return 0;
1978     }
1979
1980     if (_hasBCP47Extension(localeID)) {
1981         _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1982     } else {
1983         if (localeID==NULL) {
1984            localeID=uloc_getDefault();
1985         }
1986         tmpLocaleID=localeID;
1987     }
1988
1989     /* Skip the language */
1990     ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1991     if(_isIDSeparator(*tmpLocaleID)) {
1992         const char *scriptID;
1993         /* Skip the script if available */
1994         ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1995         if(scriptID != tmpLocaleID+1) {
1996             /* Found optional script */
1997             tmpLocaleID = scriptID;
1998         }
1999         /* Skip the Country */
2000         if (_isIDSeparator(*tmpLocaleID)) {
2001             const char *cntryID;
2002             ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2003             if (cntryID != tmpLocaleID+1) {
2004                 /* Found optional country */
2005                 tmpLocaleID = cntryID;
2006             }
2007             if(_isIDSeparator(*tmpLocaleID)) {
2008                 /* If there was no country ID, skip a possible extra IDSeparator */
2009                 if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2010                     tmpLocaleID++;
2011                 }
2012                 i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2013             }
2014         }
2015     }
2016
2017     /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2018     /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2019 /*
2020     if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2021         i=_getVariant(localeID+1, '@', variant, variantCapacity);
2022     }
2023 */
2024     return u_terminateChars(variant, variantCapacity, i, err);
2025 }
2026
2027 U_CAPI int32_t  U_EXPORT2
2028 uloc_getName(const char* localeID,
2029              char* name,
2030              int32_t nameCapacity,
2031              UErrorCode* err)
2032 {
2033     return _canonicalize(localeID, name, nameCapacity, 0, err);
2034 }
2035
2036 U_CAPI int32_t  U_EXPORT2
2037 uloc_getBaseName(const char* localeID,
2038                  char* name,
2039                  int32_t nameCapacity,
2040                  UErrorCode* err)
2041 {
2042     return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2043 }
2044
2045 U_CAPI int32_t  U_EXPORT2
2046 uloc_canonicalize(const char* localeID,
2047                   char* name,
2048                   int32_t nameCapacity,
2049                   UErrorCode* err)
2050 {
2051     return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2052 }
2053
2054 U_CAPI const char*  U_EXPORT2
2055 uloc_getISO3Language(const char* localeID)
2056 {
2057     int16_t offset;
2058     char lang[ULOC_LANG_CAPACITY];
2059     UErrorCode err = U_ZERO_ERROR;
2060
2061     if (localeID == NULL)
2062     {
2063         localeID = uloc_getDefault();
2064     }
2065     uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2066     if (U_FAILURE(err))
2067         return "";
2068     offset = _findIndex(LANGUAGES, lang);
2069     if (offset < 0)
2070         return "";
2071     return LANGUAGES_3[offset];
2072 }
2073
2074 U_CAPI const char*  U_EXPORT2
2075 uloc_getISO3Country(const char* localeID)
2076 {
2077     int16_t offset;
2078     char cntry[ULOC_LANG_CAPACITY];
2079     UErrorCode err = U_ZERO_ERROR;
2080
2081     if (localeID == NULL)
2082     {
2083         localeID = uloc_getDefault();
2084     }
2085     uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2086     if (U_FAILURE(err))
2087         return "";
2088     offset = _findIndex(COUNTRIES, cntry);
2089     if (offset < 0)
2090         return "";
2091
2092     return COUNTRIES_3[offset];
2093 }
2094
2095 U_CAPI uint32_t  U_EXPORT2
2096 uloc_getLCID(const char* localeID)
2097 {
2098     UErrorCode status = U_ZERO_ERROR;
2099     char       langID[ULOC_FULLNAME_CAPACITY];
2100
2101     uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2102     if (U_FAILURE(status)) {
2103         return 0;
2104     }
2105
2106     return uprv_convertToLCID(langID, localeID, &status);
2107 }
2108
2109 U_CAPI int32_t U_EXPORT2
2110 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2111                 UErrorCode *status)
2112 {
2113     int32_t length;
2114     const char *posix = uprv_convertToPosix(hostid, status);
2115     if (U_FAILURE(*status) || posix == NULL) {
2116         return 0;
2117     }
2118     length = (int32_t)uprv_strlen(posix);
2119     if (length+1 > localeCapacity) {
2120         *status = U_BUFFER_OVERFLOW_ERROR;
2121     }
2122     else {
2123         uprv_strcpy(locale, posix);
2124     }
2125     return length;
2126 }
2127
2128 /* ### Default locale **************************************************/
2129
2130 U_CAPI const char*  U_EXPORT2
2131 uloc_getDefault()
2132 {
2133     return locale_get_default();
2134 }
2135
2136 U_CAPI void  U_EXPORT2
2137 uloc_setDefault(const char*   newDefaultLocale,
2138              UErrorCode* err)
2139 {
2140     if (U_FAILURE(*err))
2141         return;
2142     /* the error code isn't currently used for anything by this function*/
2143
2144     /* propagate change to C++ */
2145     locale_set_default(newDefaultLocale);
2146 }
2147
2148 /**
2149  * Returns a list of all language codes defined in ISO 639.  This is a pointer
2150  * to an array of pointers to arrays of char.  All of these pointers are owned
2151  * by ICU-- do not delete them, and do not write through them.  The array is
2152  * terminated with a null pointer.
2153  */
2154 U_CAPI const char* const*  U_EXPORT2
2155 uloc_getISOLanguages()
2156 {
2157     return LANGUAGES;
2158 }
2159
2160 /**
2161  * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2162  * pointer to an array of pointers to arrays of char.  All of these pointers are
2163  * owned by ICU-- do not delete them, and do not write through them.  The array is
2164  * terminated with a null pointer.
2165  */
2166 U_CAPI const char* const*  U_EXPORT2
2167 uloc_getISOCountries()
2168 {
2169     return COUNTRIES;
2170 }
2171
2172
2173 /* this function to be moved into cstring.c later */
2174 static char gDecimal = 0;
2175
2176 static /* U_CAPI */
2177 double
2178 /* U_EXPORT2 */
2179 _uloc_strtod(const char *start, char **end) {
2180     char *decimal;
2181     char *myEnd;
2182     char buf[30];
2183     double rv;
2184     if (!gDecimal) {
2185         char rep[5];
2186         /* For machines that decide to change the decimal on you,
2187         and try to be too smart with localization.
2188         This normally should be just a '.'. */
2189         sprintf(rep, "%+1.1f", 1.0);
2190         gDecimal = rep[2];
2191     }
2192
2193     if(gDecimal == '.') {
2194         return uprv_strtod(start, end); /* fall through to OS */
2195     } else {
2196         uprv_strncpy(buf, start, 29);
2197         buf[29]=0;
2198         decimal = uprv_strchr(buf, '.');
2199         if(decimal) {
2200             *decimal = gDecimal;
2201         } else {
2202             return uprv_strtod(start, end); /* no decimal point */
2203         }
2204         rv = uprv_strtod(buf, &myEnd);
2205         if(end) {
2206             *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2207         }
2208         return rv;
2209     }
2210 }
2211
2212 typedef struct {
2213     float q;
2214     int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2215     char *locale;
2216 } _acceptLangItem;
2217
2218 static int32_t U_CALLCONV
2219 uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2220 {
2221     const _acceptLangItem *aa = (const _acceptLangItem*)a;
2222     const _acceptLangItem *bb = (const _acceptLangItem*)b;
2223
2224     int32_t rc = 0;
2225     if(bb->q < aa->q) {
2226         rc = -1;  /* A > B */
2227     } else if(bb->q > aa->q) {
2228         rc = 1;   /* A < B */
2229     } else {
2230         rc = 0;   /* A = B */
2231     }
2232
2233     if(rc==0) {
2234         rc = uprv_stricmp(aa->locale, bb->locale);
2235     }
2236
2237 #if defined(ULOC_DEBUG)
2238     /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2239     aa->locale, aa->q,
2240     bb->locale, bb->q,
2241     rc);*/
2242 #endif
2243
2244     return rc;
2245 }
2246
2247 /*
2248 mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2249 */
2250
2251 U_CAPI int32_t U_EXPORT2
2252 uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2253                             const char *httpAcceptLanguage,
2254                             UEnumeration* availableLocales,
2255                             UErrorCode *status)
2256 {
2257     _acceptLangItem *j;
2258     _acceptLangItem smallBuffer[30];
2259     char **strs;
2260     char tmp[ULOC_FULLNAME_CAPACITY +1];
2261     int32_t n = 0;
2262     const char *itemEnd;
2263     const char *paramEnd;
2264     const char *s;
2265     const char *t;
2266     int32_t res;
2267     int32_t i;
2268     int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2269     int32_t jSize;
2270     char *tempstr; /* Use for null pointer check */
2271
2272     j = smallBuffer;
2273     jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2274     if(U_FAILURE(*status)) {
2275         return -1;
2276     }
2277
2278     for(s=httpAcceptLanguage;s&&*s;) {
2279         while(isspace(*s)) /* eat space at the beginning */
2280             s++;
2281         itemEnd=uprv_strchr(s,',');
2282         paramEnd=uprv_strchr(s,';');
2283         if(!itemEnd) {
2284             itemEnd = httpAcceptLanguage+l; /* end of string */
2285         }
2286         if(paramEnd && paramEnd<itemEnd) {
2287             /* semicolon (;) is closer than end (,) */
2288             t = paramEnd+1;
2289             if(*t=='q') {
2290                 t++;
2291             }
2292             while(isspace(*t)) {
2293                 t++;
2294             }
2295             if(*t=='=') {
2296                 t++;
2297             }
2298             while(isspace(*t)) {
2299                 t++;
2300             }
2301             j[n].q = (float)_uloc_strtod(t,NULL);
2302         } else {
2303             /* no semicolon - it's 1.0 */
2304             j[n].q = 1.0f;
2305             paramEnd = itemEnd;
2306         }
2307         j[n].dummy=0;
2308         /* eat spaces prior to semi */
2309         for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2310             ;
2311         /* Check for null pointer from uprv_strndup */
2312         tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2313         if (tempstr == NULL) {
2314             *status = U_MEMORY_ALLOCATION_ERROR;
2315             return -1;
2316         }
2317         j[n].locale = tempstr;
2318         uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2319         if(strcmp(j[n].locale,tmp)) {
2320             uprv_free(j[n].locale);
2321             j[n].locale=uprv_strdup(tmp);
2322         }
2323 #if defined(ULOC_DEBUG)
2324         /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2325 #endif
2326         n++;
2327         s = itemEnd;
2328         while(*s==',') { /* eat duplicate commas */
2329             s++;
2330         }
2331         if(n>=jSize) {
2332             if(j==smallBuffer) {  /* overflowed the small buffer. */
2333                 j = reinterpret_cast<_acceptLangItem *>(uprv_malloc(sizeof(j[0])*(jSize*2)));
2334                 if(j!=NULL) {
2335                     uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2336                 }
2337 #if defined(ULOC_DEBUG)
2338                 fprintf(stderr,"malloced at size %d\n", jSize);
2339 #endif
2340             } else {
2341                 j = reinterpret_cast<_acceptLangItem *>(uprv_realloc(j, sizeof(j[0])*jSize*2));
2342 #if defined(ULOC_DEBUG)
2343                 fprintf(stderr,"re-alloced at size %d\n", jSize);
2344 #endif
2345             }
2346             jSize *= 2;
2347             if(j==NULL) {
2348                 *status = U_MEMORY_ALLOCATION_ERROR;
2349                 return -1;
2350             }
2351         }
2352     }
2353     uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2354     if(U_FAILURE(*status)) {
2355         if(j != smallBuffer) {
2356 #if defined(ULOC_DEBUG)
2357             fprintf(stderr,"freeing j %p\n", j);
2358 #endif
2359             uprv_free(j);
2360         }
2361         return -1;
2362     }
2363     strs = reinterpret_cast<char **>(uprv_malloc((size_t)(sizeof(strs[0])*n)));
2364     /* Check for null pointer */
2365     if (strs == NULL) {
2366         uprv_free(j); /* Free to avoid memory leak */
2367         *status = U_MEMORY_ALLOCATION_ERROR;
2368         return -1;
2369     }
2370     for(i=0;i<n;i++) {
2371 #if defined(ULOC_DEBUG)
2372         /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2373 #endif
2374         strs[i]=j[i].locale;
2375     }
2376     res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2377         (const char**)strs, n, availableLocales, status);
2378     for(i=0;i<n;i++) {
2379         uprv_free(strs[i]);
2380     }
2381     uprv_free(strs);
2382     if(j != smallBuffer) {
2383 #if defined(ULOC_DEBUG)
2384         fprintf(stderr,"freeing j %p\n", j);
2385 #endif
2386         uprv_free(j);
2387     }
2388     return res;
2389 }
2390
2391
2392 U_CAPI int32_t U_EXPORT2
2393 uloc_acceptLanguage(char *result, int32_t resultAvailable,
2394                     UAcceptResult *outResult, const char **acceptList,
2395                     int32_t acceptListCount,
2396                     UEnumeration* availableLocales,
2397                     UErrorCode *status)
2398 {
2399     int32_t i,j;
2400     int32_t len;
2401     int32_t maxLen=0;
2402     char tmp[ULOC_FULLNAME_CAPACITY+1];
2403     const char *l;
2404     char **fallbackList;
2405     if(U_FAILURE(*status)) {
2406         return -1;
2407     }
2408     fallbackList = reinterpret_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2409     if(fallbackList==NULL) {
2410         *status = U_MEMORY_ALLOCATION_ERROR;
2411         return -1;
2412     }
2413     for(i=0;i<acceptListCount;i++) {
2414 #if defined(ULOC_DEBUG)
2415         fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2416 #endif
2417         while((l=uenum_next(availableLocales, NULL, status))) {
2418 #if defined(ULOC_DEBUG)
2419             fprintf(stderr,"  %s\n", l);
2420 #endif
2421             len = (int32_t)uprv_strlen(l);
2422             if(!uprv_strcmp(acceptList[i], l)) {
2423                 if(outResult) {
2424                     *outResult = ULOC_ACCEPT_VALID;
2425                 }
2426 #if defined(ULOC_DEBUG)
2427                 fprintf(stderr, "MATCH! %s\n", l);
2428 #endif
2429                 if(len>0) {
2430                     uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2431                 }
2432                 for(j=0;j<i;j++) {
2433                     uprv_free(fallbackList[j]);
2434                 }
2435                 uprv_free(fallbackList);
2436                 return u_terminateChars(result, resultAvailable, len, status);
2437             }
2438             if(len>maxLen) {
2439                 maxLen = len;
2440             }
2441         }
2442         uenum_reset(availableLocales, status);
2443         /* save off parent info */
2444         if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2445             fallbackList[i] = uprv_strdup(tmp);
2446         } else {
2447             fallbackList[i]=0;
2448         }
2449     }
2450
2451     for(maxLen--;maxLen>0;maxLen--) {
2452         for(i=0;i<acceptListCount;i++) {
2453             if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2454 #if defined(ULOC_DEBUG)
2455                 fprintf(stderr,"Try: [%s]", fallbackList[i]);
2456 #endif
2457                 while((l=uenum_next(availableLocales, NULL, status))) {
2458 #if defined(ULOC_DEBUG)
2459                     fprintf(stderr,"  %s\n", l);
2460 #endif
2461                     len = (int32_t)uprv_strlen(l);
2462                     if(!uprv_strcmp(fallbackList[i], l)) {
2463                         if(outResult) {
2464                             *outResult = ULOC_ACCEPT_FALLBACK;
2465                         }
2466 #if defined(ULOC_DEBUG)
2467                         fprintf(stderr, "fallback MATCH! %s\n", l);
2468 #endif
2469                         if(len>0) {
2470                             uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2471                         }
2472                         for(j=0;j<acceptListCount;j++) {
2473                             uprv_free(fallbackList[j]);
2474                         }
2475                         uprv_free(fallbackList);
2476                         return u_terminateChars(result, resultAvailable, len, status);
2477                     }
2478                 }
2479                 uenum_reset(availableLocales, status);
2480
2481                 if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2482                     uprv_free(fallbackList[i]);
2483                     fallbackList[i] = uprv_strdup(tmp);
2484                 } else {
2485                     uprv_free(fallbackList[i]);
2486                     fallbackList[i]=0;
2487                 }
2488             }
2489         }
2490         if(outResult) {
2491             *outResult = ULOC_ACCEPT_FAILED;
2492         }
2493     }
2494     for(i=0;i<acceptListCount;i++) {
2495         uprv_free(fallbackList[i]);
2496     }
2497     uprv_free(fallbackList);
2498     return -1;
2499 }
2500
2501 /*eof*/