]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uloc_tag.cpp
1c10c48182c8876ff97b30cdf44f5819eb74efba
[apple/icu.git] / icuSources / common / uloc_tag.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2009-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/bytestream.h"
11 #include "unicode/utypes.h"
12 #include "unicode/ures.h"
13 #include "unicode/localpointer.h"
14 #include "unicode/putil.h"
15 #include "unicode/uenum.h"
16 #include "unicode/uloc.h"
17 #include "ustr_imp.h"
18 #include "charstr.h"
19 #include "cmemory.h"
20 #include "cstring.h"
21 #include "putilimp.h"
22 #include "uinvchar.h"
23 #include "ulocimp.h"
24 #include "uassert.h"
25
26
27 /* struct holding a single variant */
28 typedef struct VariantListEntry {
29 const char *variant;
30 struct VariantListEntry *next;
31 } VariantListEntry;
32
33 /* struct holding a single attribute value */
34 struct AttributeListEntry : public icu::UMemory {
35 const char *attribute;
36 struct AttributeListEntry *next;
37 };
38
39 /* struct holding a single extension */
40 struct ExtensionListEntry : public icu::UMemory {
41 const char *key;
42 const char *value;
43 struct ExtensionListEntry *next;
44 };
45
46 #define MAXEXTLANG 3
47 typedef struct ULanguageTag {
48 char *buf; /* holding parsed subtags */
49 const char *language;
50 const char *extlang[MAXEXTLANG];
51 const char *script;
52 const char *region;
53 VariantListEntry *variants;
54 ExtensionListEntry *extensions;
55 const char *privateuse;
56 const char *grandfathered;
57 } ULanguageTag;
58
59 #define MINLEN 2
60 #define SEP '-'
61 #define PRIVATEUSE 'x'
62 #define LDMLEXT 'u'
63
64 #define LOCALE_SEP '_'
65 #define LOCALE_EXT_SEP '@'
66 #define LOCALE_KEYWORD_SEP ';'
67 #define LOCALE_KEY_TYPE_SEP '='
68
69 #define ISALPHA(c) uprv_isASCIILetter(c)
70 #define ISNUMERIC(c) ((c)>='0' && (c)<='9')
71
72 static const char EMPTY[] = "";
73 static const char LANG_UND[] = "und";
74 static const char PRIVATEUSE_KEY[] = "x";
75 static const char _POSIX[] = "_POSIX";
76 static const char POSIX_KEY[] = "va";
77 static const char POSIX_VALUE[] = "posix";
78 static const char LOCALE_ATTRIBUTE_KEY[] = "attribute";
79 static const char PRIVUSE_VARIANT_PREFIX[] = "lvariant";
80 static const char LOCALE_TYPE_YES[] = "yes";
81
82 #define LANG_UND_LEN 3
83
84 /*
85 Updated on 2018-09-12 from
86 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
87
88 This table has 2 parts. The parts for Grandfathered tags is generated by the
89 following scripts from the IANA language tag registry.
90
91 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
92 egrep -A 7 'Type: grandfathered' | \
93 egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
94 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
95 tr 'A-Z' 'a-z'
96
97
98 The 2nd part is made of five ICU-specific entries. They're kept for
99 the backward compatibility for now, even though there are no preferred
100 values. They may have to be removed for the strict BCP 47 compliance.
101
102 */
103 static const char* const GRANDFATHERED[] = {
104 /* grandfathered preferred */
105 "art-lojban", "jbo",
106 "en-gb-oed", "en-gb-oxendict",
107 "i-ami", "ami",
108 "i-bnn", "bnn",
109 "i-hak", "hak",
110 "i-klingon", "tlh",
111 "i-lux", "lb",
112 "i-navajo", "nv",
113 "i-pwn", "pwn",
114 "i-tao", "tao",
115 "i-tay", "tay",
116 "i-tsu", "tsu",
117 "no-bok", "nb",
118 "no-nyn", "nn",
119 "sgn-be-fr", "sfb",
120 "sgn-be-nl", "vgt",
121 "sgn-ch-de", "sgg",
122 "zh-guoyu", "cmn",
123 "zh-hakka", "hak",
124 "zh-min-nan", "nan",
125 "zh-xiang", "hsn",
126
127 // Grandfathered tags with no preferred value in the IANA
128 // registry. Kept for now for the backward compatibility
129 // because ICU has mapped them this way.
130 "cel-gaulish", "xtg-x-cel-gaulish",
131 "i-default", "en-x-i-default",
132 "i-enochian", "und-x-i-enochian",
133 "i-mingo", "see-x-i-mingo",
134 "zh-min", "nan-x-zh-min",
135 };
136
137 /*
138 Updated on 2018-09-12 from
139 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
140
141 The table lists redundant tags with preferred value in the IANA languate tag registry.
142 It's generated with the following command:
143
144 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
145 grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
146 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
147 tr 'A-Z' 'a-z'
148
149 In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
150 a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
151 */
152
153 static const char* const REDUNDANT[] = {
154 // redundant preferred
155 "sgn-br", "bzs",
156 "sgn-co", "csn",
157 "sgn-de", "gsg",
158 "sgn-dk", "dsl",
159 "sgn-es", "ssp",
160 "sgn-fr", "fsl",
161 "sgn-gb", "bfi",
162 "sgn-gr", "gss",
163 "sgn-ie", "isg",
164 "sgn-it", "ise",
165 "sgn-jp", "jsl",
166 "sgn-mx", "mfs",
167 "sgn-ni", "ncs",
168 "sgn-nl", "dse",
169 "sgn-no", "nsl",
170 "sgn-pt", "psr",
171 "sgn-se", "swl",
172 "sgn-us", "ase",
173 "sgn-za", "sfs",
174 "zh-cmn", "cmn",
175 "zh-cmn-hans", "cmn-hans",
176 "zh-cmn-hant", "cmn-hant",
177 "zh-gan", "gan",
178 "zh-wuu", "wuu",
179 "zh-yue", "yue",
180
181 // variant tag with preferred value
182 "ja-latn-hepburn-heploc", "ja-latn-alalc97",
183 };
184
185 /*
186 Updated on 2018-09-12 from
187 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
188
189 grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
190 grep -B1 'Preferred' | grep -v '^--' | \
191 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
192
193 Make sure that 2-letter language subtags come before 3-letter subtags.
194 */
195 static const char DEPRECATEDLANGS[][4] = {
196 /* deprecated new */
197 "in", "id",
198 "iw", "he",
199 "ji", "yi",
200 "jw", "jv",
201 "mo", "ro",
202 "aam", "aas",
203 "adp", "dz",
204 "aue", "ktz",
205 "ayx", "nun",
206 "bgm", "bcg",
207 "bjd", "drl",
208 "ccq", "rki",
209 "cjr", "mom",
210 "cka", "cmr",
211 "cmk", "xch",
212 "coy", "pij",
213 "cqu", "quh",
214 "drh", "khk",
215 "drw", "prs",
216 "gav", "dev",
217 "gfx", "vaj",
218 "ggn", "gvr",
219 "gti", "nyc",
220 "guv", "duz",
221 "hrr", "jal",
222 "ibi", "opa",
223 "ilw", "gal",
224 "jeg", "oyb",
225 "kgc", "tdf",
226 "kgh", "kml",
227 "koj", "kwv",
228 "krm", "bmf",
229 "ktr", "dtp",
230 "kvs", "gdj",
231 "kwq", "yam",
232 "kxe", "tvd",
233 "kzj", "dtp",
234 "kzt", "dtp",
235 "lii", "raq",
236 "lmm", "rmx",
237 "meg", "cir",
238 "mst", "mry",
239 "mwj", "vaj",
240 "myt", "mry",
241 "nad", "xny",
242 "ncp", "kdz",
243 "nnx", "ngv",
244 "nts", "pij",
245 "oun", "vaj",
246 "pcr", "adx",
247 "pmc", "huw",
248 "pmu", "phr",
249 "ppa", "bfy",
250 "ppr", "lcq",
251 "pry", "prt",
252 "puz", "pub",
253 "sca", "hle",
254 "skk", "oyb",
255 "tdu", "dtp",
256 "thc", "tpo",
257 "thx", "oyb",
258 "tie", "ras",
259 "tkk", "twm",
260 "tlw", "weo",
261 "tmp", "tyj",
262 "tne", "kak",
263 "tnf", "prs",
264 "tsf", "taj",
265 "uok", "ema",
266 "xba", "cax",
267 "xia", "acn",
268 "xkh", "waw",
269 "xsj", "suj",
270 "ybd", "rki",
271 "yma", "lrr",
272 "ymt", "mtm",
273 "yos", "zom",
274 "yuu", "yug",
275 };
276
277 /*
278 Updated on 2018-04-24 from
279
280 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
281 grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
282 grep -B1 'Preferred' | \
283 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
284 */
285 static const char DEPRECATEDREGIONS[][3] = {
286 /* deprecated new */
287 "BU", "MM",
288 "DD", "DE",
289 "FX", "FR",
290 "TP", "TL",
291 "YD", "YE",
292 "ZR", "CD",
293 };
294
295 /*
296 * -------------------------------------------------
297 *
298 * These ultag_ functions may be exposed as APIs later
299 *
300 * -------------------------------------------------
301 */
302
303 static ULanguageTag*
304 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status);
305
306 static void
307 ultag_close(ULanguageTag* langtag);
308
309 static const char*
310 ultag_getLanguage(const ULanguageTag* langtag);
311
312 #if 0
313 static const char*
314 ultag_getJDKLanguage(const ULanguageTag* langtag);
315 #endif
316
317 static const char*
318 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx);
319
320 static int32_t
321 ultag_getExtlangSize(const ULanguageTag* langtag);
322
323 static const char*
324 ultag_getScript(const ULanguageTag* langtag);
325
326 static const char*
327 ultag_getRegion(const ULanguageTag* langtag);
328
329 static const char*
330 ultag_getVariant(const ULanguageTag* langtag, int32_t idx);
331
332 static int32_t
333 ultag_getVariantsSize(const ULanguageTag* langtag);
334
335 static const char*
336 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx);
337
338 static const char*
339 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx);
340
341 static int32_t
342 ultag_getExtensionsSize(const ULanguageTag* langtag);
343
344 static const char*
345 ultag_getPrivateUse(const ULanguageTag* langtag);
346
347 #if 0
348 static const char*
349 ultag_getGrandfathered(const ULanguageTag* langtag);
350 #endif
351
352 U_NAMESPACE_BEGIN
353
354 /**
355 * \class LocalULanguageTagPointer
356 * "Smart pointer" class, closes a ULanguageTag via ultag_close().
357 * For most methods see the LocalPointerBase base class.
358 *
359 * @see LocalPointerBase
360 * @see LocalPointer
361 * @internal
362 */
363 U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close);
364
365 U_NAMESPACE_END
366
367 /*
368 * -------------------------------------------------
369 *
370 * Language subtag syntax validation functions
371 *
372 * -------------------------------------------------
373 */
374
375 static UBool
376 _isAlphaString(const char* s, int32_t len) {
377 int32_t i;
378 for (i = 0; i < len; i++) {
379 if (!ISALPHA(*(s + i))) {
380 return FALSE;
381 }
382 }
383 return TRUE;
384 }
385
386 static UBool
387 _isNumericString(const char* s, int32_t len) {
388 int32_t i;
389 for (i = 0; i < len; i++) {
390 if (!ISNUMERIC(*(s + i))) {
391 return FALSE;
392 }
393 }
394 return TRUE;
395 }
396
397 static UBool
398 _isAlphaNumericString(const char* s, int32_t len) {
399 int32_t i;
400 for (i = 0; i < len; i++) {
401 if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) {
402 return FALSE;
403 }
404 }
405 return TRUE;
406 }
407
408 static UBool
409 _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) {
410 if (len < 0) {
411 len = (int32_t)uprv_strlen(s);
412 }
413 if (len >= min && len <= max && _isAlphaNumericString(s, len)) {
414 return TRUE;
415 }
416 return FALSE;
417 }
418
419 U_CFUNC UBool
420 ultag_isLanguageSubtag(const char* s, int32_t len) {
421 /*
422 * unicode_language_subtag = alpha{2,3} | alpha{5,8};
423 * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
424 * See ICU-20372
425 */
426 if (len < 0) {
427 len = (int32_t)uprv_strlen(s);
428 }
429 if (len >= 2 && len <= 8 && _isAlphaString(s, len)) {
430 return TRUE;
431 }
432 return FALSE;
433 }
434
435 static UBool
436 _isExtlangSubtag(const char* s, int32_t len) {
437 /*
438 * extlang = 3ALPHA ; selected ISO 639 codes
439 * *2("-" 3ALPHA) ; permanently reserved
440 */
441 if (len < 0) {
442 len = (int32_t)uprv_strlen(s);
443 }
444 if (len == 3 && _isAlphaString(s, len)) {
445 return TRUE;
446 }
447 return FALSE;
448 }
449
450 U_CFUNC UBool
451 ultag_isScriptSubtag(const char* s, int32_t len) {
452 /*
453 * script = 4ALPHA ; ISO 15924 code
454 */
455 if (len < 0) {
456 len = (int32_t)uprv_strlen(s);
457 }
458 if (len == 4 && _isAlphaString(s, len)) {
459 return TRUE;
460 }
461 return FALSE;
462 }
463
464 U_CFUNC UBool
465 ultag_isRegionSubtag(const char* s, int32_t len) {
466 /*
467 * region = 2ALPHA ; ISO 3166-1 code
468 * / 3DIGIT ; UN M.49 code
469 */
470 if (len < 0) {
471 len = (int32_t)uprv_strlen(s);
472 }
473 if (len == 2 && _isAlphaString(s, len)) {
474 return TRUE;
475 }
476 if (len == 3 && _isNumericString(s, len)) {
477 return TRUE;
478 }
479 return FALSE;
480 }
481
482 static UBool
483 _isVariantSubtag(const char* s, int32_t len) {
484 /*
485 * variant = 5*8alphanum ; registered variants
486 * / (DIGIT 3alphanum)
487 */
488 if (len < 0) {
489 len = (int32_t)uprv_strlen(s);
490 }
491 if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) {
492 return TRUE;
493 }
494 if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) {
495 return TRUE;
496 }
497 return FALSE;
498 }
499
500 static UBool
501 _isSepListOf(UBool (*test)(const char*, int32_t), const char* s, int32_t len) {
502 const char *p = s;
503 const char *pSubtag = NULL;
504
505 if (len < 0) {
506 len = (int32_t)uprv_strlen(s);
507 }
508
509 while ((p - s) < len) {
510 if (*p == SEP) {
511 if (pSubtag == NULL) {
512 return FALSE;
513 }
514 if (!test(pSubtag, (int32_t)(p - pSubtag))) {
515 return FALSE;
516 }
517 pSubtag = NULL;
518 } else if (pSubtag == NULL) {
519 pSubtag = p;
520 }
521 p++;
522 }
523 if (pSubtag == NULL) {
524 return FALSE;
525 }
526 return test(pSubtag, (int32_t)(p - pSubtag));
527 }
528
529 U_CFUNC UBool
530 ultag_isVariantSubtags(const char* s, int32_t len) {
531 return _isSepListOf(&_isVariantSubtag, s, len);
532 }
533
534 // This is for the ICU-specific "lvariant" handling.
535 static UBool
536 _isPrivateuseVariantSubtag(const char* s, int32_t len) {
537 /*
538 * variant = 1*8alphanum ; registered variants
539 * / (DIGIT 3alphanum)
540 */
541 return _isAlphaNumericStringLimitedLength(s, len , 1, 8);
542 }
543
544 static UBool
545 _isExtensionSingleton(const char* s, int32_t len) {
546 /*
547 * extension = singleton 1*("-" (2*8alphanum))
548 *
549 * singleton = DIGIT ; 0 - 9
550 * / %x41-57 ; A - W
551 * / %x59-5A ; Y - Z
552 * / %x61-77 ; a - w
553 * / %x79-7A ; y - z
554 */
555 if (len < 0) {
556 len = (int32_t)uprv_strlen(s);
557 }
558 if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) {
559 return TRUE;
560 }
561 return FALSE;
562 }
563
564 static UBool
565 _isExtensionSubtag(const char* s, int32_t len) {
566 /*
567 * extension = singleton 1*("-" (2*8alphanum))
568 */
569 return _isAlphaNumericStringLimitedLength(s, len, 2, 8);
570 }
571
572 U_CFUNC UBool
573 ultag_isExtensionSubtags(const char* s, int32_t len) {
574 return _isSepListOf(&_isExtensionSubtag, s, len);
575 }
576
577 static UBool
578 _isPrivateuseValueSubtag(const char* s, int32_t len) {
579 /*
580 * privateuse = "x" 1*("-" (1*8alphanum))
581 */
582 return _isAlphaNumericStringLimitedLength(s, len, 1, 8);
583 }
584
585 U_CFUNC UBool
586 ultag_isPrivateuseValueSubtags(const char* s, int32_t len) {
587 return _isSepListOf(&_isPrivateuseValueSubtag, s, len);
588 }
589
590 U_CFUNC UBool
591 ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) {
592 /*
593 * attribute = alphanum{3,8} ;
594 */
595 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
596 }
597
598 U_CFUNC UBool
599 ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) {
600 return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len);
601 }
602
603 U_CFUNC UBool
604 ultag_isUnicodeLocaleKey(const char* s, int32_t len) {
605 /*
606 * key = alphanum alpha ;
607 */
608 if (len < 0) {
609 len = (int32_t)uprv_strlen(s);
610 }
611 if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) {
612 return TRUE;
613 }
614 return FALSE;
615 }
616
617 U_CFUNC UBool
618 _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) {
619 /*
620 * alphanum{3,8}
621 */
622 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
623 }
624
625 U_CFUNC UBool
626 ultag_isUnicodeLocaleType(const char*s, int32_t len) {
627 /*
628 * type = alphanum{3,8} (sep alphanum{3,8})* ;
629 */
630 return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len);
631 }
632
633 static UBool
634 _isTKey(const char* s, int32_t len)
635 {
636 /*
637 * tkey = alpha digit ;
638 */
639 if (len < 0) {
640 len = (int32_t)uprv_strlen(s);
641 }
642 if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) {
643 return TRUE;
644 }
645 return FALSE;
646 }
647
648 static UBool
649 _isTValue(const char* s, int32_t len)
650 {
651 /*
652 * tvalue = (sep alphanum{3,8})+ ;
653 */
654 return _isAlphaNumericStringLimitedLength(s, len , 3, 8);
655 }
656
657 static UBool
658 _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
659 {
660 const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end
661 const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
662 // unicode_region_subtag, unicode_variant_subtag, tkey or end
663 const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag,
664 // unicode_variant_subtag, tkey, or end
665 const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag,
666 // tkey, or end.
667 const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag
668 // tkey or end.
669 const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here.
670 const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end
671
672 switch (state) {
673 case kStart:
674 if (ultag_isLanguageSubtag(s, len)) {
675 state = kGotLanguage;
676 return TRUE;
677 }
678 if (_isTKey(s, len)) {
679 state = kGotTKey;
680 return TRUE;
681 }
682 return FALSE;
683 case kGotLanguage:
684 if (ultag_isScriptSubtag(s, len)) {
685 state = kGotScript;
686 return TRUE;
687 }
688 U_FALLTHROUGH;
689 case kGotScript:
690 if (ultag_isRegionSubtag(s, len)) {
691 state = kGotRegion;
692 return TRUE;
693 }
694 U_FALLTHROUGH;
695 case kGotRegion:
696 U_FALLTHROUGH;
697 case kGotVariant:
698 if (_isVariantSubtag(s, len)) {
699 state = kGotVariant;
700 return TRUE;
701 }
702 if (_isTKey(s, len)) {
703 state = kGotTKey;
704 return TRUE;
705 }
706 return FALSE;
707 case kGotTKey:
708 if (_isTValue(s, len)) {
709 state = kGotTValue;
710 return TRUE;
711 }
712 return FALSE;
713 case kGotTValue:
714 if (_isTKey(s, len)) {
715 state = kGotTKey;
716 return TRUE;
717 }
718 if (_isTValue(s, len)) {
719 return TRUE;
720 }
721 return FALSE;
722 }
723 return FALSE;
724 }
725
726 static UBool
727 _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len)
728 {
729 const int32_t kStart = 0; // Start, wait for a key or attribute or end
730 const int32_t kGotKey = 1; // Got a key, wait for type or key or end
731 const int32_t kGotType = 2; // Got a type, wait for key or end
732
733 switch (state) {
734 case kStart:
735 if (ultag_isUnicodeLocaleKey(s, len)) {
736 state = kGotKey;
737 return TRUE;
738 }
739 if (ultag_isUnicodeLocaleAttribute(s, len)) {
740 return TRUE;
741 }
742 return FALSE;
743 case kGotKey:
744 if (ultag_isUnicodeLocaleKey(s, len)) {
745 return TRUE;
746 }
747 if (_isUnicodeLocaleTypeSubtag(s, len)) {
748 state = kGotType;
749 return TRUE;
750 }
751 return FALSE;
752 case kGotType:
753 if (ultag_isUnicodeLocaleKey(s, len)) {
754 state = kGotKey;
755 return TRUE;
756 }
757 if (_isUnicodeLocaleTypeSubtag(s, len)) {
758 return TRUE;
759 }
760 return FALSE;
761 }
762 return FALSE;
763 }
764
765 static UBool
766 _isStatefulSepListOf(UBool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len)
767 {
768 int32_t state = 0;
769 const char* p;
770 const char* start = s;
771 int32_t subtagLen = 0;
772
773 if (len < 0) {
774 len = (int32_t)uprv_strlen(s);
775 }
776
777 for (p = s; len > 0; p++, len--) {
778 if (*p == SEP) {
779 if (!test(state, start, subtagLen)) {
780 return FALSE;
781 }
782 subtagLen = 0;
783 start = p + 1;
784 } else {
785 subtagLen++;
786 }
787 }
788
789 if (test(state, start, subtagLen) && state >= 0) {
790 return TRUE;
791 }
792 return FALSE;
793 }
794
795 U_CFUNC UBool
796 ultag_isTransformedExtensionSubtags(const char* s, int32_t len)
797 {
798 return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len);
799 }
800
801 U_CFUNC UBool
802 ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) {
803 return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len);
804 }
805
806
807 /*
808 * -------------------------------------------------
809 *
810 * Helper functions
811 *
812 * -------------------------------------------------
813 */
814
815 static UBool
816 _addVariantToList(VariantListEntry **first, VariantListEntry *var) {
817 UBool bAdded = TRUE;
818
819 if (*first == NULL) {
820 var->next = NULL;
821 *first = var;
822 } else {
823 VariantListEntry *prev, *cur;
824 int32_t cmp;
825
826 /* variants order should be preserved */
827 prev = NULL;
828 cur = *first;
829 while (TRUE) {
830 if (cur == NULL) {
831 prev->next = var;
832 var->next = NULL;
833 break;
834 }
835
836 /* Checking for duplicate variant */
837 cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant);
838 if (cmp == 0) {
839 /* duplicated variant */
840 bAdded = FALSE;
841 break;
842 }
843 prev = cur;
844 cur = cur->next;
845 }
846 }
847
848 return bAdded;
849 }
850
851 static UBool
852 _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) {
853 UBool bAdded = TRUE;
854
855 if (*first == NULL) {
856 attr->next = NULL;
857 *first = attr;
858 } else {
859 AttributeListEntry *prev, *cur;
860 int32_t cmp;
861
862 /* reorder variants in alphabetical order */
863 prev = NULL;
864 cur = *first;
865 while (TRUE) {
866 if (cur == NULL) {
867 prev->next = attr;
868 attr->next = NULL;
869 break;
870 }
871 cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute);
872 if (cmp < 0) {
873 if (prev == NULL) {
874 *first = attr;
875 } else {
876 prev->next = attr;
877 }
878 attr->next = cur;
879 break;
880 }
881 if (cmp == 0) {
882 /* duplicated variant */
883 bAdded = FALSE;
884 break;
885 }
886 prev = cur;
887 cur = cur->next;
888 }
889 }
890
891 return bAdded;
892 }
893
894
895 static UBool
896 _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, UBool localeToBCP) {
897 UBool bAdded = TRUE;
898
899 if (*first == NULL) {
900 ext->next = NULL;
901 *first = ext;
902 } else {
903 ExtensionListEntry *prev, *cur;
904 int32_t cmp;
905
906 /* reorder variants in alphabetical order */
907 prev = NULL;
908 cur = *first;
909 while (TRUE) {
910 if (cur == NULL) {
911 prev->next = ext;
912 ext->next = NULL;
913 break;
914 }
915 if (localeToBCP) {
916 /* special handling for locale to bcp conversion */
917 int32_t len, curlen;
918
919 len = (int32_t)uprv_strlen(ext->key);
920 curlen = (int32_t)uprv_strlen(cur->key);
921
922 if (len == 1 && curlen == 1) {
923 if (*(ext->key) == *(cur->key)) {
924 cmp = 0;
925 } else if (*(ext->key) == PRIVATEUSE) {
926 cmp = 1;
927 } else if (*(cur->key) == PRIVATEUSE) {
928 cmp = -1;
929 } else {
930 cmp = *(ext->key) - *(cur->key);
931 }
932 } else if (len == 1) {
933 cmp = *(ext->key) - LDMLEXT;
934 } else if (curlen == 1) {
935 cmp = LDMLEXT - *(cur->key);
936 } else {
937 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
938 /* Both are u extension keys - we need special handling for 'attribute' */
939 if (cmp != 0) {
940 if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) {
941 cmp = 1;
942 } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
943 cmp = -1;
944 }
945 }
946 }
947 } else {
948 cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key);
949 }
950 if (cmp < 0) {
951 if (prev == NULL) {
952 *first = ext;
953 } else {
954 prev->next = ext;
955 }
956 ext->next = cur;
957 break;
958 }
959 if (cmp == 0) {
960 /* duplicated extension key */
961 bAdded = FALSE;
962 break;
963 }
964 prev = cur;
965 cur = cur->next;
966 }
967 }
968
969 return bAdded;
970 }
971
972 static void
973 _initializeULanguageTag(ULanguageTag* langtag) {
974 int32_t i;
975
976 langtag->buf = NULL;
977
978 langtag->language = EMPTY;
979 for (i = 0; i < MAXEXTLANG; i++) {
980 langtag->extlang[i] = NULL;
981 }
982
983 langtag->script = EMPTY;
984 langtag->region = EMPTY;
985
986 langtag->variants = NULL;
987 langtag->extensions = NULL;
988
989 langtag->grandfathered = EMPTY;
990 langtag->privateuse = EMPTY;
991 }
992
993 static void
994 _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
995 char buf[ULOC_LANG_CAPACITY];
996 UErrorCode tmpStatus = U_ZERO_ERROR;
997 int32_t len, i;
998
999 if (U_FAILURE(*status)) {
1000 return;
1001 }
1002
1003 len = uloc_getLanguage(localeID, buf, sizeof(buf), &tmpStatus);
1004 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1005 if (strict) {
1006 *status = U_ILLEGAL_ARGUMENT_ERROR;
1007 return;
1008 }
1009 len = 0;
1010 }
1011
1012 /* Note: returned language code is in lower case letters */
1013
1014 if (len == 0) {
1015 sink.Append(LANG_UND, LANG_UND_LEN);
1016 } else if (!ultag_isLanguageSubtag(buf, len)) {
1017 /* invalid language code */
1018 if (strict) {
1019 *status = U_ILLEGAL_ARGUMENT_ERROR;
1020 return;
1021 }
1022 sink.Append(LANG_UND, LANG_UND_LEN);
1023 } else {
1024 /* resolve deprecated */
1025 for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
1026 // 2-letter deprecated subtags are listede before 3-letter
1027 // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1028 // across the 1st 3-letter subtag, if the input is a 2-letter code.
1029 // to avoid continuing to try when there's no match.
1030 if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
1031 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
1032 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
1033 len = (int32_t)uprv_strlen(buf);
1034 break;
1035 }
1036 }
1037 sink.Append(buf, len);
1038 }
1039 }
1040
1041 static void
1042 _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1043 char buf[ULOC_SCRIPT_CAPACITY];
1044 UErrorCode tmpStatus = U_ZERO_ERROR;
1045 int32_t len;
1046
1047 if (U_FAILURE(*status)) {
1048 return;
1049 }
1050
1051 len = uloc_getScript(localeID, buf, sizeof(buf), &tmpStatus);
1052 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1053 if (strict) {
1054 *status = U_ILLEGAL_ARGUMENT_ERROR;
1055 }
1056 return;
1057 }
1058
1059 if (len > 0) {
1060 if (!ultag_isScriptSubtag(buf, len)) {
1061 /* invalid script code */
1062 if (strict) {
1063 *status = U_ILLEGAL_ARGUMENT_ERROR;
1064 }
1065 return;
1066 } else {
1067 sink.Append("-", 1);
1068 sink.Append(buf, len);
1069 }
1070 }
1071 }
1072
1073 static void
1074 _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UErrorCode* status) {
1075 char buf[ULOC_COUNTRY_CAPACITY];
1076 UErrorCode tmpStatus = U_ZERO_ERROR;
1077 int32_t len;
1078
1079 if (U_FAILURE(*status)) {
1080 return;
1081 }
1082
1083 len = uloc_getCountry(localeID, buf, sizeof(buf), &tmpStatus);
1084 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1085 if (strict) {
1086 *status = U_ILLEGAL_ARGUMENT_ERROR;
1087 }
1088 return;
1089 }
1090
1091 if (len > 0) {
1092 if (!ultag_isRegionSubtag(buf, len)) {
1093 /* invalid region code */
1094 if (strict) {
1095 *status = U_ILLEGAL_ARGUMENT_ERROR;
1096 }
1097 return;
1098 } else {
1099 sink.Append("-", 1);
1100 /* resolve deprecated */
1101 for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
1102 if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
1103 uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
1104 len = (int32_t)uprv_strlen(buf);
1105 break;
1106 }
1107 }
1108 sink.Append(buf, len);
1109 }
1110 }
1111 }
1112
1113 static void
1114 _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool *hadPosix, UErrorCode* status) {
1115 char buf[ULOC_FULLNAME_CAPACITY];
1116 UErrorCode tmpStatus = U_ZERO_ERROR;
1117 int32_t len, i;
1118
1119 if (U_FAILURE(*status)) {
1120 return;
1121 }
1122
1123 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1124 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1125 if (strict) {
1126 *status = U_ILLEGAL_ARGUMENT_ERROR;
1127 }
1128 return;
1129 }
1130
1131 if (len > 0) {
1132 char *p, *pVar;
1133 UBool bNext = TRUE;
1134 VariantListEntry *var;
1135 VariantListEntry *varFirst = NULL;
1136
1137 pVar = NULL;
1138 p = buf;
1139 while (bNext) {
1140 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1141 if (*p == 0) {
1142 bNext = FALSE;
1143 } else {
1144 *p = 0; /* terminate */
1145 }
1146 if (pVar == NULL) {
1147 if (strict) {
1148 *status = U_ILLEGAL_ARGUMENT_ERROR;
1149 break;
1150 }
1151 /* ignore empty variant */
1152 } else {
1153 /* ICU uses upper case letters for variants, but
1154 the canonical format is lowercase in BCP47 */
1155 for (i = 0; *(pVar + i) != 0; i++) {
1156 *(pVar + i) = uprv_tolower(*(pVar + i));
1157 }
1158
1159 /* validate */
1160 if (_isVariantSubtag(pVar, -1)) {
1161 if (uprv_strcmp(pVar,POSIX_VALUE) || len != (int32_t)uprv_strlen(POSIX_VALUE)) {
1162 /* emit the variant to the list */
1163 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
1164 if (var == NULL) {
1165 *status = U_MEMORY_ALLOCATION_ERROR;
1166 break;
1167 }
1168 var->variant = pVar;
1169 if (!_addVariantToList(&varFirst, var)) {
1170 /* duplicated variant */
1171 uprv_free(var);
1172 if (strict) {
1173 *status = U_ILLEGAL_ARGUMENT_ERROR;
1174 break;
1175 }
1176 }
1177 } else {
1178 /* Special handling for POSIX variant, need to remember that we had it and then */
1179 /* treat it like an extension later. */
1180 *hadPosix = TRUE;
1181 }
1182 } else if (strict) {
1183 *status = U_ILLEGAL_ARGUMENT_ERROR;
1184 break;
1185 } else if (_isPrivateuseValueSubtag(pVar, -1)) {
1186 /* Handle private use subtags separately */
1187 break;
1188 }
1189 }
1190 /* reset variant starting position */
1191 pVar = NULL;
1192 } else if (pVar == NULL) {
1193 pVar = p;
1194 }
1195 p++;
1196 }
1197
1198 if (U_SUCCESS(*status)) {
1199 if (varFirst != NULL) {
1200 int32_t varLen;
1201
1202 /* write out validated/normalized variants to the target */
1203 var = varFirst;
1204 while (var != NULL) {
1205 sink.Append("-", 1);
1206 varLen = (int32_t)uprv_strlen(var->variant);
1207 sink.Append(var->variant, varLen);
1208 var = var->next;
1209 }
1210 }
1211 }
1212
1213 /* clean up */
1214 var = varFirst;
1215 while (var != NULL) {
1216 VariantListEntry *tmpVar = var->next;
1217 uprv_free(var);
1218 var = tmpVar;
1219 }
1220
1221 if (U_FAILURE(*status)) {
1222 return;
1223 }
1224 }
1225 }
1226
1227 static void
1228 _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1229 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
1230 int32_t attrBufLength = 0;
1231
1232 icu::MemoryPool<AttributeListEntry> attrPool;
1233 icu::MemoryPool<ExtensionListEntry> extPool;
1234 icu::MemoryPool<icu::CharString> strPool;
1235
1236 icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, status));
1237 if (U_FAILURE(*status) && !hadPosix) {
1238 return;
1239 }
1240 if (keywordEnum.isValid() || hadPosix) {
1241 /* reorder extensions */
1242 int32_t len;
1243 const char *key;
1244 ExtensionListEntry *firstExt = NULL;
1245 ExtensionListEntry *ext;
1246 AttributeListEntry *firstAttr = NULL;
1247 AttributeListEntry *attr;
1248 icu::MemoryPool<icu::CharString> extBufPool;
1249 const char *bcpKey=nullptr, *bcpValue=nullptr;
1250 UErrorCode tmpStatus = U_ZERO_ERROR;
1251 int32_t keylen;
1252 UBool isBcpUExt;
1253
1254 while (TRUE) {
1255 icu::CharString buf;
1256 key = uenum_next(keywordEnum.getAlias(), NULL, status);
1257 if (key == NULL) {
1258 break;
1259 }
1260 char* buffer;
1261 int32_t resultCapacity = ULOC_KEYWORD_AND_VALUES_CAPACITY;
1262
1263 for (;;) {
1264 buffer = buf.getAppendBuffer(
1265 /*minCapacity=*/resultCapacity,
1266 /*desiredCapacityHint=*/resultCapacity,
1267 resultCapacity,
1268 tmpStatus);
1269
1270 if (U_FAILURE(tmpStatus)) {
1271 break;
1272 }
1273
1274 len = uloc_getKeywordValue(
1275 localeID, key, buffer, resultCapacity, &tmpStatus);
1276
1277 if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
1278 break;
1279 }
1280
1281 resultCapacity = len;
1282 tmpStatus = U_ZERO_ERROR;
1283 }
1284
1285 if (U_FAILURE(tmpStatus)) {
1286 if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
1287 *status = U_MEMORY_ALLOCATION_ERROR;
1288 break;
1289 }
1290 if (strict) {
1291 *status = U_ILLEGAL_ARGUMENT_ERROR;
1292 break;
1293 }
1294 /* ignore this keyword */
1295 tmpStatus = U_ZERO_ERROR;
1296 continue;
1297 }
1298
1299 buf.append(buffer, len, tmpStatus);
1300 if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1301 tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
1302 }
1303
1304 keylen = (int32_t)uprv_strlen(key);
1305 isBcpUExt = (keylen > 1);
1306
1307 /* special keyword used for representing Unicode locale attributes */
1308 if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) {
1309 if (len > 0) {
1310 int32_t i = 0;
1311 while (TRUE) {
1312 attrBufLength = 0;
1313 for (; i < len; i++) {
1314 if (buf[i] != '-') {
1315 attrBuf[attrBufLength++] = buf[i];
1316 } else {
1317 i++;
1318 break;
1319 }
1320 }
1321 if (attrBufLength > 0) {
1322 attrBuf[attrBufLength] = 0;
1323
1324 } else if (i >= len){
1325 break;
1326 }
1327
1328 /* create AttributeListEntry */
1329 attr = attrPool.create();
1330 if (attr == NULL) {
1331 *status = U_MEMORY_ALLOCATION_ERROR;
1332 break;
1333 }
1334 icu::CharString* attrValue =
1335 strPool.create(attrBuf, attrBufLength, *status);
1336 if (attrValue == NULL) {
1337 *status = U_MEMORY_ALLOCATION_ERROR;
1338 break;
1339 }
1340 if (U_FAILURE(*status)) {
1341 break;
1342 }
1343 attr->attribute = attrValue->data();
1344
1345 if (!_addAttributeToList(&firstAttr, attr)) {
1346 if (strict) {
1347 *status = U_ILLEGAL_ARGUMENT_ERROR;
1348 break;
1349 }
1350 }
1351 }
1352 /* for a place holder ExtensionListEntry */
1353 bcpKey = LOCALE_ATTRIBUTE_KEY;
1354 bcpValue = NULL;
1355 }
1356 } else if (isBcpUExt) {
1357 bcpKey = uloc_toUnicodeLocaleKey(key);
1358 if (bcpKey == NULL) {
1359 if (strict) {
1360 *status = U_ILLEGAL_ARGUMENT_ERROR;
1361 break;
1362 }
1363 continue;
1364 }
1365
1366 /* we've checked buf is null-terminated above */
1367 bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
1368 if (bcpValue == NULL) {
1369 if (strict) {
1370 *status = U_ILLEGAL_ARGUMENT_ERROR;
1371 break;
1372 }
1373 continue;
1374 }
1375 if (bcpValue == buf.data()) {
1376 /*
1377 When uloc_toUnicodeLocaleType(key, buf) returns the
1378 input value as is, the value is well-formed, but has
1379 no known mapping. This implementation normalizes the
1380 value to lower case
1381 */
1382 icu::CharString* extBuf = extBufPool.create();
1383 if (extBuf == nullptr) {
1384 *status = U_MEMORY_ALLOCATION_ERROR;
1385 break;
1386 }
1387 int32_t bcpValueLen = static_cast<int32_t>(uprv_strlen(bcpValue));
1388 int32_t resultCapacity;
1389 char* pExtBuf = extBuf->getAppendBuffer(
1390 /*minCapacity=*/bcpValueLen,
1391 /*desiredCapacityHint=*/bcpValueLen,
1392 resultCapacity,
1393 tmpStatus);
1394 if (U_FAILURE(tmpStatus)) {
1395 *status = tmpStatus;
1396 break;
1397 }
1398
1399 uprv_strcpy(pExtBuf, bcpValue);
1400 T_CString_toLowerCase(pExtBuf);
1401
1402 extBuf->append(pExtBuf, bcpValueLen, tmpStatus);
1403 if (U_FAILURE(tmpStatus)) {
1404 *status = tmpStatus;
1405 break;
1406 }
1407
1408 bcpValue = extBuf->data();
1409 }
1410 } else {
1411 if (*key == PRIVATEUSE) {
1412 if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) {
1413 if (strict) {
1414 *status = U_ILLEGAL_ARGUMENT_ERROR;
1415 break;
1416 }
1417 continue;
1418 }
1419 } else {
1420 if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) {
1421 if (strict) {
1422 *status = U_ILLEGAL_ARGUMENT_ERROR;
1423 break;
1424 }
1425 continue;
1426 }
1427 }
1428 bcpKey = key;
1429 icu::CharString* extBuf =
1430 extBufPool.create(buf.data(), len, tmpStatus);
1431 if (extBuf == nullptr) {
1432 *status = U_MEMORY_ALLOCATION_ERROR;
1433 break;
1434 }
1435 if (U_FAILURE(tmpStatus)) {
1436 *status = tmpStatus;
1437 break;
1438 }
1439 bcpValue = extBuf->data();
1440 }
1441
1442 /* create ExtensionListEntry */
1443 ext = extPool.create();
1444 if (ext == NULL) {
1445 *status = U_MEMORY_ALLOCATION_ERROR;
1446 break;
1447 }
1448 ext->key = bcpKey;
1449 ext->value = bcpValue;
1450
1451 if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1452 if (strict) {
1453 *status = U_ILLEGAL_ARGUMENT_ERROR;
1454 break;
1455 }
1456 }
1457 }
1458
1459 /* Special handling for POSIX variant - add the keywords for POSIX */
1460 if (hadPosix) {
1461 /* create ExtensionListEntry for POSIX */
1462 ext = extPool.create();
1463 if (ext == NULL) {
1464 *status = U_MEMORY_ALLOCATION_ERROR;
1465 return;
1466 }
1467 ext->key = POSIX_KEY;
1468 ext->value = POSIX_VALUE;
1469
1470 if (!_addExtensionToList(&firstExt, ext, TRUE)) {
1471 // Silently ignore errors.
1472 }
1473 }
1474
1475 if (U_SUCCESS(*status) && (firstExt != NULL || firstAttr != NULL)) {
1476 UBool startLDMLExtension = FALSE;
1477 for (ext = firstExt; ext; ext = ext->next) {
1478 if (!startLDMLExtension && uprv_strlen(ext->key) > 1) {
1479 /* first LDML u singlton extension */
1480 sink.Append("-u", 2);
1481 startLDMLExtension = TRUE;
1482 }
1483
1484 /* write out the sorted BCP47 attributes, extensions and private use */
1485 if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) {
1486 /* write the value for the attributes */
1487 for (attr = firstAttr; attr; attr = attr->next) {
1488 sink.Append("-", 1);
1489 sink.Append(
1490 attr->attribute, static_cast<int32_t>(uprv_strlen(attr->attribute)));
1491 }
1492 } else {
1493 sink.Append("-", 1);
1494 sink.Append(ext->key, static_cast<int32_t>(uprv_strlen(ext->key)));
1495 sink.Append("-", 1);
1496 sink.Append(ext->value, static_cast<int32_t>(uprv_strlen(ext->value)));
1497 }
1498 }
1499 }
1500 }
1501 }
1502
1503 /**
1504 * Append keywords parsed from LDML extension value
1505 * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1506 * Note: char* buf is used for storing keywords
1507 */
1508 static void
1509 _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool<ExtensionListEntry>& extPool, icu::MemoryPool<icu::CharString>& kwdBuf, UBool *posixVariant, UErrorCode *status) {
1510 const char *pTag; /* beginning of current subtag */
1511 const char *pKwds; /* beginning of key-type pairs */
1512 UBool variantExists = *posixVariant;
1513
1514 ExtensionListEntry *kwdFirst = NULL; /* first LDML keyword */
1515 ExtensionListEntry *kwd, *nextKwd;
1516
1517 int32_t len;
1518
1519 /* Reset the posixVariant value */
1520 *posixVariant = FALSE;
1521
1522 pTag = ldmlext;
1523 pKwds = NULL;
1524
1525 {
1526 AttributeListEntry *attrFirst = NULL; /* first attribute */
1527 AttributeListEntry *attr, *nextAttr;
1528
1529 char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1530 int32_t attrBufIdx = 0;
1531
1532 icu::MemoryPool<AttributeListEntry> attrPool;
1533
1534 /* Iterate through u extension attributes */
1535 while (*pTag) {
1536 /* locate next separator char */
1537 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1538
1539 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1540 pKwds = pTag;
1541 break;
1542 }
1543
1544 /* add this attribute to the list */
1545 attr = attrPool.create();
1546 if (attr == NULL) {
1547 *status = U_MEMORY_ALLOCATION_ERROR;
1548 return;
1549 }
1550
1551 if (len < (int32_t)sizeof(attrBuf) - attrBufIdx) {
1552 uprv_memcpy(&attrBuf[attrBufIdx], pTag, len);
1553 attrBuf[attrBufIdx + len] = 0;
1554 attr->attribute = &attrBuf[attrBufIdx];
1555 attrBufIdx += (len + 1);
1556 } else {
1557 *status = U_ILLEGAL_ARGUMENT_ERROR;
1558 return;
1559 }
1560
1561 // duplicate attribute is ignored, causes no error.
1562 _addAttributeToList(&attrFirst, attr);
1563
1564 /* next tag */
1565 pTag += len;
1566 if (*pTag) {
1567 /* next to the separator */
1568 pTag++;
1569 }
1570 }
1571
1572 if (attrFirst) {
1573 /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1574
1575 kwd = extPool.create();
1576 if (kwd == NULL) {
1577 *status = U_MEMORY_ALLOCATION_ERROR;
1578 return;
1579 }
1580
1581 icu::CharString* value = kwdBuf.create();
1582 if (value == NULL) {
1583 *status = U_MEMORY_ALLOCATION_ERROR;
1584 return;
1585 }
1586
1587 /* attribute subtags sorted in alphabetical order as type */
1588 attr = attrFirst;
1589 while (attr != NULL) {
1590 nextAttr = attr->next;
1591 if (attr != attrFirst) {
1592 value->append('-', *status);
1593 }
1594 value->append(attr->attribute, *status);
1595 attr = nextAttr;
1596 }
1597 if (U_FAILURE(*status)) {
1598 return;
1599 }
1600
1601 kwd->key = LOCALE_ATTRIBUTE_KEY;
1602 kwd->value = value->data();
1603
1604 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1605 *status = U_ILLEGAL_ARGUMENT_ERROR;
1606 return;
1607 }
1608 }
1609 }
1610
1611 if (pKwds) {
1612 const char *pBcpKey = NULL; /* u extenstion key subtag */
1613 const char *pBcpType = NULL; /* beginning of u extension type subtag(s) */
1614 int32_t bcpKeyLen = 0;
1615 int32_t bcpTypeLen = 0;
1616 UBool isDone = FALSE;
1617
1618 pTag = pKwds;
1619 /* BCP47 representation of LDML key/type pairs */
1620 while (!isDone) {
1621 const char *pNextBcpKey = NULL;
1622 int32_t nextBcpKeyLen = 0;
1623 UBool emitKeyword = FALSE;
1624
1625 if (*pTag) {
1626 /* locate next separator char */
1627 for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++);
1628
1629 if (ultag_isUnicodeLocaleKey(pTag, len)) {
1630 if (pBcpKey) {
1631 emitKeyword = TRUE;
1632 pNextBcpKey = pTag;
1633 nextBcpKeyLen = len;
1634 } else {
1635 pBcpKey = pTag;
1636 bcpKeyLen = len;
1637 }
1638 } else {
1639 U_ASSERT(pBcpKey != NULL);
1640 /* within LDML type subtags */
1641 if (pBcpType) {
1642 bcpTypeLen += (len + 1);
1643 } else {
1644 pBcpType = pTag;
1645 bcpTypeLen = len;
1646 }
1647 }
1648
1649 /* next tag */
1650 pTag += len;
1651 if (*pTag) {
1652 /* next to the separator */
1653 pTag++;
1654 }
1655 } else {
1656 /* processing last one */
1657 emitKeyword = TRUE;
1658 isDone = TRUE;
1659 }
1660
1661 if (emitKeyword) {
1662 const char *pKey = NULL; /* LDML key */
1663 const char *pType = NULL; /* LDML type */
1664
1665 char bcpKeyBuf[9]; /* BCP key length is always 2 for now */
1666
1667 U_ASSERT(pBcpKey != NULL);
1668
1669 if (bcpKeyLen >= (int32_t)sizeof(bcpKeyBuf)) {
1670 /* the BCP key is invalid */
1671 *status = U_ILLEGAL_ARGUMENT_ERROR;
1672 return;
1673 }
1674
1675 uprv_strncpy(bcpKeyBuf, pBcpKey, bcpKeyLen);
1676 bcpKeyBuf[bcpKeyLen] = 0;
1677
1678 /* u extension key to LDML key */
1679 pKey = uloc_toLegacyKey(bcpKeyBuf);
1680 if (pKey == NULL) {
1681 *status = U_ILLEGAL_ARGUMENT_ERROR;
1682 return;
1683 }
1684 if (pKey == bcpKeyBuf) {
1685 /*
1686 The key returned by toLegacyKey points to the input buffer.
1687 We normalize the result key to lower case.
1688 */
1689 T_CString_toLowerCase(bcpKeyBuf);
1690 icu::CharString* key = kwdBuf.create(bcpKeyBuf, bcpKeyLen, *status);
1691 if (key == NULL) {
1692 *status = U_MEMORY_ALLOCATION_ERROR;
1693 return;
1694 }
1695 if (U_FAILURE(*status)) {
1696 return;
1697 }
1698 pKey = key->data();
1699 }
1700
1701 if (pBcpType) {
1702 char bcpTypeBuf[128]; /* practically long enough even considering multiple subtag type */
1703 if (bcpTypeLen >= (int32_t)sizeof(bcpTypeBuf)) {
1704 /* the BCP type is too long */
1705 *status = U_ILLEGAL_ARGUMENT_ERROR;
1706 return;
1707 }
1708
1709 uprv_strncpy(bcpTypeBuf, pBcpType, bcpTypeLen);
1710 bcpTypeBuf[bcpTypeLen] = 0;
1711
1712 /* BCP type to locale type */
1713 pType = uloc_toLegacyType(pKey, bcpTypeBuf);
1714 if (pType == NULL) {
1715 *status = U_ILLEGAL_ARGUMENT_ERROR;
1716 return;
1717 }
1718 if (pType == bcpTypeBuf) {
1719 /*
1720 The type returned by toLegacyType points to the input buffer.
1721 We normalize the result type to lower case.
1722 */
1723 /* normalize to lower case */
1724 T_CString_toLowerCase(bcpTypeBuf);
1725 icu::CharString* type = kwdBuf.create(bcpTypeBuf, bcpTypeLen, *status);
1726 if (type == NULL) {
1727 *status = U_MEMORY_ALLOCATION_ERROR;
1728 return;
1729 }
1730 if (U_FAILURE(*status)) {
1731 return;
1732 }
1733 pType = type->data();
1734 }
1735 } else {
1736 /* typeless - default type value is "yes" */
1737 pType = LOCALE_TYPE_YES;
1738 }
1739
1740 /* Special handling for u-va-posix, since we want to treat this as a variant,
1741 not as a keyword */
1742 if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) {
1743 *posixVariant = TRUE;
1744 } else {
1745 /* create an ExtensionListEntry for this keyword */
1746 kwd = extPool.create();
1747 if (kwd == NULL) {
1748 *status = U_MEMORY_ALLOCATION_ERROR;
1749 return;
1750 }
1751
1752 kwd->key = pKey;
1753 kwd->value = pType;
1754
1755 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1756 // duplicate keyword is allowed, Only the first
1757 // is honored.
1758 }
1759 }
1760
1761 pBcpKey = pNextBcpKey;
1762 bcpKeyLen = pNextBcpKey != NULL ? nextBcpKeyLen : 0;
1763 pBcpType = NULL;
1764 bcpTypeLen = 0;
1765 }
1766 }
1767 }
1768
1769 kwd = kwdFirst;
1770 while (kwd != NULL) {
1771 nextKwd = kwd->next;
1772 _addExtensionToList(appendTo, kwd, FALSE);
1773 kwd = nextKwd;
1774 }
1775 }
1776
1777
1778 static void
1779 _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status) {
1780 int32_t i, n;
1781 int32_t len;
1782 ExtensionListEntry *kwdFirst = NULL;
1783 ExtensionListEntry *kwd;
1784 const char *key, *type;
1785 icu::MemoryPool<ExtensionListEntry> extPool;
1786 icu::MemoryPool<icu::CharString> kwdBuf;
1787 UBool posixVariant = FALSE;
1788
1789 if (U_FAILURE(*status)) {
1790 return;
1791 }
1792
1793 /* Determine if variants already exists */
1794 if (ultag_getVariantsSize(langtag)) {
1795 posixVariant = TRUE;
1796 }
1797
1798 n = ultag_getExtensionsSize(langtag);
1799
1800 /* resolve locale keywords and reordering keys */
1801 for (i = 0; i < n; i++) {
1802 key = ultag_getExtensionKey(langtag, i);
1803 type = ultag_getExtensionValue(langtag, i);
1804 if (*key == LDMLEXT) {
1805 _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
1806 if (U_FAILURE(*status)) {
1807 break;
1808 }
1809 } else {
1810 kwd = extPool.create();
1811 if (kwd == NULL) {
1812 *status = U_MEMORY_ALLOCATION_ERROR;
1813 break;
1814 }
1815 kwd->key = key;
1816 kwd->value = type;
1817 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1818 *status = U_ILLEGAL_ARGUMENT_ERROR;
1819 break;
1820 }
1821 }
1822 }
1823
1824 if (U_SUCCESS(*status)) {
1825 type = ultag_getPrivateUse(langtag);
1826 if ((int32_t)uprv_strlen(type) > 0) {
1827 /* add private use as a keyword */
1828 kwd = extPool.create();
1829 if (kwd == NULL) {
1830 *status = U_MEMORY_ALLOCATION_ERROR;
1831 } else {
1832 kwd->key = PRIVATEUSE_KEY;
1833 kwd->value = type;
1834 if (!_addExtensionToList(&kwdFirst, kwd, FALSE)) {
1835 *status = U_ILLEGAL_ARGUMENT_ERROR;
1836 }
1837 }
1838 }
1839 }
1840
1841 /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1842
1843 if (U_SUCCESS(*status) && posixVariant) {
1844 len = (int32_t) uprv_strlen(_POSIX);
1845 sink.Append(_POSIX, len);
1846 }
1847
1848 if (U_SUCCESS(*status) && kwdFirst != NULL) {
1849 /* write out the sorted keywords */
1850 UBool firstValue = TRUE;
1851 kwd = kwdFirst;
1852 do {
1853 if (firstValue) {
1854 sink.Append("@", 1);
1855 firstValue = FALSE;
1856 } else {
1857 sink.Append(";", 1);
1858 }
1859
1860 /* key */
1861 len = (int32_t)uprv_strlen(kwd->key);
1862 sink.Append(kwd->key, len);
1863 sink.Append("=", 1);
1864
1865 /* type */
1866 len = (int32_t)uprv_strlen(kwd->value);
1867 sink.Append(kwd->value, len);
1868
1869 kwd = kwd->next;
1870 } while (kwd);
1871 }
1872 }
1873
1874 static void
1875 _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, UBool strict, UBool hadPosix, UErrorCode* status) {
1876 (void)hadPosix;
1877 char buf[ULOC_FULLNAME_CAPACITY];
1878 char tmpAppend[ULOC_FULLNAME_CAPACITY];
1879 UErrorCode tmpStatus = U_ZERO_ERROR;
1880 int32_t len, i;
1881 int32_t reslen = 0;
1882 int32_t capacity = sizeof tmpAppend;
1883
1884 if (U_FAILURE(*status)) {
1885 return;
1886 }
1887
1888 len = uloc_getVariant(localeID, buf, sizeof(buf), &tmpStatus);
1889 if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
1890 if (strict) {
1891 *status = U_ILLEGAL_ARGUMENT_ERROR;
1892 }
1893 return;
1894 }
1895
1896 if (len > 0) {
1897 char *p, *pPriv;
1898 UBool bNext = TRUE;
1899 UBool firstValue = TRUE;
1900 UBool writeValue;
1901
1902 pPriv = NULL;
1903 p = buf;
1904 while (bNext) {
1905 writeValue = FALSE;
1906 if (*p == SEP || *p == LOCALE_SEP || *p == 0) {
1907 if (*p == 0) {
1908 bNext = FALSE;
1909 } else {
1910 *p = 0; /* terminate */
1911 }
1912 if (pPriv != NULL) {
1913 /* Private use in the canonical format is lowercase in BCP47 */
1914 for (i = 0; *(pPriv + i) != 0; i++) {
1915 *(pPriv + i) = uprv_tolower(*(pPriv + i));
1916 }
1917
1918 /* validate */
1919 if (_isPrivateuseValueSubtag(pPriv, -1)) {
1920 if (firstValue) {
1921 if (!_isVariantSubtag(pPriv, -1)) {
1922 writeValue = TRUE;
1923 }
1924 } else {
1925 writeValue = TRUE;
1926 }
1927 } else if (strict) {
1928 *status = U_ILLEGAL_ARGUMENT_ERROR;
1929 break;
1930 } else {
1931 break;
1932 }
1933
1934 if (writeValue) {
1935 if (reslen < capacity) {
1936 tmpAppend[reslen++] = SEP;
1937 }
1938
1939 if (firstValue) {
1940 if (reslen < capacity) {
1941 tmpAppend[reslen++] = *PRIVATEUSE_KEY;
1942 }
1943
1944 if (reslen < capacity) {
1945 tmpAppend[reslen++] = SEP;
1946 }
1947
1948 len = (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX);
1949 if (reslen < capacity) {
1950 uprv_memcpy(tmpAppend + reslen, PRIVUSE_VARIANT_PREFIX, uprv_min(len, capacity - reslen));
1951 }
1952 reslen += len;
1953
1954 if (reslen < capacity) {
1955 tmpAppend[reslen++] = SEP;
1956 }
1957
1958 firstValue = FALSE;
1959 }
1960
1961 len = (int32_t)uprv_strlen(pPriv);
1962 if (reslen < capacity) {
1963 uprv_memcpy(tmpAppend + reslen, pPriv, uprv_min(len, capacity - reslen));
1964 }
1965 reslen += len;
1966 }
1967 }
1968 /* reset private use starting position */
1969 pPriv = NULL;
1970 } else if (pPriv == NULL) {
1971 pPriv = p;
1972 }
1973 p++;
1974 }
1975
1976 if (U_FAILURE(*status)) {
1977 return;
1978 }
1979 }
1980
1981 if (U_SUCCESS(*status)) {
1982 len = reslen;
1983 sink.Append(tmpAppend, len);
1984 }
1985 }
1986
1987 /*
1988 * -------------------------------------------------
1989 *
1990 * ultag_ functions
1991 *
1992 * -------------------------------------------------
1993 */
1994
1995 /* Bit flags used by the parser */
1996 #define LANG 0x0001
1997 #define EXTL 0x0002
1998 #define SCRT 0x0004
1999 #define REGN 0x0008
2000 #define VART 0x0010
2001 #define EXTS 0x0020
2002 #define EXTV 0x0040
2003 #define PRIV 0x0080
2004
2005 /**
2006 * Ticket #12705 - Visual Studio 2015 Update 3 contains a new code optimizer which has problems optimizing
2007 * this function. (See https://blogs.msdn.microsoft.com/vcblog/2016/05/04/new-code-optimizer/ )
2008 * As a workaround, we will turn off optimization just for this function on VS2015 Update 3 and above.
2009 */
2010 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
2011 #pragma optimize( "", off )
2012 #endif
2013
2014 static ULanguageTag*
2015 ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* status) {
2016 char *tagBuf;
2017 int16_t next;
2018 char *pSubtag, *pNext, *pLastGoodPosition;
2019 int32_t subtagLen;
2020 int32_t extlangIdx;
2021 ExtensionListEntry *pExtension;
2022 char *pExtValueSubtag, *pExtValueSubtagEnd;
2023 int32_t i;
2024 UBool privateuseVar = FALSE;
2025 int32_t grandfatheredLen = 0;
2026
2027 if (parsedLen != NULL) {
2028 *parsedLen = 0;
2029 }
2030
2031 if (U_FAILURE(*status)) {
2032 return NULL;
2033 }
2034
2035 if (tagLen < 0) {
2036 tagLen = (int32_t)uprv_strlen(tag);
2037 }
2038
2039 /* copy the entire string */
2040 tagBuf = (char*)uprv_malloc(tagLen + 1);
2041 if (tagBuf == NULL) {
2042 *status = U_MEMORY_ALLOCATION_ERROR;
2043 return NULL;
2044 }
2045 uprv_memcpy(tagBuf, tag, tagLen);
2046 *(tagBuf + tagLen) = 0;
2047
2048 /* create a ULanguageTag */
2049 icu::LocalULanguageTagPointer t(
2050 (ULanguageTag*)uprv_malloc(sizeof(ULanguageTag)));
2051 if (t.isNull()) {
2052 uprv_free(tagBuf);
2053 *status = U_MEMORY_ALLOCATION_ERROR;
2054 return NULL;
2055 }
2056 _initializeULanguageTag(t.getAlias());
2057 t->buf = tagBuf;
2058
2059 if (tagLen < MINLEN) {
2060 /* the input tag is too short - return empty ULanguageTag */
2061 return t.orphan();
2062 }
2063
2064 size_t parsedLenDelta = 0;
2065 // Grandfathered tag will be consider together. Grandfathered tag with intervening
2066 // script and region such as art-DE-lojban or art-Latn-lojban won't be
2067 // matched.
2068 /* check if the tag is grandfathered */
2069 for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
2070 int32_t checkGrandfatheredLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i]));
2071 if (tagLen < checkGrandfatheredLen) {
2072 continue;
2073 }
2074 if (tagLen > checkGrandfatheredLen && tagBuf[checkGrandfatheredLen] != '-') {
2075 // make sure next char is '-'.
2076 continue;
2077 }
2078 if (uprv_strnicmp(GRANDFATHERED[i], tagBuf, checkGrandfatheredLen) == 0) {
2079 int32_t newTagLength;
2080
2081 grandfatheredLen = checkGrandfatheredLen; /* back up for output parsedLen */
2082 int32_t replacementLen = static_cast<int32_t>(uprv_strlen(GRANDFATHERED[i+1]));
2083 newTagLength = replacementLen + tagLen - checkGrandfatheredLen;
2084 if (tagLen < newTagLength) {
2085 uprv_free(tagBuf);
2086 tagBuf = (char*)uprv_malloc(newTagLength + 1);
2087 if (tagBuf == NULL) {
2088 *status = U_MEMORY_ALLOCATION_ERROR;
2089 return NULL;
2090 }
2091 t->buf = tagBuf;
2092 tagLen = newTagLength;
2093 }
2094 parsedLenDelta = checkGrandfatheredLen - replacementLen;
2095 uprv_strcpy(t->buf, GRANDFATHERED[i + 1]);
2096 if (checkGrandfatheredLen != tagLen) {
2097 uprv_strcpy(t->buf + replacementLen, tag + checkGrandfatheredLen);
2098 }
2099 break;
2100 }
2101 }
2102
2103 if (grandfatheredLen == 0) {
2104 for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
2105 const char* redundantTag = REDUNDANT[i];
2106 size_t redundantTagLen = uprv_strlen(redundantTag);
2107 // The preferred tag for a redundant tag is always shorter than redundant
2108 // tag. A redundant tag may or may not be followed by other subtags.
2109 // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2110 if (uprv_strnicmp(redundantTag, tagBuf, static_cast<uint32_t>(redundantTagLen)) == 0) {
2111 const char* redundantTagEnd = tagBuf + redundantTagLen;
2112 if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
2113 const char* preferredTag = REDUNDANT[i + 1];
2114 size_t preferredTagLen = uprv_strlen(preferredTag);
2115 uprv_strncpy(t->buf, preferredTag, preferredTagLen);
2116 if (*redundantTagEnd == SEP) {
2117 uprv_memmove(tagBuf + preferredTagLen,
2118 redundantTagEnd,
2119 tagLen - redundantTagLen + 1);
2120 } else {
2121 tagBuf[preferredTagLen] = '\0';
2122 }
2123 // parsedLen should be the length of the input
2124 // before redundantTag is replaced by preferredTag.
2125 // Save the delta to add it back later.
2126 parsedLenDelta = redundantTagLen - preferredTagLen;
2127 break;
2128 }
2129 }
2130 }
2131 }
2132
2133 /*
2134 * langtag = language
2135 * ["-" script]
2136 * ["-" region]
2137 * *("-" variant)
2138 * *("-" extension)
2139 * ["-" privateuse]
2140 */
2141
2142 next = LANG | PRIV;
2143 pNext = pLastGoodPosition = tagBuf;
2144 extlangIdx = 0;
2145 pExtension = NULL;
2146 pExtValueSubtag = NULL;
2147 pExtValueSubtagEnd = NULL;
2148
2149 while (pNext) {
2150 char *pSep;
2151
2152 pSubtag = pNext;
2153
2154 /* locate next separator char */
2155 pSep = pSubtag;
2156 while (*pSep) {
2157 if (*pSep == SEP) {
2158 break;
2159 }
2160 pSep++;
2161 }
2162 if (*pSep == 0) {
2163 /* last subtag */
2164 pNext = NULL;
2165 } else {
2166 pNext = pSep + 1;
2167 }
2168 subtagLen = (int32_t)(pSep - pSubtag);
2169
2170 if (next & LANG) {
2171 if (ultag_isLanguageSubtag(pSubtag, subtagLen)) {
2172 *pSep = 0; /* terminate */
2173 // TODO: move deprecated language code handling here.
2174 t->language = T_CString_toLowerCase(pSubtag);
2175
2176 pLastGoodPosition = pSep;
2177 next = SCRT | REGN | VART | EXTS | PRIV;
2178 if (subtagLen <= 3)
2179 next |= EXTL;
2180 continue;
2181 }
2182 }
2183 if (next & EXTL) {
2184 if (_isExtlangSubtag(pSubtag, subtagLen)) {
2185 *pSep = 0;
2186 t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag);
2187
2188 pLastGoodPosition = pSep;
2189 if (extlangIdx < 3) {
2190 next = EXTL | SCRT | REGN | VART | EXTS | PRIV;
2191 } else {
2192 next = SCRT | REGN | VART | EXTS | PRIV;
2193 }
2194 continue;
2195 }
2196 }
2197 if (next & SCRT) {
2198 if (ultag_isScriptSubtag(pSubtag, subtagLen)) {
2199 char *p = pSubtag;
2200
2201 *pSep = 0;
2202
2203 /* to title case */
2204 *p = uprv_toupper(*p);
2205 p++;
2206 for (; *p; p++) {
2207 *p = uprv_tolower(*p);
2208 }
2209
2210 t->script = pSubtag;
2211
2212 pLastGoodPosition = pSep;
2213 next = REGN | VART | EXTS | PRIV;
2214 continue;
2215 }
2216 }
2217 if (next & REGN) {
2218 if (ultag_isRegionSubtag(pSubtag, subtagLen)) {
2219 *pSep = 0;
2220 // TODO: move deprecated region code handling here.
2221 t->region = T_CString_toUpperCase(pSubtag);
2222
2223 pLastGoodPosition = pSep;
2224 next = VART | EXTS | PRIV;
2225 continue;
2226 }
2227 }
2228 if (next & VART) {
2229 if (_isVariantSubtag(pSubtag, subtagLen) ||
2230 (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) {
2231 VariantListEntry *var;
2232 UBool isAdded;
2233
2234 var = (VariantListEntry*)uprv_malloc(sizeof(VariantListEntry));
2235 if (var == NULL) {
2236 *status = U_MEMORY_ALLOCATION_ERROR;
2237 return NULL;
2238 }
2239 *pSep = 0;
2240 var->variant = T_CString_toUpperCase(pSubtag);
2241 isAdded = _addVariantToList(&(t->variants), var);
2242 if (!isAdded) {
2243 /* duplicated variant entry */
2244 uprv_free(var);
2245 break;
2246 }
2247 pLastGoodPosition = pSep;
2248 next = VART | EXTS | PRIV;
2249 continue;
2250 }
2251 }
2252 if (next & EXTS) {
2253 if (_isExtensionSingleton(pSubtag, subtagLen)) {
2254 if (pExtension != NULL) {
2255 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2256 /* the previous extension is incomplete */
2257 uprv_free(pExtension);
2258 pExtension = NULL;
2259 break;
2260 }
2261
2262 /* terminate the previous extension value */
2263 *pExtValueSubtagEnd = 0;
2264 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2265
2266 /* insert the extension to the list */
2267 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2268 pLastGoodPosition = pExtValueSubtagEnd;
2269 } else {
2270 /* stop parsing here */
2271 uprv_free(pExtension);
2272 pExtension = NULL;
2273 break;
2274 }
2275 }
2276
2277 /* create a new extension */
2278 pExtension = (ExtensionListEntry*)uprv_malloc(sizeof(ExtensionListEntry));
2279 if (pExtension == NULL) {
2280 *status = U_MEMORY_ALLOCATION_ERROR;
2281 return NULL;
2282 }
2283 *pSep = 0;
2284 pExtension->key = T_CString_toLowerCase(pSubtag);
2285 pExtension->value = NULL; /* will be set later */
2286
2287 /*
2288 * reset the start and the end location of extension value
2289 * subtags for this extension
2290 */
2291 pExtValueSubtag = NULL;
2292 pExtValueSubtagEnd = NULL;
2293
2294 next = EXTV;
2295 continue;
2296 }
2297 }
2298 if (next & EXTV) {
2299 if (_isExtensionSubtag(pSubtag, subtagLen)) {
2300 if (pExtValueSubtag == NULL) {
2301 /* if the start postion of this extension's value is not yet,
2302 this one is the first value subtag */
2303 pExtValueSubtag = pSubtag;
2304 }
2305
2306 /* Mark the end of this subtag */
2307 pExtValueSubtagEnd = pSep;
2308 next = EXTS | EXTV | PRIV;
2309
2310 continue;
2311 }
2312 }
2313 if (next & PRIV) {
2314 if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) {
2315 char *pPrivuseVal;
2316
2317 if (pExtension != NULL) {
2318 /* Process the last extension */
2319 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2320 /* the previous extension is incomplete */
2321 uprv_free(pExtension);
2322 pExtension = NULL;
2323 break;
2324 } else {
2325 /* terminate the previous extension value */
2326 *pExtValueSubtagEnd = 0;
2327 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2328
2329 /* insert the extension to the list */
2330 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2331 pLastGoodPosition = pExtValueSubtagEnd;
2332 pExtension = NULL;
2333 } else {
2334 /* stop parsing here */
2335 uprv_free(pExtension);
2336 pExtension = NULL;
2337 break;
2338 }
2339 }
2340 }
2341
2342 /* The rest of part will be private use value subtags */
2343 if (pNext == NULL) {
2344 /* empty private use subtag */
2345 break;
2346 }
2347 /* back up the private use value start position */
2348 pPrivuseVal = pNext;
2349
2350 /* validate private use value subtags */
2351 while (pNext) {
2352 pSubtag = pNext;
2353 pSep = pSubtag;
2354 while (*pSep) {
2355 if (*pSep == SEP) {
2356 break;
2357 }
2358 pSep++;
2359 }
2360 if (*pSep == 0) {
2361 /* last subtag */
2362 pNext = NULL;
2363 } else {
2364 pNext = pSep + 1;
2365 }
2366 subtagLen = (int32_t)(pSep - pSubtag);
2367
2368 if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) {
2369 *pSep = 0;
2370 next = VART;
2371 privateuseVar = TRUE;
2372 break;
2373 } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) {
2374 pLastGoodPosition = pSep;
2375 } else {
2376 break;
2377 }
2378 }
2379
2380 if (next == VART) {
2381 continue;
2382 }
2383
2384 if (pLastGoodPosition - pPrivuseVal > 0) {
2385 *pLastGoodPosition = 0;
2386 t->privateuse = T_CString_toLowerCase(pPrivuseVal);
2387 }
2388 /* No more subtags, exiting the parse loop */
2389 break;
2390 }
2391 break;
2392 }
2393
2394 /* If we fell through here, it means this subtag is illegal - quit parsing */
2395 break;
2396 }
2397
2398 if (pExtension != NULL) {
2399 /* Process the last extension */
2400 if (pExtValueSubtag == NULL || pExtValueSubtagEnd == NULL) {
2401 /* the previous extension is incomplete */
2402 uprv_free(pExtension);
2403 } else {
2404 /* terminate the previous extension value */
2405 *pExtValueSubtagEnd = 0;
2406 pExtension->value = T_CString_toLowerCase(pExtValueSubtag);
2407 /* insert the extension to the list */
2408 if (_addExtensionToList(&(t->extensions), pExtension, FALSE)) {
2409 pLastGoodPosition = pExtValueSubtagEnd;
2410 } else {
2411 uprv_free(pExtension);
2412 }
2413 }
2414 }
2415
2416 if (parsedLen != NULL) {
2417 *parsedLen = (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
2418 }
2419
2420 return t.orphan();
2421 }
2422
2423 /**
2424 * Ticket #12705 - Turn optimization back on.
2425 */
2426 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
2427 #pragma optimize( "", on )
2428 #endif
2429
2430 static void
2431 ultag_close(ULanguageTag* langtag) {
2432
2433 if (langtag == NULL) {
2434 return;
2435 }
2436
2437 uprv_free(langtag->buf);
2438
2439 if (langtag->variants) {
2440 VariantListEntry *curVar = langtag->variants;
2441 while (curVar) {
2442 VariantListEntry *nextVar = curVar->next;
2443 uprv_free(curVar);
2444 curVar = nextVar;
2445 }
2446 }
2447
2448 if (langtag->extensions) {
2449 ExtensionListEntry *curExt = langtag->extensions;
2450 while (curExt) {
2451 ExtensionListEntry *nextExt = curExt->next;
2452 uprv_free(curExt);
2453 curExt = nextExt;
2454 }
2455 }
2456
2457 uprv_free(langtag);
2458 }
2459
2460 static const char*
2461 ultag_getLanguage(const ULanguageTag* langtag) {
2462 return langtag->language;
2463 }
2464
2465 #if 0
2466 static const char*
2467 ultag_getJDKLanguage(const ULanguageTag* langtag) {
2468 int32_t i;
2469 for (i = 0; DEPRECATEDLANGS[i] != NULL; i += 2) {
2470 if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) {
2471 return DEPRECATEDLANGS[i + 1];
2472 }
2473 }
2474 return langtag->language;
2475 }
2476 #endif
2477
2478 static const char*
2479 ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) {
2480 if (idx >= 0 && idx < MAXEXTLANG) {
2481 return langtag->extlang[idx];
2482 }
2483 return NULL;
2484 }
2485
2486 static int32_t
2487 ultag_getExtlangSize(const ULanguageTag* langtag) {
2488 int32_t size = 0;
2489 int32_t i;
2490 for (i = 0; i < MAXEXTLANG; i++) {
2491 if (langtag->extlang[i]) {
2492 size++;
2493 }
2494 }
2495 return size;
2496 }
2497
2498 static const char*
2499 ultag_getScript(const ULanguageTag* langtag) {
2500 return langtag->script;
2501 }
2502
2503 static const char*
2504 ultag_getRegion(const ULanguageTag* langtag) {
2505 return langtag->region;
2506 }
2507
2508 static const char*
2509 ultag_getVariant(const ULanguageTag* langtag, int32_t idx) {
2510 const char *var = NULL;
2511 VariantListEntry *cur = langtag->variants;
2512 int32_t i = 0;
2513 while (cur) {
2514 if (i == idx) {
2515 var = cur->variant;
2516 break;
2517 }
2518 cur = cur->next;
2519 i++;
2520 }
2521 return var;
2522 }
2523
2524 static int32_t
2525 ultag_getVariantsSize(const ULanguageTag* langtag) {
2526 int32_t size = 0;
2527 VariantListEntry *cur = langtag->variants;
2528 while (TRUE) {
2529 if (cur == NULL) {
2530 break;
2531 }
2532 size++;
2533 cur = cur->next;
2534 }
2535 return size;
2536 }
2537
2538 static const char*
2539 ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) {
2540 const char *key = NULL;
2541 ExtensionListEntry *cur = langtag->extensions;
2542 int32_t i = 0;
2543 while (cur) {
2544 if (i == idx) {
2545 key = cur->key;
2546 break;
2547 }
2548 cur = cur->next;
2549 i++;
2550 }
2551 return key;
2552 }
2553
2554 static const char*
2555 ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) {
2556 const char *val = NULL;
2557 ExtensionListEntry *cur = langtag->extensions;
2558 int32_t i = 0;
2559 while (cur) {
2560 if (i == idx) {
2561 val = cur->value;
2562 break;
2563 }
2564 cur = cur->next;
2565 i++;
2566 }
2567 return val;
2568 }
2569
2570 static int32_t
2571 ultag_getExtensionsSize(const ULanguageTag* langtag) {
2572 int32_t size = 0;
2573 ExtensionListEntry *cur = langtag->extensions;
2574 while (TRUE) {
2575 if (cur == NULL) {
2576 break;
2577 }
2578 size++;
2579 cur = cur->next;
2580 }
2581 return size;
2582 }
2583
2584 static const char*
2585 ultag_getPrivateUse(const ULanguageTag* langtag) {
2586 return langtag->privateuse;
2587 }
2588
2589 #if 0
2590 static const char*
2591 ultag_getGrandfathered(const ULanguageTag* langtag) {
2592 return langtag->grandfathered;
2593 }
2594 #endif
2595
2596
2597 /*
2598 * -------------------------------------------------
2599 *
2600 * Locale/BCP47 conversion APIs, exposed as uloc_*
2601 *
2602 * -------------------------------------------------
2603 */
2604 U_CAPI int32_t U_EXPORT2
2605 uloc_toLanguageTag(const char* localeID,
2606 char* langtag,
2607 int32_t langtagCapacity,
2608 UBool strict,
2609 UErrorCode* status) {
2610 if (U_FAILURE(*status)) {
2611 return 0;
2612 }
2613
2614 icu::CheckedArrayByteSink sink(langtag, langtagCapacity);
2615 ulocimp_toLanguageTag(localeID, sink, strict, status);
2616
2617 int32_t reslen = sink.NumberOfBytesAppended();
2618
2619 if (U_FAILURE(*status)) {
2620 return reslen;
2621 }
2622
2623 if (sink.Overflowed()) {
2624 *status = U_BUFFER_OVERFLOW_ERROR;
2625 } else {
2626 u_terminateChars(langtag, langtagCapacity, reslen, status);
2627 }
2628
2629 return reslen;
2630 }
2631
2632
2633 U_CAPI void U_EXPORT2
2634 ulocimp_toLanguageTag(const char* localeID,
2635 icu::ByteSink& sink,
2636 UBool strict,
2637 UErrorCode* status) {
2638 icu::CharString canonical;
2639 int32_t reslen;
2640 UErrorCode tmpStatus = U_ZERO_ERROR;
2641 UBool hadPosix = FALSE;
2642 const char* pKeywordStart;
2643
2644 /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
2645 int32_t resultCapacity = static_cast<int32_t>(uprv_strlen(localeID));
2646 if (resultCapacity > 0) {
2647 char* buffer;
2648
2649 for (;;) {
2650 buffer = canonical.getAppendBuffer(
2651 /*minCapacity=*/resultCapacity,
2652 /*desiredCapacityHint=*/resultCapacity,
2653 resultCapacity,
2654 tmpStatus);
2655
2656 if (U_FAILURE(tmpStatus)) {
2657 *status = tmpStatus;
2658 return;
2659 }
2660
2661 reslen =
2662 uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
2663
2664 if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
2665 break;
2666 }
2667
2668 resultCapacity = reslen;
2669 tmpStatus = U_ZERO_ERROR;
2670 }
2671
2672 if (U_FAILURE(tmpStatus)) {
2673 *status = U_ILLEGAL_ARGUMENT_ERROR;
2674 return;
2675 }
2676
2677 canonical.append(buffer, reslen, tmpStatus);
2678 if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
2679 tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
2680 }
2681
2682 if (U_FAILURE(tmpStatus)) {
2683 *status = tmpStatus;
2684 return;
2685 }
2686 }
2687
2688 /* For handling special case - private use only tag */
2689 pKeywordStart = locale_getKeywordsStart(canonical.data());
2690 if (pKeywordStart == canonical.data()) {
2691 int kwdCnt = 0;
2692 UBool done = FALSE;
2693
2694 icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus));
2695 if (U_SUCCESS(tmpStatus)) {
2696 kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus);
2697 if (kwdCnt == 1) {
2698 const char *key;
2699 int32_t len = 0;
2700
2701 key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus);
2702 if (len == 1 && *key == PRIVATEUSE) {
2703 char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
2704 buf[0] = PRIVATEUSE;
2705 buf[1] = SEP;
2706 len = uloc_getKeywordValue(localeID, key, &buf[2], sizeof(buf) - 2, &tmpStatus);
2707 if (U_SUCCESS(tmpStatus)) {
2708 if (ultag_isPrivateuseValueSubtags(&buf[2], len)) {
2709 /* return private use only tag */
2710 sink.Append(buf, len + 2);
2711 done = TRUE;
2712 } else if (strict) {
2713 *status = U_ILLEGAL_ARGUMENT_ERROR;
2714 done = TRUE;
2715 }
2716 /* if not strict mode, then "und" will be returned */
2717 } else {
2718 *status = U_ILLEGAL_ARGUMENT_ERROR;
2719 done = TRUE;
2720 }
2721 }
2722 }
2723 if (done) {
2724 return;
2725 }
2726 }
2727 }
2728
2729 _appendLanguageToLanguageTag(canonical.data(), sink, strict, status);
2730 _appendScriptToLanguageTag(canonical.data(), sink, strict, status);
2731 _appendRegionToLanguageTag(canonical.data(), sink, strict, status);
2732 _appendVariantsToLanguageTag(canonical.data(), sink, strict, &hadPosix, status);
2733 _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2734 _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status);
2735 }
2736
2737
2738 U_CAPI int32_t U_EXPORT2
2739 uloc_forLanguageTag(const char* langtag,
2740 char* localeID,
2741 int32_t localeIDCapacity,
2742 int32_t* parsedLength,
2743 UErrorCode* status) {
2744 if (U_FAILURE(*status)) {
2745 return 0;
2746 }
2747
2748 icu::CheckedArrayByteSink sink(localeID, localeIDCapacity);
2749 ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status);
2750
2751 int32_t reslen = sink.NumberOfBytesAppended();
2752
2753 if (U_FAILURE(*status)) {
2754 return reslen;
2755 }
2756
2757 if (sink.Overflowed()) {
2758 *status = U_BUFFER_OVERFLOW_ERROR;
2759 } else {
2760 u_terminateChars(localeID, localeIDCapacity, reslen, status);
2761 }
2762
2763 return reslen;
2764 }
2765
2766
2767 U_CAPI void U_EXPORT2
2768 ulocimp_forLanguageTag(const char* langtag,
2769 int32_t tagLen,
2770 icu::ByteSink& sink,
2771 int32_t* parsedLength,
2772 UErrorCode* status) {
2773 UBool isEmpty = TRUE;
2774 const char *subtag, *p;
2775 int32_t len;
2776 int32_t i, n;
2777 UBool noRegion = TRUE;
2778
2779 icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status));
2780 if (U_FAILURE(*status)) {
2781 return;
2782 }
2783
2784 /* language */
2785 subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias());
2786 if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) {
2787 len = (int32_t)uprv_strlen(subtag);
2788 if (len > 0) {
2789 sink.Append(subtag, len);
2790 isEmpty = FALSE;
2791 }
2792 }
2793
2794 /* script */
2795 subtag = ultag_getScript(lt.getAlias());
2796 len = (int32_t)uprv_strlen(subtag);
2797 if (len > 0) {
2798 sink.Append("_", 1);
2799 isEmpty = FALSE;
2800
2801 /* write out the script in title case */
2802 char c = uprv_toupper(*subtag);
2803 sink.Append(&c, 1);
2804 sink.Append(subtag + 1, len - 1);
2805 }
2806
2807 /* region */
2808 subtag = ultag_getRegion(lt.getAlias());
2809 len = (int32_t)uprv_strlen(subtag);
2810 if (len > 0) {
2811 sink.Append("_", 1);
2812 isEmpty = FALSE;
2813
2814 /* write out the region in upper case */
2815 p = subtag;
2816 while (*p) {
2817 char c = uprv_toupper(*p);
2818 sink.Append(&c, 1);
2819 p++;
2820 }
2821 noRegion = FALSE;
2822 }
2823
2824 /* variants */
2825 n = ultag_getVariantsSize(lt.getAlias());
2826 if (n > 0) {
2827 if (noRegion) {
2828 sink.Append("_", 1);
2829 isEmpty = FALSE;
2830 }
2831
2832 for (i = 0; i < n; i++) {
2833 subtag = ultag_getVariant(lt.getAlias(), i);
2834 sink.Append("_", 1);
2835
2836 /* write out the variant in upper case */
2837 p = subtag;
2838 while (*p) {
2839 char c = uprv_toupper(*p);
2840 sink.Append(&c, 1);
2841 p++;
2842 }
2843 }
2844 }
2845
2846 /* keywords */
2847 n = ultag_getExtensionsSize(lt.getAlias());
2848 subtag = ultag_getPrivateUse(lt.getAlias());
2849 if (n > 0 || uprv_strlen(subtag) > 0) {
2850 if (isEmpty && n > 0) {
2851 /* need a language */
2852 sink.Append(LANG_UND, LANG_UND_LEN);
2853 }
2854 _appendKeywords(lt.getAlias(), sink, status);
2855 }
2856 }