1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2009-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/bytestream.h"
11 #include "unicode/utypes.h"
12 #include "unicode/ures.h"
13 #include "unicode/localpointer.h"
14 #include "unicode/putil.h"
15 #include "unicode/uenum.h"
16 #include "unicode/uloc.h"
27 /* struct holding a single variant */
28 typedef struct VariantListEntry
{
30 struct VariantListEntry
*next
;
33 /* struct holding a single attribute value */
34 struct AttributeListEntry
: public icu::UMemory
{
35 const char *attribute
;
36 struct AttributeListEntry
*next
;
39 /* struct holding a single extension */
40 struct ExtensionListEntry
: public icu::UMemory
{
43 struct ExtensionListEntry
*next
;
47 typedef struct ULanguageTag
{
48 char *buf
; /* holding parsed subtags */
50 const char *extlang
[MAXEXTLANG
];
53 VariantListEntry
*variants
;
54 ExtensionListEntry
*extensions
;
55 const char *privateuse
;
56 const char *grandfathered
;
61 #define PRIVATEUSE 'x'
64 #define LOCALE_SEP '_'
65 #define LOCALE_EXT_SEP '@'
66 #define LOCALE_KEYWORD_SEP ';'
67 #define LOCALE_KEY_TYPE_SEP '='
69 #define ISALPHA(c) uprv_isASCIILetter(c)
70 #define ISNUMERIC(c) ((c)>='0' && (c)<='9')
72 static const char EMPTY
[] = "";
73 static const char LANG_UND
[] = "und";
74 static const char PRIVATEUSE_KEY
[] = "x";
75 static const char _POSIX
[] = "_POSIX";
76 static const char POSIX_KEY
[] = "va";
77 static const char POSIX_VALUE
[] = "posix";
78 static const char LOCALE_ATTRIBUTE_KEY
[] = "attribute";
79 static const char PRIVUSE_VARIANT_PREFIX
[] = "lvariant";
80 static const char LOCALE_TYPE_YES
[] = "yes";
82 #define LANG_UND_LEN 3
85 Updated on 2018-09-12 from
86 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
88 This table has 2 parts. The parts for Grandfathered tags is generated by the
89 following scripts from the IANA language tag registry.
91 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
92 egrep -A 7 'Type: grandfathered' | \
93 egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
94 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
98 The 2nd part is made of five ICU-specific entries. They're kept for
99 the backward compatibility for now, even though there are no preferred
100 values. They may have to be removed for the strict BCP 47 compliance.
103 static const char* const GRANDFATHERED
[] = {
104 /* grandfathered preferred */
106 "en-gb-oed", "en-gb-oxendict",
127 // Grandfathered tags with no preferred value in the IANA
128 // registry. Kept for now for the backward compatibility
129 // because ICU has mapped them this way.
130 "cel-gaulish", "xtg-x-cel-gaulish",
131 "i-default", "en-x-i-default",
132 "i-enochian", "und-x-i-enochian",
133 "i-mingo", "see-x-i-mingo",
134 "zh-min", "nan-x-zh-min",
138 Updated on 2018-09-12 from
139 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
141 The table lists redundant tags with preferred value in the IANA languate tag registry.
142 It's generated with the following command:
144 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
145 grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
146 awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
149 In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
150 a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
153 static const char* const REDUNDANT
[] = {
154 // redundant preferred
175 "zh-cmn-hans", "cmn-hans",
176 "zh-cmn-hant", "cmn-hant",
181 // variant tag with preferred value
182 "ja-latn-hepburn-heploc", "ja-latn-alalc97",
186 Updated on 2018-09-12 from
187 https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
189 grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
190 grep -B1 'Preferred' | grep -v '^--' | \
191 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
193 Make sure that 2-letter language subtags come before 3-letter subtags.
195 static const char DEPRECATEDLANGS
[][4] = {
278 Updated on 2018-04-24 from
280 curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
281 grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
282 grep -B1 'Preferred' | \
283 awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
285 static const char DEPRECATEDREGIONS
[][3] = {
296 * -------------------------------------------------
298 * These ultag_ functions may be exposed as APIs later
300 * -------------------------------------------------
304 ultag_parse(const char* tag
, int32_t tagLen
, int32_t* parsedLen
, UErrorCode
* status
);
307 ultag_close(ULanguageTag
* langtag
);
310 ultag_getLanguage(const ULanguageTag
* langtag
);
314 ultag_getJDKLanguage(const ULanguageTag
* langtag
);
318 ultag_getExtlang(const ULanguageTag
* langtag
, int32_t idx
);
321 ultag_getExtlangSize(const ULanguageTag
* langtag
);
324 ultag_getScript(const ULanguageTag
* langtag
);
327 ultag_getRegion(const ULanguageTag
* langtag
);
330 ultag_getVariant(const ULanguageTag
* langtag
, int32_t idx
);
333 ultag_getVariantsSize(const ULanguageTag
* langtag
);
336 ultag_getExtensionKey(const ULanguageTag
* langtag
, int32_t idx
);
339 ultag_getExtensionValue(const ULanguageTag
* langtag
, int32_t idx
);
342 ultag_getExtensionsSize(const ULanguageTag
* langtag
);
345 ultag_getPrivateUse(const ULanguageTag
* langtag
);
349 ultag_getGrandfathered(const ULanguageTag
* langtag
);
355 * \class LocalULanguageTagPointer
356 * "Smart pointer" class, closes a ULanguageTag via ultag_close().
357 * For most methods see the LocalPointerBase base class.
359 * @see LocalPointerBase
363 U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer
, ULanguageTag
, ultag_close
);
368 * -------------------------------------------------
370 * Language subtag syntax validation functions
372 * -------------------------------------------------
376 _isAlphaString(const char* s
, int32_t len
) {
378 for (i
= 0; i
< len
; i
++) {
379 if (!ISALPHA(*(s
+ i
))) {
387 _isNumericString(const char* s
, int32_t len
) {
389 for (i
= 0; i
< len
; i
++) {
390 if (!ISNUMERIC(*(s
+ i
))) {
398 _isAlphaNumericString(const char* s
, int32_t len
) {
400 for (i
= 0; i
< len
; i
++) {
401 if (!ISALPHA(*(s
+ i
)) && !ISNUMERIC(*(s
+ i
))) {
409 _isAlphaNumericStringLimitedLength(const char* s
, int32_t len
, int32_t min
, int32_t max
) {
411 len
= (int32_t)uprv_strlen(s
);
413 if (len
>= min
&& len
<= max
&& _isAlphaNumericString(s
, len
)) {
420 ultag_isLanguageSubtag(const char* s
, int32_t len
) {
422 * unicode_language_subtag = alpha{2,3} | alpha{5,8};
423 * NOTE: Per ICUTC 2019/01/23- accepting alpha 4
427 len
= (int32_t)uprv_strlen(s
);
429 if (len
>= 2 && len
<= 8 && _isAlphaString(s
, len
)) {
436 _isExtlangSubtag(const char* s
, int32_t len
) {
438 * extlang = 3ALPHA ; selected ISO 639 codes
439 * *2("-" 3ALPHA) ; permanently reserved
442 len
= (int32_t)uprv_strlen(s
);
444 if (len
== 3 && _isAlphaString(s
, len
)) {
451 ultag_isScriptSubtag(const char* s
, int32_t len
) {
453 * script = 4ALPHA ; ISO 15924 code
456 len
= (int32_t)uprv_strlen(s
);
458 if (len
== 4 && _isAlphaString(s
, len
)) {
465 ultag_isRegionSubtag(const char* s
, int32_t len
) {
467 * region = 2ALPHA ; ISO 3166-1 code
468 * / 3DIGIT ; UN M.49 code
471 len
= (int32_t)uprv_strlen(s
);
473 if (len
== 2 && _isAlphaString(s
, len
)) {
476 if (len
== 3 && _isNumericString(s
, len
)) {
483 _isVariantSubtag(const char* s
, int32_t len
) {
485 * variant = 5*8alphanum ; registered variants
486 * / (DIGIT 3alphanum)
489 len
= (int32_t)uprv_strlen(s
);
491 if (_isAlphaNumericStringLimitedLength(s
, len
, 5, 8)) {
494 if (len
== 4 && ISNUMERIC(*s
) && _isAlphaNumericString(s
+ 1, 3)) {
501 _isSepListOf(UBool (*test
)(const char*, int32_t), const char* s
, int32_t len
) {
503 const char *pSubtag
= NULL
;
506 len
= (int32_t)uprv_strlen(s
);
509 while ((p
- s
) < len
) {
511 if (pSubtag
== NULL
) {
514 if (!test(pSubtag
, (int32_t)(p
- pSubtag
))) {
518 } else if (pSubtag
== NULL
) {
523 if (pSubtag
== NULL
) {
526 return test(pSubtag
, (int32_t)(p
- pSubtag
));
530 ultag_isVariantSubtags(const char* s
, int32_t len
) {
531 return _isSepListOf(&_isVariantSubtag
, s
, len
);
534 // This is for the ICU-specific "lvariant" handling.
536 _isPrivateuseVariantSubtag(const char* s
, int32_t len
) {
538 * variant = 1*8alphanum ; registered variants
539 * / (DIGIT 3alphanum)
541 return _isAlphaNumericStringLimitedLength(s
, len
, 1, 8);
545 _isExtensionSingleton(const char* s
, int32_t len
) {
547 * extension = singleton 1*("-" (2*8alphanum))
549 * singleton = DIGIT ; 0 - 9
556 len
= (int32_t)uprv_strlen(s
);
558 if (len
== 1 && (ISALPHA(*s
) || ISNUMERIC(*s
)) && (uprv_tolower(*s
) != PRIVATEUSE
)) {
565 _isExtensionSubtag(const char* s
, int32_t len
) {
567 * extension = singleton 1*("-" (2*8alphanum))
569 return _isAlphaNumericStringLimitedLength(s
, len
, 2, 8);
573 ultag_isExtensionSubtags(const char* s
, int32_t len
) {
574 return _isSepListOf(&_isExtensionSubtag
, s
, len
);
578 _isPrivateuseValueSubtag(const char* s
, int32_t len
) {
580 * privateuse = "x" 1*("-" (1*8alphanum))
582 return _isAlphaNumericStringLimitedLength(s
, len
, 1, 8);
586 ultag_isPrivateuseValueSubtags(const char* s
, int32_t len
) {
587 return _isSepListOf(&_isPrivateuseValueSubtag
, s
, len
);
591 ultag_isUnicodeLocaleAttribute(const char* s
, int32_t len
) {
593 * attribute = alphanum{3,8} ;
595 return _isAlphaNumericStringLimitedLength(s
, len
, 3, 8);
599 ultag_isUnicodeLocaleAttributes(const char* s
, int32_t len
) {
600 return _isSepListOf(&ultag_isUnicodeLocaleAttribute
, s
, len
);
604 ultag_isUnicodeLocaleKey(const char* s
, int32_t len
) {
606 * key = alphanum alpha ;
609 len
= (int32_t)uprv_strlen(s
);
611 if (len
== 2 && (ISALPHA(*s
) || ISNUMERIC(*s
)) && ISALPHA(s
[1])) {
618 _isUnicodeLocaleTypeSubtag(const char*s
, int32_t len
) {
622 return _isAlphaNumericStringLimitedLength(s
, len
, 3, 8);
626 ultag_isUnicodeLocaleType(const char*s
, int32_t len
) {
628 * type = alphanum{3,8} (sep alphanum{3,8})* ;
630 return _isSepListOf(&_isUnicodeLocaleTypeSubtag
, s
, len
);
634 _isTKey(const char* s
, int32_t len
)
637 * tkey = alpha digit ;
640 len
= (int32_t)uprv_strlen(s
);
642 if (len
== 2 && ISALPHA(*s
) && ISNUMERIC(*(s
+ 1))) {
649 _isTValue(const char* s
, int32_t len
)
652 * tvalue = (sep alphanum{3,8})+ ;
654 return _isAlphaNumericStringLimitedLength(s
, len
, 3, 8);
658 _isTransformedExtensionSubtag(int32_t& state
, const char* s
, int32_t len
)
660 const int32_t kStart
= 0; // Start, wait for unicode_language_subtag, tkey or end
661 const int32_t kGotLanguage
= 1; // Got unicode_language_subtag, wait for unicode_script_subtag,
662 // unicode_region_subtag, unicode_variant_subtag, tkey or end
663 const int32_t kGotScript
= 2; // Got unicode_script_subtag, wait for unicode_region_subtag,
664 // unicode_variant_subtag, tkey, or end
665 const int32_t kGotRegion
= 3; // Got unicode_region_subtag, wait for unicode_variant_subtag,
667 const int32_t kGotVariant
= 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag
669 const int32_t kGotTKey
= -1; // Got tkey, wait for tvalue. ERROR if stop here.
670 const int32_t kGotTValue
= 6; // Got tvalue, wait for tkey, tvalue or end
674 if (ultag_isLanguageSubtag(s
, len
)) {
675 state
= kGotLanguage
;
678 if (_isTKey(s
, len
)) {
684 if (ultag_isScriptSubtag(s
, len
)) {
690 if (ultag_isRegionSubtag(s
, len
)) {
698 if (_isVariantSubtag(s
, len
)) {
702 if (_isTKey(s
, len
)) {
708 if (_isTValue(s
, len
)) {
714 if (_isTKey(s
, len
)) {
718 if (_isTValue(s
, len
)) {
727 _isUnicodeExtensionSubtag(int32_t& state
, const char* s
, int32_t len
)
729 const int32_t kStart
= 0; // Start, wait for a key or attribute or end
730 const int32_t kGotKey
= 1; // Got a key, wait for type or key or end
731 const int32_t kGotType
= 2; // Got a type, wait for key or end
735 if (ultag_isUnicodeLocaleKey(s
, len
)) {
739 if (ultag_isUnicodeLocaleAttribute(s
, len
)) {
744 if (ultag_isUnicodeLocaleKey(s
, len
)) {
747 if (_isUnicodeLocaleTypeSubtag(s
, len
)) {
753 if (ultag_isUnicodeLocaleKey(s
, len
)) {
757 if (_isUnicodeLocaleTypeSubtag(s
, len
)) {
766 _isStatefulSepListOf(UBool (*test
)(int32_t&, const char*, int32_t), const char* s
, int32_t len
)
770 const char* start
= s
;
771 int32_t subtagLen
= 0;
774 len
= (int32_t)uprv_strlen(s
);
777 for (p
= s
; len
> 0; p
++, len
--) {
779 if (!test(state
, start
, subtagLen
)) {
789 if (test(state
, start
, subtagLen
) && state
>= 0) {
796 ultag_isTransformedExtensionSubtags(const char* s
, int32_t len
)
798 return _isStatefulSepListOf(&_isTransformedExtensionSubtag
, s
, len
);
802 ultag_isUnicodeExtensionSubtags(const char* s
, int32_t len
) {
803 return _isStatefulSepListOf(&_isUnicodeExtensionSubtag
, s
, len
);
808 * -------------------------------------------------
812 * -------------------------------------------------
816 _addVariantToList(VariantListEntry
**first
, VariantListEntry
*var
) {
819 if (*first
== NULL
) {
823 VariantListEntry
*prev
, *cur
;
826 /* variants order should be preserved */
836 /* Checking for duplicate variant */
837 cmp
= uprv_compareInvCharsAsAscii(var
->variant
, cur
->variant
);
839 /* duplicated variant */
852 _addAttributeToList(AttributeListEntry
**first
, AttributeListEntry
*attr
) {
855 if (*first
== NULL
) {
859 AttributeListEntry
*prev
, *cur
;
862 /* reorder variants in alphabetical order */
871 cmp
= uprv_compareInvCharsAsAscii(attr
->attribute
, cur
->attribute
);
882 /* duplicated variant */
896 _addExtensionToList(ExtensionListEntry
**first
, ExtensionListEntry
*ext
, UBool localeToBCP
) {
899 if (*first
== NULL
) {
903 ExtensionListEntry
*prev
, *cur
;
906 /* reorder variants in alphabetical order */
916 /* special handling for locale to bcp conversion */
919 len
= (int32_t)uprv_strlen(ext
->key
);
920 curlen
= (int32_t)uprv_strlen(cur
->key
);
922 if (len
== 1 && curlen
== 1) {
923 if (*(ext
->key
) == *(cur
->key
)) {
925 } else if (*(ext
->key
) == PRIVATEUSE
) {
927 } else if (*(cur
->key
) == PRIVATEUSE
) {
930 cmp
= *(ext
->key
) - *(cur
->key
);
932 } else if (len
== 1) {
933 cmp
= *(ext
->key
) - LDMLEXT
;
934 } else if (curlen
== 1) {
935 cmp
= LDMLEXT
- *(cur
->key
);
937 cmp
= uprv_compareInvCharsAsAscii(ext
->key
, cur
->key
);
938 /* Both are u extension keys - we need special handling for 'attribute' */
940 if (uprv_strcmp(cur
->key
, LOCALE_ATTRIBUTE_KEY
) == 0) {
942 } else if (uprv_strcmp(ext
->key
, LOCALE_ATTRIBUTE_KEY
) == 0) {
948 cmp
= uprv_compareInvCharsAsAscii(ext
->key
, cur
->key
);
960 /* duplicated extension key */
973 _initializeULanguageTag(ULanguageTag
* langtag
) {
978 langtag
->language
= EMPTY
;
979 for (i
= 0; i
< MAXEXTLANG
; i
++) {
980 langtag
->extlang
[i
] = NULL
;
983 langtag
->script
= EMPTY
;
984 langtag
->region
= EMPTY
;
986 langtag
->variants
= NULL
;
987 langtag
->extensions
= NULL
;
989 langtag
->grandfathered
= EMPTY
;
990 langtag
->privateuse
= EMPTY
;
994 _appendLanguageToLanguageTag(const char* localeID
, icu::ByteSink
& sink
, UBool strict
, UErrorCode
* status
) {
995 char buf
[ULOC_LANG_CAPACITY
];
996 UErrorCode tmpStatus
= U_ZERO_ERROR
;
999 if (U_FAILURE(*status
)) {
1003 len
= uloc_getLanguage(localeID
, buf
, sizeof(buf
), &tmpStatus
);
1004 if (U_FAILURE(tmpStatus
) || tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
1006 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1012 /* Note: returned language code is in lower case letters */
1015 sink
.Append(LANG_UND
, LANG_UND_LEN
);
1016 } else if (!ultag_isLanguageSubtag(buf
, len
)) {
1017 /* invalid language code */
1019 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1022 sink
.Append(LANG_UND
, LANG_UND_LEN
);
1024 /* resolve deprecated */
1025 for (i
= 0; i
< UPRV_LENGTHOF(DEPRECATEDLANGS
); i
+= 2) {
1026 // 2-letter deprecated subtags are listede before 3-letter
1027 // ones in DEPRECATEDLANGS[]. Get out of loop on coming
1028 // across the 1st 3-letter subtag, if the input is a 2-letter code.
1029 // to avoid continuing to try when there's no match.
1030 if (uprv_strlen(buf
) < uprv_strlen(DEPRECATEDLANGS
[i
])) break;
1031 if (uprv_compareInvCharsAsAscii(buf
, DEPRECATEDLANGS
[i
]) == 0) {
1032 uprv_strcpy(buf
, DEPRECATEDLANGS
[i
+ 1]);
1033 len
= (int32_t)uprv_strlen(buf
);
1037 sink
.Append(buf
, len
);
1042 _appendScriptToLanguageTag(const char* localeID
, icu::ByteSink
& sink
, UBool strict
, UErrorCode
* status
) {
1043 char buf
[ULOC_SCRIPT_CAPACITY
];
1044 UErrorCode tmpStatus
= U_ZERO_ERROR
;
1047 if (U_FAILURE(*status
)) {
1051 len
= uloc_getScript(localeID
, buf
, sizeof(buf
), &tmpStatus
);
1052 if (U_FAILURE(tmpStatus
) || tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
1054 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1060 if (!ultag_isScriptSubtag(buf
, len
)) {
1061 /* invalid script code */
1063 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1067 sink
.Append("-", 1);
1068 sink
.Append(buf
, len
);
1074 _appendRegionToLanguageTag(const char* localeID
, icu::ByteSink
& sink
, UBool strict
, UErrorCode
* status
) {
1075 char buf
[ULOC_COUNTRY_CAPACITY
];
1076 UErrorCode tmpStatus
= U_ZERO_ERROR
;
1079 if (U_FAILURE(*status
)) {
1083 len
= uloc_getCountry(localeID
, buf
, sizeof(buf
), &tmpStatus
);
1084 if (U_FAILURE(tmpStatus
) || tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
1086 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1092 if (!ultag_isRegionSubtag(buf
, len
)) {
1093 /* invalid region code */
1095 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1099 sink
.Append("-", 1);
1100 /* resolve deprecated */
1101 for (int i
= 0; i
< UPRV_LENGTHOF(DEPRECATEDREGIONS
); i
+= 2) {
1102 if (uprv_compareInvCharsAsAscii(buf
, DEPRECATEDREGIONS
[i
]) == 0) {
1103 uprv_strcpy(buf
, DEPRECATEDREGIONS
[i
+ 1]);
1104 len
= (int32_t)uprv_strlen(buf
);
1108 sink
.Append(buf
, len
);
1114 _appendVariantsToLanguageTag(const char* localeID
, icu::ByteSink
& sink
, UBool strict
, UBool
*hadPosix
, UErrorCode
* status
) {
1115 char buf
[ULOC_FULLNAME_CAPACITY
];
1116 UErrorCode tmpStatus
= U_ZERO_ERROR
;
1119 if (U_FAILURE(*status
)) {
1123 len
= uloc_getVariant(localeID
, buf
, sizeof(buf
), &tmpStatus
);
1124 if (U_FAILURE(tmpStatus
) || tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
1126 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1134 VariantListEntry
*var
;
1135 VariantListEntry
*varFirst
= NULL
;
1140 if (*p
== SEP
|| *p
== LOCALE_SEP
|| *p
== 0) {
1144 *p
= 0; /* terminate */
1148 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1151 /* ignore empty variant */
1153 /* ICU uses upper case letters for variants, but
1154 the canonical format is lowercase in BCP47 */
1155 for (i
= 0; *(pVar
+ i
) != 0; i
++) {
1156 *(pVar
+ i
) = uprv_tolower(*(pVar
+ i
));
1160 if (_isVariantSubtag(pVar
, -1)) {
1161 if (uprv_strcmp(pVar
,POSIX_VALUE
) || len
!= (int32_t)uprv_strlen(POSIX_VALUE
)) {
1162 /* emit the variant to the list */
1163 var
= (VariantListEntry
*)uprv_malloc(sizeof(VariantListEntry
));
1165 *status
= U_MEMORY_ALLOCATION_ERROR
;
1168 var
->variant
= pVar
;
1169 if (!_addVariantToList(&varFirst
, var
)) {
1170 /* duplicated variant */
1173 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1178 /* Special handling for POSIX variant, need to remember that we had it and then */
1179 /* treat it like an extension later. */
1182 } else if (strict
) {
1183 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1185 } else if (_isPrivateuseValueSubtag(pVar
, -1)) {
1186 /* Handle private use subtags separately */
1190 /* reset variant starting position */
1192 } else if (pVar
== NULL
) {
1198 if (U_SUCCESS(*status
)) {
1199 if (varFirst
!= NULL
) {
1202 /* write out validated/normalized variants to the target */
1204 while (var
!= NULL
) {
1205 sink
.Append("-", 1);
1206 varLen
= (int32_t)uprv_strlen(var
->variant
);
1207 sink
.Append(var
->variant
, varLen
);
1215 while (var
!= NULL
) {
1216 VariantListEntry
*tmpVar
= var
->next
;
1221 if (U_FAILURE(*status
)) {
1228 _appendKeywordsToLanguageTag(const char* localeID
, icu::ByteSink
& sink
, UBool strict
, UBool hadPosix
, UErrorCode
* status
) {
1229 char attrBuf
[ULOC_KEYWORD_AND_VALUES_CAPACITY
] = { 0 };
1230 int32_t attrBufLength
= 0;
1232 icu::MemoryPool
<AttributeListEntry
> attrPool
;
1233 icu::MemoryPool
<ExtensionListEntry
> extPool
;
1234 icu::MemoryPool
<icu::CharString
> strPool
;
1236 icu::LocalUEnumerationPointer
keywordEnum(uloc_openKeywords(localeID
, status
));
1237 if (U_FAILURE(*status
) && !hadPosix
) {
1240 if (keywordEnum
.isValid() || hadPosix
) {
1241 /* reorder extensions */
1244 ExtensionListEntry
*firstExt
= NULL
;
1245 ExtensionListEntry
*ext
;
1246 AttributeListEntry
*firstAttr
= NULL
;
1247 AttributeListEntry
*attr
;
1248 icu::MemoryPool
<icu::CharString
> extBufPool
;
1249 const char *bcpKey
=nullptr, *bcpValue
=nullptr;
1250 UErrorCode tmpStatus
= U_ZERO_ERROR
;
1255 icu::CharString buf
;
1256 key
= uenum_next(keywordEnum
.getAlias(), NULL
, status
);
1261 int32_t resultCapacity
= ULOC_KEYWORD_AND_VALUES_CAPACITY
;
1264 buffer
= buf
.getAppendBuffer(
1265 /*minCapacity=*/resultCapacity
,
1266 /*desiredCapacityHint=*/resultCapacity
,
1270 if (U_FAILURE(tmpStatus
)) {
1274 len
= uloc_getKeywordValue(
1275 localeID
, key
, buffer
, resultCapacity
, &tmpStatus
);
1277 if (tmpStatus
!= U_BUFFER_OVERFLOW_ERROR
) {
1281 resultCapacity
= len
;
1282 tmpStatus
= U_ZERO_ERROR
;
1285 if (U_FAILURE(tmpStatus
)) {
1286 if (tmpStatus
== U_MEMORY_ALLOCATION_ERROR
) {
1287 *status
= U_MEMORY_ALLOCATION_ERROR
;
1291 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1294 /* ignore this keyword */
1295 tmpStatus
= U_ZERO_ERROR
;
1299 buf
.append(buffer
, len
, tmpStatus
);
1300 if (tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
1301 tmpStatus
= U_ZERO_ERROR
; // Terminators provided by CharString.
1304 keylen
= (int32_t)uprv_strlen(key
);
1305 isBcpUExt
= (keylen
> 1);
1307 /* special keyword used for representing Unicode locale attributes */
1308 if (uprv_strcmp(key
, LOCALE_ATTRIBUTE_KEY
) == 0) {
1313 for (; i
< len
; i
++) {
1314 if (buf
[i
] != '-') {
1315 attrBuf
[attrBufLength
++] = buf
[i
];
1321 if (attrBufLength
> 0) {
1322 attrBuf
[attrBufLength
] = 0;
1324 } else if (i
>= len
){
1328 /* create AttributeListEntry */
1329 attr
= attrPool
.create();
1331 *status
= U_MEMORY_ALLOCATION_ERROR
;
1334 icu::CharString
* attrValue
=
1335 strPool
.create(attrBuf
, attrBufLength
, *status
);
1336 if (attrValue
== NULL
) {
1337 *status
= U_MEMORY_ALLOCATION_ERROR
;
1340 if (U_FAILURE(*status
)) {
1343 attr
->attribute
= attrValue
->data();
1345 if (!_addAttributeToList(&firstAttr
, attr
)) {
1347 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1352 /* for a place holder ExtensionListEntry */
1353 bcpKey
= LOCALE_ATTRIBUTE_KEY
;
1356 } else if (isBcpUExt
) {
1357 bcpKey
= uloc_toUnicodeLocaleKey(key
);
1358 if (bcpKey
== NULL
) {
1360 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1366 /* we've checked buf is null-terminated above */
1367 bcpValue
= uloc_toUnicodeLocaleType(key
, buf
.data());
1368 if (bcpValue
== NULL
) {
1370 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1375 if (bcpValue
== buf
.data()) {
1377 When uloc_toUnicodeLocaleType(key, buf) returns the
1378 input value as is, the value is well-formed, but has
1379 no known mapping. This implementation normalizes the
1382 icu::CharString
* extBuf
= extBufPool
.create();
1383 if (extBuf
== nullptr) {
1384 *status
= U_MEMORY_ALLOCATION_ERROR
;
1387 int32_t bcpValueLen
= static_cast<int32_t>(uprv_strlen(bcpValue
));
1388 int32_t resultCapacity
;
1389 char* pExtBuf
= extBuf
->getAppendBuffer(
1390 /*minCapacity=*/bcpValueLen
,
1391 /*desiredCapacityHint=*/bcpValueLen
,
1394 if (U_FAILURE(tmpStatus
)) {
1395 *status
= tmpStatus
;
1399 uprv_strcpy(pExtBuf
, bcpValue
);
1400 T_CString_toLowerCase(pExtBuf
);
1402 extBuf
->append(pExtBuf
, bcpValueLen
, tmpStatus
);
1403 if (U_FAILURE(tmpStatus
)) {
1404 *status
= tmpStatus
;
1408 bcpValue
= extBuf
->data();
1411 if (*key
== PRIVATEUSE
) {
1412 if (!ultag_isPrivateuseValueSubtags(buf
.data(), len
)) {
1414 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1420 if (!_isExtensionSingleton(key
, keylen
) || !ultag_isExtensionSubtags(buf
.data(), len
)) {
1422 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1429 icu::CharString
* extBuf
=
1430 extBufPool
.create(buf
.data(), len
, tmpStatus
);
1431 if (extBuf
== nullptr) {
1432 *status
= U_MEMORY_ALLOCATION_ERROR
;
1435 if (U_FAILURE(tmpStatus
)) {
1436 *status
= tmpStatus
;
1439 bcpValue
= extBuf
->data();
1442 /* create ExtensionListEntry */
1443 ext
= extPool
.create();
1445 *status
= U_MEMORY_ALLOCATION_ERROR
;
1449 ext
->value
= bcpValue
;
1451 if (!_addExtensionToList(&firstExt
, ext
, TRUE
)) {
1453 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1459 /* Special handling for POSIX variant - add the keywords for POSIX */
1461 /* create ExtensionListEntry for POSIX */
1462 ext
= extPool
.create();
1464 *status
= U_MEMORY_ALLOCATION_ERROR
;
1467 ext
->key
= POSIX_KEY
;
1468 ext
->value
= POSIX_VALUE
;
1470 if (!_addExtensionToList(&firstExt
, ext
, TRUE
)) {
1471 // Silently ignore errors.
1475 if (U_SUCCESS(*status
) && (firstExt
!= NULL
|| firstAttr
!= NULL
)) {
1476 UBool startLDMLExtension
= FALSE
;
1477 for (ext
= firstExt
; ext
; ext
= ext
->next
) {
1478 if (!startLDMLExtension
&& uprv_strlen(ext
->key
) > 1) {
1479 /* first LDML u singlton extension */
1480 sink
.Append("-u", 2);
1481 startLDMLExtension
= TRUE
;
1484 /* write out the sorted BCP47 attributes, extensions and private use */
1485 if (uprv_strcmp(ext
->key
, LOCALE_ATTRIBUTE_KEY
) == 0) {
1486 /* write the value for the attributes */
1487 for (attr
= firstAttr
; attr
; attr
= attr
->next
) {
1488 sink
.Append("-", 1);
1490 attr
->attribute
, static_cast<int32_t>(uprv_strlen(attr
->attribute
)));
1493 sink
.Append("-", 1);
1494 sink
.Append(ext
->key
, static_cast<int32_t>(uprv_strlen(ext
->key
)));
1495 sink
.Append("-", 1);
1496 sink
.Append(ext
->value
, static_cast<int32_t>(uprv_strlen(ext
->value
)));
1504 * Append keywords parsed from LDML extension value
1505 * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional}
1506 * Note: char* buf is used for storing keywords
1509 _appendLDMLExtensionAsKeywords(const char* ldmlext
, ExtensionListEntry
** appendTo
, icu::MemoryPool
<ExtensionListEntry
>& extPool
, icu::MemoryPool
<icu::CharString
>& kwdBuf
, UBool
*posixVariant
, UErrorCode
*status
) {
1510 const char *pTag
; /* beginning of current subtag */
1511 const char *pKwds
; /* beginning of key-type pairs */
1512 UBool variantExists
= *posixVariant
;
1514 ExtensionListEntry
*kwdFirst
= NULL
; /* first LDML keyword */
1515 ExtensionListEntry
*kwd
, *nextKwd
;
1519 /* Reset the posixVariant value */
1520 *posixVariant
= FALSE
;
1526 AttributeListEntry
*attrFirst
= NULL
; /* first attribute */
1527 AttributeListEntry
*attr
, *nextAttr
;
1529 char attrBuf
[ULOC_KEYWORD_AND_VALUES_CAPACITY
];
1530 int32_t attrBufIdx
= 0;
1532 icu::MemoryPool
<AttributeListEntry
> attrPool
;
1534 /* Iterate through u extension attributes */
1536 /* locate next separator char */
1537 for (len
= 0; *(pTag
+ len
) && *(pTag
+ len
) != SEP
; len
++);
1539 if (ultag_isUnicodeLocaleKey(pTag
, len
)) {
1544 /* add this attribute to the list */
1545 attr
= attrPool
.create();
1547 *status
= U_MEMORY_ALLOCATION_ERROR
;
1551 if (len
< (int32_t)sizeof(attrBuf
) - attrBufIdx
) {
1552 uprv_memcpy(&attrBuf
[attrBufIdx
], pTag
, len
);
1553 attrBuf
[attrBufIdx
+ len
] = 0;
1554 attr
->attribute
= &attrBuf
[attrBufIdx
];
1555 attrBufIdx
+= (len
+ 1);
1557 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1561 if (!_addAttributeToList(&attrFirst
, attr
)) {
1562 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1569 /* next to the separator */
1575 /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */
1577 kwd
= extPool
.create();
1579 *status
= U_MEMORY_ALLOCATION_ERROR
;
1583 icu::CharString
* value
= kwdBuf
.create();
1584 if (value
== NULL
) {
1585 *status
= U_MEMORY_ALLOCATION_ERROR
;
1589 /* attribute subtags sorted in alphabetical order as type */
1591 while (attr
!= NULL
) {
1592 nextAttr
= attr
->next
;
1593 if (attr
!= attrFirst
) {
1594 value
->append('-', *status
);
1596 value
->append(attr
->attribute
, *status
);
1599 if (U_FAILURE(*status
)) {
1603 kwd
->key
= LOCALE_ATTRIBUTE_KEY
;
1604 kwd
->value
= value
->data();
1606 if (!_addExtensionToList(&kwdFirst
, kwd
, FALSE
)) {
1607 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1614 const char *pBcpKey
= NULL
; /* u extenstion key subtag */
1615 const char *pBcpType
= NULL
; /* beginning of u extension type subtag(s) */
1616 int32_t bcpKeyLen
= 0;
1617 int32_t bcpTypeLen
= 0;
1618 UBool isDone
= FALSE
;
1621 /* BCP47 representation of LDML key/type pairs */
1623 const char *pNextBcpKey
= NULL
;
1624 int32_t nextBcpKeyLen
= 0;
1625 UBool emitKeyword
= FALSE
;
1628 /* locate next separator char */
1629 for (len
= 0; *(pTag
+ len
) && *(pTag
+ len
) != SEP
; len
++);
1631 if (ultag_isUnicodeLocaleKey(pTag
, len
)) {
1635 nextBcpKeyLen
= len
;
1641 U_ASSERT(pBcpKey
!= NULL
);
1642 /* within LDML type subtags */
1644 bcpTypeLen
+= (len
+ 1);
1654 /* next to the separator */
1658 /* processing last one */
1664 const char *pKey
= NULL
; /* LDML key */
1665 const char *pType
= NULL
; /* LDML type */
1667 char bcpKeyBuf
[9]; /* BCP key length is always 2 for now */
1669 U_ASSERT(pBcpKey
!= NULL
);
1671 if (bcpKeyLen
>= (int32_t)sizeof(bcpKeyBuf
)) {
1672 /* the BCP key is invalid */
1673 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1677 uprv_strncpy(bcpKeyBuf
, pBcpKey
, bcpKeyLen
);
1678 bcpKeyBuf
[bcpKeyLen
] = 0;
1680 /* u extension key to LDML key */
1681 pKey
= uloc_toLegacyKey(bcpKeyBuf
);
1683 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1686 if (pKey
== bcpKeyBuf
) {
1688 The key returned by toLegacyKey points to the input buffer.
1689 We normalize the result key to lower case.
1691 T_CString_toLowerCase(bcpKeyBuf
);
1692 icu::CharString
* key
= kwdBuf
.create(bcpKeyBuf
, bcpKeyLen
, *status
);
1694 *status
= U_MEMORY_ALLOCATION_ERROR
;
1697 if (U_FAILURE(*status
)) {
1704 char bcpTypeBuf
[128]; /* practically long enough even considering multiple subtag type */
1705 if (bcpTypeLen
>= (int32_t)sizeof(bcpTypeBuf
)) {
1706 /* the BCP type is too long */
1707 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1711 uprv_strncpy(bcpTypeBuf
, pBcpType
, bcpTypeLen
);
1712 bcpTypeBuf
[bcpTypeLen
] = 0;
1714 /* BCP type to locale type */
1715 pType
= uloc_toLegacyType(pKey
, bcpTypeBuf
);
1716 if (pType
== NULL
) {
1717 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1720 if (pType
== bcpTypeBuf
) {
1722 The type returned by toLegacyType points to the input buffer.
1723 We normalize the result type to lower case.
1725 /* normalize to lower case */
1726 T_CString_toLowerCase(bcpTypeBuf
);
1727 icu::CharString
* type
= kwdBuf
.create(bcpTypeBuf
, bcpTypeLen
, *status
);
1729 *status
= U_MEMORY_ALLOCATION_ERROR
;
1732 if (U_FAILURE(*status
)) {
1735 pType
= type
->data();
1738 /* typeless - default type value is "yes" */
1739 pType
= LOCALE_TYPE_YES
;
1742 /* Special handling for u-va-posix, since we want to treat this as a variant,
1744 if (!variantExists
&& !uprv_strcmp(pKey
, POSIX_KEY
) && !uprv_strcmp(pType
, POSIX_VALUE
) ) {
1745 *posixVariant
= TRUE
;
1747 /* create an ExtensionListEntry for this keyword */
1748 kwd
= extPool
.create();
1750 *status
= U_MEMORY_ALLOCATION_ERROR
;
1757 if (!_addExtensionToList(&kwdFirst
, kwd
, FALSE
)) {
1758 // duplicate keyword is allowed, Only the first
1763 pBcpKey
= pNextBcpKey
;
1764 bcpKeyLen
= pNextBcpKey
!= NULL
? nextBcpKeyLen
: 0;
1772 while (kwd
!= NULL
) {
1773 nextKwd
= kwd
->next
;
1774 _addExtensionToList(appendTo
, kwd
, FALSE
);
1781 _appendKeywords(ULanguageTag
* langtag
, icu::ByteSink
& sink
, UErrorCode
* status
) {
1784 ExtensionListEntry
*kwdFirst
= NULL
;
1785 ExtensionListEntry
*kwd
;
1786 const char *key
, *type
;
1787 icu::MemoryPool
<ExtensionListEntry
> extPool
;
1788 icu::MemoryPool
<icu::CharString
> kwdBuf
;
1789 UBool posixVariant
= FALSE
;
1791 if (U_FAILURE(*status
)) {
1795 /* Determine if variants already exists */
1796 if (ultag_getVariantsSize(langtag
)) {
1797 posixVariant
= TRUE
;
1800 n
= ultag_getExtensionsSize(langtag
);
1802 /* resolve locale keywords and reordering keys */
1803 for (i
= 0; i
< n
; i
++) {
1804 key
= ultag_getExtensionKey(langtag
, i
);
1805 type
= ultag_getExtensionValue(langtag
, i
);
1806 if (*key
== LDMLEXT
) {
1807 _appendLDMLExtensionAsKeywords(type
, &kwdFirst
, extPool
, kwdBuf
, &posixVariant
, status
);
1808 if (U_FAILURE(*status
)) {
1812 kwd
= extPool
.create();
1814 *status
= U_MEMORY_ALLOCATION_ERROR
;
1819 if (!_addExtensionToList(&kwdFirst
, kwd
, FALSE
)) {
1820 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1826 if (U_SUCCESS(*status
)) {
1827 type
= ultag_getPrivateUse(langtag
);
1828 if ((int32_t)uprv_strlen(type
) > 0) {
1829 /* add private use as a keyword */
1830 kwd
= extPool
.create();
1832 *status
= U_MEMORY_ALLOCATION_ERROR
;
1834 kwd
->key
= PRIVATEUSE_KEY
;
1836 if (!_addExtensionToList(&kwdFirst
, kwd
, FALSE
)) {
1837 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1843 /* If a POSIX variant was in the extensions, write it out before writing the keywords. */
1845 if (U_SUCCESS(*status
) && posixVariant
) {
1846 len
= (int32_t) uprv_strlen(_POSIX
);
1847 sink
.Append(_POSIX
, len
);
1850 if (U_SUCCESS(*status
) && kwdFirst
!= NULL
) {
1851 /* write out the sorted keywords */
1852 UBool firstValue
= TRUE
;
1856 sink
.Append("@", 1);
1859 sink
.Append(";", 1);
1863 len
= (int32_t)uprv_strlen(kwd
->key
);
1864 sink
.Append(kwd
->key
, len
);
1865 sink
.Append("=", 1);
1868 len
= (int32_t)uprv_strlen(kwd
->value
);
1869 sink
.Append(kwd
->value
, len
);
1877 _appendPrivateuseToLanguageTag(const char* localeID
, icu::ByteSink
& sink
, UBool strict
, UBool hadPosix
, UErrorCode
* status
) {
1879 char buf
[ULOC_FULLNAME_CAPACITY
];
1880 char tmpAppend
[ULOC_FULLNAME_CAPACITY
];
1881 UErrorCode tmpStatus
= U_ZERO_ERROR
;
1884 int32_t capacity
= sizeof tmpAppend
;
1886 if (U_FAILURE(*status
)) {
1890 len
= uloc_getVariant(localeID
, buf
, sizeof(buf
), &tmpStatus
);
1891 if (U_FAILURE(tmpStatus
) || tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
1893 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1901 UBool firstValue
= TRUE
;
1908 if (*p
== SEP
|| *p
== LOCALE_SEP
|| *p
== 0) {
1912 *p
= 0; /* terminate */
1914 if (pPriv
!= NULL
) {
1915 /* Private use in the canonical format is lowercase in BCP47 */
1916 for (i
= 0; *(pPriv
+ i
) != 0; i
++) {
1917 *(pPriv
+ i
) = uprv_tolower(*(pPriv
+ i
));
1921 if (_isPrivateuseValueSubtag(pPriv
, -1)) {
1923 if (!_isVariantSubtag(pPriv
, -1)) {
1929 } else if (strict
) {
1930 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1937 if (reslen
< capacity
) {
1938 tmpAppend
[reslen
++] = SEP
;
1942 if (reslen
< capacity
) {
1943 tmpAppend
[reslen
++] = *PRIVATEUSE_KEY
;
1946 if (reslen
< capacity
) {
1947 tmpAppend
[reslen
++] = SEP
;
1950 len
= (int32_t)uprv_strlen(PRIVUSE_VARIANT_PREFIX
);
1951 if (reslen
< capacity
) {
1952 uprv_memcpy(tmpAppend
+ reslen
, PRIVUSE_VARIANT_PREFIX
, uprv_min(len
, capacity
- reslen
));
1956 if (reslen
< capacity
) {
1957 tmpAppend
[reslen
++] = SEP
;
1963 len
= (int32_t)uprv_strlen(pPriv
);
1964 if (reslen
< capacity
) {
1965 uprv_memcpy(tmpAppend
+ reslen
, pPriv
, uprv_min(len
, capacity
- reslen
));
1970 /* reset private use starting position */
1972 } else if (pPriv
== NULL
) {
1978 if (U_FAILURE(*status
)) {
1983 if (U_SUCCESS(*status
)) {
1985 sink
.Append(tmpAppend
, len
);
1990 * -------------------------------------------------
1994 * -------------------------------------------------
1997 /* Bit flags used by the parser */
2008 * Ticket #12705 - Visual Studio 2015 Update 3 contains a new code optimizer which has problems optimizing
2009 * this function. (See https://blogs.msdn.microsoft.com/vcblog/2016/05/04/new-code-optimizer/ )
2010 * As a workaround, we will turn off optimization just for this function on VS2015 Update 3 and above.
2012 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
2013 #pragma optimize( "", off )
2016 static ULanguageTag
*
2017 ultag_parse(const char* tag
, int32_t tagLen
, int32_t* parsedLen
, UErrorCode
* status
) {
2020 char *pSubtag
, *pNext
, *pLastGoodPosition
;
2023 ExtensionListEntry
*pExtension
;
2024 char *pExtValueSubtag
, *pExtValueSubtagEnd
;
2026 UBool privateuseVar
= FALSE
;
2027 int32_t grandfatheredLen
= 0;
2029 if (parsedLen
!= NULL
) {
2033 if (U_FAILURE(*status
)) {
2038 tagLen
= (int32_t)uprv_strlen(tag
);
2041 /* copy the entire string */
2042 tagBuf
= (char*)uprv_malloc(tagLen
+ 1);
2043 if (tagBuf
== NULL
) {
2044 *status
= U_MEMORY_ALLOCATION_ERROR
;
2047 uprv_memcpy(tagBuf
, tag
, tagLen
);
2048 *(tagBuf
+ tagLen
) = 0;
2050 /* create a ULanguageTag */
2051 icu::LocalULanguageTagPointer
t(
2052 (ULanguageTag
*)uprv_malloc(sizeof(ULanguageTag
)));
2055 *status
= U_MEMORY_ALLOCATION_ERROR
;
2058 _initializeULanguageTag(t
.getAlias());
2061 if (tagLen
< MINLEN
) {
2062 /* the input tag is too short - return empty ULanguageTag */
2066 size_t parsedLenDelta
= 0;
2067 // Grandfathered tag will be consider together. Grandfathered tag with intervening
2068 // script and region such as art-DE-lojban or art-Latn-lojban won't be
2070 /* check if the tag is grandfathered */
2071 for (i
= 0; i
< UPRV_LENGTHOF(GRANDFATHERED
); i
+= 2) {
2072 int32_t checkGrandfatheredLen
= static_cast<int32_t>(uprv_strlen(GRANDFATHERED
[i
]));
2073 if (tagLen
< checkGrandfatheredLen
) {
2076 if (tagLen
> checkGrandfatheredLen
&& tagBuf
[checkGrandfatheredLen
] != '-') {
2077 // make sure next char is '-'.
2080 if (uprv_strnicmp(GRANDFATHERED
[i
], tagBuf
, checkGrandfatheredLen
) == 0) {
2081 int32_t newTagLength
;
2083 grandfatheredLen
= checkGrandfatheredLen
; /* back up for output parsedLen */
2084 int32_t replacementLen
= static_cast<int32_t>(uprv_strlen(GRANDFATHERED
[i
+1]));
2085 newTagLength
= replacementLen
+ tagLen
- checkGrandfatheredLen
;
2086 if (tagLen
< newTagLength
) {
2088 tagBuf
= (char*)uprv_malloc(newTagLength
+ 1);
2089 if (tagBuf
== NULL
) {
2090 *status
= U_MEMORY_ALLOCATION_ERROR
;
2094 tagLen
= newTagLength
;
2096 parsedLenDelta
= checkGrandfatheredLen
- replacementLen
;
2097 uprv_strcpy(t
->buf
, GRANDFATHERED
[i
+ 1]);
2098 if (checkGrandfatheredLen
!= tagLen
) {
2099 uprv_strcpy(t
->buf
+ replacementLen
, tag
+ checkGrandfatheredLen
);
2105 if (grandfatheredLen
== 0) {
2106 for (i
= 0; i
< UPRV_LENGTHOF(REDUNDANT
); i
+= 2) {
2107 const char* redundantTag
= REDUNDANT
[i
];
2108 size_t redundantTagLen
= uprv_strlen(redundantTag
);
2109 // The preferred tag for a redundant tag is always shorter than redundant
2110 // tag. A redundant tag may or may not be followed by other subtags.
2111 // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
2112 if (uprv_strnicmp(redundantTag
, tagBuf
, static_cast<uint32_t>(redundantTagLen
)) == 0) {
2113 const char* redundantTagEnd
= tagBuf
+ redundantTagLen
;
2114 if (*redundantTagEnd
== '\0' || *redundantTagEnd
== SEP
) {
2115 const char* preferredTag
= REDUNDANT
[i
+ 1];
2116 size_t preferredTagLen
= uprv_strlen(preferredTag
);
2117 uprv_strncpy(t
->buf
, preferredTag
, preferredTagLen
);
2118 if (*redundantTagEnd
== SEP
) {
2119 uprv_memmove(tagBuf
+ preferredTagLen
,
2121 tagLen
- redundantTagLen
+ 1);
2123 tagBuf
[preferredTagLen
] = '\0';
2125 // parsedLen should be the length of the input
2126 // before redundantTag is replaced by preferredTag.
2127 // Save the delta to add it back later.
2128 parsedLenDelta
= redundantTagLen
- preferredTagLen
;
2136 * langtag = language
2145 pNext
= pLastGoodPosition
= tagBuf
;
2148 pExtValueSubtag
= NULL
;
2149 pExtValueSubtagEnd
= NULL
;
2156 /* locate next separator char */
2170 subtagLen
= (int32_t)(pSep
- pSubtag
);
2173 if (ultag_isLanguageSubtag(pSubtag
, subtagLen
)) {
2174 *pSep
= 0; /* terminate */
2175 // TODO: move deprecated language code handling here.
2176 t
->language
= T_CString_toLowerCase(pSubtag
);
2178 pLastGoodPosition
= pSep
;
2179 next
= SCRT
| REGN
| VART
| EXTS
| PRIV
;
2186 if (_isExtlangSubtag(pSubtag
, subtagLen
)) {
2188 t
->extlang
[extlangIdx
++] = T_CString_toLowerCase(pSubtag
);
2190 pLastGoodPosition
= pSep
;
2191 if (extlangIdx
< 3) {
2192 next
= EXTL
| SCRT
| REGN
| VART
| EXTS
| PRIV
;
2194 next
= SCRT
| REGN
| VART
| EXTS
| PRIV
;
2200 if (ultag_isScriptSubtag(pSubtag
, subtagLen
)) {
2206 *p
= uprv_toupper(*p
);
2209 *p
= uprv_tolower(*p
);
2212 t
->script
= pSubtag
;
2214 pLastGoodPosition
= pSep
;
2215 next
= REGN
| VART
| EXTS
| PRIV
;
2220 if (ultag_isRegionSubtag(pSubtag
, subtagLen
)) {
2222 // TODO: move deprecated region code handling here.
2223 t
->region
= T_CString_toUpperCase(pSubtag
);
2225 pLastGoodPosition
= pSep
;
2226 next
= VART
| EXTS
| PRIV
;
2231 if (_isVariantSubtag(pSubtag
, subtagLen
) ||
2232 (privateuseVar
&& _isPrivateuseVariantSubtag(pSubtag
, subtagLen
))) {
2233 VariantListEntry
*var
;
2236 var
= (VariantListEntry
*)uprv_malloc(sizeof(VariantListEntry
));
2238 *status
= U_MEMORY_ALLOCATION_ERROR
;
2242 var
->variant
= T_CString_toUpperCase(pSubtag
);
2243 isAdded
= _addVariantToList(&(t
->variants
), var
);
2245 /* duplicated variant entry */
2249 pLastGoodPosition
= pSep
;
2250 next
= VART
| EXTS
| PRIV
;
2255 if (_isExtensionSingleton(pSubtag
, subtagLen
)) {
2256 if (pExtension
!= NULL
) {
2257 if (pExtValueSubtag
== NULL
|| pExtValueSubtagEnd
== NULL
) {
2258 /* the previous extension is incomplete */
2259 uprv_free(pExtension
);
2264 /* terminate the previous extension value */
2265 *pExtValueSubtagEnd
= 0;
2266 pExtension
->value
= T_CString_toLowerCase(pExtValueSubtag
);
2268 /* insert the extension to the list */
2269 if (_addExtensionToList(&(t
->extensions
), pExtension
, FALSE
)) {
2270 pLastGoodPosition
= pExtValueSubtagEnd
;
2272 /* stop parsing here */
2273 uprv_free(pExtension
);
2279 /* create a new extension */
2280 pExtension
= (ExtensionListEntry
*)uprv_malloc(sizeof(ExtensionListEntry
));
2281 if (pExtension
== NULL
) {
2282 *status
= U_MEMORY_ALLOCATION_ERROR
;
2286 pExtension
->key
= T_CString_toLowerCase(pSubtag
);
2287 pExtension
->value
= NULL
; /* will be set later */
2290 * reset the start and the end location of extension value
2291 * subtags for this extension
2293 pExtValueSubtag
= NULL
;
2294 pExtValueSubtagEnd
= NULL
;
2301 if (_isExtensionSubtag(pSubtag
, subtagLen
)) {
2302 if (pExtValueSubtag
== NULL
) {
2303 /* if the start postion of this extension's value is not yet,
2304 this one is the first value subtag */
2305 pExtValueSubtag
= pSubtag
;
2308 /* Mark the end of this subtag */
2309 pExtValueSubtagEnd
= pSep
;
2310 next
= EXTS
| EXTV
| PRIV
;
2316 if (uprv_tolower(*pSubtag
) == PRIVATEUSE
&& subtagLen
== 1) {
2319 if (pExtension
!= NULL
) {
2320 /* Process the last extension */
2321 if (pExtValueSubtag
== NULL
|| pExtValueSubtagEnd
== NULL
) {
2322 /* the previous extension is incomplete */
2323 uprv_free(pExtension
);
2327 /* terminate the previous extension value */
2328 *pExtValueSubtagEnd
= 0;
2329 pExtension
->value
= T_CString_toLowerCase(pExtValueSubtag
);
2331 /* insert the extension to the list */
2332 if (_addExtensionToList(&(t
->extensions
), pExtension
, FALSE
)) {
2333 pLastGoodPosition
= pExtValueSubtagEnd
;
2336 /* stop parsing here */
2337 uprv_free(pExtension
);
2344 /* The rest of part will be private use value subtags */
2345 if (pNext
== NULL
) {
2346 /* empty private use subtag */
2349 /* back up the private use value start position */
2350 pPrivuseVal
= pNext
;
2352 /* validate private use value subtags */
2368 subtagLen
= (int32_t)(pSep
- pSubtag
);
2370 if (uprv_strncmp(pSubtag
, PRIVUSE_VARIANT_PREFIX
, uprv_strlen(PRIVUSE_VARIANT_PREFIX
)) == 0) {
2373 privateuseVar
= TRUE
;
2375 } else if (_isPrivateuseValueSubtag(pSubtag
, subtagLen
)) {
2376 pLastGoodPosition
= pSep
;
2386 if (pLastGoodPosition
- pPrivuseVal
> 0) {
2387 *pLastGoodPosition
= 0;
2388 t
->privateuse
= T_CString_toLowerCase(pPrivuseVal
);
2390 /* No more subtags, exiting the parse loop */
2396 /* If we fell through here, it means this subtag is illegal - quit parsing */
2400 if (pExtension
!= NULL
) {
2401 /* Process the last extension */
2402 if (pExtValueSubtag
== NULL
|| pExtValueSubtagEnd
== NULL
) {
2403 /* the previous extension is incomplete */
2404 uprv_free(pExtension
);
2406 /* terminate the previous extension value */
2407 *pExtValueSubtagEnd
= 0;
2408 pExtension
->value
= T_CString_toLowerCase(pExtValueSubtag
);
2409 /* insert the extension to the list */
2410 if (_addExtensionToList(&(t
->extensions
), pExtension
, FALSE
)) {
2411 pLastGoodPosition
= pExtValueSubtagEnd
;
2413 uprv_free(pExtension
);
2418 if (parsedLen
!= NULL
) {
2419 *parsedLen
= (int32_t)(pLastGoodPosition
- t
->buf
+ parsedLenDelta
);
2426 * Ticket #12705 - Turn optimization back on.
2428 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
2429 #pragma optimize( "", on )
2433 ultag_close(ULanguageTag
* langtag
) {
2435 if (langtag
== NULL
) {
2439 uprv_free(langtag
->buf
);
2441 if (langtag
->variants
) {
2442 VariantListEntry
*curVar
= langtag
->variants
;
2444 VariantListEntry
*nextVar
= curVar
->next
;
2450 if (langtag
->extensions
) {
2451 ExtensionListEntry
*curExt
= langtag
->extensions
;
2453 ExtensionListEntry
*nextExt
= curExt
->next
;
2463 ultag_getLanguage(const ULanguageTag
* langtag
) {
2464 return langtag
->language
;
2469 ultag_getJDKLanguage(const ULanguageTag
* langtag
) {
2471 for (i
= 0; DEPRECATEDLANGS
[i
] != NULL
; i
+= 2) {
2472 if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS
[i
], langtag
->language
) == 0) {
2473 return DEPRECATEDLANGS
[i
+ 1];
2476 return langtag
->language
;
2481 ultag_getExtlang(const ULanguageTag
* langtag
, int32_t idx
) {
2482 if (idx
>= 0 && idx
< MAXEXTLANG
) {
2483 return langtag
->extlang
[idx
];
2489 ultag_getExtlangSize(const ULanguageTag
* langtag
) {
2492 for (i
= 0; i
< MAXEXTLANG
; i
++) {
2493 if (langtag
->extlang
[i
]) {
2501 ultag_getScript(const ULanguageTag
* langtag
) {
2502 return langtag
->script
;
2506 ultag_getRegion(const ULanguageTag
* langtag
) {
2507 return langtag
->region
;
2511 ultag_getVariant(const ULanguageTag
* langtag
, int32_t idx
) {
2512 const char *var
= NULL
;
2513 VariantListEntry
*cur
= langtag
->variants
;
2527 ultag_getVariantsSize(const ULanguageTag
* langtag
) {
2529 VariantListEntry
*cur
= langtag
->variants
;
2541 ultag_getExtensionKey(const ULanguageTag
* langtag
, int32_t idx
) {
2542 const char *key
= NULL
;
2543 ExtensionListEntry
*cur
= langtag
->extensions
;
2557 ultag_getExtensionValue(const ULanguageTag
* langtag
, int32_t idx
) {
2558 const char *val
= NULL
;
2559 ExtensionListEntry
*cur
= langtag
->extensions
;
2573 ultag_getExtensionsSize(const ULanguageTag
* langtag
) {
2575 ExtensionListEntry
*cur
= langtag
->extensions
;
2587 ultag_getPrivateUse(const ULanguageTag
* langtag
) {
2588 return langtag
->privateuse
;
2593 ultag_getGrandfathered(const ULanguageTag
* langtag
) {
2594 return langtag
->grandfathered
;
2600 * -------------------------------------------------
2602 * Locale/BCP47 conversion APIs, exposed as uloc_*
2604 * -------------------------------------------------
2606 U_CAPI
int32_t U_EXPORT2
2607 uloc_toLanguageTag(const char* localeID
,
2609 int32_t langtagCapacity
,
2611 UErrorCode
* status
) {
2612 if (U_FAILURE(*status
)) {
2616 icu::CheckedArrayByteSink
sink(langtag
, langtagCapacity
);
2617 ulocimp_toLanguageTag(localeID
, sink
, strict
, status
);
2619 int32_t reslen
= sink
.NumberOfBytesAppended();
2621 if (U_FAILURE(*status
)) {
2625 if (sink
.Overflowed()) {
2626 *status
= U_BUFFER_OVERFLOW_ERROR
;
2628 u_terminateChars(langtag
, langtagCapacity
, reslen
, status
);
2635 U_CAPI
void U_EXPORT2
2636 ulocimp_toLanguageTag(const char* localeID
,
2637 icu::ByteSink
& sink
,
2639 UErrorCode
* status
) {
2640 icu::CharString canonical
;
2642 UErrorCode tmpStatus
= U_ZERO_ERROR
;
2643 UBool hadPosix
= FALSE
;
2644 const char* pKeywordStart
;
2646 /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
2647 int32_t resultCapacity
= static_cast<int32_t>(uprv_strlen(localeID
));
2648 if (resultCapacity
> 0) {
2652 buffer
= canonical
.getAppendBuffer(
2653 /*minCapacity=*/resultCapacity
,
2654 /*desiredCapacityHint=*/resultCapacity
,
2658 if (U_FAILURE(tmpStatus
)) {
2659 *status
= tmpStatus
;
2664 uloc_canonicalize(localeID
, buffer
, resultCapacity
, &tmpStatus
);
2666 if (tmpStatus
!= U_BUFFER_OVERFLOW_ERROR
) {
2670 resultCapacity
= reslen
;
2671 tmpStatus
= U_ZERO_ERROR
;
2674 if (U_FAILURE(tmpStatus
)) {
2675 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2679 canonical
.append(buffer
, reslen
, tmpStatus
);
2680 if (tmpStatus
== U_STRING_NOT_TERMINATED_WARNING
) {
2681 tmpStatus
= U_ZERO_ERROR
; // Terminators provided by CharString.
2684 if (U_FAILURE(tmpStatus
)) {
2685 *status
= tmpStatus
;
2690 /* For handling special case - private use only tag */
2691 pKeywordStart
= locale_getKeywordsStart(canonical
.data());
2692 if (pKeywordStart
== canonical
.data()) {
2696 icu::LocalUEnumerationPointer
kwdEnum(uloc_openKeywords(canonical
.data(), &tmpStatus
));
2697 if (U_SUCCESS(tmpStatus
)) {
2698 kwdCnt
= uenum_count(kwdEnum
.getAlias(), &tmpStatus
);
2703 key
= uenum_next(kwdEnum
.getAlias(), &len
, &tmpStatus
);
2704 if (len
== 1 && *key
== PRIVATEUSE
) {
2705 char buf
[ULOC_KEYWORD_AND_VALUES_CAPACITY
];
2706 buf
[0] = PRIVATEUSE
;
2708 len
= uloc_getKeywordValue(localeID
, key
, &buf
[2], sizeof(buf
) - 2, &tmpStatus
);
2709 if (U_SUCCESS(tmpStatus
)) {
2710 if (ultag_isPrivateuseValueSubtags(&buf
[2], len
)) {
2711 /* return private use only tag */
2712 sink
.Append(buf
, len
+ 2);
2714 } else if (strict
) {
2715 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2718 /* if not strict mode, then "und" will be returned */
2720 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2731 _appendLanguageToLanguageTag(canonical
.data(), sink
, strict
, status
);
2732 _appendScriptToLanguageTag(canonical
.data(), sink
, strict
, status
);
2733 _appendRegionToLanguageTag(canonical
.data(), sink
, strict
, status
);
2734 _appendVariantsToLanguageTag(canonical
.data(), sink
, strict
, &hadPosix
, status
);
2735 _appendKeywordsToLanguageTag(canonical
.data(), sink
, strict
, hadPosix
, status
);
2736 _appendPrivateuseToLanguageTag(canonical
.data(), sink
, strict
, hadPosix
, status
);
2740 U_CAPI
int32_t U_EXPORT2
2741 uloc_forLanguageTag(const char* langtag
,
2743 int32_t localeIDCapacity
,
2744 int32_t* parsedLength
,
2745 UErrorCode
* status
) {
2746 if (U_FAILURE(*status
)) {
2750 icu::CheckedArrayByteSink
sink(localeID
, localeIDCapacity
);
2751 ulocimp_forLanguageTag(langtag
, -1, sink
, parsedLength
, status
);
2753 int32_t reslen
= sink
.NumberOfBytesAppended();
2755 if (U_FAILURE(*status
)) {
2759 if (sink
.Overflowed()) {
2760 *status
= U_BUFFER_OVERFLOW_ERROR
;
2762 u_terminateChars(localeID
, localeIDCapacity
, reslen
, status
);
2769 U_CAPI
void U_EXPORT2
2770 ulocimp_forLanguageTag(const char* langtag
,
2772 icu::ByteSink
& sink
,
2773 int32_t* parsedLength
,
2774 UErrorCode
* status
) {
2775 UBool isEmpty
= TRUE
;
2776 const char *subtag
, *p
;
2779 UBool noRegion
= TRUE
;
2781 icu::LocalULanguageTagPointer
lt(ultag_parse(langtag
, tagLen
, parsedLength
, status
));
2782 if (U_FAILURE(*status
)) {
2787 subtag
= ultag_getExtlangSize(lt
.getAlias()) > 0 ? ultag_getExtlang(lt
.getAlias(), 0) : ultag_getLanguage(lt
.getAlias());
2788 if (uprv_compareInvCharsAsAscii(subtag
, LANG_UND
) != 0) {
2789 len
= (int32_t)uprv_strlen(subtag
);
2791 sink
.Append(subtag
, len
);
2797 subtag
= ultag_getScript(lt
.getAlias());
2798 len
= (int32_t)uprv_strlen(subtag
);
2800 sink
.Append("_", 1);
2803 /* write out the script in title case */
2804 char c
= uprv_toupper(*subtag
);
2806 sink
.Append(subtag
+ 1, len
- 1);
2810 subtag
= ultag_getRegion(lt
.getAlias());
2811 len
= (int32_t)uprv_strlen(subtag
);
2813 sink
.Append("_", 1);
2816 /* write out the region in upper case */
2819 char c
= uprv_toupper(*p
);
2827 n
= ultag_getVariantsSize(lt
.getAlias());
2830 sink
.Append("_", 1);
2834 for (i
= 0; i
< n
; i
++) {
2835 subtag
= ultag_getVariant(lt
.getAlias(), i
);
2836 sink
.Append("_", 1);
2838 /* write out the variant in upper case */
2841 char c
= uprv_toupper(*p
);
2849 n
= ultag_getExtensionsSize(lt
.getAlias());
2850 subtag
= ultag_getPrivateUse(lt
.getAlias());
2851 if (n
> 0 || uprv_strlen(subtag
) > 0) {
2852 if (isEmpty
&& n
> 0) {
2853 /* need a language */
2854 sink
.Append(LANG_UND
, LANG_UND_LEN
);
2856 _appendKeywords(lt
.getAlias(), sink
, status
);