2 *******************************************************************************
4 * Copyright (C) 1997-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: loclikely.cpp
10 * tab size: 8 (not used)
13 * created on: 2010feb25
14 * created by: Markus W. Scherer
16 * Code for likely and minimized locale subtags, separated out from other .cpp files
17 * that then do not depend on resource bundle code and likely-subtags data.
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ures.h"
30 * This function looks for the localeID in the likelySubtags resource.
32 * @param localeID The tag to find.
33 * @param buffer A buffer to hold the matching entry
34 * @param bufferLength The length of the output buffer
35 * @return A pointer to "buffer" if found, or a null pointer if not.
37 static const char* U_CALLCONV
38 findLikelySubtags(const char* localeID
,
42 const char* result
= NULL
;
44 if (!U_FAILURE(*err
)) {
46 const UChar
* s
= NULL
;
47 UErrorCode tmpErr
= U_ZERO_ERROR
;
48 UResourceBundle
* subtags
= ures_openDirect(NULL
, "likelySubtags", &tmpErr
);
49 if (U_SUCCESS(tmpErr
)) {
50 s
= ures_getStringByKey(subtags
, localeID
, &resLen
, &tmpErr
);
52 if (U_FAILURE(tmpErr
)) {
54 * If a resource is missing, it's not really an error, it's
55 * just that we don't have any data for that particular locale ID.
57 if (tmpErr
!= U_MISSING_RESOURCE_ERROR
) {
61 else if (resLen
>= bufferLength
) {
62 /* The buffer should never overflow. */
63 *err
= U_INTERNAL_PROGRAM_ERROR
;
66 u_UCharsToChars(s
, buffer
, resLen
+ 1);
80 * Append a tag to a buffer, adding the separator if necessary. The buffer
81 * must be large enough to contain the resulting tag plus any separator
82 * necessary. The tag must not be a zero-length string.
84 * @param tag The tag to add.
85 * @param tagLength The length of the tag.
86 * @param buffer The output buffer.
87 * @param bufferLength The length of the output buffer. This is an input/ouput parameter.
89 static void U_CALLCONV
94 int32_t* bufferLength
) {
96 if (*bufferLength
> 0) {
97 buffer
[*bufferLength
] = '_';
102 &buffer
[*bufferLength
],
106 *bufferLength
+= tagLength
;
110 * These are the canonical strings for unknown languages, scripts and regions.
112 static const char* const unknownLanguage
= "und";
113 static const char* const unknownScript
= "Zzzz";
114 static const char* const unknownRegion
= "ZZ";
117 * Create a tag string from the supplied parameters. The lang, script and region
118 * parameters may be NULL pointers. If they are, their corresponding length parameters
119 * must be less than or equal to 0.
121 * If any of the language, script or region parameters are empty, and the alternateTags
122 * parameter is not NULL, it will be parsed for potential language, script and region tags
123 * to be used when constructing the new tag. If the alternateTags parameter is NULL, or
124 * it contains no language tag, the default tag for the unknown language is used.
126 * If the length of the new string exceeds the capacity of the output buffer,
127 * the function copies as many bytes to the output buffer as it can, and returns
128 * the error U_BUFFER_OVERFLOW_ERROR.
130 * If an illegal argument is provided, the function returns the error
131 * U_ILLEGAL_ARGUMENT_ERROR.
133 * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
134 * the tag string fits in the output buffer, but the null terminator doesn't.
136 * @param lang The language tag to use.
137 * @param langLength The length of the language tag.
138 * @param script The script tag to use.
139 * @param scriptLength The length of the script tag.
140 * @param region The region tag to use.
141 * @param regionLength The length of the region tag.
142 * @param trailing Any trailing data to append to the new tag.
143 * @param trailingLength The length of the trailing data.
144 * @param alternateTags A string containing any alternate tags.
145 * @param tag The output buffer.
146 * @param tagCapacity The capacity of the output buffer.
147 * @param err A pointer to a UErrorCode for error reporting.
148 * @return The length of the tag string, which may be greater than tagCapacity, or -1 on error.
150 static int32_t U_CALLCONV
151 createTagStringWithAlternates(
155 int32_t scriptLength
,
157 int32_t regionLength
,
158 const char* trailing
,
159 int32_t trailingLength
,
160 const char* alternateTags
,
165 if (U_FAILURE(*err
)) {
168 else if (tag
== NULL
||
170 langLength
>= ULOC_LANG_CAPACITY
||
171 scriptLength
>= ULOC_SCRIPT_CAPACITY
||
172 regionLength
>= ULOC_COUNTRY_CAPACITY
) {
177 * ULOC_FULLNAME_CAPACITY will provide enough capacity
178 * that we can build a string that contains the language,
179 * script and region code without worrying about overrunning
180 * the user-supplied buffer.
182 char tagBuffer
[ULOC_FULLNAME_CAPACITY
];
183 int32_t tagLength
= 0;
184 int32_t capacityRemaining
= tagCapacity
;
185 UBool regionAppended
= FALSE
;
187 if (langLength
> 0) {
194 else if (alternateTags
== NULL
) {
196 * Append the value for an unknown language, if
197 * we found no language.
201 (int32_t)uprv_strlen(unknownLanguage
),
207 * Parse the alternateTags string for the language.
209 char alternateLang
[ULOC_LANG_CAPACITY
];
210 int32_t alternateLangLength
= sizeof(alternateLang
);
212 alternateLangLength
=
218 if(U_FAILURE(*err
) ||
219 alternateLangLength
>= ULOC_LANG_CAPACITY
) {
222 else if (alternateLangLength
== 0) {
224 * Append the value for an unknown language, if
225 * we found no language.
229 (int32_t)uprv_strlen(unknownLanguage
),
242 if (scriptLength
> 0) {
249 else if (alternateTags
!= NULL
) {
251 * Parse the alternateTags string for the script.
253 char alternateScript
[ULOC_SCRIPT_CAPACITY
];
255 const int32_t alternateScriptLength
=
259 sizeof(alternateScript
),
262 if (U_FAILURE(*err
) ||
263 alternateScriptLength
>= ULOC_SCRIPT_CAPACITY
) {
266 else if (alternateScriptLength
> 0) {
269 alternateScriptLength
,
275 if (regionLength
> 0) {
282 regionAppended
= TRUE
;
284 else if (alternateTags
!= NULL
) {
286 * Parse the alternateTags string for the region.
288 char alternateRegion
[ULOC_COUNTRY_CAPACITY
];
290 const int32_t alternateRegionLength
=
294 sizeof(alternateRegion
),
296 if (U_FAILURE(*err
) ||
297 alternateRegionLength
>= ULOC_COUNTRY_CAPACITY
) {
300 else if (alternateRegionLength
> 0) {
303 alternateRegionLength
,
307 regionAppended
= TRUE
;
312 const int32_t toCopy
=
313 tagLength
>= tagCapacity
? tagCapacity
: tagLength
;
316 * Copy the partial tag from our internal buffer to the supplied
324 capacityRemaining
-= toCopy
;
327 if (trailingLength
> 0) {
328 if (*trailing
!= '@' && capacityRemaining
> 0) {
329 tag
[tagLength
++] = '_';
331 if (capacityRemaining
> 0 && !regionAppended
) {
332 /* extra separator is required */
333 tag
[tagLength
++] = '_';
338 if (capacityRemaining
> 0) {
340 * Copy the trailing data into the supplied buffer. Use uprv_memmove, since we
341 * don't know if the user-supplied buffers overlap.
343 const int32_t toCopy
=
344 trailingLength
>= capacityRemaining
? capacityRemaining
: trailingLength
;
353 tagLength
+= trailingLength
;
355 return u_terminateChars(
365 * An overflow indicates the locale ID passed in
366 * is ill-formed. If we got here, and there was
367 * no previous error, it's an implicit overflow.
369 if (*err
== U_BUFFER_OVERFLOW_ERROR
||
371 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
378 * Create a tag string from the supplied parameters. The lang, script and region
379 * parameters may be NULL pointers. If they are, their corresponding length parameters
380 * must be less than or equal to 0. If the lang parameter is an empty string, the
381 * default value for an unknown language is written to the output buffer.
383 * If the length of the new string exceeds the capacity of the output buffer,
384 * the function copies as many bytes to the output buffer as it can, and returns
385 * the error U_BUFFER_OVERFLOW_ERROR.
387 * If an illegal argument is provided, the function returns the error
388 * U_ILLEGAL_ARGUMENT_ERROR.
390 * @param lang The language tag to use.
391 * @param langLength The length of the language tag.
392 * @param script The script tag to use.
393 * @param scriptLength The length of the script tag.
394 * @param region The region tag to use.
395 * @param regionLength The length of the region tag.
396 * @param trailing Any trailing data to append to the new tag.
397 * @param trailingLength The length of the trailing data.
398 * @param tag The output buffer.
399 * @param tagCapacity The capacity of the output buffer.
400 * @param err A pointer to a UErrorCode for error reporting.
401 * @return The length of the tag string, which may be greater than tagCapacity.
403 static int32_t U_CALLCONV
408 int32_t scriptLength
,
410 int32_t regionLength
,
411 const char* trailing
,
412 int32_t trailingLength
,
417 return createTagStringWithAlternates(
433 * Parse the language, script, and region subtags from a tag string, and copy the
434 * results into the corresponding output parameters. The buffers are null-terminated,
435 * unless overflow occurs.
437 * The langLength, scriptLength, and regionLength parameters are input/output
438 * parameters, and must contain the capacity of their corresponding buffers on
439 * input. On output, they will contain the actual length of the buffers, not
440 * including the null terminator.
442 * If the length of any of the output subtags exceeds the capacity of the corresponding
443 * buffer, the function copies as many bytes to the output buffer as it can, and returns
444 * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow
447 * If an illegal argument is provided, the function returns the error
448 * U_ILLEGAL_ARGUMENT_ERROR.
450 * @param localeID The locale ID to parse.
451 * @param lang The language tag buffer.
452 * @param langLength The length of the language tag.
453 * @param script The script tag buffer.
454 * @param scriptLength The length of the script tag.
455 * @param region The region tag buffer.
456 * @param regionLength The length of the region tag.
457 * @param err A pointer to a UErrorCode for error reporting.
458 * @return The number of chars of the localeID parameter consumed.
460 static int32_t U_CALLCONV
462 const char* localeID
,
466 int32_t* scriptLength
,
468 int32_t* regionLength
,
471 const char* position
= localeID
;
472 int32_t subtagLength
= 0;
474 if(U_FAILURE(*err
) ||
477 langLength
== NULL
||
479 scriptLength
== NULL
||
481 regionLength
== NULL
) {
485 subtagLength
= ulocimp_getLanguage(position
, lang
, *langLength
, &position
);
486 u_terminateChars(lang
, *langLength
, subtagLength
, err
);
489 * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
490 * to be an error, because it indicates the user-supplied tag is
493 if(U_FAILURE(*err
)) {
497 *langLength
= subtagLength
;
500 * If no language was present, use the value of unknownLanguage
501 * instead. Otherwise, move past any separator.
503 if (*langLength
== 0) {
507 *langLength
= (int32_t)uprv_strlen(lang
);
509 else if (_isIDSeparator(*position
)) {
513 subtagLength
= ulocimp_getScript(position
, script
, *scriptLength
, &position
);
514 u_terminateChars(script
, *scriptLength
, subtagLength
, err
);
516 if(U_FAILURE(*err
)) {
520 *scriptLength
= subtagLength
;
522 if (*scriptLength
> 0) {
523 if (uprv_strnicmp(script
, unknownScript
, *scriptLength
) == 0) {
525 * If the script part is the "unknown" script, then don't return it.
531 * Move past any separator.
533 if (_isIDSeparator(*position
)) {
538 subtagLength
= ulocimp_getCountry(position
, region
, *regionLength
, &position
);
539 u_terminateChars(region
, *regionLength
, subtagLength
, err
);
541 if(U_FAILURE(*err
)) {
545 *regionLength
= subtagLength
;
547 if (*regionLength
> 0) {
548 if (uprv_strnicmp(region
, unknownRegion
, *regionLength
) == 0) {
550 * If the region part is the "unknown" region, then don't return it.
554 } else if (*position
!= 0 && *position
!= '@') {
555 /* back up over consumed trailing separator */
561 return (int32_t)(position
- localeID
);
566 * If we get here, we have no explicit error, it's the result of an
569 if (!U_FAILURE(*err
)) {
570 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
576 static int32_t U_CALLCONV
577 createLikelySubtagsString(
581 int32_t scriptLength
,
583 int32_t regionLength
,
584 const char* variants
,
585 int32_t variantsLength
,
591 * ULOC_FULLNAME_CAPACITY will provide enough capacity
592 * that we can build a string that contains the language,
593 * script and region code without worrying about overrunning
594 * the user-supplied buffer.
596 char tagBuffer
[ULOC_FULLNAME_CAPACITY
];
597 char likelySubtagsBuffer
[ULOC_FULLNAME_CAPACITY
];
598 int32_t tagBufferLength
= 0;
600 if(U_FAILURE(*err
)) {
605 * Try the language with the script and region first.
607 if (scriptLength
> 0 && regionLength
> 0) {
609 const char* likelySubtags
= NULL
;
611 tagBufferLength
= createTagString(
623 if(U_FAILURE(*err
)) {
631 sizeof(likelySubtagsBuffer
),
633 if(U_FAILURE(*err
)) {
637 if (likelySubtags
!= NULL
) {
638 /* Always use the language tag from the
639 maximal string, since it may be more
640 specific than the one provided. */
641 return createTagStringWithAlternates(
658 * Try the language with just the script.
660 if (scriptLength
> 0) {
662 const char* likelySubtags
= NULL
;
664 tagBufferLength
= createTagString(
676 if(U_FAILURE(*err
)) {
684 sizeof(likelySubtagsBuffer
),
686 if(U_FAILURE(*err
)) {
690 if (likelySubtags
!= NULL
) {
691 /* Always use the language tag from the
692 maximal string, since it may be more
693 specific than the one provided. */
694 return createTagStringWithAlternates(
711 * Try the language with just the region.
713 if (regionLength
> 0) {
715 const char* likelySubtags
= NULL
;
729 if(U_FAILURE(*err
)) {
737 sizeof(likelySubtagsBuffer
),
739 if(U_FAILURE(*err
)) {
743 if (likelySubtags
!= NULL
) {
744 /* Always use the language tag from the
745 maximal string, since it may be more
746 specific than the one provided. */
747 return createTagStringWithAlternates(
764 * Finally, try just the language.
767 const char* likelySubtags
= NULL
;
781 if(U_FAILURE(*err
)) {
789 sizeof(likelySubtagsBuffer
),
791 if(U_FAILURE(*err
)) {
795 if (likelySubtags
!= NULL
) {
796 /* Always use the language tag from the
797 maximal string, since it may be more
798 specific than the one provided. */
799 return createTagStringWithAlternates(
815 return u_terminateChars(
823 if (!U_FAILURE(*err
)) {
824 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
830 #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) \
831 { int32_t count = 0; \
833 for (i = 0; i < trailingLength; i++) { \
834 if (trailing[i] == '-' || trailing[i] == '_') { \
839 } else if (trailing[i] == '@') { \
841 } else if (count > 8) { \
850 _uloc_addLikelySubtags(const char* localeID
,
851 char* maximizedLocaleID
,
852 int32_t maximizedLocaleIDCapacity
,
855 char lang
[ULOC_LANG_CAPACITY
];
856 int32_t langLength
= sizeof(lang
);
857 char script
[ULOC_SCRIPT_CAPACITY
];
858 int32_t scriptLength
= sizeof(script
);
859 char region
[ULOC_COUNTRY_CAPACITY
];
860 int32_t regionLength
= sizeof(region
);
861 const char* trailing
= "";
862 int32_t trailingLength
= 0;
863 int32_t trailingIndex
= 0;
864 int32_t resultLength
= 0;
866 if(U_FAILURE(*err
)) {
869 else if (localeID
== NULL
||
870 maximizedLocaleID
== NULL
||
871 maximizedLocaleIDCapacity
<= 0) {
875 trailingIndex
= parseTagString(
884 if(U_FAILURE(*err
)) {
885 /* Overflow indicates an illegal argument error */
886 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
887 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
893 /* Find the length of the trailing portion. */
894 while (_isIDSeparator(localeID
[trailingIndex
])) {
897 trailing
= &localeID
[trailingIndex
];
898 trailingLength
= (int32_t)uprv_strlen(trailing
);
900 CHECK_TRAILING_VARIANT_SIZE(trailing
, trailingLength
);
903 createLikelySubtagsString(
913 maximizedLocaleIDCapacity
,
916 if (resultLength
== 0) {
917 const int32_t localIDLength
= (int32_t)uprv_strlen(localeID
);
920 * If we get here, we need to return localeID.
925 localIDLength
<= maximizedLocaleIDCapacity
?
926 localIDLength
: maximizedLocaleIDCapacity
);
931 maximizedLocaleIDCapacity
,
940 if (!U_FAILURE(*err
)) {
941 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
948 _uloc_minimizeSubtags(const char* localeID
,
949 char* minimizedLocaleID
,
950 int32_t minimizedLocaleIDCapacity
,
954 * ULOC_FULLNAME_CAPACITY will provide enough capacity
955 * that we can build a string that contains the language,
956 * script and region code without worrying about overrunning
957 * the user-supplied buffer.
959 char maximizedTagBuffer
[ULOC_FULLNAME_CAPACITY
];
960 int32_t maximizedTagBufferLength
= sizeof(maximizedTagBuffer
);
962 char lang
[ULOC_LANG_CAPACITY
];
963 int32_t langLength
= sizeof(lang
);
964 char script
[ULOC_SCRIPT_CAPACITY
];
965 int32_t scriptLength
= sizeof(script
);
966 char region
[ULOC_COUNTRY_CAPACITY
];
967 int32_t regionLength
= sizeof(region
);
968 const char* trailing
= "";
969 int32_t trailingLength
= 0;
970 int32_t trailingIndex
= 0;
972 if(U_FAILURE(*err
)) {
975 else if (localeID
== NULL
||
976 minimizedLocaleID
== NULL
||
977 minimizedLocaleIDCapacity
<= 0) {
991 if(U_FAILURE(*err
)) {
993 /* Overflow indicates an illegal argument error */
994 if (*err
== U_BUFFER_OVERFLOW_ERROR
) {
995 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1001 /* Find the spot where the variants or the keywords begin, if any. */
1002 while (_isIDSeparator(localeID
[trailingIndex
])) {
1005 trailing
= &localeID
[trailingIndex
];
1006 trailingLength
= (int32_t)uprv_strlen(trailing
);
1008 CHECK_TRAILING_VARIANT_SIZE(trailing
, trailingLength
);
1020 maximizedTagBufferLength
,
1022 if(U_FAILURE(*err
)) {
1027 * First, we need to first get the maximization
1028 * from AddLikelySubtags.
1030 maximizedTagBufferLength
=
1031 uloc_addLikelySubtags(
1034 maximizedTagBufferLength
,
1037 if(U_FAILURE(*err
)) {
1042 * Start first with just the language.
1045 char tagBuffer
[ULOC_FULLNAME_CAPACITY
];
1047 const int32_t tagBufferLength
=
1048 createLikelySubtagsString(
1061 if(U_FAILURE(*err
)) {
1064 else if (uprv_strnicmp(
1067 tagBufferLength
) == 0) {
1069 return createTagString(
1079 minimizedLocaleIDCapacity
,
1085 * Next, try the language and region.
1087 if (regionLength
> 0) {
1089 char tagBuffer
[ULOC_FULLNAME_CAPACITY
];
1091 const int32_t tagBufferLength
=
1092 createLikelySubtagsString(
1105 if(U_FAILURE(*err
)) {
1108 else if (uprv_strnicmp(
1111 tagBufferLength
) == 0) {
1113 return createTagString(
1123 minimizedLocaleIDCapacity
,
1129 * Finally, try the language and script. This is our last chance,
1130 * since trying with all three subtags would only yield the
1131 * maximal version that we already have.
1133 if (scriptLength
> 0 && regionLength
> 0) {
1134 char tagBuffer
[ULOC_FULLNAME_CAPACITY
];
1136 const int32_t tagBufferLength
=
1137 createLikelySubtagsString(
1150 if(U_FAILURE(*err
)) {
1153 else if (uprv_strnicmp(
1156 tagBufferLength
) == 0) {
1158 return createTagString(
1168 minimizedLocaleIDCapacity
,
1175 * If we got here, return the locale ID parameter.
1177 const int32_t localeIDLength
= (int32_t)uprv_strlen(localeID
);
1182 localeIDLength
<= minimizedLocaleIDCapacity
?
1183 localeIDLength
: minimizedLocaleIDCapacity
);
1185 return u_terminateChars(
1187 minimizedLocaleIDCapacity
,
1194 if (!U_FAILURE(*err
)) {
1195 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1204 do_canonicalize(const char* localeID
,
1206 int32_t bufferCapacity
,
1215 if (*err
== U_STRING_NOT_TERMINATED_WARNING
||
1216 *err
== U_BUFFER_OVERFLOW_ERROR
) {
1217 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1221 else if (U_FAILURE(*err
)) {
1230 U_DRAFT
int32_t U_EXPORT2
1231 uloc_addLikelySubtags(const char* localeID
,
1232 char* maximizedLocaleID
,
1233 int32_t maximizedLocaleIDCapacity
,
1236 char localeBuffer
[ULOC_FULLNAME_CAPACITY
];
1238 if (!do_canonicalize(
1241 sizeof(localeBuffer
),
1246 return _uloc_addLikelySubtags(
1249 maximizedLocaleIDCapacity
,
1254 U_DRAFT
int32_t U_EXPORT2
1255 uloc_minimizeSubtags(const char* localeID
,
1256 char* minimizedLocaleID
,
1257 int32_t minimizedLocaleIDCapacity
,
1260 char localeBuffer
[ULOC_FULLNAME_CAPACITY
];
1262 if (!do_canonicalize(
1265 sizeof(localeBuffer
),
1270 return _uloc_minimizeSubtags(
1273 minimizedLocaleIDCapacity
,