2 *******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol_res.cpp
8 * tab size: 8 (not used)
12 * This file contains dependencies that the collation run-time doesn't normally
13 * need. This mainly contains resource bundle usage and collation meta information
15 * Modification history
17 * 1996-1999 various members of ICU team maintained C API for collation framework
18 * 02/16/2001 synwee Added internal method getPrevSpecialCE
19 * 03/01/2001 synwee Added maxexpansion functionality.
20 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
21 * 12/08/2004 grhoten Split part of ucol.cpp into ucol_res.cpp
24 #include "unicode/utypes.h"
26 #if !UCONFIG_NO_COLLATION
27 #include "unicode/uloc.h"
28 #include "unicode/coll.h"
29 #include "unicode/tblcoll.h"
30 #include "unicode/caniter.h"
31 #include "unicode/uscript.h"
32 #include "unicode/ustring.h"
52 static void ucol_setReorderCodesFromParser(UCollator
*coll
, UColTokenParser
*parser
, UErrorCode
*status
);
54 // static UCA. There is only one. Collators don't use it.
55 // It is referenced only in ucol_initUCA and ucol_cleanup
56 static UCollator
* _staticUCA
= NULL
;
57 // static pointer to udata memory. Inited in ucol_initUCA
58 // used for cleanup in ucol_cleanup
59 static UDataMemory
* UCA_DATA_MEM
= NULL
;
62 static UBool U_CALLCONV
63 ucol_res_cleanup(void)
66 udata_close(UCA_DATA_MEM
);
70 ucol_close(_staticUCA
);
76 static UBool U_CALLCONV
77 isAcceptableUCA(void * /*context*/,
78 const char * /*type*/, const char * /*name*/,
79 const UDataInfo
*pInfo
){
80 /* context, type & name are intentionally not used */
81 if( pInfo
->size
>=20 &&
82 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
83 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
84 pInfo
->dataFormat
[0]==UCA_DATA_FORMAT_0
&& /* dataFormat="UCol" */
85 pInfo
->dataFormat
[1]==UCA_DATA_FORMAT_1
&&
86 pInfo
->dataFormat
[2]==UCA_DATA_FORMAT_2
&&
87 pInfo
->dataFormat
[3]==UCA_DATA_FORMAT_3
&&
88 pInfo
->formatVersion
[0]==UCA_FORMAT_VERSION_0
&&
89 pInfo
->formatVersion
[1]>=UCA_FORMAT_VERSION_1
// &&
90 //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
91 //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
92 //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
94 UVersionInfo UCDVersion
;
95 u_getUnicodeVersion(UCDVersion
);
96 return (UBool
)(pInfo
->dataVersion
[0]==UCDVersion
[0]
97 && pInfo
->dataVersion
[1]==UCDVersion
[1]);
98 //&& pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2]
99 //&& pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]);
106 /* do not close UCA returned by ucol_initUCA! */
108 ucol_initUCA(UErrorCode
*status
) {
109 if(U_FAILURE(*status
)) {
113 UMTX_CHECK(NULL
, (_staticUCA
== NULL
), needsInit
);
116 UDataMemory
*result
= udata_openChoice(U_ICUDATA_COLL
, UCA_DATA_TYPE
, UCA_DATA_NAME
, isAcceptableUCA
, NULL
, status
);
118 if(U_SUCCESS(*status
)){
119 UCollator
*newUCA
= ucol_initCollator((const UCATableHeader
*)udata_getMemory(result
), NULL
, NULL
, status
);
120 if(U_SUCCESS(*status
)){
121 // Initalize variables for implicit generation
122 uprv_uca_initImplicitConstants(status
);
125 if(_staticUCA
== NULL
) {
126 UCA_DATA_MEM
= result
;
133 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_RES
, ucol_res_cleanup
);
150 U_CAPI
void U_EXPORT2
157 /****************************************************************************/
158 /* Following are the open/close functions */
160 /****************************************************************************/
162 tryOpeningFromRules(UResourceBundle
*collElem
, UErrorCode
*status
) {
163 int32_t rulesLen
= 0;
164 const UChar
*rules
= ures_getStringByKey(collElem
, "Sequence", &rulesLen
, status
);
165 return ucol_openRules(rules
, rulesLen
, UCOL_DEFAULT
, UCOL_DEFAULT
, NULL
, status
);
172 ucol_open_internal(const char *loc
,
175 UErrorCode intStatus
= U_ZERO_ERROR
;
176 const UCollator
* UCA
= ucol_initUCA(status
);
179 if(U_FAILURE(*status
)) return 0;
183 UCollator
*result
= NULL
;
184 UResourceBundle
*b
= ures_open(U_ICUDATA_COLL
, loc
, status
);
186 /* we try to find stuff from keyword */
187 UResourceBundle
*collations
= ures_getByKey(b
, "collations", NULL
, status
);
188 UResourceBundle
*collElem
= NULL
;
190 // if there is a keyword, we pick it up and try to get elements
191 if(!uloc_getKeywordValue(loc
, "collation", keyBuffer
, 256, status
) ||
192 !uprv_strcmp(keyBuffer
,"default")) { /* Treat 'zz@collation=default' as 'zz'. */
193 // no keyword. we try to find the default setting, which will give us the keyword value
194 intStatus
= U_ZERO_ERROR
;
195 // finding default value does not affect collation fallback status
196 UResourceBundle
*defaultColl
= ures_getByKeyWithFallback(collations
, "default", NULL
, &intStatus
);
197 if(U_SUCCESS(intStatus
)) {
198 int32_t defaultKeyLen
= 0;
199 const UChar
*defaultKey
= ures_getString(defaultColl
, &defaultKeyLen
, &intStatus
);
200 u_UCharsToChars(defaultKey
, keyBuffer
, defaultKeyLen
);
201 keyBuffer
[defaultKeyLen
] = 0;
203 *status
= U_INTERNAL_PROGRAM_ERROR
;
206 ures_close(defaultColl
);
208 collElem
= ures_getByKeyWithFallback(collations
, keyBuffer
, collations
, status
);
209 collations
= NULL
; // We just reused the collations object as collElem.
211 UResourceBundle
*binary
= NULL
;
212 UResourceBundle
*reorderRes
= NULL
;
214 if(*status
== U_MISSING_RESOURCE_ERROR
) { /* We didn't find the tailoring data, we fallback to the UCA */
215 *status
= U_USING_DEFAULT_WARNING
;
216 result
= ucol_initCollator(UCA
->image
, result
, UCA
, status
);
217 if (U_FAILURE(*status
)) {
220 // if we use UCA, real locale is root
222 b
= ures_open(U_ICUDATA_COLL
, "", status
);
223 ures_close(collElem
);
224 collElem
= ures_open(U_ICUDATA_COLL
, "", status
);
225 if(U_FAILURE(*status
)) {
228 result
->hasRealData
= FALSE
;
229 } else if(U_SUCCESS(*status
)) {
230 intStatus
= U_ZERO_ERROR
;
232 binary
= ures_getByKey(collElem
, "%%CollationBin", NULL
, &intStatus
);
234 if(intStatus
== U_MISSING_RESOURCE_ERROR
) { /* we didn't find the binary image, we should use the rules */
236 result
= tryOpeningFromRules(collElem
, status
);
237 if(U_FAILURE(*status
)) {
240 } else if(U_SUCCESS(intStatus
)) { /* otherwise, we'll pick a collation data that exists */
242 const uint8_t *inData
= ures_getBinary(binary
, &len
, status
);
243 if(U_FAILURE(*status
)) {
246 UCATableHeader
*colData
= (UCATableHeader
*)inData
;
247 if(uprv_memcmp(colData
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
248 uprv_memcmp(colData
->UCDVersion
, UCA
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0 ||
249 colData
->version
[0] != UCOL_BUILDER_VERSION
)
251 *status
= U_DIFFERENT_UCA_VERSION
;
252 result
= tryOpeningFromRules(collElem
, status
);
254 if(U_FAILURE(*status
)){
257 if((uint32_t)len
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
258 result
= ucol_initCollator((const UCATableHeader
*)inData
, result
, UCA
, status
);
259 if(U_FAILURE(*status
)){
262 result
->hasRealData
= TRUE
;
264 result
= ucol_initCollator(UCA
->image
, result
, UCA
, status
);
265 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(inData
+((const UCATableHeader
*)inData
)->options
), status
);
266 if(U_FAILURE(*status
)){
269 result
->hasRealData
= FALSE
;
271 result
->freeImageOnClose
= FALSE
;
273 reorderRes
= ures_getByKey(collElem
, "%%ReorderCodes", NULL
, &intStatus
);
274 if (U_SUCCESS(intStatus
)) {
275 int32_t reorderCodesLen
= 0;
276 const int32_t* reorderCodes
= ures_getIntVector(reorderRes
, &reorderCodesLen
, status
);
277 ucol_setReorderCodes(result
, reorderCodes
, reorderCodesLen
, status
);
278 if (U_FAILURE(*status
)) {
284 } else { // !U_SUCCESS(binaryStatus)
285 if(U_SUCCESS(*status
)) {
286 *status
= intStatus
; // propagate underlying error
290 intStatus
= U_ZERO_ERROR
;
291 result
->rules
= ures_getStringByKey(collElem
, "Sequence", &result
->rulesLength
, &intStatus
);
292 result
->freeRulesOnClose
= FALSE
;
293 } else { /* There is another error, and we're just gonna clean up */
297 intStatus
= U_ZERO_ERROR
;
298 result
->ucaRules
= ures_getStringByKey(b
,"UCARules",NULL
,&intStatus
);
301 loc
= ures_getLocaleByType(b
, ULOC_ACTUAL_LOCALE
, status
);
303 result
->requestedLocale
= uprv_strdup(loc
);
305 if (result
->requestedLocale
== NULL
) {
306 *status
= U_MEMORY_ALLOCATION_ERROR
;
309 loc
= ures_getLocaleByType(collElem
, ULOC_ACTUAL_LOCALE
, status
);
310 result
->actualLocale
= uprv_strdup(loc
);
312 if (result
->actualLocale
== NULL
) {
313 *status
= U_MEMORY_ALLOCATION_ERROR
;
316 loc
= ures_getLocaleByType(b
, ULOC_ACTUAL_LOCALE
, status
);
317 result
->validLocale
= uprv_strdup(loc
);
319 if (result
->validLocale
== NULL
) {
320 *status
= U_MEMORY_ALLOCATION_ERROR
;
325 ures_close(collElem
);
327 ures_close(reorderRes
);
332 ures_close(collElem
);
334 ures_close(reorderRes
);
340 ucol_open(const char *loc
,
345 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN
);
346 UTRACE_DATA1(UTRACE_INFO
, "locale = \"%s\"", loc
);
347 UCollator
*result
= NULL
;
349 #if !UCONFIG_NO_SERVICE
350 result
= Collator::createUCollator(loc
, status
);
354 result
= ucol_open_internal(loc
, status
);
356 UTRACE_EXIT_PTR_STATUS(result
, *status
);
362 ucol_openRulesForImport( const UChar
*rules
,
364 UColAttributeValue normalizationMode
,
365 UCollationStrength strength
,
366 UParseError
*parseError
,
367 GetCollationRulesFunction importFunc
,
372 UColAttributeValue norm
;
375 if(status
== NULL
|| U_FAILURE(*status
)){
379 if(rules
== NULL
|| rulesLength
< -1) {
380 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
384 if(rulesLength
== -1) {
385 rulesLength
= u_strlen(rules
);
388 if(parseError
== NULL
){
392 switch(normalizationMode
) {
396 norm
= normalizationMode
;
399 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
403 UCollator
*result
= NULL
;
404 UCATableHeader
*table
= NULL
;
405 UCollator
*UCA
= ucol_initUCA(status
);
407 if(U_FAILURE(*status
)){
411 ucol_tok_initTokenList(&src
, rules
, rulesLength
, UCA
, importFunc
, context
, status
);
412 ucol_tok_assembleTokenList(&src
,parseError
, status
);
414 if(U_FAILURE(*status
)) {
415 /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
416 /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
417 /* so something might be done here... or on lower level */
419 if(*status
== U_ILLEGAL_ARGUMENT_ERROR
) {
420 fprintf(stderr
, "bad option starting at offset %i\n", (int)(src
.current
-src
.source
));
422 fprintf(stderr
, "invalid rule just before offset %i\n", (int)(src
.current
-src
.source
));
428 if(src
.resultLen
> 0 || src
.removeSet
!= NULL
) { /* we have a set of rules, let's make something of it */
429 /* also, if we wanted to remove some contractions, we should make a tailoring */
430 table
= ucol_assembleTailoringTable(&src
, status
);
431 if(U_SUCCESS(*status
)) {
433 table
->version
[0] = UCOL_BUILDER_VERSION
;
434 // no tailoring information on this level
435 table
->version
[1] = table
->version
[2] = table
->version
[3] = 0;
437 u_getUnicodeVersion(table
->UCDVersion
);
439 uprv_memcpy(table
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
440 result
= ucol_initCollator(table
, 0, UCA
, status
);
441 if (U_FAILURE(*status
)) {
444 result
->hasRealData
= TRUE
;
445 result
->freeImageOnClose
= TRUE
;
447 } else { /* no rules, but no error either */
448 // must be only options
449 // We will init the collator from UCA
450 result
= ucol_initCollator(UCA
->image
, 0, UCA
, status
);
451 // Check for null result
452 if (U_FAILURE(*status
)) {
455 // And set only the options
456 UColOptionSet
*opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
459 *status
= U_MEMORY_ALLOCATION_ERROR
;
462 uprv_memcpy(opts
, src
.opts
, sizeof(UColOptionSet
));
463 ucol_setOptionsFromHeader(result
, opts
, status
);
464 ucol_setReorderCodesFromParser(result
, &src
, status
);
465 result
->freeOptionsOnClose
= TRUE
;
466 result
->hasRealData
= FALSE
;
467 result
->freeImageOnClose
= FALSE
;
470 if(U_SUCCESS(*status
)) {
472 result
->dataVersion
[0] = UCOL_BUILDER_VERSION
;
473 if(rulesLength
> 0) {
474 newRules
= (UChar
*)uprv_malloc((rulesLength
+1)*U_SIZEOF_UCHAR
);
476 if (newRules
== NULL
) {
477 *status
= U_MEMORY_ALLOCATION_ERROR
;
480 uprv_memcpy(newRules
, rules
, rulesLength
*U_SIZEOF_UCHAR
);
481 newRules
[rulesLength
]=0;
482 result
->rules
= newRules
;
483 result
->rulesLength
= rulesLength
;
484 result
->freeRulesOnClose
= TRUE
;
486 result
->ucaRules
= NULL
;
487 result
->actualLocale
= NULL
;
488 result
->validLocale
= NULL
;
489 result
->requestedLocale
= NULL
;
490 ucol_buildPermutationTable(result
, status
);
491 ucol_setAttribute(result
, UCOL_STRENGTH
, strength
, status
);
492 ucol_setAttribute(result
, UCOL_NORMALIZATION_MODE
, norm
, status
);
505 ucol_tok_closeTokenList(&src
);
510 U_CAPI UCollator
* U_EXPORT2
511 ucol_openRules( const UChar
*rules
,
513 UColAttributeValue normalizationMode
,
514 UCollationStrength strength
,
515 UParseError
*parseError
,
518 return ucol_openRulesForImport(rules
,
523 ucol_tok_getRulesFromBundle
,
528 U_CAPI
int32_t U_EXPORT2
529 ucol_getRulesEx(const UCollator
*coll
, UColRuleOption delta
, UChar
*buffer
, int32_t bufferLen
) {
530 UErrorCode status
= U_ZERO_ERROR
;
533 const UChar
* ucaRules
= 0;
534 const UChar
*rules
= ucol_getRules(coll
, &len
);
535 if(delta
== UCOL_FULL_RULES
) {
536 /* take the UCA rules and append real rules at the end */
537 /* UCA rules will be probably coming from the root RB */
538 ucaRules
= coll
->ucaRules
;
540 UCAlen
= u_strlen(ucaRules
);
543 ucaRules = ures_getStringByKey(coll->rb,"UCARules",&UCAlen,&status);
544 UResourceBundle* cresb = ures_getByKeyWithFallback(coll->rb, "collations", NULL, &status);
545 UResourceBundle* uca = ures_getByKeyWithFallback(cresb, "UCA", NULL, &status);
546 ucaRules = ures_getStringByKey(uca,"Sequence",&UCAlen,&status);
551 if(U_FAILURE(status
)) {
554 if(buffer
!=0 && bufferLen
>0){
557 u_memcpy(buffer
, ucaRules
, uprv_min(UCAlen
, bufferLen
));
559 if(len
> 0 && bufferLen
> UCAlen
) {
560 u_memcpy(buffer
+UCAlen
, rules
, uprv_min(len
, bufferLen
-UCAlen
));
563 return u_terminateUChars(buffer
, bufferLen
, len
+UCAlen
, &status
);
566 static const UChar _NUL
= 0;
568 U_CAPI
const UChar
* U_EXPORT2
569 ucol_getRules( const UCollator
*coll
,
572 if(coll
->rules
!= NULL
) {
573 *length
= coll
->rulesLength
;
582 U_CAPI UBool U_EXPORT2
583 ucol_equals(const UCollator
*source
, const UCollator
*target
) {
584 UErrorCode status
= U_ZERO_ERROR
;
585 // if pointers are equal, collators are equal
586 if(source
== target
) {
589 int32_t i
= 0, j
= 0;
590 // if any of attributes are different, collators are not equal
591 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
592 if(ucol_getAttribute(source
, (UColAttribute
)i
, &status
) != ucol_getAttribute(target
, (UColAttribute
)i
, &status
) || U_FAILURE(status
)) {
596 if (source
->reorderCodesLength
!= target
->reorderCodesLength
){
599 for (i
= 0; i
< source
->reorderCodesLength
; i
++) {
600 if(source
->reorderCodes
[i
] != target
->reorderCodes
[i
]) {
605 int32_t sourceRulesLen
= 0, targetRulesLen
= 0;
606 const UChar
*sourceRules
= ucol_getRules(source
, &sourceRulesLen
);
607 const UChar
*targetRules
= ucol_getRules(target
, &targetRulesLen
);
609 if(sourceRulesLen
== targetRulesLen
&& u_strncmp(sourceRules
, targetRules
, sourceRulesLen
) == 0) {
610 // all the attributes are equal and the rules are equal - collators are equal
613 // hard part, need to construct tree from rules and see if they yield the same tailoring
615 UParseError parseError
;
616 UColTokenParser sourceParser
, targetParser
;
617 int32_t sourceListLen
= 0, targetListLen
= 0;
618 ucol_tok_initTokenList(&sourceParser
, sourceRules
, sourceRulesLen
, source
->UCA
, ucol_tok_getRulesFromBundle
, NULL
, &status
);
619 ucol_tok_initTokenList(&targetParser
, targetRules
, targetRulesLen
, target
->UCA
, ucol_tok_getRulesFromBundle
, NULL
, &status
);
620 sourceListLen
= ucol_tok_assembleTokenList(&sourceParser
, &parseError
, &status
);
621 targetListLen
= ucol_tok_assembleTokenList(&targetParser
, &parseError
, &status
);
623 if(sourceListLen
!= targetListLen
) {
624 // different number of resets
627 UColToken
*sourceReset
= NULL
, *targetReset
= NULL
;
628 UChar
*sourceResetString
= NULL
, *targetResetString
= NULL
;
629 int32_t sourceStringLen
= 0, targetStringLen
= 0;
630 for(i
= 0; i
< sourceListLen
; i
++) {
631 sourceReset
= sourceParser
.lh
[i
].reset
;
632 sourceResetString
= sourceParser
.source
+(sourceReset
->source
& 0xFFFFFF);
633 sourceStringLen
= sourceReset
->source
>> 24;
634 for(j
= 0; j
< sourceListLen
; j
++) {
635 targetReset
= targetParser
.lh
[j
].reset
;
636 targetResetString
= targetParser
.source
+(targetReset
->source
& 0xFFFFFF);
637 targetStringLen
= targetReset
->source
>> 24;
638 if(sourceStringLen
== targetStringLen
&& (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) == 0)) {
639 sourceReset
= sourceParser
.lh
[i
].first
;
640 targetReset
= targetParser
.lh
[j
].first
;
641 while(sourceReset
!= NULL
&& targetReset
!= NULL
) {
642 sourceResetString
= sourceParser
.source
+(sourceReset
->source
& 0xFFFFFF);
643 sourceStringLen
= sourceReset
->source
>> 24;
644 targetResetString
= targetParser
.source
+(targetReset
->source
& 0xFFFFFF);
645 targetStringLen
= targetReset
->source
>> 24;
646 if(sourceStringLen
!= targetStringLen
|| (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) != 0)) {
650 // probably also need to check the expansions
651 if(sourceReset
->expansion
) {
652 if(!targetReset
->expansion
) {
656 // compare expansions
657 sourceResetString
= sourceParser
.source
+(sourceReset
->expansion
& 0xFFFFFF);
658 sourceStringLen
= sourceReset
->expansion
>> 24;
659 targetResetString
= targetParser
.source
+(targetReset
->expansion
& 0xFFFFFF);
660 targetStringLen
= targetReset
->expansion
>> 24;
661 if(sourceStringLen
!= targetStringLen
|| (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) != 0)) {
667 if(targetReset
->expansion
) {
672 sourceReset
= sourceReset
->next
;
673 targetReset
= targetReset
->next
;
675 if(sourceReset
!= targetReset
) { // at least one is not NULL
676 // there are more tailored elements in one list
685 // couldn't find the reset anchor, so the collators are not equal
686 if(j
== sourceListLen
) {
694 ucol_tok_closeTokenList(&sourceParser
);
695 ucol_tok_closeTokenList(&targetParser
);
700 U_CAPI
int32_t U_EXPORT2
701 ucol_getDisplayName( const char *objLoc
,
704 int32_t resultLength
,
709 if(U_FAILURE(*status
)) return -1;
711 if(!(result
==NULL
&& resultLength
==0)) {
712 // NULL destination for pure preflighting: empty dummy string
713 // otherwise, alias the destination buffer
714 dst
.setTo(result
, 0, resultLength
);
716 Collator::getDisplayName(Locale(objLoc
), Locale(dispLoc
), dst
);
717 return dst
.extract(result
, resultLength
, *status
);
720 U_CAPI
const char* U_EXPORT2
721 ucol_getAvailable(int32_t index
)
724 const Locale
*loc
= Collator::getAvailableLocales(count
);
725 if (loc
!= NULL
&& index
< count
) {
726 return loc
[index
].getName();
731 U_CAPI
int32_t U_EXPORT2
732 ucol_countAvailable()
735 Collator::getAvailableLocales(count
);
739 #if !UCONFIG_NO_SERVICE
740 U_CAPI UEnumeration
* U_EXPORT2
741 ucol_openAvailableLocales(UErrorCode
*status
) {
744 // This is a wrapper over Collator::getAvailableLocales()
745 if (U_FAILURE(*status
)) {
748 StringEnumeration
*s
= Collator::getAvailableLocales();
750 *status
= U_MEMORY_ALLOCATION_ERROR
;
753 return uenum_openFromStringEnumeration(s
, status
);
757 // Note: KEYWORDS[0] != RESOURCE_NAME - alan
759 static const char RESOURCE_NAME
[] = "collations";
761 static const char* const KEYWORDS
[] = { "collation" };
763 #define KEYWORD_COUNT (sizeof(KEYWORDS)/sizeof(KEYWORDS[0]))
765 U_CAPI UEnumeration
* U_EXPORT2
766 ucol_getKeywords(UErrorCode
*status
) {
767 UEnumeration
*result
= NULL
;
768 if (U_SUCCESS(*status
)) {
769 return uenum_openCharStringsEnumeration(KEYWORDS
, KEYWORD_COUNT
, status
);
774 U_CAPI UEnumeration
* U_EXPORT2
775 ucol_getKeywordValues(const char *keyword
, UErrorCode
*status
) {
776 if (U_FAILURE(*status
)) {
779 // hard-coded to accept exactly one collation keyword
780 // modify if additional collation keyword is added later
781 if (keyword
==NULL
|| uprv_strcmp(keyword
, KEYWORDS
[0])!=0)
783 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
786 return ures_getKeywordValues(U_ICUDATA_COLL
, RESOURCE_NAME
, status
);
789 static const UEnumeration defaultKeywordValues
= {
792 ulist_close_keyword_values_iterator
,
793 ulist_count_keyword_values
,
795 ulist_next_keyword_value
,
796 ulist_reset_keyword_values_iterator
801 U_CAPI UEnumeration
* U_EXPORT2
802 ucol_getKeywordValuesForLocale(const char* /*key*/, const char* locale
,
803 UBool
/*commonlyUsed*/, UErrorCode
* status
) {
804 /* Get the locale base name. */
805 char localeBuffer
[ULOC_FULLNAME_CAPACITY
] = "";
806 uloc_getBaseName(locale
, localeBuffer
, sizeof(localeBuffer
), status
);
808 /* Create the 2 lists
809 * -values is the temp location for the keyword values
810 * -results hold the actual list used by the UEnumeration object
812 UList
*values
= ulist_createEmptyList(status
);
813 UList
*results
= ulist_createEmptyList(status
);
814 UEnumeration
*en
= (UEnumeration
*)uprv_malloc(sizeof(UEnumeration
));
815 if (U_FAILURE(*status
) || en
== NULL
) {
817 *status
= U_MEMORY_ALLOCATION_ERROR
;
821 ulist_deleteList(values
);
822 ulist_deleteList(results
);
826 memcpy(en
, &defaultKeywordValues
, sizeof(UEnumeration
));
827 en
->context
= results
;
829 /* Open the resource bundle for collation with the given locale. */
830 UResourceBundle bundle
, collations
, collres
, defres
;
831 ures_initStackObject(&bundle
);
832 ures_initStackObject(&collations
);
833 ures_initStackObject(&collres
);
834 ures_initStackObject(&defres
);
836 ures_openFillIn(&bundle
, U_ICUDATA_COLL
, localeBuffer
, status
);
838 while (U_SUCCESS(*status
)) {
839 ures_getByKey(&bundle
, RESOURCE_NAME
, &collations
, status
);
840 ures_resetIterator(&collations
);
841 while (U_SUCCESS(*status
) && ures_hasNext(&collations
)) {
842 ures_getNextResource(&collations
, &collres
, status
);
843 const char *key
= ures_getKey(&collres
);
844 /* If the key is default, get the string and store it in results list only
845 * if results list is empty.
847 if (uprv_strcmp(key
, "default") == 0) {
848 if (ulist_getListSize(results
) == 0) {
849 char *defcoll
= (char *)uprv_malloc(sizeof(char) * ULOC_KEYWORDS_CAPACITY
);
850 int32_t defcollLength
= ULOC_KEYWORDS_CAPACITY
;
852 ures_getNextResource(&collres
, &defres
, status
);
853 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
854 /* optimize - use the utf-8 string */
855 ures_getUTF8String(&defres
, defcoll
, &defcollLength
, TRUE
, status
);
858 const UChar
* defString
= ures_getString(&defres
, &defcollLength
, status
);
859 if(U_SUCCESS(*status
)) {
860 if(defcollLength
+1 > ULOC_KEYWORDS_CAPACITY
) {
861 *status
= U_BUFFER_OVERFLOW_ERROR
;
863 u_UCharsToChars(defString
, defcoll
, defcollLength
+1);
869 ulist_addItemBeginList(results
, defcoll
, TRUE
, status
);
872 ulist_addItemEndList(values
, key
, FALSE
, status
);
876 /* If the locale is "" this is root so exit. */
877 if (uprv_strlen(localeBuffer
) == 0) {
880 /* Get the parent locale and open a new resource bundle. */
881 uloc_getParent(localeBuffer
, localeBuffer
, sizeof(localeBuffer
), status
);
882 ures_openFillIn(&bundle
, U_ICUDATA_COLL
, localeBuffer
, status
);
886 ures_close(&collres
);
887 ures_close(&collations
);
890 if (U_SUCCESS(*status
)) {
892 ulist_resetList(values
);
893 while ((value
= (char *)ulist_getNext(values
)) != NULL
) {
894 if (!ulist_containsString(results
, value
, (int32_t)uprv_strlen(value
))) {
895 ulist_addItemEndList(results
, value
, FALSE
, status
);
896 if (U_FAILURE(*status
)) {
903 ulist_deleteList(values
);
905 if (U_FAILURE(*status
)){
909 ulist_resetList(results
);
915 U_CAPI
int32_t U_EXPORT2
916 ucol_getFunctionalEquivalent(char* result
, int32_t resultCapacity
,
917 const char* keyword
, const char* locale
,
918 UBool
* isAvailable
, UErrorCode
* status
)
920 // N.B.: Resource name is "collations" but keyword is "collation"
921 return ures_getFunctionalEquivalent(result
, resultCapacity
, U_ICUDATA_COLL
,
922 "collations", keyword
, locale
,
923 isAvailable
, TRUE
, status
);
926 /* returns the locale name the collation data comes from */
927 U_CAPI
const char * U_EXPORT2
928 ucol_getLocale(const UCollator
*coll
, ULocDataLocaleType type
, UErrorCode
*status
) {
929 return ucol_getLocaleByType(coll
, type
, status
);
932 U_CAPI
const char * U_EXPORT2
933 ucol_getLocaleByType(const UCollator
*coll
, ULocDataLocaleType type
, UErrorCode
*status
) {
934 const char *result
= NULL
;
935 if(status
== NULL
|| U_FAILURE(*status
)) {
938 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE
);
939 UTRACE_DATA1(UTRACE_INFO
, "coll=%p", coll
);
942 case ULOC_ACTUAL_LOCALE
:
943 result
= coll
->actualLocale
;
945 case ULOC_VALID_LOCALE
:
946 result
= coll
->validLocale
;
948 case ULOC_REQUESTED_LOCALE
:
949 result
= coll
->requestedLocale
;
952 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
954 UTRACE_DATA1(UTRACE_INFO
, "result = %s", result
);
955 UTRACE_EXIT_STATUS(*status
);
959 U_CFUNC
void U_EXPORT2
960 ucol_setReqValidLocales(UCollator
*coll
, char *requestedLocaleToAdopt
, char *validLocaleToAdopt
, char *actualLocaleToAdopt
)
963 if (coll
->validLocale
) {
964 uprv_free(coll
->validLocale
);
966 coll
->validLocale
= validLocaleToAdopt
;
967 if (coll
->requestedLocale
) { // should always have
968 uprv_free(coll
->requestedLocale
);
970 coll
->requestedLocale
= requestedLocaleToAdopt
;
971 if (coll
->actualLocale
) {
972 uprv_free(coll
->actualLocale
);
974 coll
->actualLocale
= actualLocaleToAdopt
;
978 U_CAPI USet
* U_EXPORT2
979 ucol_getTailoredSet(const UCollator
*coll
, UErrorCode
*status
)
983 if(status
== NULL
|| U_FAILURE(*status
)) {
986 if(coll
== NULL
|| coll
->UCA
== NULL
) {
987 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
990 UParseError parseError
;
992 int32_t rulesLen
= 0;
993 const UChar
*rules
= ucol_getRules(coll
, &rulesLen
);
994 UBool startOfRules
= TRUE
;
995 // we internally use the C++ class, for the following reasons:
996 // 1. we need to utilize canonical iterator, which is a C++ only class
997 // 2. canonical iterator returns UnicodeStrings - USet cannot take them
998 // 3. USet is internally really UnicodeSet, C is just a wrapper
999 UnicodeSet
*tailored
= new UnicodeSet();
1000 UnicodeString pattern
;
1001 UnicodeString empty
;
1002 CanonicalIterator
it(empty
, *status
);
1005 // The idea is to tokenize the rule set. For each non-reset token,
1006 // we add all the canonicaly equivalent FCD sequences
1007 ucol_tok_initTokenList(&src
, rules
, rulesLen
, coll
->UCA
, ucol_tok_getRulesFromBundle
, NULL
, status
);
1008 while (ucol_tok_parseNextToken(&src
, startOfRules
, &parseError
, status
) != NULL
) {
1009 startOfRules
= FALSE
;
1010 if(src
.parsedToken
.strength
!= UCOL_TOK_RESET
) {
1011 const UChar
*stuff
= src
.source
+(src
.parsedToken
.charsOffset
);
1012 it
.setSource(UnicodeString(stuff
, src
.parsedToken
.charsLen
), *status
);
1013 pattern
= it
.next();
1014 while(!pattern
.isBogus()) {
1015 if(Normalizer::quickCheck(pattern
, UNORM_FCD
, *status
) != UNORM_NO
) {
1016 tailored
->add(pattern
);
1018 pattern
= it
.next();
1022 ucol_tok_closeTokenList(&src
);
1023 return (USet
*)tailored
;
1027 * Collation Reordering
1030 static void ucol_setReorderCodesFromParser(UCollator
*coll
, UColTokenParser
*parser
, UErrorCode
*status
) {
1031 if (U_FAILURE(*status
)) {
1035 coll
->reorderCodesLength
= 0;
1036 if (coll
->reorderCodes
!= NULL
) {
1037 uprv_free(coll
->reorderCodes
);
1040 if (parser
->reorderCodesLength
== 0 || parser
->reorderCodes
== NULL
) {
1044 coll
->reorderCodesLength
= parser
->reorderCodesLength
;
1045 coll
->reorderCodes
= (int32_t*) uprv_malloc(coll
->reorderCodesLength
* sizeof(int32_t));
1046 uprv_memcpy(coll
->reorderCodes
, parser
->reorderCodes
, coll
->reorderCodesLength
* sizeof(int32_t));
1049 static int ucol_getLeadBytesForReorderCode(UCollator
*coll
, int reorderCode
, uint16_t* returnLeadBytes
, int returnCapacity
) {
1050 uint16_t reorderCodeIndexLength
= *((uint16_t*) ((uint8_t *)coll
->UCA
->image
+ coll
->UCA
->image
->scriptToLeadByte
));
1051 uint16_t* reorderCodeIndex
= (uint16_t*) ((uint8_t *)coll
->UCA
->image
+ coll
->UCA
->image
->scriptToLeadByte
+ 2 *sizeof(uint16_t));
1053 // TODO - replace with a binary search
1054 // reorder code index is 2 uint16_t's - reorder code + offset
1055 for (int i
= 0; i
< reorderCodeIndexLength
; i
++) {
1056 if (reorderCode
== reorderCodeIndex
[i
*2]) {
1057 uint16_t dataOffset
= reorderCodeIndex
[(i
*2) + 1];
1058 if ((dataOffset
& 0x8000) == 0x8000) {
1059 // offset isn't offset but instead is a single data element
1060 if (returnCapacity
>= 1) {
1061 returnLeadBytes
[0] = dataOffset
& ~0x8000;
1066 uint16_t* dataOffsetBase
= (uint16_t*) ((uint8_t *)reorderCodeIndex
+ reorderCodeIndexLength
* (2 * sizeof(uint16_t)));
1067 uint16_t leadByteCount
= *(dataOffsetBase
+ dataOffset
);
1068 leadByteCount
= leadByteCount
> returnCapacity
? returnCapacity
: leadByteCount
;
1069 uprv_memcpy(returnLeadBytes
, dataOffsetBase
+ dataOffset
+ 1, leadByteCount
* sizeof(uint16_t));
1070 return leadByteCount
;
1076 static int ucol_getReorderCodesForLeadByte(UCollator
*coll
, int leadByte
, int16_t* returnReorderCodes
, int returnCapacity
) {
1077 int leadByteIndexLength
= *((uint16_t*) ((uint8_t *)coll
->UCA
->image
+ coll
->UCA
->image
->leadByteToScript
));
1078 uint16_t* leadByteIndex
= (uint16_t*) ((uint8_t *)coll
->UCA
->image
+ coll
->UCA
->image
->leadByteToScript
+ 2 *sizeof(uint16_t));
1079 if (leadByte
>= leadByteIndexLength
) {
1083 if ((leadByteIndex
[leadByte
] & 0x8000) == 0x8000) {
1084 // offset isn't offset but instead is a single data element
1085 if (returnCapacity
>= 1) {
1086 returnReorderCodes
[0] = leadByteIndex
[leadByte
] & ~0x8000;
1091 uint16_t* dataOffsetBase
= (uint16_t*) ((uint8_t *)leadByteIndex
+ leadByteIndexLength
* (2 * sizeof(uint16_t)));
1092 uint16_t reorderCodeCount
= *(dataOffsetBase
+ leadByteIndex
[leadByte
]);
1093 reorderCodeCount
= reorderCodeCount
> returnCapacity
? returnCapacity
: reorderCodeCount
;
1094 uprv_memcpy(returnReorderCodes
, dataOffsetBase
+ leadByteIndex
[leadByte
] + 1, reorderCodeCount
* sizeof(uint16_t));
1095 return reorderCodeCount
;
1098 // used to mark ignorable reorder code slots
1099 static const int32_t UCOL_REORDER_CODE_IGNORE
= UCOL_REORDER_CODE_LIMIT
+ 1;
1101 void ucol_buildPermutationTable(UCollator
*coll
, UErrorCode
*status
) {
1102 uint16_t leadBytesSize
= 256;
1103 uint16_t leadBytes
[256];
1104 int32_t internalReorderCodesLength
= coll
->reorderCodesLength
+ (UCOL_REORDER_CODE_LIMIT
- UCOL_REORDER_CODE_FIRST
);
1105 int32_t* internalReorderCodes
;
1107 // The lowest byte that hasn't been assigned a mapping
1108 int toBottom
= 0x03;
1109 // The highest byte that hasn't been assigned a mapping - don't include the special or trailing
1112 // are we filling from the bottom?
1113 bool fromTheBottom
= true;
1114 int32_t reorderCodesIndex
= -1;
1116 // lead bytes that have alread been assigned to the permutation table
1117 bool newLeadByteUsed
[256];
1118 // permutation table slots that have already been filled
1119 bool permutationSlotFilled
[256];
1122 if(U_FAILURE(*status
) || coll
== NULL
|| coll
->reorderCodesLength
== 0) {
1124 if (coll
->leadBytePermutationTable
!= NULL
) {
1125 uprv_free(coll
->leadBytePermutationTable
);
1126 coll
->leadBytePermutationTable
= NULL
;
1128 coll
->reorderCodesLength
= 0;
1133 if (coll
->leadBytePermutationTable
== NULL
) {
1134 coll
->leadBytePermutationTable
= (uint8_t*)uprv_malloc(256*sizeof(uint8_t));
1135 if (coll
->leadBytePermutationTable
== NULL
) {
1136 *status
= U_MEMORY_ALLOCATION_ERROR
;
1141 // prefill the reordering codes with the leading entries
1142 internalReorderCodes
= (int32_t*)uprv_malloc(internalReorderCodesLength
* sizeof(int32_t));
1143 if (internalReorderCodes
== NULL
) {
1144 *status
= U_MEMORY_ALLOCATION_ERROR
;
1145 if (coll
->leadBytePermutationTable
!= NULL
) {
1146 uprv_free(coll
->leadBytePermutationTable
);
1147 coll
->leadBytePermutationTable
= NULL
;
1152 for (uint32_t codeIndex
= 0; codeIndex
< (UCOL_REORDER_CODE_LIMIT
- UCOL_REORDER_CODE_FIRST
); codeIndex
++) {
1153 internalReorderCodes
[codeIndex
] = UCOL_REORDER_CODE_FIRST
+ codeIndex
;
1155 for (int32_t codeIndex
= 0; codeIndex
< coll
->reorderCodesLength
; codeIndex
++) {
1156 uint32_t reorderCodesCode
= coll
->reorderCodes
[codeIndex
];
1157 internalReorderCodes
[codeIndex
+ (UCOL_REORDER_CODE_LIMIT
- UCOL_REORDER_CODE_FIRST
)] = reorderCodesCode
;
1158 if (reorderCodesCode
>= UCOL_REORDER_CODE_FIRST
&& reorderCodesCode
< UCOL_REORDER_CODE_LIMIT
) {
1159 internalReorderCodes
[reorderCodesCode
- UCOL_REORDER_CODE_FIRST
] = UCOL_REORDER_CODE_IGNORE
;
1163 for (int i
= 0; i
< 256; i
++) {
1164 if (i
< toBottom
|| i
> toTop
) {
1165 permutationSlotFilled
[i
] = true;
1166 newLeadByteUsed
[i
] = true;
1167 coll
->leadBytePermutationTable
[i
] = i
;
1169 permutationSlotFilled
[i
] = false;
1170 newLeadByteUsed
[i
] = false;
1171 coll
->leadBytePermutationTable
[i
] = 0;
1175 /* Start from the front of the list and place each script we encounter at the
1176 * earliest possible locatation in the permutation table. If we encounter
1177 * UNKNOWN, start processing from the back, and place each script in the last
1178 * possible location. At each step, we also need to make sure that any scripts
1179 * that need to not be moved are copied to their same location in the final table.
1181 for (int reorderCodesCount
= 0; reorderCodesCount
< internalReorderCodesLength
; reorderCodesCount
++) {
1182 reorderCodesIndex
+= fromTheBottom
? 1 : -1;
1183 int32_t next
= internalReorderCodes
[reorderCodesIndex
];
1184 if (next
== UCOL_REORDER_CODE_IGNORE
) {
1187 if (next
== USCRIPT_UNKNOWN
) {
1188 if (fromTheBottom
== false) {
1189 // double turnaround
1190 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1191 if (coll
->leadBytePermutationTable
!= NULL
) {
1192 uprv_free(coll
->leadBytePermutationTable
);
1193 coll
->leadBytePermutationTable
= NULL
;
1195 coll
->reorderCodesLength
= 0;
1196 if (internalReorderCodes
!= NULL
) {
1197 uprv_free(internalReorderCodes
);
1201 fromTheBottom
= false;
1202 reorderCodesIndex
= internalReorderCodesLength
;
1206 uint16_t leadByteCount
= ucol_getLeadBytesForReorderCode(coll
, next
, leadBytes
, leadBytesSize
);
1207 if (fromTheBottom
) {
1208 for (int leadByteIndex
= 0; leadByteIndex
< leadByteCount
; leadByteIndex
++) {
1209 // don't place a lead byte twice in the permutation table
1210 if (permutationSlotFilled
[leadBytes
[leadByteIndex
]]) {
1211 // lead byte already used
1212 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1213 if (coll
->leadBytePermutationTable
!= NULL
) {
1214 uprv_free(coll
->leadBytePermutationTable
);
1215 coll
->leadBytePermutationTable
= NULL
;
1217 coll
->reorderCodesLength
= 0;
1218 if (internalReorderCodes
!= NULL
) {
1219 uprv_free(internalReorderCodes
);
1224 coll
->leadBytePermutationTable
[leadBytes
[leadByteIndex
]] = toBottom
;
1225 newLeadByteUsed
[toBottom
] = true;
1226 permutationSlotFilled
[leadBytes
[leadByteIndex
]] = true;
1230 for (int leadByteIndex
= leadByteCount
- 1; leadByteIndex
>= 0; leadByteIndex
--) {
1231 // don't place a lead byte twice in the permutation table
1232 if (permutationSlotFilled
[leadBytes
[leadByteIndex
]]) {
1233 // lead byte already used
1234 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1235 if (coll
->leadBytePermutationTable
!= NULL
) {
1236 uprv_free(coll
->leadBytePermutationTable
);
1237 coll
->leadBytePermutationTable
= NULL
;
1239 coll
->reorderCodesLength
= 0;
1240 if (internalReorderCodes
!= NULL
) {
1241 uprv_free(internalReorderCodes
);
1246 coll
->leadBytePermutationTable
[leadBytes
[leadByteIndex
]] = toTop
;
1247 newLeadByteUsed
[toTop
] = true;
1248 permutationSlotFilled
[leadBytes
[leadByteIndex
]] = true;
1254 #ifdef REORDER_DEBUG
1255 fprintf(stdout
, "\n@@@@ Partial Script Reordering Table\n");
1256 for (int i
= 0; i
< 256; i
++) {
1257 fprintf(stdout
, "\t%02x = %02x\n", i
, coll
->leadBytePermutationTable
[i
]);
1259 fprintf(stdout
, "\n@@@@ Lead Byte Used Table\n");
1260 for (int i
= 0; i
< 256; i
++) {
1261 fprintf(stdout
, "\t%02x = %02x\n", i
, newLeadByteUsed
[i
]);
1263 fprintf(stdout
, "\n@@@@ Permutation Slot Filled Table\n");
1264 for (int i
= 0; i
< 256; i
++) {
1265 fprintf(stdout
, "\t%02x = %02x\n", i
, permutationSlotFilled
[i
]);
1269 /* Copy everything that's left over */
1270 int reorderCode
= 0;
1271 for (int i
= 0; i
< 256; i
++) {
1272 if (!permutationSlotFilled
[i
]) {
1273 while (reorderCode
< 256 && newLeadByteUsed
[reorderCode
]) {
1276 coll
->leadBytePermutationTable
[i
] = reorderCode
;
1277 permutationSlotFilled
[i
] = true;
1278 newLeadByteUsed
[reorderCode
] = true;
1282 #ifdef REORDER_DEBUG
1283 fprintf(stdout
, "\n@@@@ Script Reordering Table\n");
1284 for (int i
= 0; i
< 256; i
++) {
1285 fprintf(stdout
, "\t%02x = %02x\n", i
, coll
->leadBytePermutationTable
[i
]);
1289 if (internalReorderCodes
!= NULL
) {
1290 uprv_free(internalReorderCodes
);
1293 // force a regen of the latin one table since it is affected by the script reordering
1294 coll
->latinOneRegenTable
= TRUE
;
1295 ucol_updateInternalState(coll
, status
);
1298 #endif /* #if !UCONFIG_NO_COLLATION */