2 *******************************************************************************
3 * Copyright (C) 2004-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol_sit.cpp
8 * tab size: 8 (not used)
11 * Modification history
13 * 03/12/2004 weiv Creation
16 #include "unicode/ustring.h"
25 #if !UCONFIG_NO_COLLATION
28 UCOL_SIT_LANGUAGE
= 0,
37 UCOL_SIT_NUMERIC_COLLATION
,
38 UCOL_SIT_ALTERNATE_HANDLING
,
39 UCOL_SIT_NORMALIZATION_MODE
,
40 UCOL_SIT_FRENCH_COLLATION
,
41 UCOL_SIT_HIRAGANA_QUATERNARY
,
42 UCOL_SIT_VARIABLE_TOP
,
43 UCOL_SIT_VARIABLE_TOP_VALUE
,
47 /* list of locales for packing of a collator to an integer.
48 * This list corresponds to ICU 3.0. If more collation bearing
49 * locales are added in the future, this won't be a simple array
50 * but a mapping allowing forward and reverse lookup would have to
51 * be established. Currently, the mapping is from locale name to
54 static const char* const locales
[] = {
55 /* 00 - 09 */ "ar", "be", "bg", "ca", "cs", "da", "de", "de__PHONEBOOK", "el", "en",
56 /* 10 - 19 */ "en_BE", "eo", "es", "es__TRADITIONAL", "et", "fa", "fa_AF", "fi", "fo", "fr",
57 /* 20 - 29 */ "gu", "he", "hi", "hi__DIRECT", "hr", "hu", "is", "it", "ja", "kk",
58 /* 30 - 39 */ "kl", "kn", "ko", "lt", "lv", "mk", "mr", "mt", "nb", "nn",
59 /* 40 - 49 */ "om", "pa", "pl", "ps", "ro", "root", "ru", "sh", "sk", "sl",
60 /* 50 - 59 */ "sq", "sr", "sv", "ta", "te", "th", "tr", "uk", "vi", "zh",
61 /* 60 - 64 */ "zh_HK", "zh_MO", "zh_TW", "zh_TW_STROKE", "zh__PINYIN"
64 static const char* const keywords
[] = {
71 /* 06 */ "traditional"
75 /* option starters chars. */
76 static const char alternateHArg
= 'A';
77 static const char variableTopValArg
= 'B';
78 static const char caseFirstArg
= 'C';
79 static const char numericCollArg
= 'D';
80 static const char caseLevelArg
= 'E';
81 static const char frenchCollArg
= 'F';
82 static const char hiraganaQArg
= 'H';
83 static const char keywordArg
= 'K';
84 static const char languageArg
= 'L';
85 static const char normArg
= 'N';
86 static const char regionArg
= 'R';
87 static const char strengthArg
= 'S';
88 static const char variableTopArg
= 'T';
89 static const char variantArg
= 'V';
90 static const char RFC3066Arg
= 'X';
91 static const char scriptArg
= 'Z';
93 static const char collationKeyword
[] = "@collation=";
95 static const int32_t locElementCount
= 5;
96 static const int32_t locElementCapacity
= 32;
97 static const int32_t loc3066Capacity
= 256;
98 static const int32_t internalBufferSize
= 512;
100 /* structure containing specification of a collator. Initialized
101 * from a short string. Also used to construct a short string from a
104 struct CollatorSpec
{
105 char locElements
[locElementCount
][locElementCapacity
];
106 char locale
[loc3066Capacity
];
107 UColAttributeValue options
[UCOL_ATTRIBUTE_COUNT
];
108 uint32_t variableTopValue
;
109 UChar variableTopString
[locElementCapacity
];
110 int32_t variableTopStringLen
;
111 UBool variableTopSet
;
115 } entries
[UCOL_SIT_ITEMS_COUNT
];
119 /* structure for converting between character attribute
120 * representation and real collation attribute value.
122 struct AttributeConversion
{
124 UColAttributeValue value
;
127 static const AttributeConversion conversions
[12] = {
128 { '1', UCOL_PRIMARY
},
129 { '2', UCOL_SECONDARY
},
130 { '3', UCOL_TERTIARY
},
131 { '4', UCOL_QUATERNARY
},
132 { 'D', UCOL_DEFAULT
},
133 { 'I', UCOL_IDENTICAL
},
134 { 'L', UCOL_LOWER_FIRST
},
135 { 'N', UCOL_NON_IGNORABLE
},
137 { 'S', UCOL_SHIFTED
},
138 { 'U', UCOL_UPPER_FIRST
},
144 ucol_sit_attributeValueToLetter(UColAttributeValue value
, UErrorCode
*status
) {
146 for(i
= 0; i
< sizeof(conversions
)/sizeof(conversions
[0]); i
++) {
147 if(conversions
[i
].value
== value
) {
148 return conversions
[i
].letter
;
151 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
155 static UColAttributeValue
156 ucol_sit_letterToAttributeValue(char letter
, UErrorCode
*status
) {
158 for(i
= 0; i
< sizeof(conversions
)/sizeof(conversions
[0]); i
++) {
159 if(conversions
[i
].letter
== letter
) {
160 return conversions
[i
].value
;
163 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
167 /* function prototype for functions used to parse a short string */
169 typedef const char* U_CALLCONV
170 ActionFunction(CollatorSpec
*spec
, uint32_t value1
, const char* string
,
175 static const char* U_CALLCONV
176 _processLocaleElement(CollatorSpec
*spec
, uint32_t value
, const char* string
,
181 if(value
== 0 || value
== 4) {
182 spec
->locElements
[value
][len
++] = uprv_tolower(*string
);
184 spec
->locElements
[value
][len
++] = *string
;
186 } while(*(++string
) != '_' && *string
&& len
< locElementCapacity
);
187 if(len
>= locElementCapacity
) {
188 *status
= U_BUFFER_OVERFLOW_ERROR
;
191 // don't skip the underscore at the end
197 static const char* U_CALLCONV
198 _processRFC3066Locale(CollatorSpec
*spec
, uint32_t, const char* string
,
201 char terminator
= *string
;
203 const char *end
= uprv_strchr(string
+1, terminator
);
204 if(end
== NULL
|| end
- string
>= loc3066Capacity
) {
205 *status
= U_BUFFER_OVERFLOW_ERROR
;
208 uprv_strncpy(spec
->locale
, string
, end
-string
);
216 static const char* U_CALLCONV
217 _processCollatorOption(CollatorSpec
*spec
, uint32_t option
, const char* string
,
220 spec
->options
[option
] = ucol_sit_letterToAttributeValue(*string
, status
);
221 if((*(++string
) != '_' && *string
) || U_FAILURE(*status
)) {
222 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
230 readHexCodeUnit(const char **string
, UErrorCode
*status
)
235 int32_t noDigits
= 0;
236 while((c
= **string
) != 0 && noDigits
< 4) {
237 if( c
>= '0' && c
<= '9') {
239 } else if ( c
>= 'a' && c
<= 'f') {
240 value
= c
- 'a' + 10;
241 } else if ( c
>= 'A' && c
<= 'F') {
242 value
= c
- 'A' + 10;
244 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
247 result
= (result
<< 4) | (UChar
)value
;
251 // if the string was terminated before we read 4 digits, set an error
253 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
259 static const char* U_CALLCONV
260 _processVariableTop(CollatorSpec
*spec
, uint32_t value1
, const char* string
, UErrorCode
*status
)
265 while(U_SUCCESS(*status
) && i
< locElementCapacity
&& *string
!= 0 && *string
!= '_') {
266 spec
->variableTopString
[i
++] = readHexCodeUnit(&string
, status
);
268 spec
->variableTopStringLen
= i
;
269 if(i
== locElementCapacity
&& (*string
!= 0 || *string
!= '_')) {
270 *status
= U_BUFFER_OVERFLOW_ERROR
;
273 spec
->variableTopValue
= readHexCodeUnit(&string
, status
);
275 if(U_SUCCESS(*status
)) {
276 spec
->variableTopSet
= TRUE
;
283 /* Table for parsing short strings */
284 struct ShortStringOptions
{
286 ActionFunction
*action
;
290 static const ShortStringOptions options
[UCOL_SIT_ITEMS_COUNT
] =
292 /* 10 ALTERNATE_HANDLING */ {alternateHArg
, _processCollatorOption
, UCOL_ALTERNATE_HANDLING
}, // alternate N, S, D
293 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg
, _processVariableTop
, 1 },
294 /* 08 CASE_FIRST */ {caseFirstArg
, _processCollatorOption
, UCOL_CASE_FIRST
}, // case first L, U, X, D
295 /* 09 NUMERIC_COLLATION */ {numericCollArg
, _processCollatorOption
, UCOL_NUMERIC_COLLATION
}, // codan O, X, D
296 /* 07 CASE_LEVEL */ {caseLevelArg
, _processCollatorOption
, UCOL_CASE_LEVEL
}, // case level O, X, D
297 /* 12 FRENCH_COLLATION */ {frenchCollArg
, _processCollatorOption
, UCOL_FRENCH_COLLATION
}, // french O, X, D
298 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg
, _processCollatorOption
, UCOL_HIRAGANA_QUATERNARY_MODE
}, // hiragana O, X, D
299 /* 04 KEYWORD */ {keywordArg
, _processLocaleElement
, 4 }, // keyword
300 /* 00 LANGUAGE */ {languageArg
, _processLocaleElement
, 0 }, // language
301 /* 11 NORMALIZATION_MODE */ {normArg
, _processCollatorOption
, UCOL_NORMALIZATION_MODE
}, // norm O, X, D
302 /* 02 REGION */ {regionArg
, _processLocaleElement
, 2 }, // region
303 /* 06 STRENGTH */ {strengthArg
, _processCollatorOption
, UCOL_STRENGTH
}, // strength 1, 2, 3, 4, I, D
304 /* 14 VARIABLE_TOP */ {variableTopArg
, _processVariableTop
, 0 },
305 /* 03 VARIANT */ {variantArg
, _processLocaleElement
, 3 }, // variant
306 /* 05 RFC3066BIS */ {RFC3066Arg
, _processRFC3066Locale
, 0 }, // rfc3066bis locale name
307 /* 01 SCRIPT */ {scriptArg
, _processLocaleElement
, 1 } // script
312 const char* ucol_sit_readOption(const char *start
, CollatorSpec
*spec
,
317 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
318 if(*start
== options
[i
].optionStart
) {
319 spec
->entries
[i
].start
= start
;
320 const char* end
= options
[i
].action(spec
, options
[i
].attr
, start
+1, status
);
321 spec
->entries
[i
].len
= end
- start
;
325 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
330 void ucol_sit_initCollatorSpecs(CollatorSpec
*spec
)
333 uprv_memset(spec
, 0, sizeof(CollatorSpec
));
334 // set collation options to default
336 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
337 spec
->options
[i
] = UCOL_DEFAULT
;
342 ucol_sit_readSpecs(CollatorSpec
*s
, const char *string
,
343 UParseError
*parseError
, UErrorCode
*status
)
345 const char *definition
= string
;
346 while(U_SUCCESS(*status
) && *string
) {
347 string
= ucol_sit_readOption(string
, s
, status
);
349 while(*string
&& *string
== '_') {
353 if(U_FAILURE(*status
)) {
354 parseError
->offset
= string
- definition
;
360 int32_t ucol_sit_dumpSpecs(CollatorSpec
*s
, char *destination
, int32_t capacity
, UErrorCode
*status
)
362 int32_t i
= 0, j
= 0;
365 if(U_SUCCESS(*status
)) {
366 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
367 if(s
->entries
[i
].start
) {
370 uprv_strcat(destination
, "_");
374 optName
= *(s
->entries
[i
].start
);
375 if(optName
== languageArg
|| optName
== regionArg
|| optName
== variantArg
|| optName
== keywordArg
) {
376 for(j
= 0; j
< s
->entries
[i
].len
; j
++) {
377 if(len
+ j
< capacity
) {
378 destination
[len
+j
] = uprv_toupper(*(s
->entries
[i
].start
+j
));
381 len
+= s
->entries
[i
].len
;
383 len
+= s
->entries
[i
].len
;
385 uprv_strncat(destination
,s
->entries
[i
].start
, s
->entries
[i
].len
);
397 ucol_sit_calculateWholeLocale(CollatorSpec
*s
) {
398 // put the locale together, unless we have a done
400 if(s
->locale
[0] == 0) {
401 // first the language
402 uprv_strcat(s
->locale
, s
->locElements
[0]);
403 // then the script, if present
404 if(*(s
->locElements
[1])) {
405 uprv_strcat(s
->locale
, "_");
406 uprv_strcat(s
->locale
, s
->locElements
[1]);
408 // then the region, if present
409 if(*(s
->locElements
[2])) {
410 uprv_strcat(s
->locale
, "_");
411 uprv_strcat(s
->locale
, s
->locElements
[2]);
412 } else if(*(s
->locElements
[3])) { // if there is a variant, we need an underscore
413 uprv_strcat(s
->locale
, "_");
415 // add variant, if there
416 if(*(s
->locElements
[3])) {
417 uprv_strcat(s
->locale
, "_");
418 uprv_strcat(s
->locale
, s
->locElements
[3]);
421 // if there is a collation keyword, add that too
422 if(*(s
->locElements
[4])) {
423 uprv_strcat(s
->locale
, collationKeyword
);
424 uprv_strcat(s
->locale
, s
->locElements
[4]);
430 U_CAPI
void U_EXPORT2
431 ucol_prepareShortStringOpen( const char *definition
,
433 UParseError
*parseError
,
436 if(U_FAILURE(*status
)) return;
438 UParseError internalParseError
;
441 parseError
= &internalParseError
;
443 parseError
->line
= 0;
444 parseError
->offset
= 0;
445 parseError
->preContext
[0] = 0;
446 parseError
->postContext
[0] = 0;
449 // first we want to pick stuff out of short string.
450 // we'll end up with an UCA version, locale and a bunch of
453 // analyse the string in order to get everything we need.
454 const char *string
= definition
;
456 ucol_sit_initCollatorSpecs(&s
);
457 string
= ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
458 ucol_sit_calculateWholeLocale(&s
);
460 char buffer
[internalBufferSize
];
461 uprv_memset(buffer
, 0, internalBufferSize
);
462 uloc_canonicalize(s
.locale
, buffer
, internalBufferSize
, status
);
464 UResourceBundle
*b
= ures_open(U_ICUDATA_COLL
, buffer
, status
);
465 /* we try to find stuff from keyword */
466 UResourceBundle
*collations
= ures_getByKey(b
, "collations", NULL
, status
);
467 UResourceBundle
*collElem
= NULL
;
469 // if there is a keyword, we pick it up and try to get elements
470 if(!uloc_getKeywordValue(buffer
, "collation", keyBuffer
, 256, status
)) {
471 // no keyword. we try to find the default setting, which will give us the keyword value
472 UResourceBundle
*defaultColl
= ures_getByKeyWithFallback(collations
, "default", NULL
, status
);
473 if(U_SUCCESS(*status
)) {
474 int32_t defaultKeyLen
= 0;
475 const UChar
*defaultKey
= ures_getString(defaultColl
, &defaultKeyLen
, status
);
476 u_UCharsToChars(defaultKey
, keyBuffer
, defaultKeyLen
);
477 keyBuffer
[defaultKeyLen
] = 0;
479 *status
= U_INTERNAL_PROGRAM_ERROR
;
482 ures_close(defaultColl
);
484 collElem
= ures_getByKeyWithFallback(collations
, keyBuffer
, collElem
, status
);
485 ures_close(collElem
);
486 ures_close(collations
);
491 U_CAPI UCollator
* U_EXPORT2
492 ucol_openFromShortString( const char *definition
,
494 UParseError
*parseError
,
497 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING
);
498 UTRACE_DATA1(UTRACE_INFO
, "short string = \"%s\"", definition
);
500 if(U_FAILURE(*status
)) return 0;
502 UParseError internalParseError
;
505 parseError
= &internalParseError
;
507 parseError
->line
= 0;
508 parseError
->offset
= 0;
509 parseError
->preContext
[0] = 0;
510 parseError
->postContext
[0] = 0;
513 // first we want to pick stuff out of short string.
514 // we'll end up with an UCA version, locale and a bunch of
517 // analyse the string in order to get everything we need.
518 const char *string
= definition
;
520 ucol_sit_initCollatorSpecs(&s
);
521 string
= ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
522 ucol_sit_calculateWholeLocale(&s
);
524 char buffer
[internalBufferSize
];
525 uprv_memset(buffer
, 0, internalBufferSize
);
526 uloc_canonicalize(s
.locale
, buffer
, internalBufferSize
, status
);
528 UCollator
*result
= ucol_open(buffer
, status
);
531 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
532 if(s
.options
[i
] != UCOL_DEFAULT
) {
533 if(forceDefaults
|| ucol_getAttribute(result
, (UColAttribute
)i
, status
) != s
.options
[i
]) {
534 ucol_setAttribute(result
, (UColAttribute
)i
, s
.options
[i
], status
);
537 if(U_FAILURE(*status
)) {
538 parseError
->offset
= string
- definition
;
545 if(s
.variableTopSet
) {
546 if(s
.variableTopString
[0]) {
547 ucol_setVariableTop(result
, s
.variableTopString
, s
.variableTopStringLen
, status
);
548 } else { // we set by value, using 'B'
549 ucol_restoreVariableTop(result
, s
.variableTopValue
, status
);
554 if(U_FAILURE(*status
)) { // here it can only be a bogus value
559 UTRACE_EXIT_PTR_STATUS(result
, *status
);
564 static void appendShortStringElement(const char *src
, int32_t len
, char *result
, int32_t *resultSize
, int32_t capacity
, char arg
)
568 if(*resultSize
< capacity
) {
569 uprv_strcat(result
, "_");
573 *resultSize
+= len
+ 1;
574 if(*resultSize
< capacity
) {
575 uprv_strncat(result
, &arg
, 1);
576 uprv_strncat(result
, src
, len
);
581 U_CAPI
int32_t U_EXPORT2
582 ucol_getShortDefinitionString(const UCollator
*coll
,
588 if(U_FAILURE(*status
)) return 0;
589 char buffer
[internalBufferSize
];
590 uprv_memset(buffer
, 0, internalBufferSize
*sizeof(char));
591 int32_t resultSize
= 0;
592 char tempbuff
[internalBufferSize
];
593 char locBuff
[internalBufferSize
];
594 uprv_memset(buffer
, 0, internalBufferSize
*sizeof(char));
595 int32_t elementSize
= 0;
596 UBool isAvailable
= 0;
598 ucol_sit_initCollatorSpecs(&s
);
601 locale
= ucol_getLocale(coll
, ULOC_VALID_LOCALE
, status
);
603 elementSize
= ucol_getFunctionalEquivalent(locBuff
, internalBufferSize
, "collation", locale
, &isAvailable
, status
);
606 // we should probably canonicalize here...
607 elementSize
= uloc_getLanguage(locBuff
, tempbuff
, internalBufferSize
, status
);
608 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, languageArg
);
609 elementSize
= uloc_getCountry(locBuff
, tempbuff
, internalBufferSize
, status
);
610 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, regionArg
);
611 elementSize
= uloc_getScript(locBuff
, tempbuff
, internalBufferSize
, status
);
612 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, scriptArg
);
613 elementSize
= uloc_getVariant(locBuff
, tempbuff
, internalBufferSize
, status
);
614 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, variantArg
);
615 elementSize
= uloc_getKeywordValue(locBuff
, "collation", tempbuff
, internalBufferSize
, status
);
616 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, keywordArg
);
620 UColAttributeValue attribute
= UCOL_DEFAULT
;
621 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
622 if(options
[i
].action
== _processCollatorOption
) {
623 attribute
= ucol_getAttributeOrDefault(coll
, (UColAttribute
)options
[i
].attr
, status
);
624 if(attribute
!= UCOL_DEFAULT
) {
625 char letter
= ucol_sit_attributeValueToLetter(attribute
, status
);
626 appendShortStringElement(&letter
, 1,
627 buffer
, &resultSize
, capacity
, options
[i
].optionStart
);
631 if(coll
->variableTopValueisDefault
== FALSE
) {
632 //s.variableTopValue = ucol_getVariableTop(coll, status);
633 elementSize
= T_CString_integerToString(tempbuff
, coll
->variableTopValue
, 16);
634 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, variableTopValArg
);
637 UParseError parseError
;
638 return ucol_normalizeShortDefinitionString(buffer
, dst
, capacity
, &parseError
, status
);
641 U_CAPI
int32_t U_EXPORT2
642 ucol_normalizeShortDefinitionString(const char *definition
,
645 UParseError
*parseError
,
649 if(U_FAILURE(*status
)) {
654 uprv_memset(destination
, 0, capacity
*sizeof(char));
664 ucol_sit_initCollatorSpecs(&s
);
665 ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
666 return ucol_sit_dumpSpecs(&s
, destination
, capacity
, status
);
669 // structure for packing the bits of the attributes in the
670 // identifier number.
671 // locale is packed separately
676 UColAttribute attribute
;
677 UColAttributeValue values
[6];
680 static const bitPacking attributesToBits
[UCOL_ATTRIBUTE_COUNT
] = {
681 /* french */ { frenchCollArg
, 29, 2, UCOL_FRENCH_COLLATION
, { UCOL_DEFAULT
, UCOL_OFF
, UCOL_ON
}},
682 /* alternate */ { alternateHArg
, 27, 2, UCOL_ALTERNATE_HANDLING
, { UCOL_DEFAULT
, UCOL_NON_IGNORABLE
, UCOL_SHIFTED
}},
683 /* case first */ { caseFirstArg
, 25, 2, UCOL_CASE_FIRST
, { UCOL_DEFAULT
, UCOL_OFF
, UCOL_LOWER_FIRST
, UCOL_UPPER_FIRST
}},
684 /* case level */ { caseLevelArg
, 23, 2, UCOL_CASE_LEVEL
, { UCOL_DEFAULT
, UCOL_OFF
, UCOL_ON
}},
685 /* normalization */ { normArg
, 21, 2, UCOL_NORMALIZATION_MODE
, { UCOL_DEFAULT
, UCOL_OFF
, UCOL_ON
}},
686 /* strength */ { strengthArg
, 18, 3, UCOL_STRENGTH
, { UCOL_DEFAULT
, UCOL_PRIMARY
, UCOL_SECONDARY
, UCOL_TERTIARY
, UCOL_QUATERNARY
, UCOL_IDENTICAL
}},
687 /* hiragana */ { hiraganaQArg
, 16, 2, UCOL_HIRAGANA_QUATERNARY_MODE
, { UCOL_DEFAULT
, UCOL_OFF
, UCOL_ON
}},
688 /* numeric coll */ { numericCollArg
, 14, 2, UCOL_NUMERIC_COLLATION
, { UCOL_DEFAULT
, UCOL_OFF
, UCOL_ON
}}
691 static const uint32_t keywordShift
= 9;
692 static const uint32_t keywordWidth
= 5;
693 static const uint32_t localeShift
= 0;
694 static const uint32_t localeWidth
= 7;
697 static uint32_t ucol_sit_putLocaleInIdentifier(uint32_t result
, const char* locale
, UErrorCode
* status
) {
698 char buffer
[internalBufferSize
], keywordBuffer
[internalBufferSize
],
699 baseName
[internalBufferSize
], localeBuffer
[internalBufferSize
];
700 int32_t len
= 0, keywordLen
= 0,
701 baseNameLen
= 0, localeLen
= 0;
703 UBool isAvailable
= FALSE
;
705 len
= uloc_canonicalize(locale
, buffer
, internalBufferSize
, status
);
706 localeLen
= ucol_getFunctionalEquivalent(localeBuffer
, internalBufferSize
, "collation", buffer
, &isAvailable
, status
);
707 keywordLen
= uloc_getKeywordValue(buffer
, "collation", keywordBuffer
, internalBufferSize
, status
);
708 baseNameLen
= uloc_getBaseName(buffer
, baseName
, internalBufferSize
, status
);
710 /*Binary search for the map entry for normal cases */
713 uint32_t high
= sizeof(locales
)/sizeof(locales
[0]);
719 while (high
> low
) /*binary search*/{
721 mid
= (high
+low
) >> 1; /*Finds median*/
724 return UCOL_SIT_COLLATOR_NOT_ENCODABLE
; // we didn't find it
726 compVal
= uprv_strcmp(baseName
, locales
[mid
]);
730 else if (compVal
> 0){
733 else /*we found it*/{
739 result
|= (mid
& ((1 << localeWidth
) - 1)) << localeShift
;
743 for(i
= 1; i
< sizeof(keywords
)/sizeof(keywords
[0]); i
++) {
744 if(uprv_strcmp(keywords
[i
], keywordBuffer
) == 0) {
745 result
|= (i
& ((1 << keywordWidth
) - 1)) << keywordShift
;
753 U_CAPI
uint32_t U_EXPORT2
754 ucol_collatorToIdentifier(const UCollator
*coll
,
759 uint32_t i
= 0, j
= 0;
760 UColAttributeValue attrValue
= UCOL_DEFAULT
;
762 // if variable top is not default, we need to use strings
763 if(coll
->variableTopValueisDefault
!= TRUE
) {
764 return UCOL_SIT_COLLATOR_NOT_ENCODABLE
;
768 locale
= ucol_getLocale(coll
, ULOC_VALID_LOCALE
, status
);
771 result
= ucol_sit_putLocaleInIdentifier(result
, locale
, status
);
773 for(i
= 0; i
< sizeof(attributesToBits
)/sizeof(attributesToBits
[0]); i
++) {
774 attrValue
= ucol_getAttributeOrDefault(coll
, attributesToBits
[i
].attribute
, status
);
776 while(attributesToBits
[i
].values
[j
] != attrValue
) {
779 result
|= (j
& ((1 << attributesToBits
[i
].width
) - 1)) << attributesToBits
[i
].offset
;
785 U_CAPI UCollator
* U_EXPORT2
786 ucol_openFromIdentifier(uint32_t identifier
,
791 int32_t value
= 0, keyword
= 0;
792 char locale
[internalBufferSize
];
794 value
= (identifier
>> localeShift
) & ((1 << localeWidth
) - 1);
795 keyword
= (identifier
>> keywordShift
) & ((1 << keywordWidth
) - 1);
797 uprv_strcpy(locale
, locales
[value
]);
800 uprv_strcat(locale
, collationKeyword
);
801 uprv_strcat(locale
, keywords
[keyword
]);
804 UColAttributeValue attrValue
= UCOL_DEFAULT
;
806 UCollator
*result
= ucol_open(locale
, status
);
808 // variable top is not set in the identifier, so we can easily skip that on
810 for(i
= 0; i
< sizeof(attributesToBits
)/sizeof(attributesToBits
[0]); i
++) {
811 value
= (identifier
>> attributesToBits
[i
].offset
) & ((1 << attributesToBits
[i
].width
) - 1);
812 attrValue
= attributesToBits
[i
].values
[value
];
813 // the collator is all default, so we will set only the values that will differ from
814 // the default values.
815 if(attrValue
!= UCOL_DEFAULT
) {
817 ucol_getAttribute(result
, attributesToBits
[i
].attribute
, status
) != attrValue
) {
818 ucol_setAttribute(result
, attributesToBits
[i
].attribute
, attrValue
, status
);
826 U_CAPI
int32_t U_EXPORT2
827 ucol_identifierToShortString(uint32_t identifier
,
833 int32_t locIndex
= (identifier
>> localeShift
) & ((1 << localeWidth
) - 1);
834 int32_t keywordIndex
= (identifier
>> keywordShift
) & ((1 << keywordWidth
) - 1);
836 ucol_sit_initCollatorSpecs(&s
);
837 uprv_strcpy(s
.locale
, locales
[locIndex
]);
839 uprv_strcat(s
.locale
, collationKeyword
);
840 uprv_strcat(s
.locale
, keywords
[keywordIndex
]);
842 UCollator
*coll
= ucol_openFromIdentifier(identifier
, forceDefaults
, status
);
843 int32_t resultLen
= ucol_getShortDefinitionString(coll
, s
.locale
, buffer
, capacity
, status
);
848 // TODO: Crumy, crumy, crumy... Very hard to currently go algorithmically from
849 // identifier to short string. Do rethink
850 if(forceDefaults
== FALSE
) {
851 UCollator
*coll
= ucol_openFromIdentifier(identifier
, FALSE
, status
);
852 int32_t resultLen
= ucol_getShortDefinitionString(coll
, s
.locale
, buffer
, capacity
, status
);
855 } else { // forceDefaults == TRUE
857 UColAttributeValue value
;
859 for(i
= 0; i
< sizeof(attributesToBits
)/sizeof(attributesToBits
[0]); i
++) {
860 value
= attributesToBits
[i
].values
[(identifier
>> attributesToBits
[i
].offset
) & ((1 << attributesToBits
[i
].width
) - 1)];
861 if(value
!= UCOL_DEFAULT
) {
862 uprv_strcat(buffer
, "_");
863 uprv_strncat(buffer
, &attributesToBits
[i
].letter
, 1);
864 letter
= ucol_sit_attributeValueToLetter(value
, status
);
865 uprv_strncat(buffer
, &letter
, 1);
868 return ucol_sit_dumpSpecs(&s
, buffer
, capacity
, status
);
873 U_CAPI
uint32_t U_EXPORT2
874 ucol_shortStringToIdentifier(const char *definition
,
878 UParseError parseError
;
881 uint32_t i
= 0, j
= 0;
882 ucol_sit_initCollatorSpecs(&s
);
884 ucol_sit_readSpecs(&s
, definition
, &parseError
, status
);
885 ucol_sit_calculateWholeLocale(&s
);
887 char locBuffer
[internalBufferSize
];
888 UBool isAvailable
= FALSE
;
889 UColAttributeValue attrValue
= UCOL_DEFAULT
;
891 ucol_getFunctionalEquivalent(locBuffer
, internalBufferSize
, "collation", s
.locale
, &isAvailable
, status
);
893 if(forceDefaults
== FALSE
) {
894 UCollator
*coll
= ucol_openFromShortString(definition
, FALSE
, &parseError
, status
);
895 result
= ucol_collatorToIdentifier(coll
, locBuffer
, status
);
897 } else { // forceDefaults == TRUE
898 result
= ucol_sit_putLocaleInIdentifier(result
, locBuffer
, status
);
900 for(i
= 0; i
< sizeof(attributesToBits
)/sizeof(attributesToBits
[0]); i
++) {
901 attrValue
= s
.options
[i
];
903 while(attributesToBits
[i
].values
[j
] != attrValue
) {
906 result
|= (j
& ((1 << attributesToBits
[i
].width
) - 1)) << attributesToBits
[i
].offset
;
914 U_CAPI UColAttributeValue U_EXPORT2
915 ucol_getAttributeOrDefault(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
)
917 if(U_FAILURE(*status
) || coll
== NULL
) {
921 case UCOL_NUMERIC_COLLATION
:
922 return coll
->numericCollationisDefault
?UCOL_DEFAULT
:coll
->numericCollation
;
923 case UCOL_HIRAGANA_QUATERNARY_MODE
:
924 return coll
->hiraganaQisDefault
?UCOL_DEFAULT
:coll
->hiraganaQ
;
925 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
926 return coll
->frenchCollationisDefault
?UCOL_DEFAULT
:coll
->frenchCollation
;
927 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
928 return coll
->alternateHandlingisDefault
?UCOL_DEFAULT
:coll
->alternateHandling
;
929 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
930 return coll
->caseFirstisDefault
?UCOL_DEFAULT
:coll
->caseFirst
;
931 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
932 return coll
->caseLevelisDefault
?UCOL_DEFAULT
:coll
->caseLevel
;
933 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
934 return coll
->normalizationModeisDefault
?UCOL_DEFAULT
:coll
->normalizationMode
;
935 case UCOL_STRENGTH
: /* attribute for strength */
936 return coll
->strengthisDefault
?UCOL_DEFAULT
:coll
->strength
;
937 case UCOL_ATTRIBUTE_COUNT
:
939 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
947 const UCollator
*coll
;
950 USet
*removedContractions
;
958 addSpecial(contContext
*context
, UChar
*buffer
, int32_t bufLen
,
959 uint32_t CE
, int32_t leftIndex
, int32_t rightIndex
, UErrorCode
*status
)
961 const UCollator
*coll
= context
->coll
;
962 USet
*contractions
= context
->conts
;
963 USet
*expansions
= context
->expansions
;
964 UBool addPrefixes
= context
->addPrefixes
;
966 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
967 uint32_t newCE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
968 // we might have a contraction that ends from previous level
969 if(newCE
!= UCOL_NOT_FOUND
) {
970 if(isSpecial(CE
) && getCETag(CE
) == CONTRACTION_TAG
&& isSpecial(newCE
) && getCETag(newCE
) == SPEC_PROC_TAG
&& addPrefixes
) {
971 addSpecial(context
, buffer
, bufLen
, newCE
, leftIndex
, rightIndex
, status
);
973 if(contractions
&& rightIndex
-leftIndex
> 1) {
974 uset_addString(contractions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
975 if(expansions
&& isSpecial(CE
) && getCETag(CE
) == EXPANSION_TAG
) {
976 uset_addString(expansions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
982 // check whether we're doing contraction or prefix
983 if(getCETag(CE
) == SPEC_PROC_TAG
&& addPrefixes
) {
985 *status
= U_INTERNAL_PROGRAM_ERROR
;
989 while(*UCharOffset
!= 0xFFFF) {
990 newCE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
991 buffer
[leftIndex
] = *UCharOffset
;
992 if(isSpecial(newCE
) && (getCETag(newCE
) == CONTRACTION_TAG
|| getCETag(newCE
) == SPEC_PROC_TAG
)) {
993 addSpecial(context
, buffer
, bufLen
, newCE
, leftIndex
, rightIndex
, status
);
996 uset_addString(contractions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
998 if(expansions
&& isSpecial(newCE
) && getCETag(newCE
) == EXPANSION_TAG
) {
999 uset_addString(expansions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
1004 } else if(getCETag(CE
) == CONTRACTION_TAG
) {
1005 if(rightIndex
== bufLen
-1) {
1006 *status
= U_INTERNAL_PROGRAM_ERROR
;
1009 while(*UCharOffset
!= 0xFFFF) {
1010 newCE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
1011 buffer
[rightIndex
] = *UCharOffset
;
1012 if(isSpecial(newCE
) && (getCETag(newCE
) == CONTRACTION_TAG
|| getCETag(newCE
) == SPEC_PROC_TAG
)) {
1013 addSpecial(context
, buffer
, bufLen
, newCE
, leftIndex
, rightIndex
+1, status
);
1016 uset_addString(contractions
, buffer
+leftIndex
, rightIndex
+1-leftIndex
);
1018 if(expansions
&& isSpecial(newCE
) && getCETag(newCE
) == EXPANSION_TAG
) {
1019 uset_addString(expansions
, buffer
+leftIndex
, rightIndex
+1-leftIndex
);
1029 static UBool U_CALLCONV
1030 _processSpecials(const void *context
, UChar32 start
, UChar32 limit
, uint32_t CE
)
1032 UErrorCode
*status
= ((contContext
*)context
)->status
;
1033 USet
*expansions
= ((contContext
*)context
)->expansions
;
1034 USet
*removed
= ((contContext
*)context
)->removedContractions
;
1035 UBool addPrefixes
= ((contContext
*)context
)->addPrefixes
;
1036 UChar contraction
[internalBufferSize
];
1038 if(((getCETag(CE
) == SPEC_PROC_TAG
&& addPrefixes
) || getCETag(CE
) == CONTRACTION_TAG
)) {
1039 while(start
< limit
&& U_SUCCESS(*status
)) {
1040 // if there are suppressed contractions, we don't
1041 // want to add them.
1042 if(removed
&& uset_contains(removed
, start
)) {
1046 // we start our contraction from middle, since we don't know if it
1047 // will grow toward right or left
1048 contraction
[internalBufferSize
/2] = (UChar
)start
;
1049 addSpecial(((contContext
*)context
), contraction
, internalBufferSize
, CE
, internalBufferSize
/2, internalBufferSize
/2+1, status
);
1052 } else if(expansions
&& getCETag(CE
) == EXPANSION_TAG
) {
1053 while(start
< limit
&& U_SUCCESS(*status
)) {
1054 uset_add(expansions
, start
++);
1058 if(U_FAILURE(*status
)) {
1070 * Get a set containing the contractions defined by the collator. The set includes
1071 * both the UCA contractions and the contractions defined by the collator
1072 * @param coll collator
1073 * @param conts the set to hold the result
1074 * @param status to hold the error code
1075 * @return the size of the contraction set
1079 U_CAPI
int32_t U_EXPORT2
1080 ucol_getContractions( const UCollator
*coll
,
1084 ucol_getContractionsAndExpansions(coll
, contractions
, NULL
, FALSE
, status
);
1085 return uset_getItemCount(contractions
);
1089 * Get a set containing the expansions defined by the collator. The set includes
1090 * both the UCA expansions and the expansions defined by the tailoring
1091 * @param coll collator
1092 * @param conts the set to hold the result
1093 * @param addPrefixes add the prefix contextual elements to contractions
1094 * @param status to hold the error code
1098 U_CAPI
void U_EXPORT2
1099 ucol_getContractionsAndExpansions( const UCollator
*coll
,
1105 if(U_FAILURE(*status
)) {
1109 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1114 uset_clear(contractions
);
1117 uset_clear(expansions
);
1119 int32_t rulesLen
= 0;
1120 const UChar
* rules
= ucol_getRules(coll
, &rulesLen
);
1121 UColTokenParser src
;
1122 ucol_tok_initTokenList(&src
, rules
, rulesLen
, coll
->UCA
, status
);
1124 contContext c
= { NULL
, contractions
, expansions
, src
.removeSet
, addPrefixes
, status
};
1126 // Add the UCA contractions
1128 utrie_enum(&coll
->UCA
->mapping
, NULL
, _processSpecials
, &c
);
1130 // This is collator specific. Add contractions from a collator
1132 c
.removedContractions
= NULL
;
1133 utrie_enum(&coll
->mapping
, NULL
, _processSpecials
, &c
);
1134 ucol_tok_closeTokenList(&src
);
1137 U_CAPI
int32_t U_EXPORT2
1138 ucol_getUnsafeSet( const UCollator
*coll
,
1142 UChar buffer
[internalBufferSize
];
1147 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
1148 static const UChar cccpattern
[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
1149 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
1151 // add chars that fail the fcd check
1152 uset_applyPattern(unsafe
, cccpattern
, 24, USET_IGNORE_SPACE
, status
);
1154 // add Thai/Lao prevowels
1155 uset_addRange(unsafe
, 0xe40, 0xe44);
1156 uset_addRange(unsafe
, 0xec0, 0xec4);
1157 // add lead/trail surrogates
1158 uset_addRange(unsafe
, 0xd800, 0xdfff);
1160 USet
*contractions
= uset_open(0,0);
1162 int32_t i
= 0, j
= 0;
1163 int32_t contsSize
= ucol_getContractions(coll
, contractions
, status
);
1165 // Contraction set consists only of strings
1166 // to get unsafe code points, we need to
1167 // break the strings apart and add them to the unsafe set
1168 for(i
= 0; i
< contsSize
; i
++) {
1169 len
= uset_getItem(contractions
, i
, NULL
, NULL
, buffer
, internalBufferSize
, status
);
1173 U16_NEXT(buffer
, j
, len
, c
);
1175 uset_add(unsafe
, c
);
1181 uset_close(contractions
);
1183 return uset_size(unsafe
);