2 *******************************************************************************
3 * Copyright (C) 2004-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol_sit.cpp
8 * tab size: 8 (not used)
11 * Modification history
13 * 03/12/2004 weiv Creation
16 #include "unicode/ustring.h"
17 #include "unicode/udata.h"
25 #include "unicode/coll.h"
31 #if !UCONFIG_NO_COLLATION
34 UCOL_SIT_LANGUAGE
= 0,
39 UCOL_SIT_PROVIDER
= 5,
40 UCOL_SIT_LOCELEMENT_MAX
= UCOL_SIT_PROVIDER
, /* the last element that's part of LocElements */
46 UCOL_SIT_NUMERIC_COLLATION
,
47 UCOL_SIT_ALTERNATE_HANDLING
,
48 UCOL_SIT_NORMALIZATION_MODE
,
49 UCOL_SIT_FRENCH_COLLATION
,
50 UCOL_SIT_HIRAGANA_QUATERNARY
,
51 UCOL_SIT_VARIABLE_TOP
,
52 UCOL_SIT_VARIABLE_TOP_VALUE
,
56 /* option starters chars. */
57 static const char alternateHArg
= 'A';
58 static const char variableTopValArg
= 'B';
59 static const char caseFirstArg
= 'C';
60 static const char numericCollArg
= 'D';
61 static const char caseLevelArg
= 'E';
62 static const char frenchCollArg
= 'F';
63 static const char hiraganaQArg
= 'H';
64 static const char keywordArg
= 'K';
65 static const char languageArg
= 'L';
66 static const char normArg
= 'N';
67 static const char providerArg
= 'P';
68 static const char regionArg
= 'R';
69 static const char strengthArg
= 'S';
70 static const char variableTopArg
= 'T';
71 static const char variantArg
= 'V';
72 static const char RFC3066Arg
= 'X';
73 static const char scriptArg
= 'Z';
75 static const char collationKeyword
[] = "@collation=";
76 static const char providerKeyword
[] = "@sp=";
79 static const int32_t locElementCount
= UCOL_SIT_LOCELEMENT_MAX
+1;
80 static const int32_t locElementCapacity
= 32;
81 static const int32_t loc3066Capacity
= 256;
82 static const int32_t locProviderCapacity
= 10;
83 static const int32_t internalBufferSize
= 512;
85 /* structure containing specification of a collator. Initialized
86 * from a short string. Also used to construct a short string from a
90 char locElements
[locElementCount
][locElementCapacity
];
91 char locale
[loc3066Capacity
];
92 char provider
[locProviderCapacity
];
93 UColAttributeValue options
[UCOL_ATTRIBUTE_COUNT
];
94 uint32_t variableTopValue
;
95 UChar variableTopString
[locElementCapacity
];
96 int32_t variableTopStringLen
;
101 } entries
[UCOL_SIT_ITEMS_COUNT
];
105 /* structure for converting between character attribute
106 * representation and real collation attribute value.
108 struct AttributeConversion
{
110 UColAttributeValue value
;
113 static const AttributeConversion conversions
[12] = {
114 { '1', UCOL_PRIMARY
},
115 { '2', UCOL_SECONDARY
},
116 { '3', UCOL_TERTIARY
},
117 { '4', UCOL_QUATERNARY
},
118 { 'D', UCOL_DEFAULT
},
119 { 'I', UCOL_IDENTICAL
},
120 { 'L', UCOL_LOWER_FIRST
},
121 { 'N', UCOL_NON_IGNORABLE
},
123 { 'S', UCOL_SHIFTED
},
124 { 'U', UCOL_UPPER_FIRST
},
130 ucol_sit_attributeValueToLetter(UColAttributeValue value
, UErrorCode
*status
) {
132 for(i
= 0; i
< sizeof(conversions
)/sizeof(conversions
[0]); i
++) {
133 if(conversions
[i
].value
== value
) {
134 return conversions
[i
].letter
;
137 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
138 #ifdef UCOL_TRACE_SIT
139 fprintf(stderr
, "%s:%d: unknown UColAttributeValue %d: %s\n", __FILE__
, __LINE__
, value
, u_errorName(*status
));
144 static UColAttributeValue
145 ucol_sit_letterToAttributeValue(char letter
, UErrorCode
*status
) {
147 for(i
= 0; i
< sizeof(conversions
)/sizeof(conversions
[0]); i
++) {
148 if(conversions
[i
].letter
== letter
) {
149 return conversions
[i
].value
;
152 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
153 #ifdef UCOL_TRACE_SIT
154 fprintf(stderr
, "%s:%d: unknown letter %c: %s\n", __FILE__
, __LINE__
, letter
, u_errorName(*status
));
159 /* function prototype for functions used to parse a short string */
161 typedef const char* U_CALLCONV
162 ActionFunction(CollatorSpec
*spec
, uint32_t value1
, const char* string
,
167 static const char* U_CALLCONV
168 _processLocaleElement(CollatorSpec
*spec
, uint32_t value
, const char* string
,
173 if(value
== UCOL_SIT_LANGUAGE
|| value
== UCOL_SIT_KEYWORD
|| value
== UCOL_SIT_PROVIDER
) {
174 spec
->locElements
[value
][len
++] = uprv_tolower(*string
);
176 spec
->locElements
[value
][len
++] = *string
;
178 } while(*(++string
) != '_' && *string
&& len
< locElementCapacity
);
179 if(len
>= locElementCapacity
) {
180 *status
= U_BUFFER_OVERFLOW_ERROR
;
183 // don't skip the underscore at the end
189 static const char* U_CALLCONV
190 _processRFC3066Locale(CollatorSpec
*spec
, uint32_t, const char* string
,
193 char terminator
= *string
;
195 const char *end
= uprv_strchr(string
+1, terminator
);
196 if(end
== NULL
|| end
- string
>= loc3066Capacity
) {
197 *status
= U_BUFFER_OVERFLOW_ERROR
;
200 uprv_strncpy(spec
->locale
, string
, end
-string
);
208 static const char* U_CALLCONV
209 _processCollatorOption(CollatorSpec
*spec
, uint32_t option
, const char* string
,
212 spec
->options
[option
] = ucol_sit_letterToAttributeValue(*string
, status
);
213 if((*(++string
) != '_' && *string
) || U_FAILURE(*status
)) {
214 #ifdef UCOL_TRACE_SIT
215 fprintf(stderr
, "%s:%d: unknown collator option at '%s': %s\n", __FILE__
, __LINE__
, string
, u_errorName(*status
));
217 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
225 readHexCodeUnit(const char **string
, UErrorCode
*status
)
230 int32_t noDigits
= 0;
231 while((c
= **string
) != 0 && noDigits
< 4) {
232 if( c
>= '0' && c
<= '9') {
234 } else if ( c
>= 'a' && c
<= 'f') {
235 value
= c
- 'a' + 10;
236 } else if ( c
>= 'A' && c
<= 'F') {
237 value
= c
- 'A' + 10;
239 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
240 #ifdef UCOL_TRACE_SIT
241 fprintf(stderr
, "%s:%d: Bad hex char at '%s': %s\n", __FILE__
, __LINE__
, *string
, u_errorName(*status
));
245 result
= (result
<< 4) | (UChar
)value
;
249 // if the string was terminated before we read 4 digits, set an error
251 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
252 #ifdef UCOL_TRACE_SIT
253 fprintf(stderr
, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__
, __LINE__
, noDigits
,*string
, u_errorName(*status
));
260 static const char* U_CALLCONV
261 _processVariableTop(CollatorSpec
*spec
, uint32_t value1
, const char* string
, UErrorCode
*status
)
266 while(U_SUCCESS(*status
) && i
< locElementCapacity
&& *string
!= 0 && *string
!= '_') {
267 spec
->variableTopString
[i
++] = readHexCodeUnit(&string
, status
);
269 spec
->variableTopStringLen
= i
;
270 if(i
== locElementCapacity
&& *string
!= 0 && *string
!= '_') {
271 *status
= U_BUFFER_OVERFLOW_ERROR
;
274 spec
->variableTopValue
= readHexCodeUnit(&string
, status
);
276 if(U_SUCCESS(*status
)) {
277 spec
->variableTopSet
= TRUE
;
284 /* Table for parsing short strings */
285 struct ShortStringOptions
{
287 ActionFunction
*action
;
291 static const ShortStringOptions options
[UCOL_SIT_ITEMS_COUNT
] =
293 /* 10 ALTERNATE_HANDLING */ {alternateHArg
, _processCollatorOption
, UCOL_ALTERNATE_HANDLING
}, // alternate N, S, D
294 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg
, _processVariableTop
, 1 },
295 /* 08 CASE_FIRST */ {caseFirstArg
, _processCollatorOption
, UCOL_CASE_FIRST
}, // case first L, U, X, D
296 /* 09 NUMERIC_COLLATION */ {numericCollArg
, _processCollatorOption
, UCOL_NUMERIC_COLLATION
}, // codan O, X, D
297 /* 07 CASE_LEVEL */ {caseLevelArg
, _processCollatorOption
, UCOL_CASE_LEVEL
}, // case level O, X, D
298 /* 12 FRENCH_COLLATION */ {frenchCollArg
, _processCollatorOption
, UCOL_FRENCH_COLLATION
}, // french O, X, D
299 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg
, _processCollatorOption
, UCOL_HIRAGANA_QUATERNARY_MODE
}, // hiragana O, X, D
300 /* 04 KEYWORD */ {keywordArg
, _processLocaleElement
, UCOL_SIT_KEYWORD
}, // keyword
301 /* 00 LANGUAGE */ {languageArg
, _processLocaleElement
, UCOL_SIT_LANGUAGE
}, // language
302 /* 11 NORMALIZATION_MODE */ {normArg
, _processCollatorOption
, UCOL_NORMALIZATION_MODE
}, // norm O, X, D
303 /* 02 REGION */ {regionArg
, _processLocaleElement
, UCOL_SIT_REGION
}, // region
304 /* 06 STRENGTH */ {strengthArg
, _processCollatorOption
, UCOL_STRENGTH
}, // strength 1, 2, 3, 4, I, D
305 /* 14 VARIABLE_TOP */ {variableTopArg
, _processVariableTop
, 0 },
306 /* 03 VARIANT */ {variantArg
, _processLocaleElement
, UCOL_SIT_VARIANT
}, // variant
307 /* 05 RFC3066BIS */ {RFC3066Arg
, _processRFC3066Locale
, 0 }, // rfc3066bis locale name
308 /* 01 SCRIPT */ {scriptArg
, _processLocaleElement
, UCOL_SIT_SCRIPT
}, // script
309 /* PROVIDER */ {providerArg
, _processLocaleElement
, UCOL_SIT_PROVIDER
}
314 const char* ucol_sit_readOption(const char *start
, CollatorSpec
*spec
,
319 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
320 if(*start
== options
[i
].optionStart
) {
321 spec
->entries
[i
].start
= start
;
322 const char* end
= options
[i
].action(spec
, options
[i
].attr
, start
+1, status
);
323 spec
->entries
[i
].len
= (int32_t)(end
- start
);
327 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
328 #ifdef UCOL_TRACE_SIT
329 fprintf(stderr
, "%s:%d: Unknown option at '%s': %s\n", __FILE__
, __LINE__
, start
, u_errorName(*status
));
335 void ucol_sit_initCollatorSpecs(CollatorSpec
*spec
)
338 uprv_memset(spec
, 0, sizeof(CollatorSpec
));
339 // set collation options to default
341 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
342 spec
->options
[i
] = UCOL_DEFAULT
;
347 ucol_sit_readSpecs(CollatorSpec
*s
, const char *string
,
348 UParseError
*parseError
, UErrorCode
*status
)
350 const char *definition
= string
;
351 while(U_SUCCESS(*status
) && *string
) {
352 string
= ucol_sit_readOption(string
, s
, status
);
354 while(*string
&& *string
== '_') {
358 if(U_FAILURE(*status
)) {
359 parseError
->offset
= (int32_t)(string
- definition
);
365 int32_t ucol_sit_dumpSpecs(CollatorSpec
*s
, char *destination
, int32_t capacity
, UErrorCode
*status
)
367 int32_t i
= 0, j
= 0;
370 if(U_SUCCESS(*status
)) {
371 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
372 if(s
->entries
[i
].start
) {
375 uprv_strcat(destination
, "_");
379 optName
= *(s
->entries
[i
].start
);
380 if(optName
== languageArg
|| optName
== regionArg
|| optName
== variantArg
|| optName
== keywordArg
) {
381 for(j
= 0; j
< s
->entries
[i
].len
; j
++) {
382 if(len
+ j
< capacity
) {
383 destination
[len
+j
] = uprv_toupper(*(s
->entries
[i
].start
+j
));
386 len
+= s
->entries
[i
].len
;
388 len
+= s
->entries
[i
].len
;
390 uprv_strncat(destination
,s
->entries
[i
].start
, s
->entries
[i
].len
);
402 ucol_sit_calculateWholeLocale(CollatorSpec
*s
) {
403 // put the locale together, unless we have a done
405 if(s
->locale
[0] == 0) {
406 // first the language
407 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_LANGUAGE
]);
408 // then the script, if present
409 if(*(s
->locElements
[UCOL_SIT_SCRIPT
])) {
410 uprv_strcat(s
->locale
, "_");
411 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_SCRIPT
]);
413 // then the region, if present
414 if(*(s
->locElements
[UCOL_SIT_REGION
])) {
415 uprv_strcat(s
->locale
, "_");
416 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_REGION
]);
417 } else if(*(s
->locElements
[UCOL_SIT_VARIANT
])) { // if there is a variant, we need an underscore
418 uprv_strcat(s
->locale
, "_");
420 // add variant, if there
421 if(*(s
->locElements
[UCOL_SIT_VARIANT
])) {
422 uprv_strcat(s
->locale
, "_");
423 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_VARIANT
]);
426 // if there is a collation keyword, add that too
427 if(*(s
->locElements
[UCOL_SIT_KEYWORD
])) {
428 uprv_strcat(s
->locale
, collationKeyword
);
429 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_KEYWORD
]);
432 // if there is a provider keyword, add that too
433 if(*(s
->locElements
[UCOL_SIT_PROVIDER
])) {
434 uprv_strcat(s
->locale
, providerKeyword
);
435 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_PROVIDER
]);
441 U_CAPI
void U_EXPORT2
442 ucol_prepareShortStringOpen( const char *definition
,
444 UParseError
*parseError
,
447 if(U_FAILURE(*status
)) return;
449 UParseError internalParseError
;
452 parseError
= &internalParseError
;
454 parseError
->line
= 0;
455 parseError
->offset
= 0;
456 parseError
->preContext
[0] = 0;
457 parseError
->postContext
[0] = 0;
460 // first we want to pick stuff out of short string.
461 // we'll end up with an UCA version, locale and a bunch of
464 // analyse the string in order to get everything we need.
466 ucol_sit_initCollatorSpecs(&s
);
467 ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
468 ucol_sit_calculateWholeLocale(&s
);
470 char buffer
[internalBufferSize
];
471 uprv_memset(buffer
, 0, internalBufferSize
);
472 uloc_canonicalize(s
.locale
, buffer
, internalBufferSize
, status
);
474 UResourceBundle
*b
= ures_open(U_ICUDATA_COLL
, buffer
, status
);
475 /* we try to find stuff from keyword */
476 UResourceBundle
*collations
= ures_getByKey(b
, "collations", NULL
, status
);
477 UResourceBundle
*collElem
= NULL
;
479 // if there is a keyword, we pick it up and try to get elements
480 if(!uloc_getKeywordValue(buffer
, "collation", keyBuffer
, 256, status
)) {
481 // no keyword. we try to find the default setting, which will give us the keyword value
482 UResourceBundle
*defaultColl
= ures_getByKeyWithFallback(collations
, "default", NULL
, status
);
483 if(U_SUCCESS(*status
)) {
484 int32_t defaultKeyLen
= 0;
485 const UChar
*defaultKey
= ures_getString(defaultColl
, &defaultKeyLen
, status
);
486 u_UCharsToChars(defaultKey
, keyBuffer
, defaultKeyLen
);
487 keyBuffer
[defaultKeyLen
] = 0;
489 *status
= U_INTERNAL_PROGRAM_ERROR
;
492 ures_close(defaultColl
);
494 collElem
= ures_getByKeyWithFallback(collations
, keyBuffer
, collElem
, status
);
495 ures_close(collElem
);
496 ures_close(collations
);
501 U_CAPI UCollator
* U_EXPORT2
502 ucol_openFromShortString( const char *definition
,
504 UParseError
*parseError
,
507 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING
);
508 UTRACE_DATA1(UTRACE_INFO
, "short string = \"%s\"", definition
);
510 if(U_FAILURE(*status
)) return 0;
512 UParseError internalParseError
;
515 parseError
= &internalParseError
;
517 parseError
->line
= 0;
518 parseError
->offset
= 0;
519 parseError
->preContext
[0] = 0;
520 parseError
->postContext
[0] = 0;
523 // first we want to pick stuff out of short string.
524 // we'll end up with an UCA version, locale and a bunch of
527 // analyse the string in order to get everything we need.
528 const char *string
= definition
;
530 ucol_sit_initCollatorSpecs(&s
);
531 string
= ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
532 ucol_sit_calculateWholeLocale(&s
);
534 char buffer
[internalBufferSize
];
535 uprv_memset(buffer
, 0, internalBufferSize
);
536 uloc_canonicalize(s
.locale
, buffer
, internalBufferSize
, status
);
538 UCollator
*result
= ucol_open(buffer
, status
);
541 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
542 if(s
.options
[i
] != UCOL_DEFAULT
) {
543 if(forceDefaults
|| ucol_getAttribute(result
, (UColAttribute
)i
, status
) != s
.options
[i
]) {
544 ucol_setAttribute(result
, (UColAttribute
)i
, s
.options
[i
], status
);
547 if(U_FAILURE(*status
)) {
548 parseError
->offset
= (int32_t)(string
- definition
);
555 if(s
.variableTopSet
) {
556 if(s
.variableTopString
[0]) {
557 ucol_setVariableTop(result
, s
.variableTopString
, s
.variableTopStringLen
, status
);
558 } else { // we set by value, using 'B'
559 ucol_restoreVariableTop(result
, s
.variableTopValue
, status
);
564 if(U_FAILURE(*status
)) { // here it can only be a bogus value
569 UTRACE_EXIT_PTR_STATUS(result
, *status
);
574 static void appendShortStringElement(const char *src
, int32_t len
, char *result
, int32_t *resultSize
, int32_t capacity
, char arg
)
578 if(*resultSize
< capacity
) {
579 uprv_strcat(result
, "_");
583 *resultSize
+= len
+ 1;
584 if(*resultSize
< capacity
) {
585 uprv_strncat(result
, &arg
, 1);
586 uprv_strncat(result
, src
, len
);
591 U_CAPI
int32_t U_EXPORT2
592 ucol_getShortDefinitionString(const UCollator
*coll
,
598 if(U_FAILURE(*status
)) return 0;
599 if(coll
->delegate
!= NULL
) {
600 return ((icu::Collator
*)coll
->delegate
)->internalGetShortDefinitionString(locale
,dst
,capacity
,*status
);
602 char buffer
[internalBufferSize
];
603 uprv_memset(buffer
, 0, internalBufferSize
*sizeof(char));
604 int32_t resultSize
= 0;
605 char tempbuff
[internalBufferSize
];
606 char locBuff
[internalBufferSize
];
607 uprv_memset(buffer
, 0, internalBufferSize
*sizeof(char));
608 int32_t elementSize
= 0;
609 UBool isAvailable
= 0;
611 ucol_sit_initCollatorSpecs(&s
);
614 locale
= ucol_getLocaleByType(coll
, ULOC_VALID_LOCALE
, status
);
616 elementSize
= ucol_getFunctionalEquivalent(locBuff
, internalBufferSize
, "collation", locale
, &isAvailable
, status
);
619 // we should probably canonicalize here...
620 elementSize
= uloc_getLanguage(locBuff
, tempbuff
, internalBufferSize
, status
);
621 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, /*capacity*/internalBufferSize
, languageArg
);
622 elementSize
= uloc_getCountry(locBuff
, tempbuff
, internalBufferSize
, status
);
623 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, /*capacity*/internalBufferSize
, regionArg
);
624 elementSize
= uloc_getScript(locBuff
, tempbuff
, internalBufferSize
, status
);
625 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, /*capacity*/internalBufferSize
, scriptArg
);
626 elementSize
= uloc_getVariant(locBuff
, tempbuff
, internalBufferSize
, status
);
627 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, /*capacity*/internalBufferSize
, variantArg
);
628 elementSize
= uloc_getKeywordValue(locBuff
, "collation", tempbuff
, internalBufferSize
, status
);
629 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, /*capacity*/internalBufferSize
, keywordArg
);
633 UColAttributeValue attribute
= UCOL_DEFAULT
;
634 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
635 if(options
[i
].action
== _processCollatorOption
) {
636 attribute
= ucol_getAttributeOrDefault(coll
, (UColAttribute
)options
[i
].attr
, status
);
637 if(attribute
!= UCOL_DEFAULT
) {
638 char letter
= ucol_sit_attributeValueToLetter(attribute
, status
);
639 appendShortStringElement(&letter
, 1,
640 buffer
, &resultSize
, /*capacity*/internalBufferSize
, options
[i
].optionStart
);
644 if(coll
->variableTopValueisDefault
== FALSE
) {
645 //s.variableTopValue = ucol_getVariableTop(coll, status);
646 elementSize
= T_CString_integerToString(tempbuff
, coll
->variableTopValue
, 16);
647 appendShortStringElement(tempbuff
, elementSize
, buffer
, &resultSize
, capacity
, variableTopValArg
);
650 UParseError parseError
;
651 return ucol_normalizeShortDefinitionString(buffer
, dst
, capacity
, &parseError
, status
);
654 U_CAPI
int32_t U_EXPORT2
655 ucol_normalizeShortDefinitionString(const char *definition
,
658 UParseError
*parseError
,
662 if(U_FAILURE(*status
)) {
667 uprv_memset(destination
, 0, capacity
*sizeof(char));
677 ucol_sit_initCollatorSpecs(&s
);
678 ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
679 return ucol_sit_dumpSpecs(&s
, destination
, capacity
, status
);
682 U_CAPI UColAttributeValue U_EXPORT2
683 ucol_getAttributeOrDefault(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
)
685 if(U_FAILURE(*status
) || coll
== NULL
) {
689 case UCOL_NUMERIC_COLLATION
:
690 return coll
->numericCollationisDefault
?UCOL_DEFAULT
:coll
->numericCollation
;
691 case UCOL_HIRAGANA_QUATERNARY_MODE
:
692 return coll
->hiraganaQisDefault
?UCOL_DEFAULT
:coll
->hiraganaQ
;
693 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
694 return coll
->frenchCollationisDefault
?UCOL_DEFAULT
:coll
->frenchCollation
;
695 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
696 return coll
->alternateHandlingisDefault
?UCOL_DEFAULT
:coll
->alternateHandling
;
697 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
698 return coll
->caseFirstisDefault
?UCOL_DEFAULT
:coll
->caseFirst
;
699 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
700 return coll
->caseLevelisDefault
?UCOL_DEFAULT
:coll
->caseLevel
;
701 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
702 return coll
->normalizationModeisDefault
?UCOL_DEFAULT
:coll
->normalizationMode
;
703 case UCOL_STRENGTH
: /* attribute for strength */
704 return coll
->strengthisDefault
?UCOL_DEFAULT
:coll
->strength
;
705 case UCOL_ATTRIBUTE_COUNT
:
707 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
708 #ifdef UCOL_TRACE_SIT
709 fprintf(stderr
, "%s:%d: Unknown attr value '%d': %s\n", __FILE__
, __LINE__
, (int)attr
, u_errorName(*status
));
718 const UCollator
*coll
;
721 USet
*removedContractions
;
729 addSpecial(contContext
*context
, UChar
*buffer
, int32_t bufLen
,
730 uint32_t CE
, int32_t leftIndex
, int32_t rightIndex
, UErrorCode
*status
)
732 const UCollator
*coll
= context
->coll
;
733 USet
*contractions
= context
->conts
;
734 USet
*expansions
= context
->expansions
;
735 UBool addPrefixes
= context
->addPrefixes
;
737 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
738 uint32_t newCE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
739 // we might have a contraction that ends from previous level
740 if(newCE
!= UCOL_NOT_FOUND
) {
741 if(isSpecial(CE
) && getCETag(CE
) == CONTRACTION_TAG
&& isSpecial(newCE
) && getCETag(newCE
) == SPEC_PROC_TAG
&& addPrefixes
) {
742 addSpecial(context
, buffer
, bufLen
, newCE
, leftIndex
, rightIndex
, status
);
744 if(contractions
&& rightIndex
-leftIndex
> 1) {
745 uset_addString(contractions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
746 if(expansions
&& isSpecial(CE
) && getCETag(CE
) == EXPANSION_TAG
) {
747 uset_addString(expansions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
753 // check whether we're doing contraction or prefix
754 if(getCETag(CE
) == SPEC_PROC_TAG
&& addPrefixes
) {
756 *status
= U_INTERNAL_PROGRAM_ERROR
;
760 while(*UCharOffset
!= 0xFFFF) {
761 newCE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
762 buffer
[leftIndex
] = *UCharOffset
;
763 if(isSpecial(newCE
) && (getCETag(newCE
) == CONTRACTION_TAG
|| getCETag(newCE
) == SPEC_PROC_TAG
)) {
764 addSpecial(context
, buffer
, bufLen
, newCE
, leftIndex
, rightIndex
, status
);
767 uset_addString(contractions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
769 if(expansions
&& isSpecial(newCE
) && getCETag(newCE
) == EXPANSION_TAG
) {
770 uset_addString(expansions
, buffer
+leftIndex
, rightIndex
-leftIndex
);
775 } else if(getCETag(CE
) == CONTRACTION_TAG
) {
776 if(rightIndex
== bufLen
-1) {
777 *status
= U_INTERNAL_PROGRAM_ERROR
;
780 while(*UCharOffset
!= 0xFFFF) {
781 newCE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
782 buffer
[rightIndex
] = *UCharOffset
;
783 if(isSpecial(newCE
) && (getCETag(newCE
) == CONTRACTION_TAG
|| getCETag(newCE
) == SPEC_PROC_TAG
)) {
784 addSpecial(context
, buffer
, bufLen
, newCE
, leftIndex
, rightIndex
+1, status
);
787 uset_addString(contractions
, buffer
+leftIndex
, rightIndex
+1-leftIndex
);
789 if(expansions
&& isSpecial(newCE
) && getCETag(newCE
) == EXPANSION_TAG
) {
790 uset_addString(expansions
, buffer
+leftIndex
, rightIndex
+1-leftIndex
);
800 static UBool U_CALLCONV
801 _processSpecials(const void *context
, UChar32 start
, UChar32 limit
, uint32_t CE
)
803 UErrorCode
*status
= ((contContext
*)context
)->status
;
804 USet
*expansions
= ((contContext
*)context
)->expansions
;
805 USet
*removed
= ((contContext
*)context
)->removedContractions
;
806 UBool addPrefixes
= ((contContext
*)context
)->addPrefixes
;
807 UChar contraction
[internalBufferSize
];
809 if(((getCETag(CE
) == SPEC_PROC_TAG
&& addPrefixes
) || getCETag(CE
) == CONTRACTION_TAG
)) {
810 while(start
< limit
&& U_SUCCESS(*status
)) {
811 // if there are suppressed contractions, we don't
813 if(removed
&& uset_contains(removed
, start
)) {
817 // we start our contraction from middle, since we don't know if it
818 // will grow toward right or left
819 contraction
[internalBufferSize
/2] = (UChar
)start
;
820 addSpecial(((contContext
*)context
), contraction
, internalBufferSize
, CE
, internalBufferSize
/2, internalBufferSize
/2+1, status
);
823 } else if(expansions
&& getCETag(CE
) == EXPANSION_TAG
) {
824 while(start
< limit
&& U_SUCCESS(*status
)) {
825 uset_add(expansions
, start
++);
829 if(U_FAILURE(*status
)) {
841 * Get a set containing the contractions defined by the collator. The set includes
842 * both the UCA contractions and the contractions defined by the collator
843 * @param coll collator
844 * @param conts the set to hold the result
845 * @param status to hold the error code
846 * @return the size of the contraction set
848 U_CAPI
int32_t U_EXPORT2
849 ucol_getContractions( const UCollator
*coll
,
853 ucol_getContractionsAndExpansions(coll
, contractions
, NULL
, FALSE
, status
);
854 return uset_getItemCount(contractions
);
858 * Get a set containing the expansions defined by the collator. The set includes
859 * both the UCA expansions and the expansions defined by the tailoring
860 * @param coll collator
861 * @param conts the set to hold the result
862 * @param addPrefixes add the prefix contextual elements to contractions
863 * @param status to hold the error code
867 U_CAPI
void U_EXPORT2
868 ucol_getContractionsAndExpansions( const UCollator
*coll
,
874 if(U_FAILURE(*status
)) {
878 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
883 uset_clear(contractions
);
886 uset_clear(expansions
);
888 int32_t rulesLen
= 0;
889 const UChar
* rules
= ucol_getRules(coll
, &rulesLen
);
891 ucol_tok_initTokenList(&src
, rules
, rulesLen
, coll
->UCA
,
892 ucol_tok_getRulesFromBundle
, NULL
, status
);
894 contContext c
= { NULL
, contractions
, expansions
, src
.removeSet
, addPrefixes
, status
};
896 // Add the UCA contractions
898 utrie_enum(&coll
->UCA
->mapping
, NULL
, _processSpecials
, &c
);
900 // This is collator specific. Add contractions from a collator
902 c
.removedContractions
= NULL
;
903 utrie_enum(&coll
->mapping
, NULL
, _processSpecials
, &c
);
904 ucol_tok_closeTokenList(&src
);
907 U_CAPI
int32_t U_EXPORT2
908 ucol_getUnsafeSet( const UCollator
*coll
,
912 UChar buffer
[internalBufferSize
];
917 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
918 static const UChar cccpattern
[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
919 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
921 // add chars that fail the fcd check
922 uset_applyPattern(unsafe
, cccpattern
, 24, USET_IGNORE_SPACE
, status
);
924 // add Thai/Lao prevowels
925 uset_addRange(unsafe
, 0xe40, 0xe44);
926 uset_addRange(unsafe
, 0xec0, 0xec4);
927 // add lead/trail surrogates
928 uset_addRange(unsafe
, 0xd800, 0xdfff);
930 USet
*contractions
= uset_open(0,0);
932 int32_t i
= 0, j
= 0;
933 int32_t contsSize
= ucol_getContractions(coll
, contractions
, status
);
935 // Contraction set consists only of strings
936 // to get unsafe code points, we need to
937 // break the strings apart and add them to the unsafe set
938 for(i
= 0; i
< contsSize
; i
++) {
939 len
= uset_getItem(contractions
, i
, NULL
, NULL
, buffer
, internalBufferSize
, status
);
943 U16_NEXT(buffer
, j
, len
, c
);
951 uset_close(contractions
);
953 return uset_size(unsafe
);