1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2004-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_sit.cpp
10 * tab size: 8 (not used)
13 * Modification history
15 * 03/12/2004 weiv Creation
18 #include "unicode/ustring.h"
19 #include "unicode/udata.h"
20 #include "unicode/utf16.h"
26 #include "unicode/coll.h"
32 #if !UCONFIG_NO_COLLATION
34 #include "unicode/tblcoll.h"
37 UCOL_SIT_LANGUAGE
= 0,
42 UCOL_SIT_PROVIDER
= 5,
43 UCOL_SIT_LOCELEMENT_MAX
= UCOL_SIT_PROVIDER
, /* the last element that's part of LocElements */
49 UCOL_SIT_NUMERIC_COLLATION
,
50 UCOL_SIT_ALTERNATE_HANDLING
,
51 UCOL_SIT_NORMALIZATION_MODE
,
52 UCOL_SIT_FRENCH_COLLATION
,
53 UCOL_SIT_HIRAGANA_QUATERNARY
,
54 UCOL_SIT_VARIABLE_TOP
,
55 UCOL_SIT_VARIABLE_TOP_VALUE
,
59 /* option starters chars. */
60 static const char alternateHArg
= 'A';
61 static const char variableTopValArg
= 'B';
62 static const char caseFirstArg
= 'C';
63 static const char numericCollArg
= 'D';
64 static const char caseLevelArg
= 'E';
65 static const char frenchCollArg
= 'F';
66 static const char hiraganaQArg
= 'H';
67 static const char keywordArg
= 'K';
68 static const char languageArg
= 'L';
69 static const char normArg
= 'N';
70 static const char providerArg
= 'P';
71 static const char regionArg
= 'R';
72 static const char strengthArg
= 'S';
73 static const char variableTopArg
= 'T';
74 static const char variantArg
= 'V';
75 static const char RFC3066Arg
= 'X';
76 static const char scriptArg
= 'Z';
78 static const char collationKeyword
[] = "@collation=";
79 static const char providerKeyword
[] = "@sp=";
82 static const int32_t locElementCount
= UCOL_SIT_LOCELEMENT_MAX
+1;
83 static const int32_t locElementCapacity
= 32;
84 static const int32_t loc3066Capacity
= 256;
85 static const int32_t locProviderCapacity
= 10;
86 static const int32_t internalBufferSize
= 512;
88 /* structure containing specification of a collator. Initialized
89 * from a short string. Also used to construct a short string from a
93 char locElements
[locElementCount
][locElementCapacity
];
94 char locale
[loc3066Capacity
];
95 char provider
[locProviderCapacity
];
96 UColAttributeValue options
[UCOL_ATTRIBUTE_COUNT
];
97 uint32_t variableTopValue
;
98 UChar variableTopString
[locElementCapacity
];
99 int32_t variableTopStringLen
;
100 UBool variableTopSet
;
104 } entries
[UCOL_SIT_ITEMS_COUNT
];
108 /* structure for converting between character attribute
109 * representation and real collation attribute value.
111 struct AttributeConversion
{
113 UColAttributeValue value
;
116 static const AttributeConversion conversions
[12] = {
117 { '1', UCOL_PRIMARY
},
118 { '2', UCOL_SECONDARY
},
119 { '3', UCOL_TERTIARY
},
120 { '4', UCOL_QUATERNARY
},
121 { 'D', UCOL_DEFAULT
},
122 { 'I', UCOL_IDENTICAL
},
123 { 'L', UCOL_LOWER_FIRST
},
124 { 'N', UCOL_NON_IGNORABLE
},
126 { 'S', UCOL_SHIFTED
},
127 { 'U', UCOL_UPPER_FIRST
},
132 static UColAttributeValue
133 ucol_sit_letterToAttributeValue(char letter
, UErrorCode
*status
) {
135 for(i
= 0; i
< UPRV_LENGTHOF(conversions
); i
++) {
136 if(conversions
[i
].letter
== letter
) {
137 return conversions
[i
].value
;
140 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
141 #ifdef UCOL_TRACE_SIT
142 fprintf(stderr
, "%s:%d: unknown letter %c: %s\n", __FILE__
, __LINE__
, letter
, u_errorName(*status
));
147 /* function prototype for functions used to parse a short string */
149 typedef const char* U_CALLCONV
150 ActionFunction(CollatorSpec
*spec
, uint32_t value1
, const char* string
,
155 static const char* U_CALLCONV
156 _processLocaleElement(CollatorSpec
*spec
, uint32_t value
, const char* string
,
161 if(value
== UCOL_SIT_LANGUAGE
|| value
== UCOL_SIT_KEYWORD
|| value
== UCOL_SIT_PROVIDER
) {
162 spec
->locElements
[value
][len
++] = uprv_tolower(*string
);
164 spec
->locElements
[value
][len
++] = *string
;
166 } while(*(++string
) != '_' && *string
&& len
< locElementCapacity
);
167 if(len
>= locElementCapacity
) {
168 *status
= U_BUFFER_OVERFLOW_ERROR
;
171 // don't skip the underscore at the end
177 static const char* U_CALLCONV
178 _processRFC3066Locale(CollatorSpec
*spec
, uint32_t, const char* string
,
181 char terminator
= *string
;
183 const char *end
= uprv_strchr(string
+1, terminator
);
184 if(end
== NULL
|| end
- string
>= loc3066Capacity
) {
185 *status
= U_BUFFER_OVERFLOW_ERROR
;
188 uprv_strncpy(spec
->locale
, string
, end
-string
);
196 static const char* U_CALLCONV
197 _processCollatorOption(CollatorSpec
*spec
, uint32_t option
, const char* string
,
200 spec
->options
[option
] = ucol_sit_letterToAttributeValue(*string
, status
);
201 if((*(++string
) != '_' && *string
) || U_FAILURE(*status
)) {
202 #ifdef UCOL_TRACE_SIT
203 fprintf(stderr
, "%s:%d: unknown collator option at '%s': %s\n", __FILE__
, __LINE__
, string
, u_errorName(*status
));
205 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
213 readHexCodeUnit(const char **string
, UErrorCode
*status
)
218 int32_t noDigits
= 0;
219 while((c
= **string
) != 0 && noDigits
< 4) {
220 if( c
>= '0' && c
<= '9') {
222 } else if ( c
>= 'a' && c
<= 'f') {
223 value
= c
- 'a' + 10;
224 } else if ( c
>= 'A' && c
<= 'F') {
225 value
= c
- 'A' + 10;
227 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
228 #ifdef UCOL_TRACE_SIT
229 fprintf(stderr
, "%s:%d: Bad hex char at '%s': %s\n", __FILE__
, __LINE__
, *string
, u_errorName(*status
));
233 result
= (result
<< 4) | (UChar
)value
;
237 // if the string was terminated before we read 4 digits, set an error
239 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
240 #ifdef UCOL_TRACE_SIT
241 fprintf(stderr
, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__
, __LINE__
, noDigits
,*string
, u_errorName(*status
));
248 static const char* U_CALLCONV
249 _processVariableTop(CollatorSpec
*spec
, uint32_t value1
, const char* string
, UErrorCode
*status
)
254 while(U_SUCCESS(*status
) && i
< locElementCapacity
&& *string
!= 0 && *string
!= '_') {
255 spec
->variableTopString
[i
++] = readHexCodeUnit(&string
, status
);
257 spec
->variableTopStringLen
= i
;
258 if(i
== locElementCapacity
&& *string
!= 0 && *string
!= '_') {
259 *status
= U_BUFFER_OVERFLOW_ERROR
;
262 spec
->variableTopValue
= readHexCodeUnit(&string
, status
);
264 if(U_SUCCESS(*status
)) {
265 spec
->variableTopSet
= TRUE
;
272 /* Table for parsing short strings */
273 struct ShortStringOptions
{
275 ActionFunction
*action
;
279 static const ShortStringOptions options
[UCOL_SIT_ITEMS_COUNT
] =
281 /* 10 ALTERNATE_HANDLING */ {alternateHArg
, _processCollatorOption
, UCOL_ALTERNATE_HANDLING
}, // alternate N, S, D
282 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg
, _processVariableTop
, 1 },
283 /* 08 CASE_FIRST */ {caseFirstArg
, _processCollatorOption
, UCOL_CASE_FIRST
}, // case first L, U, X, D
284 /* 09 NUMERIC_COLLATION */ {numericCollArg
, _processCollatorOption
, UCOL_NUMERIC_COLLATION
}, // codan O, X, D
285 /* 07 CASE_LEVEL */ {caseLevelArg
, _processCollatorOption
, UCOL_CASE_LEVEL
}, // case level O, X, D
286 /* 12 FRENCH_COLLATION */ {frenchCollArg
, _processCollatorOption
, UCOL_FRENCH_COLLATION
}, // french O, X, D
287 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg
, _processCollatorOption
, UCOL_HIRAGANA_QUATERNARY_MODE
}, // hiragana O, X, D
288 /* 04 KEYWORD */ {keywordArg
, _processLocaleElement
, UCOL_SIT_KEYWORD
}, // keyword
289 /* 00 LANGUAGE */ {languageArg
, _processLocaleElement
, UCOL_SIT_LANGUAGE
}, // language
290 /* 11 NORMALIZATION_MODE */ {normArg
, _processCollatorOption
, UCOL_NORMALIZATION_MODE
}, // norm O, X, D
291 /* 02 REGION */ {regionArg
, _processLocaleElement
, UCOL_SIT_REGION
}, // region
292 /* 06 STRENGTH */ {strengthArg
, _processCollatorOption
, UCOL_STRENGTH
}, // strength 1, 2, 3, 4, I, D
293 /* 14 VARIABLE_TOP */ {variableTopArg
, _processVariableTop
, 0 },
294 /* 03 VARIANT */ {variantArg
, _processLocaleElement
, UCOL_SIT_VARIANT
}, // variant
295 /* 05 RFC3066BIS */ {RFC3066Arg
, _processRFC3066Locale
, 0 }, // rfc3066bis locale name
296 /* 01 SCRIPT */ {scriptArg
, _processLocaleElement
, UCOL_SIT_SCRIPT
}, // script
297 /* PROVIDER */ {providerArg
, _processLocaleElement
, UCOL_SIT_PROVIDER
}
302 const char* ucol_sit_readOption(const char *start
, CollatorSpec
*spec
,
307 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
308 if(*start
== options
[i
].optionStart
) {
309 spec
->entries
[i
].start
= start
;
310 const char* end
= options
[i
].action(spec
, options
[i
].attr
, start
+1, status
);
311 spec
->entries
[i
].len
= (int32_t)(end
- start
);
315 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
316 #ifdef UCOL_TRACE_SIT
317 fprintf(stderr
, "%s:%d: Unknown option at '%s': %s\n", __FILE__
, __LINE__
, start
, u_errorName(*status
));
323 void ucol_sit_initCollatorSpecs(CollatorSpec
*spec
)
326 uprv_memset(spec
, 0, sizeof(CollatorSpec
));
327 // set collation options to default
329 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
330 spec
->options
[i
] = UCOL_DEFAULT
;
335 ucol_sit_readSpecs(CollatorSpec
*s
, const char *string
,
336 UParseError
*parseError
, UErrorCode
*status
)
338 const char *definition
= string
;
339 while(U_SUCCESS(*status
) && *string
) {
340 string
= ucol_sit_readOption(string
, s
, status
);
342 while(*string
&& *string
== '_') {
346 if(U_FAILURE(*status
)) {
347 parseError
->offset
= (int32_t)(string
- definition
);
353 int32_t ucol_sit_dumpSpecs(CollatorSpec
*s
, char *destination
, int32_t capacity
, UErrorCode
*status
)
355 int32_t i
= 0, j
= 0;
358 if(U_SUCCESS(*status
)) {
359 for(i
= 0; i
< UCOL_SIT_ITEMS_COUNT
; i
++) {
360 if(s
->entries
[i
].start
) {
363 uprv_strcat(destination
, "_");
367 optName
= *(s
->entries
[i
].start
);
368 if(optName
== languageArg
|| optName
== regionArg
|| optName
== variantArg
|| optName
== keywordArg
) {
369 for(j
= 0; j
< s
->entries
[i
].len
; j
++) {
370 if(len
+ j
< capacity
) {
371 destination
[len
+j
] = uprv_toupper(*(s
->entries
[i
].start
+j
));
374 len
+= s
->entries
[i
].len
;
376 len
+= s
->entries
[i
].len
;
378 uprv_strncat(destination
,s
->entries
[i
].start
, s
->entries
[i
].len
);
390 ucol_sit_calculateWholeLocale(CollatorSpec
*s
) {
391 // put the locale together, unless we have a done
393 if(s
->locale
[0] == 0) {
394 // first the language
395 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_LANGUAGE
]);
396 // then the script, if present
397 if(*(s
->locElements
[UCOL_SIT_SCRIPT
])) {
398 uprv_strcat(s
->locale
, "_");
399 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_SCRIPT
]);
401 // then the region, if present
402 if(*(s
->locElements
[UCOL_SIT_REGION
])) {
403 uprv_strcat(s
->locale
, "_");
404 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_REGION
]);
405 } else if(*(s
->locElements
[UCOL_SIT_VARIANT
])) { // if there is a variant, we need an underscore
406 uprv_strcat(s
->locale
, "_");
408 // add variant, if there
409 if(*(s
->locElements
[UCOL_SIT_VARIANT
])) {
410 uprv_strcat(s
->locale
, "_");
411 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_VARIANT
]);
414 // if there is a collation keyword, add that too
415 if(*(s
->locElements
[UCOL_SIT_KEYWORD
])) {
416 uprv_strcat(s
->locale
, collationKeyword
);
417 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_KEYWORD
]);
420 // if there is a provider keyword, add that too
421 if(*(s
->locElements
[UCOL_SIT_PROVIDER
])) {
422 uprv_strcat(s
->locale
, providerKeyword
);
423 uprv_strcat(s
->locale
, s
->locElements
[UCOL_SIT_PROVIDER
]);
429 U_CAPI
void U_EXPORT2
430 ucol_prepareShortStringOpen( const char *definition
,
432 UParseError
*parseError
,
435 if(U_FAILURE(*status
)) return;
437 UParseError internalParseError
;
440 parseError
= &internalParseError
;
442 parseError
->line
= 0;
443 parseError
->offset
= 0;
444 parseError
->preContext
[0] = 0;
445 parseError
->postContext
[0] = 0;
448 // first we want to pick stuff out of short string.
449 // we'll end up with an UCA version, locale and a bunch of
452 // analyse the string in order to get everything we need.
454 ucol_sit_initCollatorSpecs(&s
);
455 ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
456 ucol_sit_calculateWholeLocale(&s
);
458 char buffer
[internalBufferSize
];
459 uprv_memset(buffer
, 0, internalBufferSize
);
460 uloc_canonicalize(s
.locale
, buffer
, internalBufferSize
, status
);
462 UResourceBundle
*b
= ures_open(U_ICUDATA_COLL
, buffer
, status
);
463 /* we try to find stuff from keyword */
464 UResourceBundle
*collations
= ures_getByKey(b
, "collations", NULL
, status
);
465 UResourceBundle
*collElem
= NULL
;
467 // if there is a keyword, we pick it up and try to get elements
468 int32_t keyLen
= uloc_getKeywordValue(buffer
, "collation", keyBuffer
, sizeof(keyBuffer
), status
);
469 // Treat too long a value as no keyword.
470 if(keyLen
>= (int32_t)sizeof(keyBuffer
)) {
472 *status
= U_ZERO_ERROR
;
476 // we try to find the default setting, which will give us the keyword value
477 UResourceBundle
*defaultColl
= ures_getByKeyWithFallback(collations
, "default", NULL
, status
);
478 if(U_SUCCESS(*status
)) {
479 int32_t defaultKeyLen
= 0;
480 const UChar
*defaultKey
= ures_getString(defaultColl
, &defaultKeyLen
, status
);
481 u_UCharsToChars(defaultKey
, keyBuffer
, defaultKeyLen
);
482 keyBuffer
[defaultKeyLen
] = 0;
484 *status
= U_INTERNAL_PROGRAM_ERROR
;
487 ures_close(defaultColl
);
489 collElem
= ures_getByKeyWithFallback(collations
, keyBuffer
, collElem
, status
);
490 ures_close(collElem
);
491 ures_close(collations
);
496 U_CAPI UCollator
* U_EXPORT2
497 ucol_openFromShortString( const char *definition
,
499 UParseError
*parseError
,
502 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING
);
503 UTRACE_DATA1(UTRACE_INFO
, "short string = \"%s\"", definition
);
505 if(U_FAILURE(*status
)) return 0;
507 UParseError internalParseError
;
510 parseError
= &internalParseError
;
512 parseError
->line
= 0;
513 parseError
->offset
= 0;
514 parseError
->preContext
[0] = 0;
515 parseError
->postContext
[0] = 0;
518 // first we want to pick stuff out of short string.
519 // we'll end up with an UCA version, locale and a bunch of
522 // analyse the string in order to get everything we need.
523 const char *string
= definition
;
525 ucol_sit_initCollatorSpecs(&s
);
526 string
= ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
527 ucol_sit_calculateWholeLocale(&s
);
529 char buffer
[internalBufferSize
];
530 uprv_memset(buffer
, 0, internalBufferSize
);
531 uloc_canonicalize(s
.locale
, buffer
, internalBufferSize
, status
);
533 UCollator
*result
= ucol_open(buffer
, status
);
536 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
537 if(s
.options
[i
] != UCOL_DEFAULT
) {
538 if(forceDefaults
|| ucol_getAttribute(result
, (UColAttribute
)i
, status
) != s
.options
[i
]) {
539 ucol_setAttribute(result
, (UColAttribute
)i
, s
.options
[i
], status
);
542 if(U_FAILURE(*status
)) {
543 parseError
->offset
= (int32_t)(string
- definition
);
550 if(s
.variableTopSet
) {
551 if(s
.variableTopString
[0]) {
552 ucol_setVariableTop(result
, s
.variableTopString
, s
.variableTopStringLen
, status
);
553 } else { // we set by value, using 'B'
554 ucol_restoreVariableTop(result
, s
.variableTopValue
, status
);
559 if(U_FAILURE(*status
)) { // here it can only be a bogus value
564 UTRACE_EXIT_PTR_STATUS(result
, *status
);
569 U_CAPI
int32_t U_EXPORT2
570 ucol_getShortDefinitionString(const UCollator
*coll
,
576 if(U_FAILURE(*status
)) return 0;
578 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
581 return ((icu::Collator
*)coll
)->internalGetShortDefinitionString(locale
,dst
,capacity
,*status
);
584 U_CAPI
int32_t U_EXPORT2
585 ucol_normalizeShortDefinitionString(const char *definition
,
588 UParseError
*parseError
,
592 if(U_FAILURE(*status
)) {
597 uprv_memset(destination
, 0, capacity
*sizeof(char));
607 ucol_sit_initCollatorSpecs(&s
);
608 ucol_sit_readSpecs(&s
, definition
, parseError
, status
);
609 return ucol_sit_dumpSpecs(&s
, destination
, capacity
, status
);
613 * Get a set containing the contractions defined by the collator. The set includes
614 * both the UCA contractions and the contractions defined by the collator
615 * @param coll collator
616 * @param conts the set to hold the result
617 * @param status to hold the error code
618 * @return the size of the contraction set
620 U_CAPI
int32_t U_EXPORT2
621 ucol_getContractions( const UCollator
*coll
,
625 ucol_getContractionsAndExpansions(coll
, contractions
, NULL
, FALSE
, status
);
626 return uset_getItemCount(contractions
);
630 * Get a set containing the expansions defined by the collator. The set includes
631 * both the UCA expansions and the expansions defined by the tailoring
632 * @param coll collator
633 * @param conts the set to hold the result
634 * @param addPrefixes add the prefix contextual elements to contractions
635 * @param status to hold the error code
639 U_CAPI
void U_EXPORT2
640 ucol_getContractionsAndExpansions( const UCollator
*coll
,
646 if(U_FAILURE(*status
)) {
650 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
653 const icu::RuleBasedCollator
*rbc
= icu::RuleBasedCollator::rbcFromUCollator(coll
);
655 *status
= U_UNSUPPORTED_ERROR
;
658 rbc
->internalGetContractionsAndExpansions(
659 icu::UnicodeSet::fromUSet(contractions
),
660 icu::UnicodeSet::fromUSet(expansions
),
661 addPrefixes
, *status
);