]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_sit.cpp
ICU-400.40.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_sit.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2004-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol_sit.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 03/12/2004 weiv Creation
14 */
15
16 #include "unicode/ustring.h"
17 #include "unicode/udata.h"
18
19 #include "utracimp.h"
20 #include "ucol_imp.h"
21 #include "ucol_tok.h"
22 #include "unormimp.h"
23 #include "cmemory.h"
24 #include "cstring.h"
25 #include "uresimp.h"
26
27 #if !UCONFIG_NO_COLLATION
28
29 enum OptionsList {
30 UCOL_SIT_LANGUAGE = 0,
31 UCOL_SIT_SCRIPT,
32 UCOL_SIT_REGION,
33 UCOL_SIT_VARIANT,
34 UCOL_SIT_KEYWORD,
35 UCOL_SIT_BCP47,
36 UCOL_SIT_STRENGTH,
37 UCOL_SIT_CASE_LEVEL,
38 UCOL_SIT_CASE_FIRST,
39 UCOL_SIT_NUMERIC_COLLATION,
40 UCOL_SIT_ALTERNATE_HANDLING,
41 UCOL_SIT_NORMALIZATION_MODE,
42 UCOL_SIT_FRENCH_COLLATION,
43 UCOL_SIT_HIRAGANA_QUATERNARY,
44 UCOL_SIT_VARIABLE_TOP,
45 UCOL_SIT_VARIABLE_TOP_VALUE,
46 UCOL_SIT_ITEMS_COUNT
47 };
48
49 /* option starters chars. */
50 static const char alternateHArg = 'A';
51 static const char variableTopValArg = 'B';
52 static const char caseFirstArg = 'C';
53 static const char numericCollArg = 'D';
54 static const char caseLevelArg = 'E';
55 static const char frenchCollArg = 'F';
56 static const char hiraganaQArg = 'H';
57 static const char keywordArg = 'K';
58 static const char languageArg = 'L';
59 static const char normArg = 'N';
60 static const char regionArg = 'R';
61 static const char strengthArg = 'S';
62 static const char variableTopArg = 'T';
63 static const char variantArg = 'V';
64 static const char RFC3066Arg = 'X';
65 static const char scriptArg = 'Z';
66
67 static const char collationKeyword[] = "@collation=";
68
69 static const int32_t locElementCount = 5;
70 static const int32_t locElementCapacity = 32;
71 static const int32_t loc3066Capacity = 256;
72 static const int32_t internalBufferSize = 512;
73
74 /* structure containing specification of a collator. Initialized
75 * from a short string. Also used to construct a short string from a
76 * collator instance
77 */
78 struct CollatorSpec {
79 char locElements[locElementCount][locElementCapacity];
80 char locale[loc3066Capacity];
81 UColAttributeValue options[UCOL_ATTRIBUTE_COUNT];
82 uint32_t variableTopValue;
83 UChar variableTopString[locElementCapacity];
84 int32_t variableTopStringLen;
85 UBool variableTopSet;
86 struct {
87 const char *start;
88 int32_t len;
89 } entries[UCOL_SIT_ITEMS_COUNT];
90 };
91
92
93 /* structure for converting between character attribute
94 * representation and real collation attribute value.
95 */
96 struct AttributeConversion {
97 char letter;
98 UColAttributeValue value;
99 };
100
101 static const AttributeConversion conversions[12] = {
102 { '1', UCOL_PRIMARY },
103 { '2', UCOL_SECONDARY },
104 { '3', UCOL_TERTIARY },
105 { '4', UCOL_QUATERNARY },
106 { 'D', UCOL_DEFAULT },
107 { 'I', UCOL_IDENTICAL },
108 { 'L', UCOL_LOWER_FIRST },
109 { 'N', UCOL_NON_IGNORABLE },
110 { 'O', UCOL_ON },
111 { 'S', UCOL_SHIFTED },
112 { 'U', UCOL_UPPER_FIRST },
113 { 'X', UCOL_OFF }
114 };
115
116
117 static char
118 ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) {
119 uint32_t i = 0;
120 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
121 if(conversions[i].value == value) {
122 return conversions[i].letter;
123 }
124 }
125 *status = U_ILLEGAL_ARGUMENT_ERROR;
126 return 0;
127 }
128
129 static UColAttributeValue
130 ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) {
131 uint32_t i = 0;
132 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
133 if(conversions[i].letter == letter) {
134 return conversions[i].value;
135 }
136 }
137 *status = U_ILLEGAL_ARGUMENT_ERROR;
138 return UCOL_DEFAULT;
139 }
140
141 /* function prototype for functions used to parse a short string */
142 U_CDECL_BEGIN
143 typedef const char* U_CALLCONV
144 ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string,
145 UErrorCode *status);
146 U_CDECL_END
147
148 U_CDECL_BEGIN
149 static const char* U_CALLCONV
150 _processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string,
151 UErrorCode *status)
152 {
153 int32_t len = 0;
154 do {
155 if(value == 0 || value == 4) {
156 spec->locElements[value][len++] = uprv_tolower(*string);
157 } else {
158 spec->locElements[value][len++] = *string;
159 }
160 } while(*(++string) != '_' && *string && len < locElementCapacity);
161 if(len >= locElementCapacity) {
162 *status = U_BUFFER_OVERFLOW_ERROR;
163 return string;
164 }
165 // don't skip the underscore at the end
166 return string;
167 }
168 U_CDECL_END
169
170 U_CDECL_BEGIN
171 static const char* U_CALLCONV
172 _processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string,
173 UErrorCode *status)
174 {
175 char terminator = *string;
176 string++;
177 const char *end = uprv_strchr(string+1, terminator);
178 if(end == NULL || end - string >= loc3066Capacity) {
179 *status = U_BUFFER_OVERFLOW_ERROR;
180 return string;
181 } else {
182 uprv_strncpy(spec->locale, string, end-string);
183 return end+1;
184 }
185 }
186
187 U_CDECL_END
188
189 U_CDECL_BEGIN
190 static const char* U_CALLCONV
191 _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string,
192 UErrorCode *status)
193 {
194 spec->options[option] = ucol_sit_letterToAttributeValue(*string, status);
195 if((*(++string) != '_' && *string) || U_FAILURE(*status)) {
196 *status = U_ILLEGAL_ARGUMENT_ERROR;
197 }
198 return string;
199 }
200 U_CDECL_END
201
202
203 static UChar
204 readHexCodeUnit(const char **string, UErrorCode *status)
205 {
206 UChar result = 0;
207 int32_t value = 0;
208 char c;
209 int32_t noDigits = 0;
210 while((c = **string) != 0 && noDigits < 4) {
211 if( c >= '0' && c <= '9') {
212 value = c - '0';
213 } else if ( c >= 'a' && c <= 'f') {
214 value = c - 'a' + 10;
215 } else if ( c >= 'A' && c <= 'F') {
216 value = c - 'A' + 10;
217 } else {
218 *status = U_ILLEGAL_ARGUMENT_ERROR;
219 return 0;
220 }
221 result = (result << 4) | (UChar)value;
222 noDigits++;
223 (*string)++;
224 }
225 // if the string was terminated before we read 4 digits, set an error
226 if(noDigits < 4) {
227 *status = U_ILLEGAL_ARGUMENT_ERROR;
228 }
229 return result;
230 }
231
232 U_CDECL_BEGIN
233 static const char* U_CALLCONV
234 _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status)
235 {
236 // get four digits
237 int32_t i = 0;
238 if(!value1) {
239 while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') {
240 spec->variableTopString[i++] = readHexCodeUnit(&string, status);
241 }
242 spec->variableTopStringLen = i;
243 if(i == locElementCapacity && (*string != 0 || *string != '_')) {
244 *status = U_BUFFER_OVERFLOW_ERROR;
245 }
246 } else {
247 spec->variableTopValue = readHexCodeUnit(&string, status);
248 }
249 if(U_SUCCESS(*status)) {
250 spec->variableTopSet = TRUE;
251 }
252 return string;
253 }
254 U_CDECL_END
255
256
257 /* Table for parsing short strings */
258 struct ShortStringOptions {
259 char optionStart;
260 ActionFunction *action;
261 uint32_t attr;
262 };
263
264 static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] =
265 {
266 /* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D
267 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 },
268 /* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D
269 /* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D
270 /* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D
271 /* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D
272 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D
273 /* 04 KEYWORD */ {keywordArg, _processLocaleElement, 4 }, // keyword
274 /* 00 LANGUAGE */ {languageArg, _processLocaleElement, 0 }, // language
275 /* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D
276 /* 02 REGION */ {regionArg, _processLocaleElement, 2 }, // region
277 /* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D
278 /* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 },
279 /* 03 VARIANT */ {variantArg, _processLocaleElement, 3 }, // variant
280 /* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name
281 /* 01 SCRIPT */ {scriptArg, _processLocaleElement, 1 } // script
282 };
283
284
285 static
286 const char* ucol_sit_readOption(const char *start, CollatorSpec *spec,
287 UErrorCode *status)
288 {
289 int32_t i = 0;
290
291 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
292 if(*start == options[i].optionStart) {
293 spec->entries[i].start = start;
294 const char* end = options[i].action(spec, options[i].attr, start+1, status);
295 spec->entries[i].len = end - start;
296 return end;
297 }
298 }
299 *status = U_ILLEGAL_ARGUMENT_ERROR;
300 return start;
301 }
302
303 static
304 void ucol_sit_initCollatorSpecs(CollatorSpec *spec)
305 {
306 // reset everything
307 uprv_memset(spec, 0, sizeof(CollatorSpec));
308 // set collation options to default
309 int32_t i = 0;
310 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
311 spec->options[i] = UCOL_DEFAULT;
312 }
313 }
314
315 static const char*
316 ucol_sit_readSpecs(CollatorSpec *s, const char *string,
317 UParseError *parseError, UErrorCode *status)
318 {
319 const char *definition = string;
320 while(U_SUCCESS(*status) && *string) {
321 string = ucol_sit_readOption(string, s, status);
322 // advance over '_'
323 while(*string && *string == '_') {
324 string++;
325 }
326 }
327 if(U_FAILURE(*status)) {
328 parseError->offset = string - definition;
329 }
330 return string;
331 }
332
333 static
334 int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status)
335 {
336 int32_t i = 0, j = 0;
337 int32_t len = 0;
338 char optName;
339 if(U_SUCCESS(*status)) {
340 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
341 if(s->entries[i].start) {
342 if(len) {
343 if(len < capacity) {
344 uprv_strcat(destination, "_");
345 }
346 len++;
347 }
348 optName = *(s->entries[i].start);
349 if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) {
350 for(j = 0; j < s->entries[i].len; j++) {
351 if(len + j < capacity) {
352 destination[len+j] = uprv_toupper(*(s->entries[i].start+j));
353 }
354 }
355 len += s->entries[i].len;
356 } else {
357 len += s->entries[i].len;
358 if(len < capacity) {
359 uprv_strncat(destination,s->entries[i].start, s->entries[i].len);
360 }
361 }
362 }
363 }
364 return len;
365 } else {
366 return 0;
367 }
368 }
369
370 static void
371 ucol_sit_calculateWholeLocale(CollatorSpec *s) {
372 // put the locale together, unless we have a done
373 // locale
374 if(s->locale[0] == 0) {
375 // first the language
376 uprv_strcat(s->locale, s->locElements[0]);
377 // then the script, if present
378 if(*(s->locElements[1])) {
379 uprv_strcat(s->locale, "_");
380 uprv_strcat(s->locale, s->locElements[1]);
381 }
382 // then the region, if present
383 if(*(s->locElements[2])) {
384 uprv_strcat(s->locale, "_");
385 uprv_strcat(s->locale, s->locElements[2]);
386 } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore
387 uprv_strcat(s->locale, "_");
388 }
389 // add variant, if there
390 if(*(s->locElements[3])) {
391 uprv_strcat(s->locale, "_");
392 uprv_strcat(s->locale, s->locElements[3]);
393 }
394
395 // if there is a collation keyword, add that too
396 if(*(s->locElements[4])) {
397 uprv_strcat(s->locale, collationKeyword);
398 uprv_strcat(s->locale, s->locElements[4]);
399 }
400 }
401 }
402
403
404 U_CAPI void U_EXPORT2
405 ucol_prepareShortStringOpen( const char *definition,
406 UBool,
407 UParseError *parseError,
408 UErrorCode *status)
409 {
410 if(U_FAILURE(*status)) return;
411
412 UParseError internalParseError;
413
414 if(!parseError) {
415 parseError = &internalParseError;
416 }
417 parseError->line = 0;
418 parseError->offset = 0;
419 parseError->preContext[0] = 0;
420 parseError->postContext[0] = 0;
421
422
423 // first we want to pick stuff out of short string.
424 // we'll end up with an UCA version, locale and a bunch of
425 // settings
426
427 // analyse the string in order to get everything we need.
428 CollatorSpec s;
429 ucol_sit_initCollatorSpecs(&s);
430 ucol_sit_readSpecs(&s, definition, parseError, status);
431 ucol_sit_calculateWholeLocale(&s);
432
433 char buffer[internalBufferSize];
434 uprv_memset(buffer, 0, internalBufferSize);
435 uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
436
437 UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status);
438 /* we try to find stuff from keyword */
439 UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status);
440 UResourceBundle *collElem = NULL;
441 char keyBuffer[256];
442 // if there is a keyword, we pick it up and try to get elements
443 if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) {
444 // no keyword. we try to find the default setting, which will give us the keyword value
445 UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status);
446 if(U_SUCCESS(*status)) {
447 int32_t defaultKeyLen = 0;
448 const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status);
449 u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen);
450 keyBuffer[defaultKeyLen] = 0;
451 } else {
452 *status = U_INTERNAL_PROGRAM_ERROR;
453 return;
454 }
455 ures_close(defaultColl);
456 }
457 collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status);
458 ures_close(collElem);
459 ures_close(collations);
460 ures_close(b);
461 }
462
463
464 U_CAPI UCollator* U_EXPORT2
465 ucol_openFromShortString( const char *definition,
466 UBool forceDefaults,
467 UParseError *parseError,
468 UErrorCode *status)
469 {
470 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING);
471 UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition);
472
473 if(U_FAILURE(*status)) return 0;
474
475 UParseError internalParseError;
476
477 if(!parseError) {
478 parseError = &internalParseError;
479 }
480 parseError->line = 0;
481 parseError->offset = 0;
482 parseError->preContext[0] = 0;
483 parseError->postContext[0] = 0;
484
485
486 // first we want to pick stuff out of short string.
487 // we'll end up with an UCA version, locale and a bunch of
488 // settings
489
490 // analyse the string in order to get everything we need.
491 const char *string = definition;
492 CollatorSpec s;
493 ucol_sit_initCollatorSpecs(&s);
494 string = ucol_sit_readSpecs(&s, definition, parseError, status);
495 ucol_sit_calculateWholeLocale(&s);
496
497 char buffer[internalBufferSize];
498 uprv_memset(buffer, 0, internalBufferSize);
499 uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
500
501 UCollator *result = ucol_open(buffer, status);
502 int32_t i = 0;
503
504 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
505 if(s.options[i] != UCOL_DEFAULT) {
506 if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) {
507 ucol_setAttribute(result, (UColAttribute)i, s.options[i], status);
508 }
509
510 if(U_FAILURE(*status)) {
511 parseError->offset = string - definition;
512 ucol_close(result);
513 return NULL;
514 }
515
516 }
517 }
518 if(s.variableTopSet) {
519 if(s.variableTopString[0]) {
520 ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status);
521 } else { // we set by value, using 'B'
522 ucol_restoreVariableTop(result, s.variableTopValue, status);
523 }
524 }
525
526
527 if(U_FAILURE(*status)) { // here it can only be a bogus value
528 ucol_close(result);
529 result = NULL;
530 }
531
532 UTRACE_EXIT_PTR_STATUS(result, *status);
533 return result;
534 }
535
536
537 static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg)
538 {
539 if(len) {
540 if(*resultSize) {
541 if(*resultSize < capacity) {
542 uprv_strcat(result, "_");
543 }
544 (*resultSize)++;
545 }
546 *resultSize += len + 1;
547 if(*resultSize < capacity) {
548 uprv_strncat(result, &arg, 1);
549 uprv_strncat(result, src, len);
550 }
551 }
552 }
553
554 U_CAPI int32_t U_EXPORT2
555 ucol_getShortDefinitionString(const UCollator *coll,
556 const char *locale,
557 char *dst,
558 int32_t capacity,
559 UErrorCode *status)
560 {
561 if(U_FAILURE(*status)) return 0;
562 char buffer[internalBufferSize];
563 uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
564 int32_t resultSize = 0;
565 char tempbuff[internalBufferSize];
566 char locBuff[internalBufferSize];
567 uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
568 int32_t elementSize = 0;
569 UBool isAvailable = 0;
570 CollatorSpec s;
571 ucol_sit_initCollatorSpecs(&s);
572
573 if(!locale) {
574 locale = ucol_getLocale(coll, ULOC_VALID_LOCALE, status);
575 }
576 elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status);
577
578 if(elementSize) {
579 // we should probably canonicalize here...
580 elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
581 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg);
582 elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
583 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg);
584 elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
585 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg);
586 elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
587 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg);
588 elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
589 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg);
590 }
591
592 int32_t i = 0;
593 UColAttributeValue attribute = UCOL_DEFAULT;
594 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
595 if(options[i].action == _processCollatorOption) {
596 attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status);
597 if(attribute != UCOL_DEFAULT) {
598 char letter = ucol_sit_attributeValueToLetter(attribute, status);
599 appendShortStringElement(&letter, 1,
600 buffer, &resultSize, capacity, options[i].optionStart);
601 }
602 }
603 }
604 if(coll->variableTopValueisDefault == FALSE) {
605 //s.variableTopValue = ucol_getVariableTop(coll, status);
606 elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16);
607 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg);
608 }
609
610 UParseError parseError;
611 return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status);
612 }
613
614 U_CAPI int32_t U_EXPORT2
615 ucol_normalizeShortDefinitionString(const char *definition,
616 char *destination,
617 int32_t capacity,
618 UParseError *parseError,
619 UErrorCode *status)
620 {
621
622 if(U_FAILURE(*status)) {
623 return 0;
624 }
625
626 if(destination) {
627 uprv_memset(destination, 0, capacity*sizeof(char));
628 }
629
630 UParseError pe;
631 if(!parseError) {
632 parseError = &pe;
633 }
634
635 // validate
636 CollatorSpec s;
637 ucol_sit_initCollatorSpecs(&s);
638 ucol_sit_readSpecs(&s, definition, parseError, status);
639 return ucol_sit_dumpSpecs(&s, destination, capacity, status);
640 }
641
642 U_CAPI UColAttributeValue U_EXPORT2
643 ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status)
644 {
645 if(U_FAILURE(*status) || coll == NULL) {
646 return UCOL_DEFAULT;
647 }
648 switch(attr) {
649 case UCOL_NUMERIC_COLLATION:
650 return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation;
651 case UCOL_HIRAGANA_QUATERNARY_MODE:
652 return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ;
653 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
654 return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation;
655 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
656 return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling;
657 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
658 return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst;
659 case UCOL_CASE_LEVEL: /* do we have an extra case level */
660 return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel;
661 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
662 return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode;
663 case UCOL_STRENGTH: /* attribute for strength */
664 return coll->strengthisDefault?UCOL_DEFAULT:coll->strength;
665 case UCOL_ATTRIBUTE_COUNT:
666 default:
667 *status = U_ILLEGAL_ARGUMENT_ERROR;
668 break;
669 }
670 return UCOL_DEFAULT;
671 }
672
673
674 struct contContext {
675 const UCollator *coll;
676 USet *conts;
677 USet *expansions;
678 USet *removedContractions;
679 UBool addPrefixes;
680 UErrorCode *status;
681 };
682
683
684
685 static void
686 addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
687 uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
688 {
689 const UCollator *coll = context->coll;
690 USet *contractions = context->conts;
691 USet *expansions = context->expansions;
692 UBool addPrefixes = context->addPrefixes;
693
694 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
695 uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
696 // we might have a contraction that ends from previous level
697 if(newCE != UCOL_NOT_FOUND) {
698 if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
699 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
700 }
701 if(contractions && rightIndex-leftIndex > 1) {
702 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
703 if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
704 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
705 }
706 }
707 }
708
709 UCharOffset++;
710 // check whether we're doing contraction or prefix
711 if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
712 if(leftIndex == 0) {
713 *status = U_INTERNAL_PROGRAM_ERROR;
714 return;
715 }
716 --leftIndex;
717 while(*UCharOffset != 0xFFFF) {
718 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
719 buffer[leftIndex] = *UCharOffset;
720 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
721 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
722 } else {
723 if(contractions) {
724 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
725 }
726 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
727 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
728 }
729 }
730 UCharOffset++;
731 }
732 } else if(getCETag(CE) == CONTRACTION_TAG) {
733 if(rightIndex == bufLen-1) {
734 *status = U_INTERNAL_PROGRAM_ERROR;
735 return;
736 }
737 while(*UCharOffset != 0xFFFF) {
738 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
739 buffer[rightIndex] = *UCharOffset;
740 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
741 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
742 } else {
743 if(contractions) {
744 uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
745 }
746 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
747 uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
748 }
749 }
750 UCharOffset++;
751 }
752 }
753
754 }
755
756 U_CDECL_BEGIN
757 static UBool U_CALLCONV
758 _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
759 {
760 UErrorCode *status = ((contContext *)context)->status;
761 USet *expansions = ((contContext *)context)->expansions;
762 USet *removed = ((contContext *)context)->removedContractions;
763 UBool addPrefixes = ((contContext *)context)->addPrefixes;
764 UChar contraction[internalBufferSize];
765 if(isSpecial(CE)) {
766 if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
767 while(start < limit && U_SUCCESS(*status)) {
768 // if there are suppressed contractions, we don't
769 // want to add them.
770 if(removed && uset_contains(removed, start)) {
771 start++;
772 continue;
773 }
774 // we start our contraction from middle, since we don't know if it
775 // will grow toward right or left
776 contraction[internalBufferSize/2] = (UChar)start;
777 addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
778 start++;
779 }
780 } else if(expansions && getCETag(CE) == EXPANSION_TAG) {
781 while(start < limit && U_SUCCESS(*status)) {
782 uset_add(expansions, start++);
783 }
784 }
785 }
786 if(U_FAILURE(*status)) {
787 return FALSE;
788 } else {
789 return TRUE;
790 }
791 }
792
793 U_CDECL_END
794
795
796
797 /**
798 * Get a set containing the contractions defined by the collator. The set includes
799 * both the UCA contractions and the contractions defined by the collator
800 * @param coll collator
801 * @param conts the set to hold the result
802 * @param status to hold the error code
803 * @return the size of the contraction set
804 *
805 * @draft ICU 3.0
806 */
807 U_CAPI int32_t U_EXPORT2
808 ucol_getContractions( const UCollator *coll,
809 USet *contractions,
810 UErrorCode *status)
811 {
812 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
813 return uset_getItemCount(contractions);
814 }
815
816 /**
817 * Get a set containing the expansions defined by the collator. The set includes
818 * both the UCA expansions and the expansions defined by the tailoring
819 * @param coll collator
820 * @param conts the set to hold the result
821 * @param addPrefixes add the prefix contextual elements to contractions
822 * @param status to hold the error code
823 *
824 * @draft ICU 3.4
825 */
826 U_CAPI void U_EXPORT2
827 ucol_getContractionsAndExpansions( const UCollator *coll,
828 USet *contractions,
829 USet *expansions,
830 UBool addPrefixes,
831 UErrorCode *status)
832 {
833 if(U_FAILURE(*status)) {
834 return;
835 }
836 if(coll == NULL) {
837 *status = U_ILLEGAL_ARGUMENT_ERROR;
838 return;
839 }
840
841 if(contractions) {
842 uset_clear(contractions);
843 }
844 if(expansions) {
845 uset_clear(expansions);
846 }
847 int32_t rulesLen = 0;
848 const UChar* rules = ucol_getRules(coll, &rulesLen);
849 UColTokenParser src;
850 ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
851
852 contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
853
854 // Add the UCA contractions
855 c.coll = coll->UCA;
856 utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c);
857
858 // This is collator specific. Add contractions from a collator
859 c.coll = coll;
860 c.removedContractions = NULL;
861 utrie_enum(&coll->mapping, NULL, _processSpecials, &c);
862 ucol_tok_closeTokenList(&src);
863 }
864
865 U_CAPI int32_t U_EXPORT2
866 ucol_getUnsafeSet( const UCollator *coll,
867 USet *unsafe,
868 UErrorCode *status)
869 {
870 UChar buffer[internalBufferSize];
871 int32_t len = 0;
872
873 uset_clear(unsafe);
874
875 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
876 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
877 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
878
879 // add chars that fail the fcd check
880 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
881
882 // add Thai/Lao prevowels
883 uset_addRange(unsafe, 0xe40, 0xe44);
884 uset_addRange(unsafe, 0xec0, 0xec4);
885 // add lead/trail surrogates
886 uset_addRange(unsafe, 0xd800, 0xdfff);
887
888 USet *contractions = uset_open(0,0);
889
890 int32_t i = 0, j = 0;
891 int32_t contsSize = ucol_getContractions(coll, contractions, status);
892 UChar32 c = 0;
893 // Contraction set consists only of strings
894 // to get unsafe code points, we need to
895 // break the strings apart and add them to the unsafe set
896 for(i = 0; i < contsSize; i++) {
897 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
898 if(len > 0) {
899 j = 0;
900 while(j < len) {
901 U16_NEXT(buffer, j, len, c);
902 if(j < len) {
903 uset_add(unsafe, c);
904 }
905 }
906 }
907 }
908
909 uset_close(contractions);
910
911 return uset_size(unsafe);
912 }
913 #endif