]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / ucol.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2003, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20 #include "uassert.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/uloc.h"
25 #include "unicode/coll.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/coleitr.h"
28 #include "unicode/unorm.h"
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/caniter.h"
32
33 #include "ucol_bld.h"
34 #include "ucol_imp.h"
35 #include "ucol_tok.h"
36 #include "ucol_elm.h"
37 #include "bocsu.h"
38
39 #include "unormimp.h"
40 #include "unorm_it.h"
41 #include "uresimp.h"
42 #include "umutex.h"
43 #include "uhash.h"
44 #include "ucln_in.h"
45 #include "cstring.h"
46
47 #ifdef UCOL_DEBUG
48 #include <stdio.h>
49 #endif
50
51 U_NAMESPACE_USE
52
53 /* added by synwee for trie manipulation*/
54 #define STAGE_1_SHIFT_ 10
55 #define STAGE_2_SHIFT_ 4
56 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
57 #define STAGE_3_MASK_ 0xF
58 #define LAST_BYTE_MASK_ 0xFF
59 #define SECOND_LAST_BYTE_SHIFT_ 8
60
61 #define ZERO_CC_LIMIT_ 0xC0
62
63 static UCollator* UCA = NULL;
64 static UCAConstants *UCAconsts = NULL;
65 static UDataMemory* UCA_DATA_MEM = NULL;
66
67
68 U_CDECL_BEGIN
69 static UBool U_CALLCONV
70 isAcceptableUCA(void * /*context*/,
71 const char * /*type*/, const char * /*name*/,
72 const UDataInfo *pInfo){
73 /* context, type & name are intentionally not used */
74 if( pInfo->size>=20 &&
75 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
76 pInfo->charsetFamily==U_CHARSET_FAMILY &&
77 pInfo->dataFormat[0]==ucaDataInfo.dataFormat[0] && /* dataFormat="UCol" */
78 pInfo->dataFormat[1]==ucaDataInfo.dataFormat[1] &&
79 pInfo->dataFormat[2]==ucaDataInfo.dataFormat[2] &&
80 pInfo->dataFormat[3]==ucaDataInfo.dataFormat[3] &&
81 pInfo->formatVersion[0]==ucaDataInfo.formatVersion[0] &&
82 pInfo->formatVersion[1]>=ucaDataInfo.formatVersion[1]// &&
83 //pInfo->formatVersion[1]==ucaDataInfo.formatVersion[1] &&
84 //pInfo->formatVersion[2]==ucaDataInfo.formatVersion[2] && // Too harsh
85 //pInfo->formatVersion[3]==ucaDataInfo.formatVersion[3] && // Too harsh
86 ) {
87 UVersionInfo UCDVersion;
88 u_getUnicodeVersion(UCDVersion);
89 if(pInfo->dataVersion[0]==UCDVersion[0] &&
90 pInfo->dataVersion[1]==UCDVersion[1]) { // &&
91 //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
92 //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
93 return TRUE;
94 } else {
95 return FALSE;
96 }
97 } else {
98 return FALSE;
99 }
100 }
101
102
103 static int32_t U_CALLCONV
104 _getFoldingOffset(uint32_t data) {
105 return (int32_t)(data&0xFFFFFF);
106 }
107
108 U_CDECL_END
109
110 static
111 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
112 int32_t sourceLen, collIterate *s) {
113 (s)->string = (s)->pos = (UChar *)(sourceString);
114 (s)->origFlags = 0;
115 (s)->flags = 0;
116 if (sourceLen >= 0) {
117 s->flags |= UCOL_ITER_HASLEN;
118 (s)->endp = (UChar *)sourceString+sourceLen;
119 }
120 else {
121 /* change to enable easier checking for end of string for fcdpositon */
122 (s)->endp = NULL;
123 }
124 (s)->CEpos = (s)->toReturn = (s)->CEs;
125 (s)->writableBuffer = (s)->stackWritableBuffer;
126 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
127 (s)->coll = (collator);
128 (s)->fcdPosition = 0;
129 if(collator->normalizationMode == UCOL_ON) {
130 (s)->flags |= UCOL_ITER_NORM;
131 }
132 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
133 (s)->flags |= UCOL_HIRAGANA_Q;
134 }
135 (s)->iterator = NULL;
136 //(s)->iteratorIndex = 0;
137 }
138
139 U_CAPI void U_EXPORT2
140 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
141 int32_t sourceLen, collIterate *s){
142 /* Out-of-line version for use from other files. */
143 IInit_collIterate(collator, sourceString, sourceLen, s);
144 }
145
146
147 /**
148 * Backup the state of the collIterate struct data
149 * @param data collIterate to backup
150 * @param backup storage
151 */
152 static
153 inline void backupState(const collIterate *data, collIterateState *backup)
154 {
155 backup->fcdPosition = data->fcdPosition;
156 backup->flags = data->flags;
157 backup->origFlags = data->origFlags;
158 backup->pos = data->pos;
159 backup->bufferaddress = data->writableBuffer;
160 backup->buffersize = data->writableBufSize;
161 if(data->iterator != NULL) {
162 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
163 backup->iteratorIndex = data->iterator->getState(data->iterator);
164 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
165 backup->iteratorMove = 0;
166 if(backup->iteratorIndex == UITER_NO_STATE) {
167 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
168 backup->iteratorMove++;
169 data->iterator->move(data->iterator, -1, UITER_CURRENT);
170 }
171 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
172 }
173 }
174 }
175
176 /**
177 * Loads the state into the collIterate struct data
178 * @param data collIterate to backup
179 * @param backup storage
180 * @param forwards boolean to indicate if forwards iteration is used,
181 * false indicates backwards iteration
182 */
183 static
184 inline void loadState(collIterate *data, const collIterateState *backup,
185 UBool forwards)
186 {
187 UErrorCode status = U_ZERO_ERROR;
188 data->flags = backup->flags;
189 data->origFlags = backup->origFlags;
190 if(data->iterator != NULL) {
191 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
192 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
193 if(backup->iteratorMove != 0) {
194 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
195 }
196 }
197 data->pos = backup->pos;
198 if ((data->flags & UCOL_ITER_INNORMBUF) &&
199 data->writableBuffer != backup->bufferaddress) {
200 /*
201 this is when a new buffer has been reallocated and we'll have to
202 calculate the new position.
203 note the new buffer has to contain the contents of the old buffer.
204 */
205 if (forwards) {
206 data->pos = data->writableBuffer +
207 (data->pos - backup->bufferaddress);
208 }
209 else {
210 /* backwards direction */
211 uint32_t temp = backup->buffersize -
212 (data->pos - backup->bufferaddress);
213 data->pos = data->writableBuffer + (data->writableBufSize - temp);
214 }
215 }
216 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
217 /*
218 this is alittle tricky.
219 if we are initially not in the normalization buffer, even if we
220 normalize in the later stage, the data in the buffer will be
221 ignored, since we skip back up to the data string.
222 however if we are already in the normalization buffer, any
223 further normalization will pull data into the normalization
224 buffer and modify the fcdPosition.
225 since we are keeping the data in the buffer for use, the
226 fcdPosition can not be reverted back.
227 arrgghh....
228 */
229 data->fcdPosition = backup->fcdPosition;
230 }
231 }
232
233
234 /*
235 * collIter_eos()
236 * Checks for a collIterate being positioned at the end of
237 * its source string.
238 *
239 */
240 static
241 inline UBool collIter_eos(collIterate *s) {
242 if(s->flags & UCOL_USE_ITERATOR) {
243 return !(s->iterator->hasNext(s->iterator));
244 }
245 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
246 // Null terminated string, but not at null, so not at end.
247 // Whether in main or normalization buffer doesn't matter.
248 return FALSE;
249 }
250
251 // String with length. Can't be in normalization buffer, which is always
252 // null termintated.
253 if (s->flags & UCOL_ITER_HASLEN) {
254 return (s->pos == s->endp);
255 }
256
257 // We are at a null termination, could be either normalization buffer or main string.
258 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
259 // At null at end of main string.
260 return TRUE;
261 }
262
263 // At null at end of normalization buffer. Need to check whether there there are
264 // any characters left in the main buffer.
265 if(s->origFlags & UCOL_USE_ITERATOR) {
266 return !(s->iterator->hasNext(s->iterator));
267 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
268 // Null terminated main string. fcdPosition is the 'return' position into main buf.
269 return (*s->fcdPosition == 0);
270 }
271 else {
272 // Main string with an end pointer.
273 return s->fcdPosition == s->endp;
274 }
275 }
276
277 /*
278 * collIter_bos()
279 * Checks for a collIterate being positioned at the start of
280 * its source string.
281 *
282 */
283 static
284 inline UBool collIter_bos(collIterate *source) {
285 // if we're going backwards, we need to know whether there is more in the
286 // iterator, even if we are in the side buffer
287 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
288 return !source->iterator->hasPrevious(source->iterator);
289 }
290 if (source->pos <= source->string ||
291 ((source->flags & UCOL_ITER_INNORMBUF) &&
292 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
293 return TRUE;
294 }
295 return FALSE;
296 }
297
298 static
299 inline UBool collIter_SimpleBos(collIterate *source) {
300 // if we're going backwards, we need to know whether there is more in the
301 // iterator, even if we are in the side buffer
302 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
303 return !source->iterator->hasPrevious(source->iterator);
304 }
305 if (source->pos == source->string) {
306 return TRUE;
307 }
308 return FALSE;
309 }
310 //return (data->pos == data->string) ||
311
312
313 /**
314 * Checks and free writable buffer if it is not the original stack buffer
315 * in collIterate. This function does not reassign the writable buffer.
316 * @param data collIterate struct to determine and free the writable buffer
317 */
318 static
319 inline void freeHeapWritableBuffer(collIterate *data)
320 {
321 if (data->writableBuffer != data->stackWritableBuffer) {
322 uprv_free(data->writableBuffer);
323 }
324 }
325
326
327 /****************************************************************************/
328 /* Following are the open/close functions */
329 /* */
330 /****************************************************************************/
331 static UCollator*
332 tryOpeningFromRules(UResourceBundle *collElem, UErrorCode *status) {
333 int32_t rulesLen = 0;
334 const UChar *rules = ures_getStringByKey(collElem, "Sequence", &rulesLen, status);
335 return ucol_openRules(rules, rulesLen, UCOL_DEFAULT, UCOL_DEFAULT, NULL, status);
336
337 }
338
339
340 U_CAPI UCollator*
341 ucol_open(const char *loc,
342 UErrorCode *status)
343 {
344 UCollator *result = NULL;
345 if (status && U_SUCCESS(*status)) {
346 result = Collator::createUCollator(loc, status);
347 if (result) {
348 return result;
349 }
350 }
351 return ucol_open_internal(loc, status);
352 }
353
354 // API in ucol_imp.h
355
356 U_CFUNC UCollator*
357 ucol_open_internal(const char *loc,
358 UErrorCode *status)
359 {
360 ucol_initUCA(status);
361
362 /* New version */
363 if(U_FAILURE(*status)) return 0;
364
365 UCollator *result = NULL;
366 UResourceBundle *b = ures_open(NULL, loc, status);
367 UResourceBundle *collElem = ures_getByKey(b, "CollationElements", NULL, status);
368 UResourceBundle *binary = NULL;
369 UErrorCode binaryStatus = U_ZERO_ERROR;
370
371 if(*status == U_MISSING_RESOURCE_ERROR) { /* We didn't find the tailoring data, we fallback to the UCA */
372 *status = U_USING_DEFAULT_WARNING;
373 result = ucol_initCollator(UCA->image, result, status);
374 // if we use UCA, real locale is root
375 result->rb = ures_open(NULL, "", status);
376 result->elements = ures_open(NULL, "", status);
377 if(U_FAILURE(*status)) {
378 goto clean;
379 }
380 ures_close(b);
381 result->hasRealData = FALSE;
382 } else if(U_SUCCESS(*status)) {
383 binary = ures_getByKey(collElem, "%%CollationBin", NULL, &binaryStatus);
384
385 if(binaryStatus == U_MISSING_RESOURCE_ERROR) { /* we didn't find the binary image, we should use the rules */
386 binary = NULL;
387 result = tryOpeningFromRules(collElem, status);
388 if(U_FAILURE(*status)) {
389 goto clean;
390 }
391 } else if(U_SUCCESS(*status)) { /* otherwise, we'll pick a collation data that exists */
392 int32_t len = 0;
393 const uint8_t *inData = ures_getBinary(binary, &len, status);
394 UCATableHeader *colData = (UCATableHeader *)inData;
395 if(uprv_memcmp(colData->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
396 uprv_memcmp(colData->UCDVersion, UCA->image->UCDVersion, sizeof(UVersionInfo)) != 0 ||
397 colData->version[0] != UCOL_BUILDER_VERSION) {
398 *status = U_DIFFERENT_UCA_VERSION;
399 result = tryOpeningFromRules(collElem, status);
400 } else {
401 if(U_FAILURE(*status)){
402 goto clean;
403 }
404 if((uint32_t)len > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
405 result = ucol_initCollator((const UCATableHeader *)inData, result, status);
406 if(U_FAILURE(*status)){
407 goto clean;
408 }
409 result->hasRealData = TRUE;
410 } else {
411 result = ucol_initCollator(UCA->image, result, status);
412 ucol_setOptionsFromHeader(result, (UColOptionSet *)(inData+((const UCATableHeader *)inData)->options), status);
413 if(U_FAILURE(*status)){
414 goto clean;
415 }
416 result->hasRealData = FALSE;
417 }
418 result->freeImageOnClose = FALSE;
419 }
420 }
421 result->rb = b;
422 result->elements = collElem;
423 } else { /* There is another error, and we're just gonna clean up */
424 clean:
425 ures_close(b);
426 ures_close(collElem);
427 ures_close(binary);
428 return NULL;
429 }
430
431 result->validLocale = NULL; // default is to use rb info
432
433 if(loc == NULL) {
434 loc = ures_getLocale(result->rb, status);
435 }
436 result->requestedLocale = (char *)uprv_malloc((uprv_strlen(loc)+1)*sizeof(char));
437 /* test for NULL */
438 if (result->requestedLocale == NULL) {
439 *status = U_MEMORY_ALLOCATION_ERROR;
440 ures_close(b); // ??? appears needed
441 ures_close(collElem);
442 ures_close(binary); // ??? appears needed
443 return NULL;
444 }
445 uprv_strcpy(result->requestedLocale, loc);
446
447 ures_close(binary);
448 return result;
449 }
450
451 U_CAPI void U_EXPORT2
452 ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt)
453 {
454 if (coll) {
455 if (coll->validLocale) {
456 uprv_free(coll->validLocale);
457 }
458 coll->validLocale = validLocaleToAdopt;
459 if (coll->requestedLocale) { // should always have
460 uprv_free(coll->requestedLocale);
461 }
462 coll->requestedLocale = requestedLocaleToAdopt;
463 }
464 }
465
466 U_CAPI void U_EXPORT2
467 ucol_close(UCollator *coll)
468 {
469 if(coll != NULL) {
470 // these are always owned by each UCollator struct,
471 // so we always free them
472 if(coll->validLocale != NULL) {
473 uprv_free(coll->validLocale);
474 }
475 if(coll->requestedLocale != NULL) {
476 uprv_free(coll->requestedLocale);
477 }
478
479 /* Here, it would be advisable to close: */
480 /* - UData for UCA (unless we stuff it in the root resb */
481 /* Again, do we need additional housekeeping... HMMM! */
482 if(coll->freeOnClose == FALSE){
483 return; /* for safeClone, if freeOnClose is FALSE,
484 don't free the other instance data */
485 }
486 if(coll->freeOptionsOnClose != FALSE) {
487 if(coll->options != NULL) {
488 uprv_free(coll->options);
489 }
490 }
491 if(coll->mapping != NULL) {
492 /*ucmpe32_close(coll->mapping);*/
493 uprv_free(coll->mapping);
494 }
495 if(coll->rules != NULL && coll->freeRulesOnClose) {
496 uprv_free((UChar *)coll->rules);
497 }
498 if(coll->rb != NULL) { /* pointing to read-only memory */
499 ures_close(coll->rb);
500 }
501 if(coll->freeImageOnClose == TRUE) {
502 uprv_free((UCATableHeader *)coll->image);
503 }
504 if(coll->elements != NULL) {
505 ures_close(coll->elements);
506 }
507 if(coll->latinOneCEs != NULL) {
508 uprv_free(coll->latinOneCEs);
509 }
510 uprv_free(coll);
511 }
512 }
513
514 U_CAPI UCollator* U_EXPORT2
515 ucol_openRules( const UChar *rules,
516 int32_t rulesLength,
517 UColAttributeValue normalizationMode,
518 UCollationStrength strength,
519 UParseError *parseError,
520 UErrorCode *status)
521 {
522 uint32_t listLen = 0;
523 UColTokenParser src;
524 UColAttributeValue norm;
525 UParseError tErr;
526
527 if(status == NULL || U_FAILURE(*status)){
528 return 0;
529 }
530
531 if(rulesLength < -1 || (rules == NULL && rulesLength != 0)) {
532 *status = U_ILLEGAL_ARGUMENT_ERROR;
533 return 0;
534 }
535
536 if(rulesLength == -1) {
537 rulesLength = u_strlen(rules);
538 }
539
540 if(parseError == NULL){
541 parseError = &tErr;
542 }
543
544 switch(normalizationMode) {
545 case UCOL_OFF:
546 case UCOL_ON:
547 case UCOL_DEFAULT:
548 norm = normalizationMode;
549 break;
550 default:
551 *status = U_ILLEGAL_ARGUMENT_ERROR;
552 return 0;
553 }
554
555 ucol_initUCA(status);
556
557 if(U_FAILURE(*status)){
558 return NULL;
559 }
560
561 ucol_tok_initTokenList(&src, rules, rulesLength, UCA, status);
562 listLen = ucol_tok_assembleTokenList(&src,parseError, status);
563
564 if(U_FAILURE(*status)) {
565 /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
566 /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
567 /* so something might be done here... or on lower level */
568 #ifdef UCOL_DEBUG
569 if(*status == U_ILLEGAL_ARGUMENT_ERROR) {
570 fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source);
571 } else {
572 fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source);
573 }
574 #endif
575 ucol_tok_closeTokenList(&src);
576 return NULL;
577 }
578 UCollator *result = NULL;
579 UCATableHeader *table = NULL;
580
581 if(src.resultLen > 0 || src.removeSet != NULL) { /* we have a set of rules, let's make something of it */
582 /* also, if we wanted to remove some contractions, we should make a tailoring */
583 table = ucol_assembleTailoringTable(&src, status);
584 if(U_SUCCESS(*status)) {
585 // builder version
586 table->version[0] = UCOL_BUILDER_VERSION;
587 // no tailoring information on this level
588 table->version[1] = table->version[2] = table->version[3] = 0;
589 // set UCD version
590 u_getUnicodeVersion(table->UCDVersion);
591 // set UCA version
592 uprv_memcpy(table->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo));
593 result = ucol_initCollator(table,0,status);
594 result->hasRealData = TRUE;
595 result->freeImageOnClose = TRUE;
596 }
597 } else { /* no rules, but no error either */
598 // must be only options
599 // We will init the collator from UCA
600 result = ucol_initCollator(UCA->image,0,status);
601 // And set only the options
602 UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
603 /* test for NULL */
604 if (opts == NULL) {
605 *status = U_MEMORY_ALLOCATION_ERROR;
606 goto cleanup;
607 }
608 uprv_memcpy(opts, src.opts, sizeof(UColOptionSet));
609 ucol_setOptionsFromHeader(result, opts, status);
610 result->freeOptionsOnClose = TRUE;
611 result->hasRealData = FALSE;
612 result->freeImageOnClose = FALSE;
613 }
614
615 if(U_SUCCESS(*status)) {
616 UChar *newRules;
617 result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
618 if(rulesLength > 0) {
619 newRules = (UChar *)uprv_malloc((rulesLength+1)*U_SIZEOF_UCHAR);
620 /* test for NULL */
621 if (newRules == NULL) {
622 *status = U_MEMORY_ALLOCATION_ERROR;
623 goto cleanup;
624 }
625 uprv_memcpy(newRules, rules, rulesLength*U_SIZEOF_UCHAR);
626 newRules[rulesLength]=0;
627 result->rules = newRules;
628 result->rulesLength = rulesLength;
629 result->freeRulesOnClose = TRUE;
630 }
631 result->rb = NULL;
632 result->elements = NULL;
633 result->validLocale = NULL;
634 result->requestedLocale = NULL;
635 ucol_setAttribute(result, UCOL_STRENGTH, strength, status);
636 ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status);
637 } else {
638 cleanup:
639 if(result != NULL) {
640 ucol_close(result);
641 } else {
642 if(table != NULL) {
643 uprv_free(table);
644 }
645 }
646 result = NULL;
647 }
648
649 ucol_tok_closeTokenList(&src);
650
651 return result;
652 }
653
654 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
655 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
656 U_CAPI uint8_t* U_EXPORT2
657 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
658 {
659 uint8_t *result = NULL;
660 if(U_FAILURE(*status)) {
661 return NULL;
662 }
663 if(coll->hasRealData == TRUE) {
664 *length = coll->image->size;
665 result = (uint8_t *)uprv_malloc(*length);
666 /* test for NULL */
667 if (result == NULL) {
668 *status = U_MEMORY_ALLOCATION_ERROR;
669 return NULL;
670 }
671 uprv_memcpy(result, coll->image, *length);
672 } else {
673 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
674 result = (uint8_t *)uprv_malloc(*length);
675 /* test for NULL */
676 if (result == NULL) {
677 *status = U_MEMORY_ALLOCATION_ERROR;
678 return NULL;
679 }
680 uprv_memcpy(result, UCA->image, sizeof(UCATableHeader));
681 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
682 }
683 return result;
684 }
685
686 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
687 if(U_FAILURE(*status)) {
688 return;
689 }
690 result->caseFirst = (UColAttributeValue)opts->caseFirst;
691 result->caseLevel = (UColAttributeValue)opts->caseLevel;
692 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
693 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
694 result->strength = (UColAttributeValue)opts->strength;
695 result->variableTopValue = opts->variableTopValue;
696 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
697 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
698 result->numericCollation = (UColAttributeValue)opts->numericCollation;
699
700 result->caseFirstisDefault = TRUE;
701 result->caseLevelisDefault = TRUE;
702 result->frenchCollationisDefault = TRUE;
703 result->normalizationModeisDefault = TRUE;
704 result->strengthisDefault = TRUE;
705 result->variableTopValueisDefault = TRUE;
706 result->hiraganaQisDefault = TRUE;
707 result->numericCollationisDefault = TRUE;
708
709 ucol_updateInternalState(result, status);
710
711 result->options = opts;
712 }
713
714 #if 0
715 // doesn't look like anybody is using this
716 void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
717 if(U_FAILURE(*status)) {
718 return;
719 }
720 opts->caseFirst = result->caseFirst;
721 opts->caseLevel = result->caseLevel;
722 opts->frenchCollation = result->frenchCollation;
723 opts->normalizationMode = result->normalizationMode;
724 opts->strength = result->strength;
725 opts->variableTopValue = result->variableTopValue;
726 opts->alternateHandling = result->alternateHandling;
727 opts->hiraganaQ = result->hiraganaQ;
728 opts->numericCollation = result->numericCollation;
729 }
730 #endif
731
732 static const uint16_t *fcdTrieIndex=NULL;
733
734
735 /**
736 * Approximate determination if a character is at a contraction end.
737 * Guaranteed to be TRUE if a character is at the end of a contraction,
738 * otherwise it is not deterministic.
739 * @param c character to be determined
740 * @param coll collator
741 */
742 static
743 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
744 if (UTF_IS_TRAIL(c)) {
745 return TRUE;
746 }
747
748 if (c < coll->minContrEndCP) {
749 return FALSE;
750 }
751
752 int32_t hash = c;
753 uint8_t htbyte;
754 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
755 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
756 }
757 htbyte = coll->contrEndCP[hash>>3];
758 return (((htbyte >> (hash & 7)) & 1) == 1);
759 }
760
761
762
763 /*
764 * i_getCombiningClass()
765 * A fast, at least partly inline version of u_getCombiningClass()
766 * This is a candidate for further optimization. Used heavily
767 * in contraction processing.
768 */
769 static
770 inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) {
771 uint8_t sCC = 0;
772 if (c >= 0x300 && ucol_unsafeCP(c, coll)) {
773 sCC = u_getCombiningClass(c);
774 }
775 return sCC;
776 }
777
778
779 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UErrorCode *status) {
780 UChar c;
781 UCollator *result = fillIn;
782 if(U_FAILURE(*status) || image == NULL) {
783 return NULL;
784 }
785
786 if(result == NULL) {
787 result = (UCollator *)uprv_malloc(sizeof(UCollator));
788 if(result == NULL) {
789 *status = U_MEMORY_ALLOCATION_ERROR;
790 return result;
791 }
792 result->freeOnClose = TRUE;
793 } else {
794 result->freeOnClose = FALSE;
795 }
796
797 result->image = image;
798 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
799 /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
800 UTrie *newUCAmapping = (UTrie *)uprv_malloc(sizeof(UTrie));
801 if(newUCAmapping != NULL) {
802 utrie_unserialize(newUCAmapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
803 } else {
804 *status = U_MEMORY_ALLOCATION_ERROR;
805 if(result->freeOnClose == TRUE) {
806 uprv_free(result);
807 result = NULL;
808 }
809 return result;
810 }
811 if(U_SUCCESS(*status)) {
812 result->mapping = newUCAmapping;
813 } else {
814 if(result->freeOnClose == TRUE) {
815 uprv_free(result);
816 result = NULL;
817 }
818 uprv_free(newUCAmapping);
819 return result;
820 }
821
822 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
823 result->latinOneMapping = UTRIE_GET32_LATIN1(result->mapping);
824 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
825 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
826 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
827
828 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
829 result->freeOptionsOnClose = FALSE;
830
831 /* set attributes */
832 result->caseFirst = (UColAttributeValue)result->options->caseFirst;
833 result->caseLevel = (UColAttributeValue)result->options->caseLevel;
834 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
835 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
836 result->strength = (UColAttributeValue)result->options->strength;
837 result->variableTopValue = result->options->variableTopValue;
838 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
839 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
840 result->numericCollation = (UColAttributeValue)result->options->numericCollation;
841
842 result->caseFirstisDefault = TRUE;
843 result->caseLevelisDefault = TRUE;
844 result->frenchCollationisDefault = TRUE;
845 result->normalizationModeisDefault = TRUE;
846 result->strengthisDefault = TRUE;
847 result->variableTopValueisDefault = TRUE;
848 result->alternateHandlingisDefault = TRUE;
849 result->hiraganaQisDefault = TRUE;
850 result->numericCollationisDefault = TRUE;
851
852 result->scriptOrder = NULL;
853
854 result->rules = NULL;
855 result->rulesLength = 0;
856
857 /* get the version info from UCATableHeader and populate the Collator struct*/
858 result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
859 result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
860
861 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
862 result->minUnsafeCP = 0;
863 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
864 if (ucol_unsafeCP(c, result)) break;
865 }
866 result->minUnsafeCP = c;
867
868 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
869 result->minContrEndCP = 0;
870 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
871 if (ucol_contractionEndCP(c, result)) break;
872 }
873 result->minContrEndCP = c;
874
875 /* max expansion tables */
876 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
877 result->image->endExpansionCE);
878 result->lastEndExpansionCE = result->endExpansionCE +
879 result->image->endExpansionCECount - 1;
880 result->expansionCESize = (uint8_t*)result->image +
881 result->image->expansionCESize;
882
883 if (fcdTrieIndex == NULL) {
884 fcdTrieIndex = unorm_getFCDTrie(status);
885 }
886
887 //result->errorCode = *status;
888
889 result->latinOneCEs = NULL;
890
891 result->latinOneRegenTable = FALSE;
892 result->latinOneFailed = FALSE;
893
894 ucol_updateInternalState(result, status);
895
896
897 return result;
898 }
899
900 U_CFUNC UBool
901 ucol_cleanup(void)
902 {
903 if (UCA_DATA_MEM) {
904 udata_close(UCA_DATA_MEM);
905 UCA_DATA_MEM = NULL;
906 }
907 if (UCA) {
908 ucol_close(UCA);
909 UCA = NULL;
910 }
911 return TRUE;
912 }
913
914 /* Following is a port of Mark's code for new treatment of implicits.
915 * It is positioned here, since ucol_initUCA need to initialize the
916 * variables below according to the data in the fractional UCA.
917 */
918
919 /**
920 * Function used to:
921 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
922 * b) bump any non-CJK characters by 10FFFF.
923 * The relevant blocks are:
924 * A: 4E00..9FFF; CJK Unified Ideographs
925 * F900..FAFF; CJK Compatibility Ideographs
926 * B: 3400..4DBF; CJK Unified Ideographs Extension A
927 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
928 * As long as
929 * no new B characters are allocated between 4E00 and FAFF, and
930 * no new A characters are outside of this range,
931 * (very high probability) this simple code will work.
932 * The reordered blocks are:
933 * Block1 is CJK
934 * Block2 is CJK_COMPAT_USED
935 * Block3 is CJK_A
936 * Any other CJK gets its normal code point
937 * Any non-CJK gets +10FFFF
938 * When we reorder Block1, we make sure that it is at the very start,
939 * so that it will use a 3-byte form.
940 */
941
942 // CONSTANTS
943 static const uint32_t
944 NON_CJK_OFFSET = 0x110000,
945 BYTES_TO_AVOID = 3,
946 OTHER_COUNT = 256 - BYTES_TO_AVOID,
947 LAST_COUNT = OTHER_COUNT / 2,
948 LAST_COUNT2 = OTHER_COUNT / 21, // room for intervening, without expanding to 5 bytes
949 IMPLICIT_3BYTE_COUNT = 1;
950
951 // These depend on initUCA, and are initialized at that time
952 static uint32_t
953 IMPLICIT_BASE_BYTE = 0,
954 IMPLICIT_LIMIT_BYTE = 0, // leave room for 1 3-byte and 2 4-byte forms
955
956 IMPLICIT_4BYTE_BOUNDARY = 0,
957 LAST_MULTIPLIER = 0,
958 LAST2_MULTIPLIER = 0,
959 IMPLICIT_BASE_3BYTE = 0,
960 IMPLICIT_BASE_4BYTE = 0;
961
962 static const UChar32
963 CJK_BASE = 0x4E00,
964 CJK_LIMIT = 0x9FFF+1,
965 CJK_COMPAT_USED_BASE = 0xFA0E,
966 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
967 CJK_A_BASE = 0x3400,
968 CJK_A_LIMIT = 0x4DBF+1,
969 CJK_B_BASE = 0x20000,
970 CJK_B_LIMIT = 0x2A6DF+1;
971
972 static inline UChar32 swapCJK(UChar32 cp) {
973
974 if (cp >= CJK_BASE) {
975 if (cp < CJK_LIMIT) return cp - CJK_BASE;
976
977 if (cp < CJK_COMPAT_USED_BASE) return cp + NON_CJK_OFFSET;
978
979 if (cp < CJK_COMPAT_USED_LIMIT) return cp - CJK_COMPAT_USED_BASE
980 + (CJK_LIMIT - CJK_BASE);
981 if (cp < CJK_B_BASE) return cp + NON_CJK_OFFSET;
982
983 if (cp < CJK_B_LIMIT) return cp; // non-BMP-CJK
984
985 return cp + NON_CJK_OFFSET; // non-CJK
986 }
987 if (cp < CJK_A_BASE) return cp + NON_CJK_OFFSET;
988
989 if (cp < CJK_A_LIMIT) return cp - CJK_A_BASE
990 + (CJK_LIMIT - CJK_BASE)
991 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
992 return cp + NON_CJK_OFFSET; // non-CJK
993 }
994
995
996 // GET IMPLICIT PRIMARY WEIGHTS
997 // Return value is left justified primary key
998
999 static inline uint32_t getImplicitPrimary(UChar32 cp) {
1000
1001 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1002
1003 cp = swapCJK(cp);
1004
1005 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1006
1007 // we now have a range of numbers from 0 to 21FFFF.
1008
1009 // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
1010 // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
1011 // we shift so that HAN all has the same first primary, for compression.
1012 // for the 4 byte case, we make the gap as large as we can fit.
1013 // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
1014 // Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
1015
1016 int32_t last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
1017 if (last0 < 0) {
1018 int32_t last1 = cp / LAST_COUNT;
1019 last0 = cp % LAST_COUNT;
1020
1021 int32_t last2 = last1 / OTHER_COUNT;
1022 last1 %= OTHER_COUNT;
1023 /*
1024 if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
1025 + Utility.hex(last2) + ", "
1026 + Utility.hex(last1) + ", "
1027 + Utility.hex(last0) + ", "
1028 );
1029 */
1030
1031 return IMPLICIT_BASE_3BYTE + (last2 << 24) + (last1 << 16) + ((last0*LAST_MULTIPLIER) << 8);
1032 } else {
1033 int32_t last1 = last0 / LAST_COUNT2;
1034 last0 %= LAST_COUNT2;
1035
1036 int32_t last2 = last1 / OTHER_COUNT;
1037 last1 %= OTHER_COUNT;
1038
1039 int32_t last3 = last2 / OTHER_COUNT;
1040 last2 %= OTHER_COUNT;
1041
1042 /*
1043 if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
1044 + Utility.hex(last3) + ", "
1045 + Utility.hex(last2) + ", "
1046 + Utility.hex(last1) + ", "
1047 + Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
1048 );
1049 */
1050
1051 return IMPLICIT_BASE_4BYTE + (last3 << 24) + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
1052 }
1053 }
1054
1055 /* this function is either called from initUCA or from genUCA before
1056 * doing canonical closure for the UCA.
1057 */
1058 U_CAPI void U_EXPORT2
1059 uprv_uca_initImplicitConstants(uint32_t baseByte)
1060 {
1061 IMPLICIT_BASE_BYTE = baseByte;
1062 IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4; // leave room for 1 3-byte and 2 4-byte forms
1063
1064 IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT;
1065 LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT;
1066 LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2;
1067 IMPLICIT_BASE_3BYTE = (IMPLICIT_BASE_BYTE << 24) + 0x030300;
1068 IMPLICIT_BASE_4BYTE = ((IMPLICIT_BASE_BYTE + IMPLICIT_3BYTE_COUNT) << 24) + 0x030303;
1069 }
1070
1071 /* do not close UCA returned by ucol_initUCA! */
1072 UCollator *
1073 ucol_initUCA(UErrorCode *status) {
1074 if(U_FAILURE(*status)) {
1075 return NULL;
1076 }
1077 umtx_lock(NULL);
1078 UBool f = (UCA == NULL);
1079 umtx_unlock(NULL);
1080
1081 if(f) {
1082 UCollator *newUCA = NULL;
1083 UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1084
1085 if(U_FAILURE(*status)) {
1086 if (result) {
1087 udata_close(result);
1088 }
1089 uprv_free(newUCA);
1090 }
1091
1092 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1093 newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, status);
1094 if(U_SUCCESS(*status)){
1095 newUCA->rb = NULL;
1096 newUCA->elements = NULL;
1097 newUCA->validLocale = NULL;
1098 newUCA->requestedLocale = NULL;
1099 newUCA->hasRealData = FALSE; // real data lives in .dat file...
1100 newUCA->freeImageOnClose = FALSE;
1101 umtx_lock(NULL);
1102 if(UCA == NULL) {
1103 UCA = newUCA;
1104 UCA_DATA_MEM = result;
1105 result = NULL;
1106 newUCA = NULL;
1107 }
1108 umtx_unlock(NULL);
1109
1110 if(newUCA != NULL) {
1111 udata_close(result);
1112 uprv_free(newUCA);
1113 }
1114 else {
1115 ucln_i18n_registerCleanup();
1116 }
1117 // Initalize variables for implicit generation
1118 UCAconsts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
1119 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN);
1120 UCA->mapping->getFoldingOffset = _getFoldingOffset;
1121 }else{
1122 udata_close(result);
1123 uprv_free(newUCA);
1124 UCA= NULL;
1125 }
1126 }
1127 }
1128 return UCA;
1129 }
1130
1131
1132 /* collIterNormalize Incremental Normalization happens here. */
1133 /* pick up the range of chars identifed by FCD, */
1134 /* normalize it into the collIterate's writable buffer, */
1135 /* switch the collIterate's state to use the writable buffer. */
1136 /* */
1137 static
1138 void collIterNormalize(collIterate *collationSource)
1139 {
1140 UErrorCode status = U_ZERO_ERROR;
1141
1142 int32_t normLen;
1143 UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1144 UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1145
1146 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1147 srcP, (int32_t)(endP - srcP),
1148 FALSE, 0,
1149 &status);
1150 if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1151 // reallocate and terminate
1152 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1153 &collationSource->writableBuffer,
1154 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1155 0)
1156 ) {
1157 #ifdef UCOL_DEBUG
1158 fprintf(stderr, "collIterNormalize(), out of memory\n");
1159 #endif
1160 return;
1161 }
1162 status = U_ZERO_ERROR;
1163 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1164 srcP, (int32_t)(endP - srcP),
1165 FALSE, 0,
1166 &status);
1167 }
1168 if (U_FAILURE(status)) {
1169 #ifdef UCOL_DEBUG
1170 fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1171 #endif
1172 return;
1173 }
1174
1175 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1176 collationSource->flags |= UCOL_ITER_ALLOCATED;
1177 }
1178 collationSource->pos = collationSource->writableBuffer;
1179 collationSource->origFlags = collationSource->flags;
1180 collationSource->flags |= UCOL_ITER_INNORMBUF;
1181 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1182 }
1183
1184
1185 // This function takes the iterator and extracts normalized stuff up to the next boundary
1186 // It is similar in the end results to the collIterNormalize, but for the cases when we
1187 // use an iterator
1188 static
1189 inline void normalizeIterator(collIterate *collationSource) {
1190 UErrorCode status = U_ZERO_ERROR;
1191 UBool wasNormalized = FALSE;
1192 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1193 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1194 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1195 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1196 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1197 // reallocate and terminate
1198 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1199 &collationSource->writableBuffer,
1200 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1201 0)
1202 ) {
1203 #ifdef UCOL_DEBUG
1204 fprintf(stderr, "normalizeIterator(), out of memory\n");
1205 #endif
1206 return;
1207 }
1208 status = U_ZERO_ERROR;
1209 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1210 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1211 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1212 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1213 }
1214 // Terminate the buffer - we already checked that it is big enough
1215 collationSource->writableBuffer[normLen] = 0;
1216 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1217 collationSource->flags |= UCOL_ITER_ALLOCATED;
1218 }
1219 collationSource->pos = collationSource->writableBuffer;
1220 collationSource->origFlags = collationSource->flags;
1221 collationSource->flags |= UCOL_ITER_INNORMBUF;
1222 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1223 }
1224
1225
1226 /* Incremental FCD check and normalize */
1227 /* Called from getNextCE when normalization state is suspect. */
1228 /* When entering, the state is known to be this: */
1229 /* o We are working in the main buffer of the collIterate, not the side */
1230 /* writable buffer. When in the side buffer, normalization mode is always off, */
1231 /* so we won't get here. */
1232 /* o The leading combining class from the current character is 0 or */
1233 /* the trailing combining class of the previous char was zero. */
1234 /* True because the previous call to this function will have always exited */
1235 /* that way, and we get called for every char where cc might be non-zero. */
1236 static
1237 inline UBool collIterFCD(collIterate *collationSource) {
1238 UChar c, c2;
1239 const UChar *srcP, *endP;
1240 uint8_t leadingCC;
1241 uint8_t prevTrailingCC = 0;
1242 uint16_t fcd;
1243 UBool needNormalize = FALSE;
1244
1245 srcP = collationSource->pos-1;
1246
1247 if (collationSource->flags & UCOL_ITER_HASLEN) {
1248 endP = collationSource->endp;
1249 } else {
1250 endP = NULL;
1251 }
1252
1253 // Get the trailing combining class of the current character. If it's zero,
1254 // we are OK.
1255 c = *srcP++;
1256 /* trie access */
1257 fcd = unorm_getFCD16(fcdTrieIndex, c);
1258 if (fcd != 0) {
1259 if (UTF_IS_FIRST_SURROGATE(c)) {
1260 if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1261 ++srcP;
1262 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1263 } else {
1264 fcd = 0;
1265 }
1266 }
1267
1268 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1269
1270 if (prevTrailingCC != 0) {
1271 // The current char has a non-zero trailing CC. Scan forward until we find
1272 // a char with a leading cc of zero.
1273 while (endP == NULL || srcP != endP)
1274 {
1275 const UChar *savedSrcP = srcP;
1276
1277 c = *srcP++;
1278 /* trie access */
1279 fcd = unorm_getFCD16(fcdTrieIndex, c);
1280 if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) {
1281 if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1282 ++srcP;
1283 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1284 } else {
1285 fcd = 0;
1286 }
1287 }
1288 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1289 if (leadingCC == 0) {
1290 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1291 // back up over it. (Could be surrogate pair!)
1292 break;
1293 }
1294
1295 if (leadingCC < prevTrailingCC) {
1296 needNormalize = TRUE;
1297 }
1298
1299 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1300 }
1301 }
1302 }
1303
1304 collationSource->fcdPosition = (UChar *)srcP;
1305
1306 return needNormalize;
1307 }
1308
1309 /****************************************************************************/
1310 /* Following are the CE retrieval functions */
1311 /* */
1312 /****************************************************************************/
1313
1314 /* there should be a macro version of this function in the header file */
1315 /* This is the first function that tries to fetch a collation element */
1316 /* If it's not succesfull or it encounters a more difficult situation */
1317 /* some more sofisticated and slower functions are invoked */
1318 static
1319 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1320 uint32_t order = 0;
1321 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1322 order = *(collationSource->toReturn++); /* if so, return them */
1323 if(collationSource->CEpos == collationSource->toReturn) {
1324 collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1325 }
1326 return order;
1327 }
1328
1329 UChar ch = 0;
1330
1331 for (;;) /* Loop handles case when incremental normalize switches */
1332 { /* to or from the side buffer / original string, and we */
1333 /* need to start again to get the next character. */
1334
1335 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1336 {
1337 // The source string is null terminated and we're not working from the side buffer,
1338 // and we're not normalizing. This is the fast path.
1339 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1340 ch = *collationSource->pos++;
1341 if (ch != 0) {
1342 break;
1343 }
1344 else {
1345 return UCOL_NO_MORE_CES;
1346 }
1347 }
1348
1349 if (collationSource->flags & UCOL_ITER_HASLEN) {
1350 // Normal path for strings when length is specified.
1351 // (We can't be in side buffer because it is always null terminated.)
1352 if (collationSource->pos >= collationSource->endp) {
1353 // Ran off of the end of the main source string. We're done.
1354 return UCOL_NO_MORE_CES;
1355 }
1356 ch = *collationSource->pos++;
1357 }
1358 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1359 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1360 if(iterCh == U_SENTINEL) {
1361 return UCOL_NO_MORE_CES;
1362 }
1363 ch = (UChar)iterCh;
1364 }
1365 else
1366 {
1367 // Null terminated string.
1368 ch = *collationSource->pos++;
1369 if (ch == 0) {
1370 // Ran off end of buffer.
1371 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1372 // Ran off end of main string. backing up one character.
1373 collationSource->pos--;
1374 return UCOL_NO_MORE_CES;
1375 }
1376 else
1377 {
1378 // Hit null in the normalize side buffer.
1379 // Usually this means the end of the normalized data,
1380 // except for one odd case: a null followed by combining chars,
1381 // which is the case if we are at the start of the buffer.
1382 if (collationSource->pos == collationSource->writableBuffer+1) {
1383 break;
1384 }
1385
1386 // Null marked end of side buffer.
1387 // Revert to the main string and
1388 // loop back to top to try again to get a character.
1389 collationSource->pos = collationSource->fcdPosition;
1390 collationSource->flags = collationSource->origFlags;
1391 continue;
1392 }
1393 }
1394 }
1395
1396 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1397 if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1398 collationSource->flags |= UCOL_WAS_HIRAGANA;
1399 } else {
1400 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1401 }
1402 }
1403
1404 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1405 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1406 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1407 break;
1408 }
1409
1410 if (collationSource->fcdPosition >= collationSource->pos) {
1411 // An earlier FCD check has already covered the current character.
1412 // We can go ahead and process this char.
1413 break;
1414 }
1415
1416 if (ch < ZERO_CC_LIMIT_ ) {
1417 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1418 break;
1419 }
1420
1421 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1422 // We need to peek at the next character in order to tell if we are FCD
1423 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1424 // We are at the last char of source string.
1425 // It is always OK for FCD check.
1426 break;
1427 }
1428
1429 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1430 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1431 break;
1432 }
1433 }
1434
1435
1436 // Need a more complete FCD check and possible normalization.
1437 if (collIterFCD(collationSource)) {
1438 collIterNormalize(collationSource);
1439 }
1440 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1441 // No normalization was needed. Go ahead and process the char we already had.
1442 break;
1443 }
1444
1445 // Some normalization happened. Next loop iteration will pick up a char
1446 // from the normalization buffer.
1447
1448 } // end for (;;)
1449
1450
1451 if (ch <= 0xFF) {
1452 /* For latin-1 characters we never need to fall back to the UCA table */
1453 /* because all of the UCA data is replicated in the latinOneMapping array */
1454 order = coll->latinOneMapping[ch];
1455 if (order > UCOL_NOT_FOUND) {
1456 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1457 }
1458 }
1459 else
1460 {
1461 order = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
1462 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1463 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1464 }
1465 if(order == UCOL_NOT_FOUND) { /* We couldn't find a good CE in the tailoring */
1466 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1467 order = UTRIE_GET32_FROM_LEAD(UCA->mapping, ch);
1468
1469 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1470 order = ucol_prv_getSpecialCE(UCA, ch, order, collationSource, status);
1471 }
1472 }
1473 }
1474 return order; /* return the CE */
1475 }
1476
1477 /* ucol_getNextCE, out-of-line version for use from other files. */
1478 U_CAPI uint32_t U_EXPORT2
1479 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1480 return ucol_IGetNextCE(coll, collationSource, status);
1481 }
1482
1483
1484 /**
1485 * Incremental previous normalization happens here. Pick up the range of chars
1486 * identifed by FCD, normalize it into the collIterate's writable buffer,
1487 * switch the collIterate's state to use the writable buffer.
1488 * @param data collation iterator data
1489 */
1490 static
1491 void collPrevIterNormalize(collIterate *data)
1492 {
1493 UErrorCode status = U_ZERO_ERROR;
1494 UChar *pEnd = data->pos; /* End normalize + 1 */
1495 UChar *pStart;
1496 uint32_t normLen;
1497 UChar *pStartNorm;
1498
1499 /* Start normalize */
1500 if (data->fcdPosition == NULL) {
1501 pStart = data->string;
1502 }
1503 else {
1504 pStart = data->fcdPosition + 1;
1505 }
1506
1507 normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1508 data->writableBuffer, 0, &status);
1509
1510 if (data->writableBufSize <= normLen) {
1511 freeHeapWritableBuffer(data);
1512 data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1513 sizeof(UChar));
1514 if(data->writableBuffer == NULL) { // something is wrong here, return
1515 return;
1516 }
1517 data->flags |= UCOL_ITER_ALLOCATED;
1518 /* to handle the zero termination */
1519 data->writableBufSize = normLen + 1;
1520 }
1521 status = U_ZERO_ERROR;
1522 /*
1523 this puts the null termination infront of the normalized string instead
1524 of the end
1525 */
1526 pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1527 *(pStartNorm - 1) = 0;
1528 unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1529 normLen, &status);
1530
1531 data->pos = data->writableBuffer + data->writableBufSize;
1532 data->origFlags = data->flags;
1533 data->flags |= UCOL_ITER_INNORMBUF;
1534 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1535 }
1536
1537
1538 /**
1539 * Incremental FCD check for previous iteration and normalize. Called from
1540 * getPrevCE when normalization state is suspect.
1541 * When entering, the state is known to be this:
1542 * o We are working in the main buffer of the collIterate, not the side
1543 * writable buffer. When in the side buffer, normalization mode is always
1544 * off, so we won't get here.
1545 * o The leading combining class from the current character is 0 or the
1546 * trailing combining class of the previous char was zero.
1547 * True because the previous call to this function will have always exited
1548 * that way, and we get called for every char where cc might be non-zero.
1549 * @param data collation iterate struct
1550 * @return normalization status, TRUE for normalization to be done, FALSE
1551 * otherwise
1552 */
1553 static
1554 inline UBool collPrevIterFCD(collIterate *data)
1555 {
1556 const UChar *src, *start;
1557 UChar c, c2;
1558 uint8_t leadingCC;
1559 uint8_t trailingCC = 0;
1560 uint16_t fcd;
1561 UBool result = FALSE;
1562
1563 start = data->string;
1564 src = data->pos + 1;
1565
1566 /* Get the trailing combining class of the current character. */
1567 c = *--src;
1568 if (!UTF_IS_SURROGATE(c)) {
1569 fcd = unorm_getFCD16(fcdTrieIndex, c);
1570 } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1571 --src;
1572 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1573 if (fcd != 0) {
1574 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1575 }
1576 } else /* unpaired surrogate */ {
1577 fcd = 0;
1578 }
1579
1580 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1581
1582 if (leadingCC != 0) {
1583 /*
1584 The current char has a non-zero leading combining class.
1585 Scan backward until we find a char with a trailing cc of zero.
1586 */
1587 for (;;)
1588 {
1589 if (start == src) {
1590 data->fcdPosition = NULL;
1591 return result;
1592 }
1593
1594 c = *--src;
1595 if (!UTF_IS_SURROGATE(c)) {
1596 fcd = unorm_getFCD16(fcdTrieIndex, c);
1597 } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1598 --src;
1599 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1600 if (fcd != 0) {
1601 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1602 }
1603 } else /* unpaired surrogate */ {
1604 fcd = 0;
1605 }
1606
1607 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1608
1609 if (trailingCC == 0) {
1610 break;
1611 }
1612
1613 if (leadingCC < trailingCC) {
1614 result = TRUE;
1615 }
1616
1617 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1618 }
1619 }
1620
1621 data->fcdPosition = (UChar *)src;
1622
1623 return result;
1624 }
1625
1626 /** gets a character from the string at a given offset
1627 * Handles both normal and iterative cases.
1628 * No error checking - caller beware!
1629 */
1630 inline static
1631 UChar peekCharacter(collIterate *source, int32_t offset) {
1632 if(source->pos != NULL) {
1633 return *(source->pos + offset);
1634 } else if(source->iterator != NULL) {
1635 if(offset != 0) {
1636 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1637 UChar toReturn = (UChar)source->iterator->next(source->iterator);
1638 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1639 return toReturn;
1640 } else {
1641 return (UChar)source->iterator->current(source->iterator);
1642 }
1643 } else {
1644 return (UChar)U_SENTINEL;
1645 }
1646 }
1647
1648 /**
1649 * Determines if we are at the start of the data string in the backwards
1650 * collation iterator
1651 * @param data collation iterator
1652 * @return TRUE if we are at the start
1653 */
1654 static
1655 inline UBool isAtStartPrevIterate(collIterate *data) {
1656 if(data->pos == NULL && data->iterator != NULL) {
1657 return !data->iterator->hasPrevious(data->iterator);
1658 }
1659 //return (collIter_bos(data)) ||
1660 return (data->pos == data->string) ||
1661 ((data->flags & UCOL_ITER_INNORMBUF) &&
1662 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1663 }
1664
1665 /**
1666 * Inline function that gets a simple CE.
1667 * So what it does is that it will first check the expansion buffer. If the
1668 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1669 * is different from the string pointer, we return the collation element at the
1670 * return pointer and decrement it.
1671 * For more complicated CEs it resorts to getComplicatedCE.
1672 * @param coll collator data
1673 * @param data collation iterator struct
1674 * @param status error status
1675 */
1676 static
1677 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1678 UErrorCode *status)
1679 {
1680 uint32_t result = UCOL_NULLORDER;
1681 if (data->toReturn > data->CEs) {
1682 data->toReturn --;
1683 result = *(data->toReturn);
1684 if (data->CEs == data->toReturn) {
1685 data->CEpos = data->toReturn;
1686 }
1687 }
1688 else {
1689 UChar ch = 0;
1690 /*
1691 Loop handles case when incremental normalize switches to or from the
1692 side buffer / original string, and we need to start again to get the
1693 next character.
1694 */
1695 for (;;) {
1696 if (data->flags & UCOL_ITER_HASLEN) {
1697 /*
1698 Normal path for strings when length is specified.
1699 Not in side buffer because it is always null terminated.
1700 */
1701 if (data->pos <= data->string) {
1702 /* End of the main source string */
1703 return UCOL_NO_MORE_CES;
1704 }
1705 data->pos --;
1706 ch = *data->pos;
1707 }
1708 // we are using an iterator to go back. Pray for us!
1709 else if (data->flags & UCOL_USE_ITERATOR) {
1710 UChar32 iterCh = data->iterator->previous(data->iterator);
1711 if(iterCh == U_SENTINEL) {
1712 return UCOL_NO_MORE_CES;
1713 } else {
1714 ch = (UChar)iterCh;
1715 }
1716 }
1717 else {
1718 data->pos --;
1719 ch = *data->pos;
1720 /* we are in the side buffer. */
1721 if (ch == 0) {
1722 /*
1723 At the start of the normalize side buffer.
1724 Go back to string.
1725 Because pointer points to the last accessed character,
1726 hence we have to increment it by one here.
1727 */
1728 if (data->fcdPosition == NULL) {
1729 data->pos = data->string;
1730 return UCOL_NO_MORE_CES;
1731 }
1732 else {
1733 data->pos = data->fcdPosition + 1;
1734 }
1735 data->flags = data->origFlags;
1736 continue;
1737 }
1738 }
1739
1740 if(data->flags&UCOL_HIRAGANA_Q) {
1741 if(ch>=0x3040 && ch<=0x309f) {
1742 data->flags |= UCOL_WAS_HIRAGANA;
1743 } else {
1744 data->flags &= ~UCOL_WAS_HIRAGANA;
1745 }
1746 }
1747
1748 /*
1749 * got a character to determine if there's fcd and/or normalization
1750 * stuff to do.
1751 * if the current character is not fcd.
1752 * if current character is at the start of the string
1753 * Trailing combining class == 0.
1754 * Note if pos is in the writablebuffer, norm is always 0
1755 */
1756 if (ch < ZERO_CC_LIMIT_ ||
1757 // this should propel us out of the loop in the iterator case
1758 (data->flags & UCOL_ITER_NORM) == 0 ||
1759 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1760 || data->string == data->pos) {
1761 break;
1762 }
1763
1764 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1765 /* if next character is FCD */
1766 if (data->pos == data->string) {
1767 /* First char of string is always OK for FCD check */
1768 break;
1769 }
1770
1771 /* Not first char of string, do the FCD fast test */
1772 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1773 break;
1774 }
1775 }
1776
1777 /* Need a more complete FCD check and possible normalization. */
1778 if (collPrevIterFCD(data)) {
1779 collPrevIterNormalize(data);
1780 }
1781
1782 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1783 /* No normalization. Go ahead and process the char. */
1784 break;
1785 }
1786
1787 /*
1788 Some normalization happened.
1789 Next loop picks up a char from the normalization buffer.
1790 */
1791 }
1792
1793 /* attempt to handle contractions, after removal of the backwards
1794 contraction
1795 */
1796 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1797 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1798 }
1799 else {
1800 if (ch <= 0xFF) {
1801 result = coll->latinOneMapping[ch];
1802 if (result > UCOL_NOT_FOUND) {
1803 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1804 }
1805 }
1806 else {
1807 // TODO: fix me for THAI - I reference *(data->pos-1)
1808 if ((data->flags & UCOL_ITER_INNORMBUF) == 0 &&
1809 /*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally
1810 data->pos > data->string &&
1811 UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1)))
1812 //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
1813 {
1814 result = UCOL_THAI;
1815 }
1816 else {
1817 /*result = ucmpe32_get(coll->mapping, ch);*/
1818 result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
1819 }
1820 if (result > UCOL_NOT_FOUND) {
1821 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1822 }
1823 if (result == UCOL_NOT_FOUND) {
1824 if (!isAtStartPrevIterate(data) &&
1825 ucol_contractionEndCP(ch, data->coll)) {
1826 result = UCOL_CONTRACTION;
1827 }
1828 else {
1829 /*result = ucmpe32_get(UCA->mapping, ch);*/
1830 result = UTRIE_GET32_FROM_LEAD(UCA->mapping, ch);
1831 }
1832
1833 if (result > UCOL_NOT_FOUND) {
1834 result = ucol_prv_getSpecialPrevCE(UCA, ch, result, data, status);
1835 }
1836 }
1837 }
1838 }
1839 }
1840 return result;
1841 }
1842
1843
1844 /* ucol_getPrevCE, out-of-line version for use from other files. */
1845 U_CAPI uint32_t U_EXPORT2
1846 ucol_getPrevCE(const UCollator *coll, collIterate *data,
1847 UErrorCode *status) {
1848 return ucol_IGetPrevCE(coll, data, status);
1849 }
1850
1851
1852 /* this should be connected to special Jamo handling */
1853 U_CAPI uint32_t U_EXPORT2
1854 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
1855 collIterate colIt;
1856 uint32_t order;
1857 IInit_collIterate(coll, &u, 1, &colIt);
1858 order = ucol_IGetNextCE(coll, &colIt, status);
1859 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1860 return order;
1861 }
1862
1863 /**
1864 * Inserts the argument character into the end of the buffer pushing back the
1865 * null terminator.
1866 * @param data collIterate struct data
1867 * @param pNull pointer to the null termination
1868 * @param ch character to be appended
1869 * @return the position of the new addition
1870 */
1871 static
1872 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
1873 {
1874 uint32_t size = data->writableBufSize;
1875 UChar *newbuffer;
1876 const uint32_t incsize = 5;
1877
1878 if ((data->writableBuffer + size) > (pNull + 1)) {
1879 *pNull = ch;
1880 *(pNull + 1) = 0;
1881 return pNull;
1882 }
1883
1884 /*
1885 buffer will always be null terminated at the end.
1886 giving extra space since it is likely that more characters will be added.
1887 */
1888 size += incsize;
1889 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
1890 if(newbuffer != NULL) { // something wrong, but no status
1891 uprv_memcpy(newbuffer, data->writableBuffer,
1892 data->writableBufSize * sizeof(UChar));
1893
1894 freeHeapWritableBuffer(data);
1895 data->writableBufSize = size;
1896 data->writableBuffer = newbuffer;
1897
1898 newbuffer = newbuffer + data->writableBufSize;
1899 *newbuffer = ch;
1900 *(newbuffer + 1) = 0;
1901 }
1902 return newbuffer;
1903 }
1904
1905 /**
1906 * Inserts the argument string into the end of the buffer pushing back the
1907 * null terminator.
1908 * @param data collIterate struct data
1909 * @param pNull pointer to the null termination
1910 * @param string to be appended
1911 * @param length of the string to be appended
1912 * @return the position of the new addition
1913 */
1914 static
1915 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
1916 int32_t length)
1917 {
1918 uint32_t size = pNull - data->writableBuffer;
1919 UChar *newbuffer;
1920
1921 if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
1922 uprv_memcpy(pNull, str, length * sizeof(UChar));
1923 *(pNull + length) = 0;
1924 return pNull;
1925 }
1926
1927 /*
1928 buffer will always be null terminated at the end.
1929 giving extra space since it is likely that more characters will be added.
1930 */
1931 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
1932 if(newbuffer != NULL) {
1933 uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
1934 uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
1935
1936 freeHeapWritableBuffer(data);
1937 data->writableBufSize = size + length + 1;
1938 data->writableBuffer = newbuffer;
1939 }
1940
1941 return newbuffer;
1942 }
1943
1944 /**
1945 * Special normalization function for contraction in the forwards iterator.
1946 * This normalization sequence will place the current character at source->pos
1947 * and its following normalized sequence into the buffer.
1948 * The fcd position, pos will be changed.
1949 * pos will now point to positions in the buffer.
1950 * Flags will be changed accordingly.
1951 * @param data collation iterator data
1952 */
1953 static
1954 inline void normalizeNextContraction(collIterate *data)
1955 {
1956 UChar *buffer = data->writableBuffer;
1957 uint32_t buffersize = data->writableBufSize;
1958 uint32_t strsize;
1959 UErrorCode status = U_ZERO_ERROR;
1960 /* because the pointer points to the next character */
1961 UChar *pStart = data->pos - 1;
1962 UChar *pEnd;
1963 uint32_t normLen;
1964 UChar *pStartNorm;
1965
1966 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1967 *data->writableBuffer = *(pStart - 1);
1968 strsize = 1;
1969 }
1970 else {
1971 strsize = u_strlen(data->writableBuffer);
1972 }
1973
1974 pEnd = data->fcdPosition;
1975
1976 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
1977 &status);
1978
1979 if (buffersize <= normLen + strsize) {
1980 uint32_t size = strsize + normLen + 1;
1981 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
1982 if(temp != NULL) {
1983 uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
1984 freeHeapWritableBuffer(data);
1985 data->writableBuffer = temp;
1986 data->writableBufSize = size;
1987 data->flags |= UCOL_ITER_ALLOCATED;
1988 }
1989 }
1990
1991 status = U_ZERO_ERROR;
1992 pStartNorm = buffer + strsize;
1993 /* null-termination will be added here */
1994 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
1995 normLen + 1, &status);
1996
1997 data->pos = data->writableBuffer + strsize;
1998 data->origFlags = data->flags;
1999 data->flags |= UCOL_ITER_INNORMBUF;
2000 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2001 }
2002
2003 /**
2004 * Contraction character management function that returns the next character
2005 * for the forwards iterator.
2006 * Does nothing if the next character is in buffer and not the first character
2007 * in it.
2008 * Else it checks next character in data string to see if it is normalizable.
2009 * If it is not, the character is simply copied into the buffer, else
2010 * the whole normalized substring is copied into the buffer, including the
2011 * current character.
2012 * @param data collation element iterator data
2013 * @return next character
2014 */
2015 static
2016 inline UChar getNextNormalizedChar(collIterate *data)
2017 {
2018 UChar nextch;
2019 UChar ch;
2020 // Here we need to add the iterator code. One problem is the way
2021 // end of string is handled. If we just return next char, it could
2022 // be the sentinel. Most of the cases already check for this, but we
2023 // need to be sure.
2024 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2025 /* if no normalization and not in buffer. */
2026 if(data->flags & UCOL_USE_ITERATOR) {
2027 return (UChar)data->iterator->next(data->iterator);
2028 } else {
2029 return *(data->pos ++);
2030 }
2031 }
2032
2033 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2034 //normalizeIterator(data);
2035 //}
2036
2037 UChar *pEndWritableBuffer = NULL;
2038 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2039 if ((innormbuf && *data->pos != 0) ||
2040 (data->fcdPosition != NULL && !innormbuf &&
2041 data->pos < data->fcdPosition)) {
2042 /*
2043 if next character is in normalized buffer, no further normalization
2044 is required
2045 */
2046 return *(data->pos ++);
2047 }
2048
2049 if (data->flags & UCOL_ITER_HASLEN) {
2050 /* in data string */
2051 if (data->pos + 1 == data->endp) {
2052 return *(data->pos ++);
2053 }
2054 }
2055 else {
2056 if (innormbuf) {
2057 // inside the normalization buffer, but at the end
2058 // (since we encountered zero). This means, in the
2059 // case we're using char iterator, that we need to
2060 // do another round of normalization.
2061 //if(data->origFlags & UCOL_USE_ITERATOR) {
2062 // we need to restore original flags,
2063 // otherwise, we'll lose them
2064 //data->flags = data->origFlags;
2065 //normalizeIterator(data);
2066 //return *(data->pos++);
2067 //} else {
2068 /*
2069 in writable buffer, at this point fcdPosition can not be
2070 pointing to the end of the data string. see contracting tag.
2071 */
2072 if(data->fcdPosition) {
2073 if (*(data->fcdPosition + 1) == 0 ||
2074 data->fcdPosition + 1 == data->endp) {
2075 /* at the end of the string, dump it into the normalizer */
2076 data->pos = insertBufferEnd(data, data->pos,
2077 *(data->fcdPosition)) + 1;
2078 return *(data->fcdPosition ++);
2079 }
2080 pEndWritableBuffer = data->pos;
2081 data->pos = data->fcdPosition;
2082 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2083 // if we are here, we're using a normalizing iterator.
2084 // we should just continue further.
2085 data->flags = data->origFlags;
2086 data->pos = NULL;
2087 return (UChar)data->iterator->next(data->iterator);
2088 }
2089 //}
2090 }
2091 else {
2092 if (*(data->pos + 1) == 0) {
2093 return *(data->pos ++);
2094 }
2095 }
2096 }
2097
2098 ch = *data->pos ++;
2099 nextch = *data->pos;
2100
2101 /*
2102 * if the current character is not fcd.
2103 * Trailing combining class == 0.
2104 */
2105 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2106 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2107 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2108 /*
2109 Need a more complete FCD check and possible normalization.
2110 normalize substring will be appended to buffer
2111 */
2112 if (collIterFCD(data)) {
2113 normalizeNextContraction(data);
2114 return *(data->pos ++);
2115 }
2116 else if (innormbuf) {
2117 /* fcdposition shifted even when there's no normalization, if we
2118 don't input the rest into this, we'll get the wrong position when
2119 we reach the end of the writableBuffer */
2120 int32_t length = data->fcdPosition - data->pos + 1;
2121 data->pos = insertBufferEnd(data, pEndWritableBuffer,
2122 data->pos - 1, length);
2123 return *(data->pos ++);
2124 }
2125 }
2126
2127 if (innormbuf) {
2128 /*
2129 no normalization is to be done hence only one character will be
2130 appended to the buffer.
2131 */
2132 data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2133 }
2134
2135 /* points back to the pos in string */
2136 return ch;
2137 }
2138
2139 static
2140 inline void goBackOne(collIterate *data) {
2141 # if 0
2142 // somehow, it looks like we need to keep iterator synced up
2143 // at all times, as above.
2144 if(data->pos) {
2145 data->pos--;
2146 }
2147 if(data->iterator) {
2148 data->iterator->previous(data->iterator);
2149 }
2150 #endif
2151 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
2152 data->iterator->previous(data->iterator);
2153 }
2154 if(data->pos) {
2155 data->pos --;
2156 }
2157 }
2158
2159
2160 /**
2161 * Function to copy the buffer into writableBuffer and sets the fcd position to
2162 * the correct position
2163 * @param source data string source
2164 * @param buffer character buffer
2165 * @param tempdb current position in buffer that has been used up
2166 */
2167 static
2168 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2169 UChar *tempdb)
2170 {
2171 /* okay confusing part here. to ensure that the skipped characters are
2172 considered later, we need to place it in the appropriate position in the
2173 normalization buffer and reassign the pos pointer. simple case if pos
2174 reside in string, simply copy to normalization buffer and
2175 fcdposition = pos, pos = start of normalization buffer. if pos in
2176 normalization buffer, we'll insert the copy infront of pos and point pos
2177 to the start of the normalization buffer. why am i doing these copies?
2178 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2179 not require any changes, which be really painful. */
2180 uint32_t length = u_strlen(buffer);;
2181 if (source->flags & UCOL_ITER_INNORMBUF) {
2182 u_strcpy(tempdb, source->pos);
2183 }
2184 else {
2185 source->fcdPosition = source->pos;
2186 source->origFlags = source->flags;
2187 source->flags |= UCOL_ITER_INNORMBUF;
2188 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2189 }
2190
2191 if (length >= source->writableBufSize) {
2192 freeHeapWritableBuffer(source);
2193 source->writableBuffer =
2194 (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2195 if(source->writableBuffer == NULL) {
2196 return;
2197 }
2198 source->writableBufSize = length;
2199 }
2200
2201 u_strcpy(source->writableBuffer, buffer);
2202 source->pos = source->writableBuffer;
2203 }
2204
2205 /**
2206 * Function to get the discontiguos collation element within the source.
2207 * Note this function will set the position to the appropriate places.
2208 * @param coll current collator used
2209 * @param source data string source
2210 * @param constart index to the start character in the contraction table
2211 * @return discontiguos collation element offset
2212 */
2213 static
2214 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2215 const UChar *constart)
2216 {
2217 /* source->pos currently points to the second combining character after
2218 the start character */
2219 UChar *temppos = source->pos;
2220 UChar buffer[4*UCOL_MAX_BUFFER];
2221 UChar *tempdb = buffer;
2222 const UChar *tempconstart = constart;
2223 uint8_t tempflags = source->flags;
2224 UBool multicontraction = FALSE;
2225 UChar *tempbufferpos = 0;
2226 collIterateState discState;
2227
2228 backupState(source, &discState);
2229
2230 //*tempdb = *(source->pos - 1);
2231 *tempdb = peekCharacter(source, -1);
2232 tempdb ++;
2233 while (TRUE) {
2234 UChar *UCharOffset;
2235 UChar schar,
2236 tchar;
2237 uint32_t result;
2238
2239 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2240 || (peekCharacter(source, 0) == 0 &&
2241 //|| (*source->pos == 0 &&
2242 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2243 source->fcdPosition == NULL ||
2244 source->fcdPosition == source->endp ||
2245 *(source->fcdPosition) == 0 ||
2246 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2247 /* end of string in null terminated string or stopped by a
2248 null character, note fcd does not always point to a base
2249 character after the discontiguos change */
2250 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2251 //u_getCombiningClass(*(source->pos)) == 0) {
2252 //constart = (UChar *)coll->image + getContractOffset(CE);
2253 if (multicontraction) {
2254 *tempbufferpos = 0;
2255 source->pos = temppos - 1;
2256 setDiscontiguosAttribute(source, buffer, tempdb);
2257 return *(coll->contractionCEs +
2258 (tempconstart - coll->contractionIndex));
2259 }
2260 constart = tempconstart;
2261 break;
2262 }
2263
2264 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2265 schar = getNextNormalizedChar(source);
2266
2267 while (schar > (tchar = *UCharOffset)) {
2268 UCharOffset++;
2269 }
2270
2271 if (schar != tchar) {
2272 /* not the correct codepoint. we stuff the current codepoint into
2273 the discontiguos buffer and try the next character */
2274 *tempdb = schar;
2275 tempdb ++;
2276 continue;
2277 }
2278 else {
2279 if (u_getCombiningClass(schar) ==
2280 u_getCombiningClass(peekCharacter(source, -2))) {
2281 //u_getCombiningClass(*(source->pos - 2))) {
2282 *tempdb = schar;
2283 tempdb ++;
2284 continue;
2285 }
2286 result = *(coll->contractionCEs +
2287 (UCharOffset - coll->contractionIndex));
2288 }
2289 *tempdb = 0;
2290
2291 if (result == UCOL_NOT_FOUND) {
2292 break;
2293 } else if (isContraction(result)) {
2294 /* this is a multi-contraction*/
2295 tempconstart = (UChar *)coll->image + getContractOffset(result);
2296 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2297 != UCOL_NOT_FOUND) {
2298 multicontraction = TRUE;
2299 temppos = source->pos + 1;
2300 tempbufferpos = buffer + u_strlen(buffer);
2301 }
2302 } else {
2303 setDiscontiguosAttribute(source, buffer, tempdb);
2304 return result;
2305 }
2306 }
2307
2308 /* no problems simply reverting just like that,
2309 if we are in string before getting into this function, points back to
2310 string hence no problem.
2311 if we are in normalization buffer before getting into this function,
2312 since we'll never use another normalization within this function, we
2313 know that fcdposition points to a base character. the normalization buffer
2314 never change, hence this revert works. */
2315 loadState(source, &discState, TRUE);
2316 goBackOne(source);
2317
2318 //source->pos = temppos - 1;
2319 source->flags = tempflags;
2320 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2321 }
2322
2323 static
2324 inline UBool isNonChar(UChar32 cp) {
2325 if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2326 return TRUE;
2327 }
2328 return FALSE;
2329 }
2330
2331 /* now uses Mark's getImplicitPrimary code */
2332 static
2333 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2334 if(isNonChar(cp)) {
2335 return 0;
2336 }
2337 uint32_t r = getImplicitPrimary(cp);
2338 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2339 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2340 }
2341
2342 /**
2343 * Inserts the argument character into the front of the buffer replacing the
2344 * front null terminator.
2345 * @param data collation element iterator data
2346 * @param pNull pointer to the null terminator
2347 * @param ch character to be appended
2348 * @return positon of added character
2349 */
2350 static
2351 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2352 {
2353 uint32_t size = data->writableBufSize;
2354 UChar *end;
2355 UChar *newbuffer;
2356 const uint32_t incsize = 5;
2357
2358 if (pNull > data->writableBuffer + 1) {
2359 *pNull = ch;
2360 *(pNull - 1) = 0;
2361 return pNull;
2362 }
2363
2364 /*
2365 buffer will always be null terminated infront.
2366 giving extra space since it is likely that more characters will be added.
2367 */
2368 size += incsize;
2369 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2370 if(newbuffer == NULL) {
2371 return NULL;
2372 }
2373 end = newbuffer + incsize;
2374 uprv_memcpy(end, data->writableBuffer,
2375 data->writableBufSize * sizeof(UChar));
2376 *end = ch;
2377 *(end - 1) = 0;
2378
2379 freeHeapWritableBuffer(data);
2380
2381 data->writableBufSize = size;
2382 data->writableBuffer = newbuffer;
2383 return end;
2384 }
2385
2386 /**
2387 * Special normalization function for contraction in the previous iterator.
2388 * This normalization sequence will place the current character at source->pos
2389 * and its following normalized sequence into the buffer.
2390 * The fcd position, pos will be changed.
2391 * pos will now point to positions in the buffer.
2392 * Flags will be changed accordingly.
2393 * @param data collation iterator data
2394 */
2395 static
2396 inline void normalizePrevContraction(collIterate *data)
2397 {
2398 UChar *buffer = data->writableBuffer;
2399 uint32_t buffersize = data->writableBufSize;
2400 uint32_t nulltermsize;
2401 UErrorCode status = U_ZERO_ERROR;
2402 UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2403 UChar *pStart;
2404 uint32_t normLen;
2405 UChar *pStartNorm;
2406
2407 if (data->flags & UCOL_ITER_HASLEN) {
2408 /*
2409 normalization buffer not used yet, we'll pull down the next
2410 character into the end of the buffer
2411 */
2412 *(buffer + (buffersize - 1)) = *(data->pos + 1);
2413 nulltermsize = buffersize - 1;
2414 }
2415 else {
2416 nulltermsize = buffersize;
2417 UChar *temp = buffer + (nulltermsize - 1);
2418 while (*(temp --) != 0) {
2419 nulltermsize --;
2420 }
2421 }
2422
2423 /* Start normalize */
2424 if (data->fcdPosition == NULL) {
2425 pStart = data->string;
2426 }
2427 else {
2428 pStart = data->fcdPosition + 1;
2429 }
2430
2431 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2432 &status);
2433
2434 if (nulltermsize <= normLen) {
2435 uint32_t size = buffersize - nulltermsize + normLen + 1;
2436 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2437 if(temp != NULL) {
2438 nulltermsize = normLen + 1;
2439 uprv_memcpy(temp + normLen, buffer,
2440 sizeof(UChar) * (buffersize - nulltermsize));
2441 freeHeapWritableBuffer(data);
2442 data->writableBuffer = temp;
2443 data->writableBufSize = size;
2444 }
2445 }
2446
2447 status = U_ZERO_ERROR;
2448 /*
2449 this puts the null termination infront of the normalized string instead
2450 of the end
2451 */
2452 pStartNorm = buffer + (nulltermsize - normLen);
2453 *(pStartNorm - 1) = 0;
2454 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2455 &status);
2456
2457 data->pos = data->writableBuffer + nulltermsize;
2458 data->origFlags = data->flags;
2459 data->flags |= UCOL_ITER_INNORMBUF;
2460 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2461 }
2462
2463 /**
2464 * Contraction character management function that returns the previous character
2465 * for the backwards iterator.
2466 * Does nothing if the previous character is in buffer and not the first
2467 * character in it.
2468 * Else it checks previous character in data string to see if it is
2469 * normalizable.
2470 * If it is not, the character is simply copied into the buffer, else
2471 * the whole normalized substring is copied into the buffer, including the
2472 * current character.
2473 * @param data collation element iterator data
2474 * @return previous character
2475 */
2476 static
2477 inline UChar getPrevNormalizedChar(collIterate *data)
2478 {
2479 UChar prevch;
2480 UChar ch;
2481 UChar *start;
2482 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2483 UChar *pNull = NULL;
2484 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2485 (innormbuf && *(data->pos - 1) != 0)) {
2486 /*
2487 if no normalization.
2488 if previous character is in normalized buffer, no further normalization
2489 is required
2490 */
2491 if(data->flags & UCOL_USE_ITERATOR) {
2492 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2493 return (UChar)data->iterator->next(data->iterator);
2494 } else {
2495 return *(data->pos - 1);
2496 }
2497 }
2498
2499 start = data->pos;
2500 if (data->flags & UCOL_ITER_HASLEN) {
2501 /* in data string */
2502 if ((start - 1) == data->string) {
2503 return *(start - 1);
2504 }
2505 start --;
2506 ch = *start;
2507 prevch = *(start - 1);
2508 }
2509 else {
2510 /*
2511 in writable buffer, at this point fcdPosition can not be NULL.
2512 see contracting tag.
2513 */
2514 if (data->fcdPosition == data->string) {
2515 /* at the start of the string, just dump it into the normalizer */
2516 insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2517 data->fcdPosition = NULL;
2518 return *(data->pos - 1);
2519 }
2520 pNull = data->pos - 1;
2521 start = data->fcdPosition;
2522 ch = *start;
2523 prevch = *(start - 1);
2524 }
2525 /*
2526 * if the current character is not fcd.
2527 * Trailing combining class == 0.
2528 */
2529 if (data->fcdPosition > start &&
2530 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2531 {
2532 /*
2533 Need a more complete FCD check and possible normalization.
2534 normalize substring will be appended to buffer
2535 */
2536 UChar *backuppos = data->pos;
2537 data->pos = start;
2538 if (collPrevIterFCD(data)) {
2539 normalizePrevContraction(data);
2540 return *(data->pos - 1);
2541 }
2542 data->pos = backuppos;
2543 data->fcdPosition ++;
2544 }
2545
2546 if (innormbuf) {
2547 /*
2548 no normalization is to be done hence only one character will be
2549 appended to the buffer.
2550 */
2551 insertBufferFront(data, pNull, ch);
2552 data->fcdPosition --;
2553 }
2554
2555 return ch;
2556 }
2557
2558 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2559 /* It is called by getNextCE */
2560
2561 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2562 collIterateState entryState;
2563 backupState(source, &entryState);
2564 UChar32 cp = ch;
2565
2566 for (;;) {
2567 // This loop will repeat only in the case of contractions, and only when a contraction
2568 // is found and the first CE resulting from that contraction is itself a special
2569 // (an expansion, for example.) All other special CE types are fully handled the
2570 // first time through, and the loop exits.
2571
2572 const uint32_t *CEOffset = NULL;
2573 switch(getCETag(CE)) {
2574 case NOT_FOUND_TAG:
2575 /* This one is not found, and we'll let somebody else bother about it... no more games */
2576 return CE;
2577 case SURROGATE_TAG:
2578 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2579 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2580 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2581 /* we return 0 (completely ignorable - per UCA specification */
2582 {
2583 UChar trail;
2584 collIterateState state;
2585 backupState(source, &state);
2586 if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2587 // we chould have stepped one char forward and it might have turned that it
2588 // was not a trail surrogate. In that case, we have to backup.
2589 loadState(source, &state, TRUE);
2590 return 0;
2591 } else {
2592 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2593 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, CE&0xFFFFFF, trail);
2594 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2595 // We need to backup
2596 loadState(source, &state, TRUE);
2597 return CE;
2598 }
2599 // calculate the supplementary code point value, if surrogate was not tailored
2600 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2601 }
2602 }
2603 break;
2604 case THAI_TAG:
2605 /* Thai/Lao reordering */
2606 if (((source->flags) & UCOL_ITER_INNORMBUF) /* Already Swapped || */
2607 || (source->iterator && !source->iterator->hasNext(source->iterator))
2608 || (source->pos && source->endp == source->pos) /* At end of string. No swap possible || */
2609 /*|| UCOL_ISTHAIBASECONSONANT(*(source->pos)) == 0*/) /* next char not Thai base cons.*/ // This is from the old specs - we now rearrange unconditionally
2610 {
2611 // Treat Thai as a length one expansion */
2612 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2613 CE = *CEOffset++;
2614 }
2615 else
2616 {
2617 // Move the prevowel and the following base Consonant into the normalization buffer
2618 // with their order swapped
2619
2620 source->writableBuffer[0] = peekCharacter(source, 0);
2621 source->writableBuffer[1] = peekCharacter(source, -1);
2622 source->writableBuffer[2] = 0;
2623
2624 if(source->pos) {
2625 source->fcdPosition = source->pos+1; // Indicate where to continue in main input string
2626 // after exhausting the writableBuffer
2627 } else if(source->iterator) {
2628 source->iterator->next(source->iterator);
2629 }
2630 source->pos = source->writableBuffer;
2631 source->origFlags = source->flags;
2632 source->flags |= UCOL_ITER_INNORMBUF;
2633 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2634
2635 CE = UCOL_IGNORABLE;
2636 }
2637 break;
2638 case SPEC_PROC_TAG:
2639 {
2640 // Special processing is getting a CE that is preceded by a certain prefix
2641 // Currently this is only needed for optimizing Japanese length and iteration marks.
2642 // When we encouter a special processing tag, we go backwards and try to see if
2643 // we have a match.
2644 // Contraction tables are used - so the whole process is not unlike contraction.
2645 // prefix data is stored backwards in the table.
2646 const UChar *UCharOffset;
2647 UChar schar, tchar;
2648 collIterateState prefixState;
2649 backupState(source, &prefixState);
2650 loadState(source, &entryState, TRUE);
2651 goBackOne(source); // We want to look at the point where we entered - actually one
2652 // before that...
2653
2654 for(;;) {
2655 // This loop will run once per source string character, for as long as we
2656 // are matching a potential contraction sequence
2657
2658 // First we position ourselves at the begining of contraction sequence
2659 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2660 if (collIter_bos(source)) {
2661 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2662 break;
2663 }
2664 schar = getPrevNormalizedChar(source);
2665 goBackOne(source);
2666
2667 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2668 UCharOffset++;
2669 }
2670
2671 if (schar == tchar) {
2672 // Found the source string char in the table.
2673 // Pick up the corresponding CE from the table.
2674 CE = *(coll->contractionCEs +
2675 (UCharOffset - coll->contractionIndex));
2676 }
2677 else
2678 {
2679 // if there is a completely ignorable code point in the middle of
2680 // a prefix, we need to act as if it's not there
2681 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
2682 // lone surrogates cannot be set to zero as it would break other processing
2683 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
2684 // it's easy for BMP code points
2685 if(isZeroCE == 0) {
2686 continue;
2687 } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
2688 // for supplementary code points, we have to check the next one
2689 // situations where we are going to ignore
2690 // 1. beginning of the string: schar is a lone surrogate
2691 // 2. schar is a lone surrogate
2692 // 3. schar is a trail surrogate in a valid surrogate sequence
2693 // that is explicitly set to zero.
2694 if (!collIter_bos(source)) {
2695 UChar lead;
2696 if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
2697 isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
2698 if(getCETag(isZeroCE) == SURROGATE_TAG) {
2699 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
2700 if(finalCE == 0) {
2701 // this is a real, assigned completely ignorable code point
2702 goBackOne(source);
2703 continue;
2704 }
2705 }
2706 } else {
2707 // lone surrogate, completely ignorable
2708 continue;
2709 }
2710 } else {
2711 // lone surrogate at the beggining, completely ignorable
2712 continue;
2713 }
2714 }
2715 // Source string char was not in the table.
2716 // We have not found the prefix.
2717 CE = *(coll->contractionCEs +
2718 (ContractionStart - coll->contractionIndex));
2719 }
2720
2721 if(!isPrefix(CE)) {
2722 // The source string char was in the contraction table, and the corresponding
2723 // CE is not a prefix CE. We found the prefix, break
2724 // out of loop, this CE will end up being returned. This is the normal
2725 // way out of prefix handling when the source actually contained
2726 // the prefix.
2727 break;
2728 }
2729 }
2730 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2731 loadState(source, &prefixState, TRUE);
2732 if(source->origFlags & UCOL_USE_ITERATOR) {
2733 source->flags = source->origFlags;
2734 }
2735 } else { // prefix search was a failure, we have to backup all the way to the start
2736 loadState(source, &entryState, TRUE);
2737 }
2738 break;
2739 }
2740 case CONTRACTION_TAG:
2741 {
2742 /* This should handle contractions */
2743 collIterateState state;
2744 backupState(source, &state);
2745 uint32_t firstCE = UCOL_NOT_FOUND;
2746 const UChar *UCharOffset;
2747 UChar schar, tchar;
2748
2749 for (;;) {
2750 /* This loop will run once per source string character, for as long as we */
2751 /* are matching a potential contraction sequence */
2752
2753 /* First we position ourselves at the begining of contraction sequence */
2754 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2755
2756 if (collIter_eos(source)) {
2757 // Ran off the end of the source string.
2758 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2759 // So we'll pick whatever we have at the point...
2760 if (CE == UCOL_NOT_FOUND) {
2761 // back up the source over all the chars we scanned going into this contraction.
2762 CE = firstCE;
2763 loadState(source, &state, TRUE);
2764 if(source->origFlags & UCOL_USE_ITERATOR) {
2765 source->flags = source->origFlags;
2766 }
2767 }
2768 break;
2769 }
2770
2771 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2772 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2773
2774 schar = getNextNormalizedChar(source);
2775 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2776 UCharOffset++;
2777 }
2778
2779 if (schar == tchar) {
2780 // Found the source string char in the contraction table.
2781 // Pick up the corresponding CE from the table.
2782 CE = *(coll->contractionCEs +
2783 (UCharOffset - coll->contractionIndex));
2784 }
2785 else
2786 {
2787 // if there is a completely ignorable code point in the middle of
2788 // contraction, we need to act as if it's not there
2789 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
2790 // it's easy for BMP code points
2791 if(isZeroCE == 0) {
2792 continue;
2793 } else if(UTF_IS_LEAD(schar)) {
2794 if(!collIter_eos(source)) {
2795 backupState(source, &state);
2796 UChar trail = getNextNormalizedChar(source);
2797 if(UTF_IS_TRAIL(trail)) { // do stuff with trail
2798 if(getCETag(isZeroCE) == SURROGATE_TAG) {
2799 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
2800 if(finalCE == 0) {
2801 continue;
2802 }
2803 }
2804 } else {
2805 // broken surrogate sequence, thus completely ignorable
2806 loadState(source, &state, TRUE);
2807 continue;
2808 }
2809 loadState(source, &state, TRUE);
2810 } else { // no more characters, so broken surrogate pair...
2811 // this contraction will ultimately fail, but not because of us
2812 continue;
2813 }
2814 } // else if(UTF_IS_LEAD(schar))
2815
2816 // Source string char was not in contraction table.
2817 // Unless we have a discontiguous contraction, we have finished
2818 // with this contraction.
2819 uint8_t sCC;
2820 if (schar < 0x300 ||
2821 maxCC == 0 ||
2822 (sCC = i_getCombiningClass(schar, coll)) == 0 ||
2823 sCC>maxCC ||
2824 (allSame != 0 && sCC == maxCC) ||
2825 collIter_eos(source)) {
2826 // Contraction can not be discontiguous.
2827 goBackOne(source); // back up the source string by one,
2828 // because the character we just looked at was
2829 // not part of the contraction. */
2830 CE = *(coll->contractionCEs +
2831 (ContractionStart - coll->contractionIndex));
2832 } else {
2833 //
2834 // Contraction is possibly discontiguous.
2835 // Scan more of source string looking for a match
2836 //
2837 UChar tempchar;
2838 /* find the next character if schar is not a base character
2839 and we are not yet at the end of the string */
2840 tempchar = getNextNormalizedChar(source);
2841 goBackOne(source);
2842 if (i_getCombiningClass(tempchar, coll) == 0) {
2843 goBackOne(source);
2844 /* Spit out the last char of the string, wasn't tasty enough */
2845 CE = *(coll->contractionCEs +
2846 (ContractionStart - coll->contractionIndex));
2847 } else {
2848 CE = getDiscontiguous(coll, source, ContractionStart);
2849 }
2850 }
2851 } // else after if(schar == tchar)
2852
2853 if(CE == UCOL_NOT_FOUND) {
2854 /* The Source string did not match the contraction that we were checking. */
2855 /* Back up the source position to undo the effects of having partially */
2856 /* scanned through what ultimately proved to not be a contraction. */
2857 loadState(source, &state, TRUE);
2858 CE = firstCE;
2859 if(source->origFlags & UCOL_USE_ITERATOR) {
2860 source->flags = source->origFlags;
2861 }
2862 break;
2863 }
2864
2865 if(!isContraction(CE)) {
2866 // The source string char was in the contraction table, and the corresponding
2867 // CE is not a contraction CE. We completed the contraction, break
2868 // out of loop, this CE will end up being returned. This is the normal
2869 // way out of contraction handling when the source actually contained
2870 // the contraction.
2871 break;
2872 }
2873
2874
2875 // The source string char was in the contraction table, and the corresponding
2876 // CE is IS a contraction CE. We will continue looping to check the source
2877 // string for the remaining chars in the contraction.
2878 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2879 if(tempCE != UCOL_NOT_FOUND) {
2880 // We have scanned a a section of source string for which there is a
2881 // CE from the contraction table. Remember the CE and scan position, so
2882 // that we can return to this point if further scanning fails to
2883 // match a longer contraction sequence.
2884 firstCE = tempCE;
2885
2886 goBackOne(source);
2887 backupState(source, &state);
2888 getNextNormalizedChar(source);
2889
2890 // Another way to do this is:
2891 //collIterateState tempState;
2892 //backupState(source, &tempState);
2893 //goBackOne(source);
2894 //backupState(source, &state);
2895 //loadState(source, &tempState, TRUE);
2896
2897 // The problem is that for incomplete contractions we have to remember the previous
2898 // position. Before, the only thing I needed to do was state.pos--;
2899 // After iterator introduction and especially after introduction of normalizing
2900 // iterators, it became much more difficult to decrease the saved state.
2901 // I'm not yet sure which of the two methods above is faster.
2902 }
2903 } // for(;;)
2904 break;
2905 } // case CONTRACTION_TAG:
2906 case LONG_PRIMARY_TAG:
2907 {
2908 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2909 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2910 return CE;
2911 }
2912 case EXPANSION_TAG:
2913 {
2914 /* This should handle expansion. */
2915 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2916 /* I have to decide where continuations are going to be dealt with */
2917 uint32_t size;
2918 uint32_t i; /* general counter */
2919 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2920 size = getExpansionCount(CE);
2921 CE = *CEOffset++;
2922 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2923 for(i = 1; i<size; i++) {
2924 *(source->CEpos++) = *CEOffset++;
2925 }
2926 } else { /* else, we do */
2927 while(*CEOffset != 0) {
2928 *(source->CEpos++) = *CEOffset++;
2929 }
2930 }
2931 return CE;
2932 }
2933 case DIGIT_TAG:
2934 {
2935 /*
2936 We do a check to see if we want to collate digits as numbers; if so we generate
2937 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2938 */
2939 uint32_t size;
2940 uint32_t i; /* general counter */
2941
2942 if (coll->numericCollation == UCOL_ON){
2943 UChar32 char32 = 0;
2944
2945 uint32_t digIndx = 0;
2946 uint32_t endIndex = 0;
2947 uint32_t trailingZeroIndex = 0;
2948
2949 uint32_t primWeight = 0;
2950
2951 uint32_t digVal = 0;
2952 uint8_t collateVal = 0;
2953
2954 UBool nonZeroValReached = false;
2955
2956 uint8_t *numTempBuf;
2957 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
2958 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
2959
2960 numTempBuf = stackNumTempBuf;
2961 /*
2962 We parse the source string until we hit a char that's NOT a digit.
2963 Use this u_charDigitValue. This might be slow because we have to
2964 handle surrogates...
2965 */
2966
2967 if (U16_IS_LEAD(ch)){
2968 if (!collIter_eos(source))
2969 char32 = U16_GET_SUPPLEMENTARY(ch, getNextNormalizedChar(source));
2970 else
2971 char32 = ch;
2972 }
2973 else
2974 char32 = ch;
2975 digVal = u_charDigitValue(char32);
2976
2977 /*
2978 We pad a zero in front of the first element anyways. This takes
2979 care of the (probably) most common case where people are sorting things followed
2980 by a single digit
2981 */
2982 digIndx++;
2983 for(;;){
2984 // Make sure we have enough space.
2985 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
2986 {
2987 numTempBufSize *= 2;
2988 if (numTempBuf == stackNumTempBuf){
2989 numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
2990 memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
2991 }else
2992 realloc(numTempBuf, numTempBufSize);
2993 }
2994
2995 // Skipping over leading zeroes.
2996 if (digVal != 0 || nonZeroValReached){
2997 if (digVal != 0 && !nonZeroValReached)
2998 nonZeroValReached = true;
2999
3000 /*
3001 We parse the digit string into base 100 numbers (this fits into a byte).
3002 We only add to the buffer in twos, thus if we are parsing an odd character,
3003 that serves as the 'tens' digit while the if we are parsing an even one, that
3004 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3005 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3006 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3007 than all the other bytes.
3008 */
3009
3010 if (digIndx % 2 == 1){
3011 collateVal += (uint8_t)digVal;
3012
3013 // We don't enter the low-order-digit case unless we've already seen
3014 // the high order, or for the first digit, which is always non-zero.
3015 if (collateVal != 0)
3016 trailingZeroIndex = 0;
3017
3018 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3019 collateVal = 0;
3020 }
3021 else{
3022 // We drop the collation value into the buffer so if we need to do
3023 // a "front patch" we don't have to check to see if we're hitting the
3024 // last element.
3025 collateVal = (uint8_t)(digVal * 10);
3026
3027 // Check for trailing zeroes.
3028 if (collateVal == 0)
3029 {
3030 if (!trailingZeroIndex)
3031 trailingZeroIndex = (digIndx/2) + 2;
3032 }
3033 else
3034 trailingZeroIndex = 0;
3035
3036 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3037 }
3038 digIndx++;
3039 }
3040
3041 // Get next character.
3042 if (!collIter_eos(source)){
3043 ch = getNextNormalizedChar(source);
3044 if (U16_IS_LEAD(ch)){
3045 if (!collIter_eos(source))
3046 char32 = U16_GET_SUPPLEMENTARY(ch, getNextNormalizedChar(source));
3047 }
3048 else
3049 char32 = ch;
3050
3051 if ((digVal = u_charDigitValue(char32)) == -1){
3052 // Resetting position to point to the next unprocessed char. We
3053 // overshot it when doing our test/set for numbers.
3054 goBackOne(source);
3055 if (char32 > 0xFFFF) // For surrogates.
3056 goBackOne(source);
3057 break;
3058 }
3059 }else
3060 break;
3061 }
3062
3063 if (nonZeroValReached == false){
3064 digIndx = 2;
3065 numTempBuf[2] = 6;
3066 }
3067
3068 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3069 if (digIndx % 2 != 0){
3070 /*
3071 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3072 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3073 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3074 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3075 */
3076
3077 for(i = 2; i < endIndex; i++){
3078 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3079 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3080 }
3081 --digIndx;
3082 }
3083
3084 // Subtract one off of the last byte.
3085 numTempBuf[endIndex-1] -= 1;
3086
3087 /*
3088 We want to skip over the first two slots in the buffer. The first slot
3089 is reserved for the header byte 0x1B. The second slot is for the
3090 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3091 */
3092 numTempBuf[0] = 0x1B;
3093 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3094
3095 // Now transfer the collation key to our collIterate struct.
3096 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3097 size = ((endIndex+1) & ~1)/2;
3098 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3099 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3100 UCOL_BYTE_COMMON; // Tertiary weight.
3101 i = 2; // Reset the index into the buffer.
3102 while(i < endIndex)
3103 {
3104 primWeight = numTempBuf[i++] << 8;
3105 if ( i < endIndex)
3106 primWeight |= numTempBuf[i++];
3107 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3108 }
3109
3110 if (numTempBuf != stackNumTempBuf)
3111 free(numTempBuf);
3112 }
3113 else{
3114 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3115 size = getExpansionCount(CE);
3116 CE = *CEOffset++;
3117 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3118 for(i = 1; i<size; i++) {
3119 *(source->CEpos++) = *CEOffset++;
3120 }
3121 } else { /* else, we do */
3122 while(*CEOffset != 0) {
3123 *(source->CEpos++) = *CEOffset++;
3124 }
3125 }
3126 }
3127 return CE;
3128 }
3129 /* various implicits optimization */
3130 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3131 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3132 //return getImplicit(cp, source, 0x04000000);
3133 return getImplicit(cp, source);
3134 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3135 /* UCA is filled with these. Tailorings are NOT_FOUND */
3136 //return getImplicit(cp, source, 0);
3137 return getImplicit(cp, source);
3138 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3139 return 0; /* broken surrogate sequence */
3140 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3141 UChar nextChar;
3142 if( source->flags & UCOL_USE_ITERATOR) {
3143 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3144 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3145 source->iterator->next(source->iterator);
3146 return getImplicit(cp, source);
3147 } else {
3148 return 0;
3149 }
3150 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3151 U_IS_TRAIL((nextChar=*source->pos))) {
3152 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3153 source->pos++;
3154 return getImplicit(cp, source);
3155 } else {
3156 return 0; /* completely ignorable */
3157 }
3158 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3159 {
3160 const uint32_t
3161 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3162 //const uint32_t LCount = 19;
3163 const uint32_t VCount = 21;
3164 const uint32_t TCount = 28;
3165 //const uint32_t NCount = VCount * TCount; // 588
3166 //const uint32_t SCount = LCount * NCount; // 11172
3167 uint32_t L = ch - SBase;
3168
3169 // divide into pieces
3170
3171 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3172 L /= TCount;
3173 uint32_t V = L % VCount;
3174 L /= VCount;
3175
3176 // offset them
3177
3178 L += LBase;
3179 V += VBase;
3180 T += TBase;
3181
3182 // return the first CE, but first put the rest into the expansion buffer
3183 if (!source->coll->image->jamoSpecial) { // FAST PATH
3184
3185 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
3186 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3187 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
3188 if (T != TBase) {
3189 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
3190 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3191 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
3192 }
3193
3194 /*return ucmpe32_get(UCA->mapping, L);*/ // return first one
3195 /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3196 return UTRIE_GET32_FROM_LEAD(coll->mapping, L);
3197
3198 } else { // Jamo is Special
3199 // Since Hanguls pass the FCD check, it is
3200 // guaranteed that we won't be in
3201 // the normalization buffer if something like this happens
3202 // However, if we are using a uchar iterator and normalization
3203 // is ON, the Hangul that lead us here is going to be in that
3204 // normalization buffer. Here we want to restore the uchar
3205 // iterator state and pull out of the normalization buffer
3206 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3207 source->flags = source->origFlags; // restore the iterator
3208 source->pos = NULL;
3209 }
3210 // Move Jamos into normalization buffer
3211 source->writableBuffer[0] = (UChar)L;
3212 source->writableBuffer[1] = (UChar)V;
3213 if (T != TBase) {
3214 source->writableBuffer[2] = (UChar)T;
3215 source->writableBuffer[3] = 0;
3216 } else {
3217 source->writableBuffer[2] = 0;
3218 }
3219
3220 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3221 // after exhausting the writableBuffer
3222 source->pos = source->writableBuffer;
3223 source->origFlags = source->flags;
3224 source->flags |= UCOL_ITER_INNORMBUF;
3225 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3226
3227 return(UCOL_IGNORABLE);
3228 }
3229 }
3230 case CHARSET_TAG:
3231 /* not yet implemented */
3232 /* probably after 1.8 */
3233 return UCOL_NOT_FOUND;
3234 default:
3235 *status = U_INTERNAL_PROGRAM_ERROR;
3236 CE=0;
3237 break;
3238 }
3239 if (CE <= UCOL_NOT_FOUND) break;
3240 }
3241 return CE;
3242 }
3243
3244
3245 /* now uses Mark's getImplicitPrimary code */
3246 static
3247 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3248 if(isNonChar(cp)) {
3249 return 0;
3250 }
3251
3252 uint32_t r = getImplicitPrimary(cp);
3253
3254 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3255 collationSource->toReturn = collationSource->CEpos;
3256 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3257 }
3258
3259 /**
3260 * This function handles the special CEs like contractions, expansions,
3261 * surrogates, Thai.
3262 * It is called by both getPrevCE
3263 */
3264 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3265 collIterate *source,
3266 UErrorCode *status)
3267 {
3268 const uint32_t *CEOffset = NULL;
3269 UChar *UCharOffset = NULL;
3270 UChar schar;
3271 const UChar *constart = NULL;
3272 uint32_t size;
3273 UChar buffer[UCOL_MAX_BUFFER];
3274 uint32_t *endCEBuffer;
3275 UChar *strbuffer;
3276 int32_t noChars = 0;
3277
3278 for(;;)
3279 {
3280 /* the only ces that loops are thai and contractions */
3281 switch (getCETag(CE))
3282 {
3283 case NOT_FOUND_TAG: /* this tag always returns */
3284 return CE;
3285 case SURROGATE_TAG: /* This is a surrogate pair */
3286 /* essentialy an engaged lead surrogate. */
3287 /* if you have encountered it here, it means that a */
3288 /* broken sequence was encountered and this is an error */
3289 return 0;
3290 case THAI_TAG:
3291 if ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
3292 source->string == source->pos || /* At start of string.|| */
3293 /* previous char not Thai prevowel */
3294 /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
3295 UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE)
3296 //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
3297 {
3298 /* Treat Thai as a length one expansion */
3299 /* find the offset to expansion table */
3300 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE);
3301 CE = *CEOffset ++;
3302 }
3303 else
3304 {
3305 /*
3306 Move the prevowel and the following base Consonant into the
3307 normalization buffer with their order swapped
3308 */
3309 UChar *tempbuffer = source->writableBuffer +
3310 (source->writableBufSize - 1);
3311 *(tempbuffer - 2) = 0;
3312 *(tempbuffer - 1) = peekCharacter(source, 0);
3313 *(tempbuffer) = peekCharacter(source, -1);
3314
3315 /*
3316 Indicate where to continue in main input string after exhausting
3317 the writableBuffer
3318 */
3319 if (source->pos - 1 == source->string) {
3320 source->fcdPosition = NULL;
3321 } else {
3322 source->fcdPosition = source->pos-2;
3323 }
3324
3325 source->pos = tempbuffer;
3326 source->origFlags = source->flags;
3327 source->flags |= UCOL_ITER_INNORMBUF;
3328 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3329
3330 //CE = UCOL_IGNORABLE;
3331 return(UCOL_IGNORABLE);
3332 }
3333 break;
3334 case SPEC_PROC_TAG:
3335 {
3336 // Special processing is getting a CE that is preceded by a certain prefix
3337 // Currently this is only needed for optimizing Japanese length and iteration marks.
3338 // When we encouter a special processing tag, we go backwards and try to see if
3339 // we have a match.
3340 // Contraction tables are used - so the whole process is not unlike contraction.
3341 // prefix data is stored backwards in the table.
3342 const UChar *UCharOffset;
3343 UChar schar, tchar;
3344 collIterateState prefixState;
3345 backupState(source, &prefixState);
3346 for(;;) {
3347 // This loop will run once per source string character, for as long as we
3348 // are matching a potential contraction sequence
3349
3350 // First we position ourselves at the begining of contraction sequence
3351 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3352
3353 if (collIter_bos(source)) {
3354 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3355 break;
3356 }
3357 schar = getPrevNormalizedChar(source);
3358 goBackOne(source);
3359
3360 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3361 UCharOffset++;
3362 }
3363
3364 if (schar == tchar) {
3365 // Found the source string char in the table.
3366 // Pick up the corresponding CE from the table.
3367 CE = *(coll->contractionCEs +
3368 (UCharOffset - coll->contractionIndex));
3369 }
3370 else
3371 {
3372 // if there is a completely ignorable code point in the middle of
3373 // a prefix, we need to act as if it's not there
3374 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3375 // lone surrogates cannot be set to zero as it would break other processing
3376 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3377 // it's easy for BMP code points
3378 if(isZeroCE == 0) {
3379 continue;
3380 } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
3381 // for supplementary code points, we have to check the next one
3382 // situations where we are going to ignore
3383 // 1. beginning of the string: schar is a lone surrogate
3384 // 2. schar is a lone surrogate
3385 // 3. schar is a trail surrogate in a valid surrogate sequence
3386 // that is explicitly set to zero.
3387 if (!collIter_bos(source)) {
3388 UChar lead;
3389 if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
3390 isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
3391 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3392 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
3393 if(finalCE == 0) {
3394 // this is a real, assigned completely ignorable code point
3395 goBackOne(source);
3396 continue;
3397 }
3398 }
3399 } else {
3400 // lone surrogate, completely ignorable
3401 continue;
3402 }
3403 } else {
3404 // lone surrogate at the beggining, completely ignorable
3405 continue;
3406 }
3407 }
3408 // Source string char was not in the table.
3409 // We have not found the prefix.
3410 CE = *(coll->contractionCEs +
3411 (ContractionStart - coll->contractionIndex));
3412 }
3413
3414 if(!isPrefix(CE)) {
3415 // The source string char was in the contraction table, and the corresponding
3416 // CE is not a prefix CE. We found the prefix, break
3417 // out of loop, this CE will end up being returned. This is the normal
3418 // way out of prefix handling when the source actually contained
3419 // the prefix.
3420 break;
3421 }
3422 }
3423 loadState(source, &prefixState, TRUE);
3424 break;
3425 }
3426
3427 case CONTRACTION_TAG:
3428 /* to ensure that the backwards and forwards iteration matches, we
3429 take the current region of most possible match and pass it through
3430 the forward iteration. this will ensure that the obstinate problem of
3431 overlapping contractions will not occur.
3432 */
3433 schar = peekCharacter(source, 0);
3434 constart = (UChar *)coll->image + getContractOffset(CE);
3435 if (isAtStartPrevIterate(source)
3436 /* commented away contraction end checks after adding the checks
3437 in getPrevCE */) {
3438 /* start of string or this is not the end of any contraction */
3439 CE = *(coll->contractionCEs +
3440 (constart - coll->contractionIndex));
3441 break;
3442 }
3443 strbuffer = buffer;
3444 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3445 *(UCharOffset --) = 0;
3446 noChars = 0;
3447 // have to swap thai characters
3448 while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIBASECONSONANT(schar)) {
3449 *(UCharOffset) = schar;
3450 noChars++;
3451 UCharOffset --;
3452 schar = getPrevNormalizedChar(source);
3453 goBackOne(source);
3454 // TODO: when we exhaust the contraction buffer,
3455 // it needs to get reallocated. The problem is
3456 // that the size depends on the string which is
3457 // not iterated over. However, since we're travelling
3458 // backwards, we already had to set the iterator at
3459 // the end - so we might as well know where we are?
3460 if (UCharOffset + 1 == buffer) {
3461 /* we have exhausted the buffer */
3462 int32_t newsize = 0;
3463 if(source->pos) { // actually dealing with a position
3464 newsize = source->pos - source->string + 1;
3465 } else { // iterator
3466 newsize = 4 * UCOL_MAX_BUFFER;
3467 }
3468 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3469 (newsize + UCOL_MAX_BUFFER));
3470 /* test for NULL */
3471 if (strbuffer == NULL) {
3472 *status = U_MEMORY_ALLOCATION_ERROR;
3473 return UCOL_NO_MORE_CES;
3474 }
3475 UCharOffset = strbuffer + newsize;
3476 uprv_memcpy(UCharOffset, buffer,
3477 UCOL_MAX_BUFFER * sizeof(UChar));
3478 UCharOffset --;
3479 }
3480 if ((source->pos && (source->pos == source->string ||
3481 ((source->flags & UCOL_ITER_INNORMBUF) &&
3482 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3483 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3484 break;
3485 }
3486 }
3487 /* adds the initial base character to the string */
3488 *(UCharOffset) = schar;
3489 noChars++;
3490
3491 /* a new collIterate is used to simply things, since using the current
3492 collIterate will mean that the forward and backwards iteration will
3493 share and change the same buffers. we don't want to get into that. */
3494 collIterate temp;
3495 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3496 IInit_collIterate(coll, UCharOffset, noChars, &temp);
3497 temp.flags &= ~UCOL_ITER_NORM;
3498
3499 CE = ucol_IGetNextCE(coll, &temp, status);
3500 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3501 while (CE != UCOL_NO_MORE_CES) {
3502 *(source->CEpos ++) = CE;
3503 if (source->CEpos == endCEBuffer) {
3504 /* ran out of CE space, bail.
3505 there's no guarantee of the right character position after
3506 this bail*/
3507 *status = U_BUFFER_OVERFLOW_ERROR;
3508 source->CEpos = source->CEs;
3509 freeHeapWritableBuffer(&temp);
3510 if (strbuffer != buffer) {
3511 uprv_free(strbuffer);
3512 }
3513 return UCOL_NULLORDER;
3514 }
3515 CE = ucol_IGetNextCE(coll, &temp, status);
3516 }
3517 freeHeapWritableBuffer(&temp);
3518 if (strbuffer != buffer) {
3519 uprv_free(strbuffer);
3520 }
3521 source->toReturn = source->CEpos - 1;
3522 if (source->toReturn == source->CEs) {
3523 source->CEpos = source->CEs;
3524 }
3525 return *(source->toReturn);
3526 case LONG_PRIMARY_TAG:
3527 {
3528 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3529 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3530 source->toReturn = source->CEpos - 1;
3531 return *(source->toReturn);
3532 }
3533 case EXPANSION_TAG: /* this tag always returns */
3534 /*
3535 This should handle expansion.
3536 NOTE: we can encounter both continuations and expansions in an expansion!
3537 I have to decide where continuations are going to be dealt with
3538 */
3539 /* find the offset to expansion table */
3540 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3541 size = getExpansionCount(CE);
3542 if (size != 0) {
3543 /*
3544 if there are less than 16 elements in expansion, we don't terminate
3545 */
3546 uint32_t count;
3547 for (count = 0; count < size; count++) {
3548 *(source->CEpos ++) = *CEOffset++;
3549 }
3550 }
3551 else {
3552 /* else, we do */
3553 while (*CEOffset != 0) {
3554 *(source->CEpos ++) = *CEOffset ++;
3555 }
3556 }
3557 source->toReturn = source->CEpos - 1;
3558 // in case of one element expansion, we
3559 // want to immediately return CEpos
3560 if(source->toReturn == source->CEs) {
3561 source->CEpos = source->CEs;
3562 }
3563 return *(source->toReturn);
3564 case DIGIT_TAG:
3565 {
3566 /*
3567 We do a check to see if we want to collate digits as numbers; if so we generate
3568 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3569 */
3570 uint32_t size;
3571 uint32_t i; /* general counter */
3572
3573 if (coll->numericCollation == UCOL_ON){
3574 UChar32 char32 = 0;
3575
3576 uint32_t digIndx = 0;
3577 uint32_t endIndex = 0;
3578 uint32_t leadingZeroIndex = 0;
3579 uint32_t trailingZeroCount = 0;
3580
3581 uint32_t primWeight = 0;
3582
3583 uint32_t digVal = 0;
3584 uint8_t collateVal = 0;
3585
3586 UBool nonZeroValReached = false;
3587
3588 uint8_t *numTempBuf;
3589 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3590 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3591
3592 numTempBuf = stackNumTempBuf;
3593 /*
3594 We parse the source string until we hit a char that's NOT a digit.
3595 Use this u_charDigitValue. This might be slow because we have to
3596 handle surrogates...
3597 */
3598
3599 if (U16_IS_TRAIL (ch)){
3600 if (!collIter_bos(source)){
3601 char32 = U16_GET_SUPPLEMENTARY(getPrevNormalizedChar(source),ch);
3602 goBackOne(source);
3603 }
3604 else
3605 char32 = ch;
3606 }
3607 else
3608 char32 = ch;
3609 digVal = u_charDigitValue(char32);
3610
3611 for(;;){
3612 // Make sure we have enough space.
3613 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3614 {
3615 numTempBufSize *= 2;
3616 if (numTempBuf == stackNumTempBuf){
3617 numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
3618 memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3619 }else
3620 realloc(numTempBuf, numTempBufSize);
3621 }
3622
3623 // Skip over trailing zeroes, and keep a count of them.
3624 if (digVal != 0)
3625 nonZeroValReached = true;
3626 if (nonZeroValReached){
3627 /*
3628 We parse the digit string into base 100 numbers (this fits into a byte).
3629 We only add to the buffer in twos, thus if we are parsing an odd character,
3630 that serves as the 'tens' digit while the if we are parsing an even one, that
3631 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3632 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3633 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3634 than all the other bytes.
3635
3636 Since we're doing in this reverse we want to put the first digit encountered into the
3637 ones place and the second digit encountered into the tens place.
3638 */
3639
3640 if ((digIndx + trailingZeroCount) % 2 == 1){
3641 // High-order digit case (tens place)
3642 collateVal += digVal * 10;
3643
3644 // We cannot set leadingZeroIndex unless it has been set for the
3645 // low-order digit. Therefore, all we can do for the high-order
3646 // digit is turn it off, never on.
3647 // The only time we will have a high digit without a low is for
3648 // the very first non-zero digit, so no zero check is necessary.
3649 if (collateVal != 0)
3650 leadingZeroIndex = 0;
3651
3652 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3653 collateVal = 0;
3654 }
3655 else{
3656 // Low-order digit case (ones place)
3657 collateVal = digVal;
3658
3659 // Check for leading zeroes.
3660 if (collateVal == 0)
3661 {
3662 if (!leadingZeroIndex)
3663 leadingZeroIndex = (digIndx/2) + 2;
3664 }
3665 else
3666 leadingZeroIndex = 0;
3667
3668 // No need to write to buffer; the case of a last odd digit
3669 // is handled below.
3670 }
3671 ++digIndx;
3672 }
3673 else
3674 ++trailingZeroCount;
3675
3676 if (!collIter_bos(source)){
3677 ch = getPrevNormalizedChar(source);
3678 goBackOne(source);
3679 if (U16_IS_TRAIL(ch)){
3680 if (!collIter_bos(source))
3681 {
3682 char32 = U16_GET_SUPPLEMENTARY(getPrevNormalizedChar(source),ch);
3683 goBackOne(source);
3684 }
3685 }
3686 else
3687 char32 = ch;
3688
3689 if ((digVal = u_charDigitValue(char32)) == -1){
3690 // Don't need to "reverse" the goBackOne call,
3691 // as this points to the next position to process..
3692 if (char32 > 0xFFFF) // For surrogates.
3693 getNextNormalizedChar(source);
3694 break;
3695 }
3696 }else
3697 break;
3698 }
3699
3700 if (nonZeroValReached == false){
3701 digIndx = 2;
3702 trailingZeroCount = 0;
3703 numTempBuf[2] = 6;
3704 }
3705
3706 if ((digIndx + trailingZeroCount) % 2 != 0){
3707 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3708 digIndx += 1;
3709 }
3710
3711 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3712
3713 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3714 numTempBuf[2] -= 1;
3715
3716 /*
3717 We want to skip over the first two slots in the buffer. The first slot
3718 is reserved for the header byte 0x1B. The second slot is for the
3719 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3720 The exponent must be adjusted by the number of leading zeroes, and the number of
3721 trailing zeroes.
3722 */
3723 numTempBuf[0] = 0x1B;
3724 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3725 if (leadingZeroIndex)
3726 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3727 numTempBuf[1] = 0x80 + (exponent & 0x7F);
3728
3729 // Now transfer the collation key to our collIterate struct.
3730 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3731 //size = ((endIndex+1) & ~1)/2;
3732 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3733 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3734 UCOL_BYTE_COMMON; // Tertiary weight.
3735 i = endIndex - 1; // Reset the index into the buffer.
3736 while(i >= 2)
3737 {
3738 primWeight = numTempBuf[i--] << 8;
3739 if ( i >= 2)
3740 primWeight |= numTempBuf[i--];
3741 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3742 }
3743 if (numTempBuf != stackNumTempBuf)
3744 free(numTempBuf);
3745
3746 source->toReturn = source->CEpos -1;
3747 return *(source->toReturn);
3748 }
3749 else{
3750 /* find the offset to expansion table */
3751 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3752 size = getExpansionCount(CE);
3753 if (size != 0) {
3754 /*
3755 if there are less than 16 elements in expansion, we don't terminate
3756 */
3757 uint32_t count;
3758 for (count = 0; count < size; count++) {
3759 *(source->CEpos ++) = *CEOffset++;
3760 }
3761 }
3762 else {
3763 /* else, we do */
3764 while (*CEOffset != 0) {
3765 *(source->CEpos ++) = *CEOffset ++;
3766 }
3767 }
3768 source->toReturn = source->CEpos - 1;
3769 // in case of one element expansion, we
3770 // want to immediately return CEpos
3771 if(source->toReturn == source->CEs) {
3772 source->CEpos = source->CEs;
3773 }
3774 return *(source->toReturn);
3775 }
3776 }
3777 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3778 {
3779 const uint32_t
3780 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3781 //const uint32_t LCount = 19;
3782 const uint32_t VCount = 21;
3783 const uint32_t TCount = 28;
3784 //const uint32_t NCount = VCount * TCount; /* 588 */
3785 //const uint32_t SCount = LCount * NCount; /* 11172 */
3786
3787 uint32_t L = ch - SBase;
3788 /*
3789 divide into pieces.
3790 we do it in this order since some compilers can do % and / in one
3791 operation
3792 */
3793 uint32_t T = L % TCount;
3794 L /= TCount;
3795 uint32_t V = L % VCount;
3796 L /= VCount;
3797
3798 /* offset them */
3799 L += LBase;
3800 V += VBase;
3801 T += TBase;
3802
3803 /*
3804 return the first CE, but first put the rest into the expansion buffer
3805 */
3806 if (!source->coll->image->jamoSpecial)
3807 {
3808 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
3809 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3810 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L);
3811 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
3812 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3813 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
3814 if (T != TBase)
3815 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
3816 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3817 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
3818
3819 source->toReturn = source->CEpos - 1;
3820 return *(source->toReturn);
3821 } else {
3822 // Since Hanguls pass the FCD check, it is
3823 // guaranteed that we won't be in
3824 // the normalization buffer if something like this happens
3825 // Move Jamos into normalization buffer
3826 /*
3827 Move the Jamos into the
3828 normalization buffer
3829 */
3830 UChar *tempbuffer = source->writableBuffer +
3831 (source->writableBufSize - 1);
3832 *(tempbuffer) = 0;
3833 if (T != TBase) {
3834 *(tempbuffer - 1) = (UChar)T;
3835 *(tempbuffer - 2) = (UChar)V;
3836 *(tempbuffer - 3) = (UChar)L;
3837 *(tempbuffer - 4) = 0;
3838 } else {
3839 *(tempbuffer - 1) = (UChar)V;
3840 *(tempbuffer - 2) = (UChar)L;
3841 *(tempbuffer - 3) = 0;
3842 }
3843
3844 /*
3845 Indicate where to continue in main input string after exhausting
3846 the writableBuffer
3847 */
3848 if (source->pos == source->string) {
3849 source->fcdPosition = NULL;
3850 } else {
3851 source->fcdPosition = source->pos-1;
3852 }
3853
3854 source->pos = tempbuffer;
3855 source->origFlags = source->flags;
3856 source->flags |= UCOL_ITER_INNORMBUF;
3857 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3858
3859 return(UCOL_IGNORABLE);
3860 }
3861 }
3862 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3863 return 0; /* broken surrogate sequence */
3864 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3865 {
3866 UChar32 cp = 0;
3867 UChar prevChar;
3868 UChar *prev;
3869 if (isAtStartPrevIterate(source)) {
3870 /* we are at the start of the string, wrong place to be at */
3871 return 0;
3872 }
3873 if (source->pos != source->writableBuffer) {
3874 prev = source->pos - 1;
3875 } else {
3876 prev = source->fcdPosition;
3877 }
3878 prevChar = *prev;
3879
3880 /* Handles Han and Supplementary characters here.*/
3881 if (UTF_IS_FIRST_SURROGATE(prevChar)) {
3882 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3883 source->pos = prev;
3884 } else {
3885 return 0; /* completely ignorable */
3886 }
3887 return getPrevImplicit(cp, source);
3888 }
3889 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3890 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3891 return getPrevImplicit(ch, source);
3892 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3893 return getPrevImplicit(ch, source);
3894 /* UCA is filled with these. Tailorings are NOT_FOUND */
3895 /* not yet implemented */
3896 case CHARSET_TAG: /* this tag always returns */
3897 /* probably after 1.8 */
3898 return UCOL_NOT_FOUND;
3899 default: /* this tag always returns */
3900 *status = U_INTERNAL_PROGRAM_ERROR;
3901 CE=0;
3902 break;
3903 }
3904 if (CE <= UCOL_NOT_FOUND) {
3905 break;
3906 }
3907 }
3908 return CE;
3909 }
3910
3911 /* This should really be a macro */
3912 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3913 /* anyway */
3914 static
3915 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
3916 #ifdef UCOL_DEBUG
3917 fprintf(stderr, ".");
3918 #endif
3919 uint8_t *newStart = NULL;
3920 uint32_t offset = *secondaries-secStart;
3921
3922 if(secStart==second) {
3923 newStart=(uint8_t*)uprv_malloc(newSize);
3924 if(newStart==NULL) {
3925 *status = U_MEMORY_ALLOCATION_ERROR;
3926 return NULL;
3927 }
3928 uprv_memcpy(newStart, secStart, *secondaries-secStart);
3929 } else {
3930 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
3931 if(newStart==NULL) {
3932 *status = U_MEMORY_ALLOCATION_ERROR;
3933 return NULL;
3934 }
3935 }
3936 *secondaries=newStart+offset;
3937 *secSize=newSize;
3938 return newStart;
3939 }
3940
3941
3942 /* This should really be a macro */
3943 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3944 /* secondaries in French */
3945 /*
3946 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3947 uint8_t temp;
3948 while(start<end) {
3949 temp = *start;
3950 *start++ = *end;
3951 *end-- = temp;
3952 }
3953 }
3954 */
3955
3956 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3957 TYPE tempA; \
3958 while((start)<(end)) { \
3959 tempA = *(start); \
3960 *(start)++ = *(end); \
3961 *(end)-- = tempA; \
3962 } \
3963 }
3964
3965 /****************************************************************************/
3966 /* Following are the sortkey generation functions */
3967 /* */
3968 /****************************************************************************/
3969
3970 /**
3971 * Merge two sort keys.
3972 * This is useful, for example, to combine sort keys from first and last names
3973 * to sort such pairs.
3974 * Merged sort keys consider on each collation level the first part first entirely,
3975 * then the second one.
3976 * It is possible to merge multiple sort keys by consecutively merging
3977 * another one with the intermediate result.
3978 *
3979 * The length of the merge result is the sum of the lengths of the input sort keys
3980 * minus 1.
3981 *
3982 * @param src1 the first sort key
3983 * @param src1Length the length of the first sort key, including the zero byte at the end;
3984 * can be -1 if the function is to find the length
3985 * @param src2 the second sort key
3986 * @param src2Length the length of the second sort key, including the zero byte at the end;
3987 * can be -1 if the function is to find the length
3988 * @param dest the buffer where the merged sort key is written,
3989 * can be NULL if destCapacity==0
3990 * @param destCapacity the number of bytes in the dest buffer
3991 * @return the length of the merged sort key, src1Length+src2Length-1;
3992 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3993 * in which cases the contents of dest is undefined
3994 *
3995 * @draft
3996 */
3997 U_CAPI int32_t U_EXPORT2
3998 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
3999 const uint8_t *src2, int32_t src2Length,
4000 uint8_t *dest, int32_t destCapacity) {
4001 int32_t destLength;
4002 uint8_t b;
4003
4004 /* check arguments */
4005 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4006 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4007 destCapacity<0 || (destCapacity>0 && dest==NULL)
4008 ) {
4009 /* error, attempt to write a zero byte and return 0 */
4010 if(dest!=NULL && destCapacity>0) {
4011 *dest=0;
4012 }
4013 return 0;
4014 }
4015
4016 /* check lengths and capacity */
4017 if(src1Length<0) {
4018 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4019 }
4020 if(src2Length<0) {
4021 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4022 }
4023
4024 destLength=src1Length+src2Length-1;
4025 if(destLength>destCapacity) {
4026 /* the merged sort key does not fit into the destination */
4027 return destLength;
4028 }
4029
4030 /* merge the sort keys with the same number of levels */
4031 while(*src1!=0 && *src2!=0) { /* while both have another level */
4032 /* copy level from src1 not including 00 or 01 */
4033 while((b=*src1)>=2) {
4034 ++src1;
4035 *dest++=b;
4036 }
4037
4038 /* add a 02 merge separator */
4039 *dest++=2;
4040
4041 /* copy level from src2 not including 00 or 01 */
4042 while((b=*src2)>=2) {
4043 ++src2;
4044 *dest++=b;
4045 }
4046
4047 /* if both sort keys have another level, then add a 01 level separator and continue */
4048 if(*src1==1 && *src2==1) {
4049 ++src1;
4050 ++src2;
4051 *dest++=1;
4052 }
4053 }
4054
4055 /*
4056 * here, at least one sort key is finished now, but the other one
4057 * might have some contents left from containing more levels;
4058 * that contents is just appended to the result
4059 */
4060 if(*src1!=0) {
4061 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4062 src2=src1;
4063 }
4064 /* append src2, "the other, unfinished sort key" */
4065 uprv_strcpy((char *)dest, (const char *)src2);
4066
4067 /* trust that neither sort key contained illegally embedded zero bytes */
4068 return destLength;
4069 }
4070
4071 /* sortkey API */
4072 U_CAPI int32_t U_EXPORT2
4073 ucol_getSortKey(const UCollator *coll,
4074 const UChar *source,
4075 int32_t sourceLength,
4076 uint8_t *result,
4077 int32_t resultLength)
4078 {
4079 UErrorCode status = U_ZERO_ERROR;
4080
4081 if(source == NULL) {
4082 // this is actually an error situation, but we would need to
4083 // have an error code to return it. Until we introduce a new
4084 // API, it stays like this
4085 return 0;
4086 }
4087 /* this uses the function pointer that is set in updateinternalstate */
4088 /* currently, there are two funcs: */
4089 /*ucol_calcSortKey(...);*/
4090 /*ucol_calcSortKeySimpleTertiary(...);*/
4091
4092 int32_t keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4093 //((UCollator *)coll)->errorCode = status; /*semantically const */
4094 return keySize;
4095 }
4096
4097 /* this function is called by the C++ API for sortkey generation */
4098 U_CFUNC int32_t
4099 ucol_getSortKeyWithAllocation(const UCollator *coll,
4100 const UChar *source, int32_t sourceLength,
4101 uint8_t **pResult,
4102 UErrorCode *pErrorCode) {
4103 *pResult = 0;
4104 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4105 }
4106
4107 #define UCOL_FSEC_BUF_SIZE 256
4108
4109 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4110 /* or if we run out of space while making a sortkey and want to return ASAP */
4111 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4112 UErrorCode status = U_ZERO_ERROR;
4113 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4114 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4115 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4116 UBool compareIdent = (strength == UCOL_IDENTICAL);
4117 UBool doCase = (coll->caseLevel == UCOL_ON);
4118 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4119 //UBool qShifted = shifted && (compareQuad == 0);
4120 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4121 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4122 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4123 uint8_t *fSecs = fSecsBuff;
4124 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4125 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4126
4127 uint32_t variableTopValue = coll->variableTopValue;
4128 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4129 if(doHiragana) {
4130 UCOL_COMMON_BOT4++;
4131 /* allocate one more space for hiragana */
4132 }
4133 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4134
4135 uint32_t order = UCOL_NO_MORE_CES;
4136 uint8_t primary1 = 0;
4137 uint8_t primary2 = 0;
4138 uint8_t secondary = 0;
4139 uint8_t tertiary = 0;
4140 int32_t caseShift = 0;
4141 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4142
4143 uint8_t caseSwitch = coll->caseSwitch;
4144 uint8_t tertiaryMask = coll->tertiaryMask;
4145 uint8_t tertiaryCommon = coll->tertiaryCommon;
4146
4147 UBool wasShifted = FALSE;
4148 UBool notIsContinuation = FALSE;
4149 uint8_t leadPrimary = 0;
4150
4151
4152 for(;;) {
4153 order = ucol_IGetNextCE(coll, s, &status);
4154 if(order == UCOL_NO_MORE_CES) {
4155 break;
4156 }
4157
4158 if(order == 0) {
4159 continue;
4160 }
4161
4162 notIsContinuation = !isContinuation(order);
4163
4164
4165 if(notIsContinuation) {
4166 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4167 } else {
4168 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4169 }
4170 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4171 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4172 primary1 = (uint8_t)(order >> 8);
4173
4174
4175 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4176 || (!notIsContinuation && wasShifted))
4177 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4178 /* and other ignorables should be removed if following a shifted code point */
4179 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4180 /* we should just completely ignore it */
4181 continue;
4182 }
4183 if(compareQuad == 0) {
4184 if(c4 > 0) {
4185 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4186 c4 = 0;
4187 }
4188 currentSize++;
4189 if(primary2 != 0) {
4190 currentSize++;
4191 }
4192 }
4193 wasShifted = TRUE;
4194 } else {
4195 wasShifted = FALSE;
4196 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4197 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4198 /* calculate sortkey size */
4199 if(primary1 != UCOL_IGNORABLE) {
4200 if(notIsContinuation) {
4201 if(leadPrimary == primary1) {
4202 currentSize++;
4203 } else {
4204 if(leadPrimary != 0) {
4205 currentSize++;
4206 }
4207 if(primary2 == UCOL_IGNORABLE) {
4208 /* one byter, not compressed */
4209 currentSize++;
4210 leadPrimary = 0;
4211 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4212 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4213 (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4214 /* not compressible */
4215 leadPrimary = 0;
4216 currentSize+=2;
4217 } else { /* compress */
4218 leadPrimary = primary1;
4219 currentSize+=2;
4220 }
4221 }
4222 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4223 currentSize++;
4224 if(primary2 != UCOL_IGNORABLE) {
4225 currentSize++;
4226 }
4227 }
4228 }
4229
4230 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4231 if(!isFrenchSec){
4232 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4233 c2++;
4234 } else {
4235 if(c2 > 0) {
4236 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4237 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4238 } else {
4239 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4240 }
4241 c2 = 0;
4242 }
4243 currentSize++;
4244 }
4245 } else {
4246 fSecs[fSecsLen++] = secondary;
4247 if(fSecsLen == fSecsMaxLen) {
4248 if(fSecs == fSecsBuff) {
4249 fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4250 } else {
4251 fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4252 }
4253 if(fSecs == NULL) {
4254 status = U_MEMORY_ALLOCATION_ERROR;
4255 return -1;
4256 }
4257 fSecsMaxLen *= 2;
4258 }
4259 if(notIsContinuation) {
4260 if (frenchStartPtr != NULL) {
4261 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4262 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4263 frenchStartPtr = NULL;
4264 }
4265 } else {
4266 if (frenchStartPtr == NULL) {
4267 frenchStartPtr = fSecs+fSecsLen-2;
4268 }
4269 frenchEndPtr = fSecs+fSecsLen-1;
4270 }
4271 }
4272 }
4273
4274 if(doCase) {
4275 if (caseShift == 0) {
4276 currentSize++;
4277 caseShift = UCOL_CASE_SHIFT_START;
4278 }
4279 if((tertiary&0x3F) > 0 && notIsContinuation) {
4280 caseShift--;
4281 if((tertiary &0xC0) != 0) {
4282 if (caseShift == 0) {
4283 currentSize++;
4284 caseShift = UCOL_CASE_SHIFT_START;
4285 }
4286 caseShift--;
4287 }
4288 }
4289 } else {
4290 if(notIsContinuation) {
4291 tertiary ^= caseSwitch;
4292 }
4293 }
4294
4295 tertiary &= tertiaryMask;
4296 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4297 if (tertiary == tertiaryCommon && notIsContinuation) {
4298 c3++;
4299 } else {
4300 if(c3 > 0) {
4301 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4302 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4303 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4304 } else {
4305 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4306 }
4307 c3 = 0;
4308 }
4309 currentSize++;
4310 }
4311 }
4312
4313 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4314 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4315 if(c4>0) { // Close this part
4316 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4317 c4 = 0;
4318 }
4319 currentSize++; // Add the Hiragana
4320 } else { // This wasn't Hiragana, so we can continue adding stuff
4321 c4++;
4322 }
4323 }
4324
4325 }
4326 }
4327
4328 if(!isFrenchSec){
4329 if(c2 > 0) {
4330 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4331 }
4332 } else {
4333 uint32_t i = 0;
4334 if(frenchStartPtr != NULL) {
4335 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4336 }
4337 for(i = 0; i<fSecsLen; i++) {
4338 secondary = *(fSecs+fSecsLen-i-1);
4339 /* This is compression code. */
4340 if (secondary == UCOL_COMMON2) {
4341 ++c2;
4342 } else {
4343 if(c2 > 0) {
4344 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4345 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4346 } else {
4347 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4348 }
4349 c2 = 0;
4350 }
4351 currentSize++;
4352 }
4353 }
4354 if(c2 > 0) {
4355 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4356 }
4357 if(fSecs != fSecsBuff) {
4358 uprv_free(fSecs);
4359 }
4360 }
4361
4362 if(c3 > 0) {
4363 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4364 }
4365
4366 if(c4 > 0 && compareQuad == 0) {
4367 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4368 }
4369
4370 if(compareIdent) {
4371 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4372 }
4373 return currentSize;
4374
4375 }
4376
4377 static
4378 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4379 if (caseShift == 0) {
4380 *(*cases)++ = UCOL_CASE_BYTE_START;
4381 caseShift = UCOL_CASE_SHIFT_START;
4382 }
4383 }
4384
4385 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4386 // know how many values we wanted to add, even if we didn't add them all
4387 static
4388 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4389 size++;
4390 if(primaries < limit) {
4391 *(primaries)++ = value;
4392 }
4393 }
4394
4395 // Packs the secondary buffer when processing French locale. Adds the terminator.
4396 static
4397 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4398 uint8_t secondary;
4399 int32_t count2 = 0;
4400 uint32_t i = 0, size = 0;
4401 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4402 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4403 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4404 if(frenchStartPtr != NULL) {
4405 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4406 }
4407 for(i = 0; i<*secsize; i++) {
4408 secondary = *(secondaries-i-1);
4409 /* This is compression code. */
4410 if (secondary == UCOL_COMMON2) {
4411 ++count2;
4412 } else {
4413 if (count2 > 0) {
4414 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4415 while (count2 > UCOL_TOP_COUNT2) {
4416 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4417 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4418 }
4419 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4420 } else {
4421 while (count2 > UCOL_BOT_COUNT2) {
4422 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4423 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4424 }
4425 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4426 }
4427 count2 = 0;
4428 }
4429 addWithIncrement(primaries, primEnd, size, secondary);
4430 }
4431 }
4432 if (count2 > 0) {
4433 while (count2 > UCOL_BOT_COUNT2) {
4434 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4435 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4436 }
4437 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4438 }
4439 *secsize = size;
4440 return primaries;
4441 }
4442
4443 /* This is the sortkey work horse function */
4444 U_CFUNC int32_t U_CALLCONV
4445 ucol_calcSortKey(const UCollator *coll,
4446 const UChar *source,
4447 int32_t sourceLength,
4448 uint8_t **result,
4449 uint32_t resultLength,
4450 UBool allocateSKBuffer,
4451 UErrorCode *status)
4452 {
4453 uint32_t i = 0; /* general purpose counter */
4454
4455 /* Stack allocated buffers for buffers we use */
4456 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4457
4458 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4459
4460 if(U_FAILURE(*status)) {
4461 return 0;
4462 }
4463
4464 if(primaries == NULL && allocateSKBuffer == TRUE) {
4465 primaries = *result = prim;
4466 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4467 }
4468
4469 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4470 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4471
4472 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4473
4474 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4475 UChar *normSource = normBuffer;
4476 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4477
4478 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4479
4480 UColAttributeValue strength = coll->strength;
4481
4482 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4483 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4484 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4485 UBool compareIdent = (strength == UCOL_IDENTICAL);
4486 UBool doCase = (coll->caseLevel == UCOL_ON);
4487 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4488 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4489 //UBool qShifted = shifted && (compareQuad == 0);
4490 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4491 const uint8_t *scriptOrder = coll->scriptOrder;
4492
4493 uint32_t variableTopValue = coll->variableTopValue;
4494 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4495 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4496 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4497 uint8_t UCOL_HIRAGANA_QUAD = 0;
4498 if(doHiragana) {
4499 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4500 /* allocate one more space for hiragana, value for hiragana */
4501 }
4502 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4503
4504 /* support for special features like caselevel and funky secondaries */
4505 uint8_t *frenchStartPtr = NULL;
4506 uint8_t *frenchEndPtr = NULL;
4507 uint32_t caseShift = 0;
4508
4509 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4510
4511 /* If we need to normalize, we'll do it all at once at the beginning! */
4512 UNormalizationMode normMode;
4513 if(compareIdent) {
4514 normMode = UNORM_NFD;
4515 } else if(coll->normalizationMode != UCOL_OFF) {
4516 normMode = UNORM_FCD;
4517 } else {
4518 normMode = UNORM_NONE;
4519 }
4520
4521 if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4522 len = unorm_internalNormalize(normSource, normSourceLen,
4523 source, len,
4524 normMode, FALSE,
4525 status);
4526 if(*status == U_BUFFER_OVERFLOW_ERROR) {
4527 normSourceLen = len;
4528 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4529 if(normSource == NULL) {
4530 *status = U_MEMORY_ALLOCATION_ERROR;
4531 return 0;
4532 }
4533 *status = U_ZERO_ERROR;
4534 len = unorm_internalNormalize(normSource, normSourceLen,
4535 source, len,
4536 normMode, FALSE,
4537 status);
4538 }
4539
4540 if(U_FAILURE(*status)) {
4541 return 0;
4542 }
4543 source = normSource;
4544 }
4545
4546 collIterate s;
4547 IInit_collIterate(coll, (UChar *)source, len, &s);
4548 if(source == normSource) {
4549 s.flags &= ~UCOL_ITER_NORM;
4550 }
4551
4552 if(resultLength == 0 || primaries == NULL) {
4553 int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4554 if(normSource != normBuffer) {
4555 uprv_free(normSource);
4556 }
4557 return keyLen;
4558 }
4559 uint8_t *primarySafeEnd = primaries + resultLength - 2;
4560
4561 uint32_t minBufferSize = UCOL_MAX_BUFFER;
4562
4563 uint8_t *primStart = primaries;
4564 uint8_t *secStart = secondaries;
4565 uint8_t *terStart = tertiaries;
4566 uint8_t *caseStart = cases;
4567 uint8_t *quadStart = quads;
4568
4569 uint32_t order = 0;
4570
4571 uint8_t primary1 = 0;
4572 uint8_t primary2 = 0;
4573 uint8_t secondary = 0;
4574 uint8_t tertiary = 0;
4575 uint8_t caseSwitch = coll->caseSwitch;
4576 uint8_t tertiaryMask = coll->tertiaryMask;
4577 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
4578 uint8_t tertiaryTop = coll->tertiaryTop;
4579 uint8_t tertiaryBottom = coll->tertiaryBottom;
4580 uint8_t tertiaryCommon = coll->tertiaryCommon;
4581 uint8_t caseBits = 0;
4582
4583 UBool finished = FALSE;
4584 UBool wasShifted = FALSE;
4585 UBool notIsContinuation = FALSE;
4586
4587 uint32_t prevBuffSize = 0;
4588
4589 uint32_t count2 = 0, count3 = 0, count4 = 0;
4590 uint8_t leadPrimary = 0;
4591
4592 for(;;) {
4593 for(i=prevBuffSize; i<minBufferSize; ++i) {
4594
4595 order = ucol_IGetNextCE(coll, &s, status);
4596 if(order == UCOL_NO_MORE_CES) {
4597 finished = TRUE;
4598 break;
4599 }
4600
4601 if(order == 0) {
4602 continue;
4603 }
4604
4605 notIsContinuation = !isContinuation(order);
4606
4607 if(notIsContinuation) {
4608 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4609 } else {
4610 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4611 }
4612
4613 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4614 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4615 primary1 = (uint8_t)(order >> 8);
4616
4617 if(notIsContinuation) {
4618 if(scriptOrder != NULL) {
4619 primary1 = scriptOrder[primary1];
4620 }
4621 }
4622
4623 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4624 || (!notIsContinuation && wasShifted))
4625 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4626 /* and other ignorables should be removed if following a shifted code point */
4627 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4628 /* we should just completely ignore it */
4629 continue;
4630 }
4631 if(compareQuad == 0) {
4632 if(count4 > 0) {
4633 while (count4 > UCOL_BOT_COUNT4) {
4634 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4635 count4 -= UCOL_BOT_COUNT4;
4636 }
4637 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4638 count4 = 0;
4639 }
4640 /* We are dealing with a variable and we're treating them as shifted */
4641 /* This is a shifted ignorable */
4642 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4643 *quads++ = primary1;
4644 }
4645 if(primary2 != 0) {
4646 *quads++ = primary2;
4647 }
4648 }
4649 wasShifted = TRUE;
4650 } else {
4651 wasShifted = FALSE;
4652 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4653 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4654 /* regular and simple sortkey calc */
4655 if(primary1 != UCOL_IGNORABLE) {
4656 if(notIsContinuation) {
4657 if(leadPrimary == primary1) {
4658 *primaries++ = primary2;
4659 } else {
4660 if(leadPrimary != 0) {
4661 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4662 }
4663 if(primary2 == UCOL_IGNORABLE) {
4664 /* one byter, not compressed */
4665 *primaries++ = primary1;
4666 leadPrimary = 0;
4667 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4668 (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4669 /* not compressible */
4670 leadPrimary = 0;
4671 *primaries++ = primary1;
4672 *primaries++ = primary2;
4673 } else { /* compress */
4674 *primaries++ = leadPrimary = primary1;
4675 *primaries++ = primary2;
4676 }
4677 }
4678 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4679 *primaries++ = primary1;
4680 if(primary2 != UCOL_IGNORABLE) {
4681 *primaries++ = primary2; /* second part */
4682 }
4683 }
4684 }
4685
4686 if(secondary > compareSec) {
4687 if(!isFrenchSec) {
4688 /* This is compression code. */
4689 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4690 ++count2;
4691 } else {
4692 if (count2 > 0) {
4693 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4694 while (count2 > UCOL_TOP_COUNT2) {
4695 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4696 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4697 }
4698 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4699 } else {
4700 while (count2 > UCOL_BOT_COUNT2) {
4701 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4702 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4703 }
4704 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4705 }
4706 count2 = 0;
4707 }
4708 *secondaries++ = secondary;
4709 }
4710 } else {
4711 *secondaries++ = secondary;
4712 /* Do the special handling for French secondaries */
4713 /* We need to get continuation elements and do intermediate restore */
4714 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4715 if(notIsContinuation) {
4716 if (frenchStartPtr != NULL) {
4717 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4718 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4719 frenchStartPtr = NULL;
4720 }
4721 } else {
4722 if (frenchStartPtr == NULL) {
4723 frenchStartPtr = secondaries - 2;
4724 }
4725 frenchEndPtr = secondaries-1;
4726 }
4727 }
4728 }
4729
4730 if(doCase) {
4731 doCaseShift(&cases, caseShift);
4732 if(notIsContinuation) {
4733 caseBits = (uint8_t)(tertiary & 0xC0);
4734
4735 if(tertiary != 0) {
4736 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4737 if((caseBits & 0xC0) == 0) {
4738 *(cases-1) |= 1 << (--caseShift);
4739 } else {
4740 *(cases-1) |= 0 << (--caseShift);
4741 /* second bit */
4742 doCaseShift(&cases, caseShift);
4743 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4744 }
4745 } else {
4746 if((caseBits & 0xC0) == 0) {
4747 *(cases-1) |= 0 << (--caseShift);
4748 } else {
4749 *(cases-1) |= 1 << (--caseShift);
4750 /* second bit */
4751 doCaseShift(&cases, caseShift);
4752 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4753 }
4754 }
4755 }
4756
4757 }
4758 } else {
4759 if(notIsContinuation) {
4760 tertiary ^= caseSwitch;
4761 }
4762 }
4763
4764 tertiary &= tertiaryMask;
4765 if(tertiary > compareTer) {
4766 /* This is compression code. */
4767 /* sequence size check is included in the if clause */
4768 if (tertiary == tertiaryCommon && notIsContinuation) {
4769 ++count3;
4770 } else {
4771 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4772 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4773 tertiary += tertiaryAddition;
4774 }
4775 if (count3 > 0) {
4776 if ((tertiary > tertiaryCommon)) {
4777 while (count3 > coll->tertiaryTopCount) {
4778 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4779 count3 -= (uint32_t)coll->tertiaryTopCount;
4780 }
4781 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4782 } else {
4783 while (count3 > coll->tertiaryBottomCount) {
4784 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4785 count3 -= (uint32_t)coll->tertiaryBottomCount;
4786 }
4787 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4788 }
4789 count3 = 0;
4790 }
4791 *tertiaries++ = tertiary;
4792 }
4793 }
4794
4795 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4796 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4797 if(count4>0) { // Close this part
4798 while (count4 > UCOL_BOT_COUNT4) {
4799 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4800 count4 -= UCOL_BOT_COUNT4;
4801 }
4802 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4803 count4 = 0;
4804 }
4805 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4806 } else { // This wasn't Hiragana, so we can continue adding stuff
4807 count4++;
4808 }
4809 }
4810 }
4811
4812 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4813 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4814 IInit_collIterate(coll, (UChar *)source, len, &s);
4815 if(source == normSource) {
4816 s.flags &= ~UCOL_ITER_NORM;
4817 }
4818 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4819 *status = U_BUFFER_OVERFLOW_ERROR;
4820 finished = TRUE;
4821 break;
4822 } else { /* It's much nicer if we can actually reallocate */
4823 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
4824 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4825 if(U_SUCCESS(*status)) {
4826 *result = primStart;
4827 primarySafeEnd = primStart + resultLength - 2;
4828 } else {
4829 IInit_collIterate(coll, (UChar *)source, len, &s);
4830 if(source == normSource) {
4831 s.flags &= ~UCOL_ITER_NORM;
4832 }
4833 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4834 finished = TRUE;
4835 break;
4836 }
4837 }
4838 }
4839 }
4840 if(finished) {
4841 break;
4842 } else {
4843 prevBuffSize = minBufferSize;
4844 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
4845 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
4846 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
4847 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
4848 minBufferSize *= 2;
4849 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4850 IInit_collIterate(coll, (UChar *)source, len, &s);
4851 if(source == normSource) {
4852 s.flags &= ~UCOL_ITER_NORM;
4853 }
4854 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4855 break;
4856 }
4857 }
4858 }
4859
4860 /* Here, we are generally done with processing */
4861 /* bailing out would not be too productive */
4862
4863 if(U_SUCCESS(*status)) {
4864 sortKeySize += (primaries - primStart);
4865 /* we have done all the CE's, now let's put them together to form a key */
4866 if(compareSec == 0) {
4867 if (count2 > 0) {
4868 while (count2 > UCOL_BOT_COUNT2) {
4869 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4870 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4871 }
4872 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4873 }
4874 uint32_t secsize = secondaries-secStart;
4875 if(!isFrenchSec) { // Regular situation, we know the length of secondaries
4876 sortKeySize += secsize;
4877 if(sortKeySize <= resultLength) {
4878 *(primaries++) = UCOL_LEVELTERMINATOR;
4879 uprv_memcpy(primaries, secStart, secsize);
4880 primaries += secsize;
4881 } else {
4882 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4883 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4884 if(U_SUCCESS(*status)) {
4885 *result = primStart;
4886 *(primaries++) = UCOL_LEVELTERMINATOR;
4887 uprv_memcpy(primaries, secStart, secsize);
4888 primaries += secsize;
4889 }
4890 } else {
4891 *status = U_BUFFER_OVERFLOW_ERROR;
4892 }
4893 }
4894 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4895 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4896 sortKeySize += secsize;
4897 if(sortKeySize <= resultLength) { // if we managed to pack fine
4898 primaries = newPrim; // update the primary pointer
4899 } else { // overflow, need to reallocate and redo
4900 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4901 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4902 if(U_SUCCESS(*status)) {
4903 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4904 }
4905 } else {
4906 *status = U_BUFFER_OVERFLOW_ERROR;
4907 }
4908 }
4909 }
4910 }
4911
4912 if(doCase) {
4913 uint32_t casesize = cases - caseStart;
4914 sortKeySize += casesize;
4915 if(sortKeySize <= resultLength) {
4916 *(primaries++) = UCOL_LEVELTERMINATOR;
4917 uprv_memcpy(primaries, caseStart, casesize);
4918 primaries += casesize;
4919 } else {
4920 if(allocateSKBuffer == TRUE) {
4921 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4922 if(U_SUCCESS(*status)) {
4923 *result = primStart;
4924 *(primaries++) = UCOL_LEVELTERMINATOR;
4925 uprv_memcpy(primaries, caseStart, casesize);
4926 }
4927 } else {
4928 *status = U_BUFFER_OVERFLOW_ERROR;
4929 }
4930 }
4931 }
4932
4933 if(compareTer == 0) {
4934 if (count3 > 0) {
4935 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4936 while (count3 >= coll->tertiaryTopCount) {
4937 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4938 count3 -= (uint32_t)coll->tertiaryTopCount;
4939 }
4940 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
4941 } else {
4942 while (count3 > coll->tertiaryBottomCount) {
4943 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4944 count3 -= (uint32_t)coll->tertiaryBottomCount;
4945 }
4946 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4947 }
4948 }
4949 uint32_t tersize = tertiaries - terStart;
4950 sortKeySize += tersize;
4951 if(sortKeySize <= resultLength) {
4952 *(primaries++) = UCOL_LEVELTERMINATOR;
4953 uprv_memcpy(primaries, terStart, tersize);
4954 primaries += tersize;
4955 } else {
4956 if(allocateSKBuffer == TRUE) {
4957 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4958 if(U_SUCCESS(*status)) {
4959 *result = primStart;
4960 *(primaries++) = UCOL_LEVELTERMINATOR;
4961 uprv_memcpy(primaries, terStart, tersize);
4962 }
4963 } else {
4964 *status = U_BUFFER_OVERFLOW_ERROR;
4965 }
4966 }
4967
4968 if(compareQuad == 0/*qShifted == TRUE*/) {
4969 if(count4 > 0) {
4970 while (count4 > UCOL_BOT_COUNT4) {
4971 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4972 count4 -= UCOL_BOT_COUNT4;
4973 }
4974 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4975 }
4976 uint32_t quadsize = quads - quadStart;
4977 sortKeySize += quadsize;
4978 if(sortKeySize <= resultLength) {
4979 *(primaries++) = UCOL_LEVELTERMINATOR;
4980 uprv_memcpy(primaries, quadStart, quadsize);
4981 primaries += quadsize;
4982 } else {
4983 if(allocateSKBuffer == TRUE) {
4984 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4985 if(U_SUCCESS(*status)) {
4986 *result = primStart;
4987 *(primaries++) = UCOL_LEVELTERMINATOR;
4988 uprv_memcpy(primaries, quadStart, quadsize);
4989 }
4990 } else {
4991 *status = U_BUFFER_OVERFLOW_ERROR;
4992 }
4993 }
4994 }
4995
4996 if(compareIdent) {
4997 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
4998 if(sortKeySize <= resultLength) {
4999 *(primaries++) = UCOL_LEVELTERMINATOR;
5000 primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5001 } else {
5002 if(allocateSKBuffer == TRUE) {
5003 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5004 if(U_SUCCESS(*status)) {
5005 *result = primStart;
5006 *(primaries++) = UCOL_LEVELTERMINATOR;
5007 u_writeIdenticalLevelRun(s.string, len, primaries);
5008 }
5009 } else {
5010 *status = U_BUFFER_OVERFLOW_ERROR;
5011 }
5012 }
5013 }
5014 }
5015 *(primaries++) = '\0';
5016 }
5017
5018 if(terStart != tert) {
5019 uprv_free(terStart);
5020 uprv_free(secStart);
5021 uprv_free(caseStart);
5022 uprv_free(quadStart);
5023 }
5024
5025 if(normSource != normBuffer) {
5026 uprv_free(normSource);
5027 }
5028
5029 if(allocateSKBuffer == TRUE) {
5030 *result = (uint8_t*)uprv_malloc(sortKeySize);
5031 /* test for NULL */
5032 if (*result == NULL) {
5033 *status = U_MEMORY_ALLOCATION_ERROR;
5034 return sortKeySize;
5035 }
5036 uprv_memcpy(*result, primStart, sortKeySize);
5037 if(primStart != prim) {
5038 uprv_free(primStart);
5039 }
5040 }
5041
5042 return sortKeySize;
5043 }
5044
5045
5046 U_CFUNC int32_t U_CALLCONV
5047 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5048 const UChar *source,
5049 int32_t sourceLength,
5050 uint8_t **result,
5051 uint32_t resultLength,
5052 UBool allocateSKBuffer,
5053 UErrorCode *status)
5054 {
5055 U_ALIGN_CODE(16);
5056 uint32_t i = 0; /* general purpose counter */
5057
5058 /* Stack allocated buffers for buffers we use */
5059 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5060
5061 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5062
5063 if(U_FAILURE(*status)) {
5064 return 0;
5065 }
5066
5067 if(primaries == NULL && allocateSKBuffer == TRUE) {
5068 primaries = *result = prim;
5069 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5070 }
5071
5072 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5073
5074 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5075
5076 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5077 UChar *normSource = normBuffer;
5078 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5079
5080 int32_t len = sourceLength;
5081
5082 /* If we need to normalize, we'll do it all at once at the beginning! */
5083 if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5084 len = unorm_internalNormalize(normSource, normSourceLen,
5085 source, len,
5086 UNORM_FCD, FALSE,
5087 status);
5088 if(*status == U_BUFFER_OVERFLOW_ERROR) {
5089 normSourceLen = len;
5090 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5091 if(normSource == NULL) {
5092 *status = U_MEMORY_ALLOCATION_ERROR;
5093 return 0;
5094 }
5095 *status = U_ZERO_ERROR;
5096 len = unorm_internalNormalize(normSource, normSourceLen,
5097 source, len,
5098 UNORM_FCD, FALSE,
5099 status);
5100 }
5101
5102 if(U_FAILURE(*status)) {
5103 return 0;
5104 }
5105 source = normSource;
5106 }
5107
5108 collIterate s;
5109 IInit_collIterate(coll, (UChar *)source, len, &s);
5110 if(source == normSource) {
5111 s.flags &= ~UCOL_ITER_NORM;
5112 }
5113
5114 if(resultLength == 0 || primaries == NULL) {
5115 int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5116 if(normSource != normBuffer) {
5117 uprv_free(normSource);
5118 }
5119 return t;
5120 }
5121
5122 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5123
5124 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5125
5126 uint8_t *primStart = primaries;
5127 uint8_t *secStart = secondaries;
5128 uint8_t *terStart = tertiaries;
5129
5130 uint32_t order = 0;
5131
5132 uint8_t primary1 = 0;
5133 uint8_t primary2 = 0;
5134 uint8_t secondary = 0;
5135 uint8_t tertiary = 0;
5136 uint8_t caseSwitch = coll->caseSwitch;
5137 uint8_t tertiaryMask = coll->tertiaryMask;
5138 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5139 uint8_t tertiaryTop = coll->tertiaryTop;
5140 uint8_t tertiaryBottom = coll->tertiaryBottom;
5141 uint8_t tertiaryCommon = coll->tertiaryCommon;
5142
5143 uint32_t prevBuffSize = 0;
5144
5145 UBool finished = FALSE;
5146 UBool notIsContinuation = FALSE;
5147
5148 uint32_t count2 = 0, count3 = 0;
5149 uint8_t leadPrimary = 0;
5150
5151 for(;;) {
5152 for(i=prevBuffSize; i<minBufferSize; ++i) {
5153
5154 order = ucol_IGetNextCE(coll, &s, status);
5155
5156 if(order == 0) {
5157 continue;
5158 }
5159
5160 if(order == UCOL_NO_MORE_CES) {
5161 finished = TRUE;
5162 break;
5163 }
5164
5165 notIsContinuation = !isContinuation(order);
5166
5167 if(notIsContinuation) {
5168 tertiary = (uint8_t)((order & tertiaryMask));
5169 } else {
5170 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5171 }
5172 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5173 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5174 primary1 = (uint8_t)(order >> 8);
5175
5176 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5177 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5178 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5179 /* regular and simple sortkey calc */
5180 if(primary1 != UCOL_IGNORABLE) {
5181 if(notIsContinuation) {
5182 if(leadPrimary == primary1) {
5183 *primaries++ = primary2;
5184 } else {
5185 if(leadPrimary != 0) {
5186 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5187 }
5188 if(primary2 == UCOL_IGNORABLE) {
5189 /* one byter, not compressed */
5190 *primaries++ = primary1;
5191 leadPrimary = 0;
5192 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5193 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5194 (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5195 /* not compressible */
5196 leadPrimary = 0;
5197 *primaries++ = primary1;
5198 *primaries++ = primary2;
5199 } else { /* compress */
5200 *primaries++ = leadPrimary = primary1;
5201 *primaries++ = primary2;
5202 }
5203 }
5204 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5205 *primaries++ = primary1;
5206 if(primary2 != UCOL_IGNORABLE) {
5207 *primaries++ = primary2; /* second part */
5208 }
5209 }
5210 }
5211
5212 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5213 /* This is compression code. */
5214 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5215 ++count2;
5216 } else {
5217 if (count2 > 0) {
5218 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5219 while (count2 > UCOL_TOP_COUNT2) {
5220 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5221 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5222 }
5223 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5224 } else {
5225 while (count2 > UCOL_BOT_COUNT2) {
5226 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5227 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5228 }
5229 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5230 }
5231 count2 = 0;
5232 }
5233 *secondaries++ = secondary;
5234 }
5235 }
5236
5237 if(notIsContinuation) {
5238 tertiary ^= caseSwitch;
5239 }
5240
5241 if(tertiary > 0) {
5242 /* This is compression code. */
5243 /* sequence size check is included in the if clause */
5244 if (tertiary == tertiaryCommon && notIsContinuation) {
5245 ++count3;
5246 } else {
5247 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5248 tertiary += tertiaryAddition;
5249 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5250 tertiary -= tertiaryAddition;
5251 }
5252 if (count3 > 0) {
5253 if ((tertiary > tertiaryCommon)) {
5254 while (count3 > coll->tertiaryTopCount) {
5255 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5256 count3 -= (uint32_t)coll->tertiaryTopCount;
5257 }
5258 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5259 } else {
5260 while (count3 > coll->tertiaryBottomCount) {
5261 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5262 count3 -= (uint32_t)coll->tertiaryBottomCount;
5263 }
5264 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5265 }
5266 count3 = 0;
5267 }
5268 *tertiaries++ = tertiary;
5269 }
5270 }
5271
5272 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5273 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5274 IInit_collIterate(coll, (UChar *)source, len, &s);
5275 if(source == normSource) {
5276 s.flags &= ~UCOL_ITER_NORM;
5277 }
5278 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5279 *status = U_BUFFER_OVERFLOW_ERROR;
5280 finished = TRUE;
5281 break;
5282 } else { /* It's much nicer if we can actually reallocate */
5283 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5284 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5285 if(U_SUCCESS(*status)) {
5286 *result = primStart;
5287 primarySafeEnd = primStart + resultLength - 2;
5288 } else {
5289 IInit_collIterate(coll, (UChar *)source, len, &s);
5290 if(source == normSource) {
5291 s.flags &= ~UCOL_ITER_NORM;
5292 }
5293 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5294 finished = TRUE;
5295 break;
5296 }
5297 }
5298 }
5299 }
5300 if(finished) {
5301 break;
5302 } else {
5303 prevBuffSize = minBufferSize;
5304 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5305 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5306 minBufferSize *= 2;
5307 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5308 IInit_collIterate(coll, (UChar *)source, len, &s);
5309 if(source == normSource) {
5310 s.flags &= ~UCOL_ITER_NORM;
5311 }
5312 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5313 break;
5314 }
5315 }
5316 }
5317
5318 if(U_SUCCESS(*status)) {
5319 sortKeySize += (primaries - primStart);
5320 /* we have done all the CE's, now let's put them together to form a key */
5321 if (count2 > 0) {
5322 while (count2 > UCOL_BOT_COUNT2) {
5323 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5324 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5325 }
5326 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5327 }
5328 uint32_t secsize = secondaries-secStart;
5329 sortKeySize += secsize;
5330 if(sortKeySize <= resultLength) {
5331 *(primaries++) = UCOL_LEVELTERMINATOR;
5332 uprv_memcpy(primaries, secStart, secsize);
5333 primaries += secsize;
5334 } else {
5335 if(allocateSKBuffer == TRUE) {
5336 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5337 if(U_SUCCESS(*status)) {
5338 *(primaries++) = UCOL_LEVELTERMINATOR;
5339 *result = primStart;
5340 uprv_memcpy(primaries, secStart, secsize);
5341 }
5342 } else {
5343 *status = U_BUFFER_OVERFLOW_ERROR;
5344 }
5345 }
5346
5347 if (count3 > 0) {
5348 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5349 while (count3 >= coll->tertiaryTopCount) {
5350 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5351 count3 -= (uint32_t)coll->tertiaryTopCount;
5352 }
5353 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5354 } else {
5355 while (count3 > coll->tertiaryBottomCount) {
5356 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5357 count3 -= (uint32_t)coll->tertiaryBottomCount;
5358 }
5359 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5360 }
5361 }
5362 uint32_t tersize = tertiaries - terStart;
5363 sortKeySize += tersize;
5364 if(sortKeySize <= resultLength) {
5365 *(primaries++) = UCOL_LEVELTERMINATOR;
5366 uprv_memcpy(primaries, terStart, tersize);
5367 primaries += tersize;
5368 } else {
5369 if(allocateSKBuffer == TRUE) {
5370 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5371 if(U_SUCCESS(*status)) {
5372 *result = primStart;
5373 *(primaries++) = UCOL_LEVELTERMINATOR;
5374 uprv_memcpy(primaries, terStart, tersize);
5375 }
5376 } else {
5377 *status = U_MEMORY_ALLOCATION_ERROR;
5378 }
5379 }
5380
5381 *(primaries++) = '\0';
5382 }
5383
5384 if(terStart != tert) {
5385 uprv_free(terStart);
5386 uprv_free(secStart);
5387 }
5388
5389 if(normSource != normBuffer) {
5390 uprv_free(normSource);
5391 }
5392
5393 if(allocateSKBuffer == TRUE) {
5394 *result = (uint8_t*)uprv_malloc(sortKeySize);
5395 /* test for NULL */
5396 if (*result == NULL) {
5397 *status = U_MEMORY_ALLOCATION_ERROR;
5398 return sortKeySize;
5399 }
5400 uprv_memcpy(*result, primStart, sortKeySize);
5401 if(primStart != prim) {
5402 uprv_free(primStart);
5403 }
5404 }
5405
5406 return sortKeySize;
5407 }
5408
5409 static inline
5410 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5411 UBool notIsContinuation = !isContinuation(CE);
5412 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5413 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5414 || (!notIsContinuation && *wasShifted))
5415 || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5416 // The stuff below should probably be in the sortkey code... maybe not...
5417 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5418 /* we should just completely ignore it */
5419 *wasShifted = TRUE;
5420 //continue;
5421 }
5422 //*wasShifted = TRUE;
5423 return TRUE;
5424 } else {
5425 *wasShifted = FALSE;
5426 return FALSE;
5427 }
5428 }
5429 static inline
5430 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5431 if(level < maxLevel) {
5432 dest[i++] = UCOL_LEVELTERMINATOR;
5433 } else {
5434 dest[i++] = 0;
5435 }
5436 }
5437
5438 /** enumeration of level identifiers for partial sort key generation */
5439 enum {
5440 UCOL_PSK_PRIMARY = 0,
5441 UCOL_PSK_SECONDARY = 1,
5442 UCOL_PSK_CASE = 2,
5443 UCOL_PSK_TERTIARY = 3,
5444 UCOL_PSK_QUATERNARY = 4,
5445 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5446 UCOL_PSK_IDENTICAL = 6,
5447 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5448 UCOL_PSK_LIMIT
5449 };
5450
5451 /** collation state enum. *_SHIFT value is how much to shift right
5452 * to get the state piece to the right. *_MASK value should be
5453 * ANDed with the shifted state. This data is stored in state[1]
5454 * field.
5455 */
5456 enum {
5457 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5458 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5459 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5460 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5461 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5462 * This field is also used to denote that the French secondary level is finished
5463 */
5464 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5465 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5466 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5467 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5468 /** When we do French we need to reverse secondary values. However, continuations
5469 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5470 */
5471 UCOL_PSK_USED_ELEMENTS_SHIFT = 7,
5472 UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF,
5473 UCOL_PSK_ITER_SKIP_SHIFT = 17,
5474 UCOL_PSK_ITER_SKIP_MASK = 0x7FFF
5475 };
5476
5477
5478 /** main sortkey part procedure. On the first call,
5479 * you should pass in a collator, an iterator, empty state
5480 * state[0] == state[1] == 0, a buffer to hold results
5481 * number of bytes you need and an error code pointer.
5482 * Make sure your buffer is big enough to hold the wanted
5483 * number of sortkey bytes. I don't check.
5484 * The only meaningful status you can get back is
5485 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5486 * have been dealt a raw deal and that you probably won't
5487 * be able to use partial sortkey generation for this
5488 * particular combination of string and collator. This
5489 * is highly unlikely, but you should still check the error code.
5490 * Any other status means that you're not in a sane situation
5491 * anymore. After the first call, preserve state values and
5492 * use them on subsequent calls to obtain more bytes of a sortkey.
5493 * Use until the number of bytes written is smaller than the requested
5494 * number of bytes. Generated sortkey is not compatible with the
5495 * one generated by ucol_getSortKey, as we don't do any compression.
5496 * However, levels are still terminated by a 1 (one) and the sortkey
5497 * is terminated by a 0 (zero). Identical level is the same as in the
5498 * regular sortkey - internal bocu-1 implementation is used.
5499 * For curious, although you cannot do much about this, here is
5500 * the structure of state words.
5501 * state[0] - iterator state. Depends on the iterator implementation,
5502 * but allows the iterator to continue where it stopped in
5503 * the last iteration.
5504 * state[1] - collation processing state. Here is the distribution
5505 * of the bits:
5506 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5507 * quaternary, quin (we don't use this one), identical and
5508 * null (producing only zeroes - first one to terminate the
5509 * sortkey and subsequent to fill the buffer).
5510 * 3 - byte count. Number of bytes written on the primary level.
5511 * 4 - was shifted. Whether the previous iteration finished in the
5512 * shifted state.
5513 * 5, 6 - French continuation bytes written. See the comment in the enum
5514 * 7..16 - Used elements. Number of CEs that were already used from the
5515 * expansion buffer or number of bytes from a bocu sequence on
5516 * the identical level.
5517 * 17..31 - iterator skip. Number of move operations iterator needs to
5518 * skip from the current state in order to continue. This is used
5519 * only if normalization is turned on, since the normalizing iterator
5520 * can return undefined state, which means that it's in the middle
5521 * of normalizing sequence.
5522 */
5523 U_CAPI int32_t U_EXPORT2
5524 ucol_nextSortKeyPart(const UCollator *coll,
5525 UCharIterator *iter,
5526 uint32_t state[2],
5527 uint8_t *dest, int32_t count,
5528 UErrorCode *status) {
5529 /* error checking */
5530 if(status==NULL || U_FAILURE(*status)) {
5531 return 0;
5532 }
5533 if( coll==NULL || iter==NULL ||
5534 state==NULL ||
5535 count<0 || (count>0 && dest==NULL)
5536 ) {
5537 *status=U_ILLEGAL_ARGUMENT_ERROR;
5538 }
5539
5540
5541 if(count==0) {
5542 /* nothing to do */
5543 return 0;
5544 }
5545
5546 /** Setting up situation according to the state we got from the previous iteration */
5547 // The state of the iterator from the previous invocation
5548 uint32_t iterState = state[0];
5549 // Has the last iteration ended in the shifted state
5550 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5551 // What is the current level of the sortkey?
5552 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5553 // Have we written only one byte from a two byte primary in the previous iteration?
5554 // Also on secondary level - have we finished with the French secondary?
5555 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5556 // number of bytes in the continuation buffer for French
5557 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5558 // Skip the CEs that we got from an extraction
5559 // and delivered in the previous call
5560 int32_t usedElements = (state[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT) & UCOL_PSK_USED_ELEMENTS_MASK;
5561 // Number of times to skip because the iterator returned
5562 // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
5563 // last valid state.
5564 int32_t iterSkips = (state[1] >> UCOL_PSK_ITER_SKIP_SHIFT) & UCOL_PSK_ITER_SKIP_MASK;
5565
5566 /** values that depend on the collator attributes */
5567 // strength of the collator.
5568 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5569 // maximal level of the partial sortkey. Need to take whether case level is done
5570 int32_t maxLevel = 0;
5571 if(strength < UCOL_TERTIARY) {
5572 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5573 maxLevel = UCOL_PSK_CASE;
5574 } else {
5575 maxLevel = strength;
5576 }
5577 } else {
5578 if(strength == UCOL_TERTIARY) {
5579 maxLevel = UCOL_PSK_TERTIARY;
5580 } else if(strength == UCOL_QUATERNARY) {
5581 maxLevel = UCOL_PSK_QUATERNARY;
5582 } else { // identical
5583 maxLevel = UCOL_IDENTICAL;
5584 }
5585 }
5586 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5587 uint8_t UCOL_HIRAGANA_QUAD =
5588 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5589 // Boundary value that decides whether a CE is shifted or not
5590 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5591 // Are we doing French collation?
5592 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5593
5594 /** initializing the collation state */
5595 UBool notIsContinuation = FALSE;
5596 uint32_t CE = UCOL_NO_MORE_CES;
5597
5598 collIterate s;
5599 IInit_collIterate(coll, NULL, -1, &s);
5600 s.iterator = iter;
5601 s.flags |= UCOL_USE_ITERATOR;
5602 // This variable tells us whether we have produced some other levels in this iteration
5603 // before we moved to the identical level. In that case, we need to switch the
5604 // type of the iterator.
5605 UBool doingIdenticalFromStart = FALSE;
5606 // Normalizing iterator
5607 // The division for the array length may truncate the array size to
5608 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5609 // for all platforms anyway.
5610 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5611 UNormIterator *normIter = NULL;
5612 // If the normalization is turned on for the collator and we are below identical level
5613 // we will use a FCD normalizing iterator
5614 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5615 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5616 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5617 s.flags &= ~UCOL_ITER_NORM;
5618 if(U_FAILURE(*status)) {
5619 return 0;
5620 }
5621 } else if(level == UCOL_PSK_IDENTICAL) {
5622 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5623 // will be updating the state - and this cannot be done on an ordinary iterator.
5624 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5625 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5626 s.flags &= ~UCOL_ITER_NORM;
5627 if(U_FAILURE(*status)) {
5628 return 0;
5629 }
5630 doingIdenticalFromStart = TRUE;
5631 }
5632
5633 // This is the tentative new state of the iterator. The problem
5634 // is that the iterator might return an undefined state, in
5635 // which case we should save the last valid state and increase
5636 // the iterator skip value.
5637 uint32_t newState = 0;
5638
5639 // First, we set the iterator to the last valid position
5640 // from the last iteration. This was saved in state[0].
5641 if(iterState == 0) {
5642 /* initial state */
5643 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5644 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5645 } else {
5646 s.iterator->move(s.iterator, 0, UITER_START);
5647 }
5648 } else {
5649 /* reset to previous state */
5650 s.iterator->setState(s.iterator, iterState, status);
5651 if(U_FAILURE(*status)) {
5652 return 0;
5653 }
5654 }
5655
5656 // Then, we may have to move more, if the normalizing iterator
5657 // was going through a normalizing sequence.
5658 if(iterSkips) {
5659 // if we are on secondary level AND we do French, we need to go backward instead of forward
5660 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5661 s.iterator->move(s.iterator, -iterSkips, UITER_CURRENT);
5662 } else {
5663 s.iterator->move(s.iterator, iterSkips, UITER_CURRENT);
5664 }
5665 }
5666
5667
5668 // Number of expansion CEs that were already consumed in the
5669 // previous iteration for the last code point processed. We
5670 // want to clean out the expansion buffer, so that we can
5671 // get correct CEs. This value is persistent over iterations,
5672 // since we can have several iterations on the one expansion
5673 // buffer.
5674 int32_t consumedExpansionCEs = usedElements;
5675 // Number of bytes already writted from a bocsu sequence. Since
5676 // the longes bocsu sequence is 4 long, this can be up to 3. It
5677 // shares the state field with consumedExpansionCEs value, since
5678 // they cannot simultanously appear on the same level
5679 int32_t bocsuBytesUsed = 0;
5680 // Clean out the expansion buffer unless we are on
5681 // identical level. In that case we use this field
5682 // to store the number of bytes already written
5683 // from the previous bocsu sequence.
5684 if(level < UCOL_PSK_IDENTICAL && usedElements != 0) {
5685 while(usedElements-->0) {
5686 // If we're doing French and we are on the secondary level,
5687 // we go backwards.
5688 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5689 CE = ucol_IGetPrevCE(coll, &s, status);
5690 } else {
5691 CE = ucol_IGetNextCE(coll, &s, status);
5692 }
5693 if(CE==UCOL_NO_MORE_CES) {
5694 /* should not happen */
5695 *status=U_INTERNAL_PROGRAM_ERROR;
5696 return 0;
5697 }
5698 }
5699 } else {
5700 bocsuBytesUsed = usedElements;
5701 }
5702
5703 // This variable prevents the adjusting of iterator
5704 // skip variable when we are the first time on a
5705 // level. I hope there is a better way to do it, but
5706 // I could not think of it.
5707 UBool firstTimeOnLevel = TRUE;
5708 // French secondary needs to know whether the iterator state of zero came from previous level OR
5709 // from a new invocation...
5710 UBool wasDoingPrimary = FALSE;
5711 // Case level is kind of goofy. This variable tells us that
5712 // we are still not done with the case level.
5713 UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE;
5714 // destination buffer byte counter. When this guy
5715 // gets to count, we're done with the iteration
5716 int32_t i = 0;
5717 // used to count the zero bytes written after we
5718 // have finished with the sort key
5719 int32_t j = 0;
5720
5721
5722 // Hm.... I think we're ready to plunge in. Basic story is as following:
5723 // we have a fall through case based on level. This is used for initial
5724 // positioning on iteration start. Every level processor contains a
5725 // for(;;) which will be broken when we exhaust all the CEs. Other
5726 // way to exit is a goto saveState, which happens when we have filled
5727 // out our buffer.
5728 switch(level) {
5729 case UCOL_PSK_PRIMARY:
5730 wasDoingPrimary = TRUE;
5731 for(;;) {
5732 if(i==count) {
5733 goto saveState;
5734 }
5735 // We should save the state only if we
5736 // are sure that we are done with the
5737 // previous iterator state
5738 if(consumedExpansionCEs == 0 && byteCountOrFrenchDone == 0) {
5739 newState = s.iterator->getState(s.iterator);
5740 if(newState != UITER_NO_STATE) {
5741 iterState = newState;
5742 iterSkips = 0;
5743 } else {
5744 if(!firstTimeOnLevel && !byteCountOrFrenchDone) {
5745 iterSkips++;
5746 }
5747 }
5748 }
5749 firstTimeOnLevel = FALSE;
5750 CE = ucol_IGetNextCE(coll, &s, status);
5751 if(CE==UCOL_NO_MORE_CES) {
5752 // Add the level separator
5753 terminatePSKLevel(level, maxLevel, i, dest);
5754 byteCountOrFrenchDone=0;
5755 // Restart the iteration an move to the
5756 // second level
5757 s.iterator->move(s.iterator, 0, UITER_START);
5758 level = UCOL_PSK_SECONDARY;
5759 break;
5760 }
5761 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5762 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5763 if(CE != 0) {
5764 if(byteCountOrFrenchDone == 0) {
5765 // get the second byte of primary
5766 dest[i++]=(uint8_t)(CE >> 8);
5767 } else {
5768 byteCountOrFrenchDone = 0;
5769 }
5770 if((CE &=0xff)!=0) {
5771 if(i==count) {
5772 /* overflow */
5773 byteCountOrFrenchDone=1;
5774 goto saveState;
5775 }
5776 dest[i++]=(uint8_t)CE;
5777 }
5778 }
5779 }
5780 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
5781 // s.pos != NULL means there is a normalization buffer in effect
5782 // in iterative case, this means that we are doing Thai (maybe discontiguos)
5783 consumedExpansionCEs++;
5784 } else {
5785 consumedExpansionCEs = 0;
5786 }
5787 if(s.pos && *s.pos == 0) {
5788 // maybe it is the end of Thai - we have to have
5789 // an extra skip
5790 iterSkips++;
5791 }
5792 }
5793 /* fall through to next level */
5794 case UCOL_PSK_SECONDARY:
5795 if(strength >= UCOL_SECONDARY) {
5796 if(!doingFrench) {
5797 for(;;) {
5798 if(i == count) {
5799 goto saveState;
5800 }
5801 // We should save the state only if we
5802 // are sure that we are done with the
5803 // previous iterator state
5804 if(consumedExpansionCEs == 0) {
5805 newState = s.iterator->getState(s.iterator);
5806 if(newState != UITER_NO_STATE) {
5807 iterState = newState;
5808 iterSkips = 0;
5809 } else {
5810 if(!firstTimeOnLevel) {
5811 iterSkips++;
5812 }
5813 }
5814 }
5815 firstTimeOnLevel = FALSE;
5816 CE = ucol_IGetNextCE(coll, &s, status);
5817 if(CE==UCOL_NO_MORE_CES) {
5818 // Add the level separator
5819 terminatePSKLevel(level, maxLevel, i, dest);
5820 byteCountOrFrenchDone=0;
5821 // Restart the iteration an move to the
5822 // second level
5823 s.iterator->move(s.iterator, 0, UITER_START);
5824 level = UCOL_PSK_CASE;
5825 break;
5826 }
5827 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5828 CE >>= 8; /* get secondary */
5829 if(CE != 0) {
5830 dest[i++]=(uint8_t)CE;
5831 }
5832 }
5833 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
5834 consumedExpansionCEs++;
5835 } else {
5836 consumedExpansionCEs = 0;
5837 }
5838 if(s.pos && *s.pos == 0) {
5839 iterSkips++;
5840 }
5841 }
5842 } else { // French secondary processing
5843 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5844 int32_t frenchIndex = 0;
5845 // Here we are going backwards.
5846 // If the iterator is at the beggining, it should be
5847 // moved to end.
5848 if(wasDoingPrimary) {
5849 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5850 }
5851 for(;;) {
5852 if(i == count) {
5853 goto saveState;
5854 }
5855 if(consumedExpansionCEs == 0) {
5856 newState = s.iterator->getState(s.iterator);
5857 if(newState != UITER_NO_STATE) {
5858 iterState = newState;
5859 iterSkips = 0;
5860 } else {
5861 if(!firstTimeOnLevel) {
5862 iterSkips++;
5863 }
5864 }
5865 }
5866 firstTimeOnLevel = FALSE;
5867 CE = ucol_IGetPrevCE(coll, &s, status);
5868 if(CE==UCOL_NO_MORE_CES) {
5869 // Add the level separator
5870 terminatePSKLevel(level, maxLevel, i, dest);
5871 byteCountOrFrenchDone=0;
5872 // Restart the iteration an move to the next level
5873 s.iterator->move(s.iterator, 0, UITER_START);
5874 level = UCOL_PSK_CASE;
5875 break;
5876 }
5877 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5878 // reverse when we get a first non-continuation CE.
5879 CE >>= 8;
5880 frenchBuff[frenchIndex++] = (uint8_t)CE;
5881 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5882 CE >>= 8; /* get secondary */
5883 if(!frenchIndex) {
5884 if(CE != 0) {
5885 dest[i++]=(uint8_t)CE;
5886 }
5887 } else {
5888 frenchBuff[frenchIndex++] = (uint8_t)CE;
5889 frenchIndex -= usedFrench;
5890 usedFrench = 0;
5891 while(i < count && frenchIndex) {
5892 dest[i++] = frenchBuff[--frenchIndex];
5893 usedFrench++;
5894 }
5895 }
5896 }
5897 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
5898 consumedExpansionCEs++;
5899 } else {
5900 consumedExpansionCEs = 0;
5901 }
5902 if(s.pos && *s.pos == 0) {
5903 iterSkips++;
5904 }
5905 }
5906 }
5907 } else {
5908 level = UCOL_PSK_CASE;
5909 }
5910 /* fall through to next level */
5911 case UCOL_PSK_CASE:
5912 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5913 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5914 uint8_t caseByte = UCOL_CASE_BYTE_START;
5915 uint8_t caseBits = 0;
5916
5917 for(;;) {
5918 if(i == count) {
5919 goto saveState;
5920 }
5921 // We should save the state only if we
5922 // are sure that we are done with the
5923 // previous iterator state
5924 if(consumedExpansionCEs == 0) {
5925 newState = s.iterator->getState(s.iterator);
5926 if(newState != UITER_NO_STATE) {
5927 iterState = newState;
5928 iterSkips = 0;
5929 } else {
5930 if(!firstTimeOnLevel) {
5931 iterSkips++;
5932 }
5933 }
5934 }
5935 firstTimeOnLevel = FALSE;
5936 CE = ucol_IGetNextCE(coll, &s, status);
5937 if(CE==UCOL_NO_MORE_CES) {
5938 // On the case level we might have an unfinished
5939 // case byte. Add one if it's started.
5940 if(caseShift != UCOL_CASE_SHIFT_START) {
5941 dest[i++] = caseByte;
5942 }
5943 // This is kind of tricky - situation where
5944 // we need to keep the iterator in the old
5945 // state, but don't need to bring anything
5946 // to the next invocation
5947 if(i < count) {
5948 // Add the level separator
5949 terminatePSKLevel(level, maxLevel, i, dest);
5950 // Restart the iteration and move to the
5951 // next level
5952 s.iterator->move(s.iterator, 0, UITER_START);
5953 level = UCOL_PSK_TERTIARY;
5954 } else {
5955 dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE;
5956 }
5957 break;
5958 }
5959
5960 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5961 if(!isContinuation(CE)) {
5962 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5963 caseBits = (uint8_t)(CE & 0xC0);
5964 // this copies the case level logic from the
5965 // sort key generation code
5966 if(CE != 0) {
5967 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5968 if((caseBits & 0xC0) == 0) {
5969 caseByte |= 1 << (--caseShift);
5970 } else {
5971 caseByte |= 0 << (--caseShift);
5972 /* second bit */
5973 if(caseShift == 0) {
5974 dest[i++] = caseByte;
5975 caseShift = UCOL_CASE_SHIFT_START;
5976 caseByte = UCOL_CASE_BYTE_START;
5977 }
5978 caseByte |= ((caseBits>>6)&1) << (--caseShift);
5979 }
5980 } else {
5981 if((caseBits & 0xC0) == 0) {
5982 caseByte |= 0 << (--caseShift);
5983 } else {
5984 caseByte |= 1 << (--caseShift);
5985 /* second bit */
5986 if(caseShift == 0) {
5987 dest[i++] = caseByte;
5988 caseShift = UCOL_CASE_SHIFT_START;
5989 caseByte = UCOL_CASE_BYTE_START;
5990 }
5991 caseByte |= ((caseBits>>7)&1) << (--caseShift);
5992 }
5993 }
5994 }
5995
5996 }
5997 }
5998 // Not sure this is correct for the case level - revisit
5999 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6000 consumedExpansionCEs++;
6001 } else {
6002 consumedExpansionCEs = 0;
6003 }
6004 if(s.pos && *s.pos == 0) {
6005 iterSkips++;
6006 }
6007 }
6008 } else {
6009 level = UCOL_PSK_TERTIARY;
6010 }
6011 /* fall through to next level */
6012 case UCOL_PSK_TERTIARY:
6013 if(strength >= UCOL_TERTIARY) {
6014 for(;;) {
6015 if(i == count) {
6016 goto saveState;
6017 }
6018 // We should save the state only if we
6019 // are sure that we are done with the
6020 // previous iterator state
6021 if(consumedExpansionCEs == 0) {
6022 newState = s.iterator->getState(s.iterator);
6023 if(newState != UITER_NO_STATE) {
6024 iterState = newState;
6025 iterSkips = 0;
6026 } else {
6027 if(!firstTimeOnLevel) {
6028 iterSkips++;
6029 }
6030 }
6031 }
6032 firstTimeOnLevel = FALSE;
6033 CE = ucol_IGetNextCE(coll, &s, status);
6034 if(CE==UCOL_NO_MORE_CES) {
6035 // Add the level separator
6036 terminatePSKLevel(level, maxLevel, i, dest);
6037 byteCountOrFrenchDone=0;
6038 // Restart the iteration an move to the
6039 // second level
6040 s.iterator->move(s.iterator, 0, UITER_START);
6041 level = UCOL_PSK_QUATERNARY;
6042 break;
6043 }
6044 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6045 notIsContinuation = !isContinuation(CE);
6046
6047 if(notIsContinuation) {
6048 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6049 CE ^= coll->caseSwitch;
6050 CE &= coll->tertiaryMask;
6051 } else {
6052 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6053 }
6054
6055 if(CE != 0) {
6056 dest[i++]=(uint8_t)CE;
6057 }
6058 }
6059 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6060 consumedExpansionCEs++;
6061 } else {
6062 consumedExpansionCEs = 0;
6063 }
6064 if(s.pos && *s.pos == 0) {
6065 iterSkips++;
6066 }
6067 }
6068 } else {
6069 // if we're not doing tertiary
6070 // skip to the end
6071 level = UCOL_PSK_NULL;
6072 }
6073 /* fall through to next level */
6074 case UCOL_PSK_QUATERNARY:
6075 if(strength >= UCOL_QUATERNARY) {
6076 for(;;) {
6077 if(i == count) {
6078 goto saveState;
6079 }
6080 // We should save the state only if we
6081 // are sure that we are done with the
6082 // previous iterator state
6083 if(consumedExpansionCEs == 0) {
6084 newState = s.iterator->getState(s.iterator);
6085 if(newState != UITER_NO_STATE) {
6086 iterState = newState;
6087 iterSkips = 0;
6088 } else {
6089 if(!firstTimeOnLevel) {
6090 iterSkips++;
6091 }
6092 }
6093 }
6094 firstTimeOnLevel = FALSE;
6095 CE = ucol_IGetNextCE(coll, &s, status);
6096 if(CE==UCOL_NO_MORE_CES) {
6097 // Add the level separator
6098 terminatePSKLevel(level, maxLevel, i, dest);
6099 //dest[i++] = UCOL_LEVELTERMINATOR;
6100 byteCountOrFrenchDone=0;
6101 // Restart the iteration an move to the
6102 // second level
6103 s.iterator->move(s.iterator, 0, UITER_START);
6104 level = UCOL_PSK_QUIN;
6105 break;
6106 }
6107 if(isShiftedCE(CE, LVT, &wasShifted)) {
6108 CE >>= 16; /* get primary */
6109 if(CE != 0) {
6110 if(byteCountOrFrenchDone == 0) {
6111 dest[i++]=(uint8_t)(CE >> 8);
6112 } else {
6113 byteCountOrFrenchDone = 0;
6114 }
6115 if((CE &=0xff)!=0) {
6116 if(i==count) {
6117 /* overflow */
6118 byteCountOrFrenchDone=1;
6119 goto saveState;
6120 }
6121 dest[i++]=(uint8_t)CE;
6122 }
6123 }
6124 } else {
6125 notIsContinuation = !isContinuation(CE);
6126 if(notIsContinuation) {
6127 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6128 dest[i++] = UCOL_HIRAGANA_QUAD;
6129 } else {
6130 dest[i++] = 0xFF;
6131 }
6132 }
6133 }
6134 if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6135 consumedExpansionCEs++;
6136 } else {
6137 consumedExpansionCEs = 0;
6138 }
6139 if(s.pos && *s.pos == 0) {
6140 iterSkips++;
6141 }
6142 }
6143 } else {
6144 // if we're not doing quaternary
6145 // skip to the end
6146 level = UCOL_PSK_NULL;
6147 }
6148 /* fall through to next level */
6149 case UCOL_PSK_QUIN:
6150 level = UCOL_PSK_IDENTICAL;
6151 /* fall through to next level */
6152 case UCOL_PSK_IDENTICAL:
6153 if(strength >= UCOL_IDENTICAL) {
6154 UChar32 first, second;
6155 int32_t bocsuBytesWritten = 0;
6156 // We always need to do identical on
6157 // the NFD form of the string.
6158 if(normIter == NULL) {
6159 // we arrived from the level below and
6160 // normalization was not turned on.
6161 // therefore, we need to make a fresh NFD iterator
6162 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6163 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6164 } else if(!doingIdenticalFromStart) {
6165 // there is an iterator, but we did some other levels.
6166 // therefore, we have a FCD iterator - need to make
6167 // a NFD one.
6168 // normIter being at the beginning does not guarantee
6169 // that the underlying iterator is at the beginning
6170 iter->move(iter, 0, UITER_START);
6171 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6172 }
6173 // At this point we have a NFD iterator that is positioned
6174 // in the right place
6175 if(U_FAILURE(*status)) {
6176 return 0;
6177 }
6178 first = uiter_previous32(s.iterator);
6179 // maybe we're at the start of the string
6180 if(first == U_SENTINEL) {
6181 first = 0;
6182 } else {
6183 uiter_next32(s.iterator);
6184 }
6185
6186 j = 0;
6187 for(;;) {
6188 if(i == count) {
6189 if(j+1 < bocsuBytesWritten) {
6190 bocsuBytesUsed = j+1;
6191 }
6192 goto saveState;
6193 }
6194
6195 // On identical level, we will always save
6196 // the state if we reach this point, since
6197 // we don't depend on getNextCE for content
6198 // all the content is in our buffer and we
6199 // already either stored the full buffer OR
6200 // otherwise we won't arrive here.
6201 newState = s.iterator->getState(s.iterator);
6202 if(newState != UITER_NO_STATE) {
6203 iterState = newState;
6204 iterSkips = 0;
6205 } else {
6206 iterSkips++;
6207 }
6208
6209 uint8_t buff[4];
6210 second = uiter_next32(s.iterator);
6211
6212 // end condition for identical level
6213 if(second == U_SENTINEL) {
6214 terminatePSKLevel(level, maxLevel, i, dest);
6215 level = UCOL_PSK_NULL;
6216 break;
6217 }
6218 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6219 first = second;
6220
6221 j = 0;
6222 if(bocsuBytesUsed != 0) {
6223 while(bocsuBytesUsed-->0) {
6224 j++;
6225 }
6226 }
6227
6228 while(i < count && j < bocsuBytesWritten) {
6229 dest[i++] = buff[j++];
6230 }
6231 }
6232
6233 } else {
6234 level = UCOL_PSK_NULL;
6235 }
6236 /* fall through to next level */
6237 case UCOL_PSK_NULL:
6238 j = i;
6239 while(j<count) {
6240 dest[j++]=0;
6241 }
6242 break;
6243 default:
6244 *status = U_INTERNAL_PROGRAM_ERROR;
6245 return 0;
6246 }
6247
6248 saveState:
6249 // Now we need to return stuff. First we want to see whether we have
6250 // done everything for the current state of iterator.
6251 if(consumedExpansionCEs || byteCountOrFrenchDone
6252 || dontAdvanceIteratorBecauseWeNeedALevelTerminator) {
6253 // Any of above mean that the previous transaction
6254 // wasn't finished and that we should store the
6255 // previous iterator state.
6256 state[0] = iterState;
6257 } else {
6258 // The transaction is complete. We will continue in
6259 // next iteration.
6260 if((newState = s.iterator->getState(s.iterator))!= UITER_NO_STATE) {
6261 state[0] = s.iterator->getState(s.iterator);
6262 iterSkips = 0;
6263 } else {
6264 state[0] = iterState;
6265 iterSkips++;
6266 }
6267 }
6268 // Store the number of elements processed. On CE levels, this is
6269 // the number of expansion CEs processed. On identical level, this
6270 // is the number of bocsu bytes written.
6271 if(level < UCOL_PSK_IDENTICAL) {
6272 if((consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) != consumedExpansionCEs) {
6273 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6274 }
6275 state[1] = (consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6276 } else {
6277 if((bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) != bocsuBytesUsed) {
6278 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6279 }
6280 state[1] = (bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6281 }
6282
6283 // Next we put in the level of comparison
6284 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6285
6286 // If we are doing French, we need to store whether we have just finished the French level
6287 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6288 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6289 } else {
6290 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6291 }
6292
6293 // Was the latest CE shifted
6294 if(wasShifted) {
6295 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6296 }
6297 // Check for iterSkips overflow
6298 if((iterSkips & UCOL_PSK_ITER_SKIP_MASK) != iterSkips) {
6299 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6300 }
6301 // Store iterSkips
6302 state[1] |= ((iterSkips & UCOL_PSK_ITER_SKIP_MASK) << UCOL_PSK_ITER_SKIP_SHIFT);
6303
6304 // Check for French overflow
6305 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6306 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6307 }
6308 // Store number of bytes written in the French secondary continuation sequence
6309 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6310
6311
6312 // If we have used normalizing iterator, get rid of it
6313 if(normIter != NULL) {
6314 unorm_closeIter(normIter);
6315 }
6316
6317 // Return number of meaningful sortkey bytes.
6318 return i;
6319 }
6320
6321 /**
6322 * Produce a bound for a given sortkey and a number of levels.
6323 */
6324 U_CAPI int32_t U_EXPORT2
6325 ucol_getBound(const uint8_t *source,
6326 int32_t sourceLength,
6327 UColBoundMode boundType,
6328 uint32_t noOfLevels,
6329 uint8_t *result,
6330 int32_t resultLength,
6331 UErrorCode *status) {
6332 // consistency checks
6333 if(status == NULL || U_FAILURE(*status)) {
6334 return 0;
6335 }
6336 if(source == NULL) {
6337 *status = U_ILLEGAL_ARGUMENT_ERROR;
6338 return 0;
6339 }
6340
6341 int32_t sourceIndex = 0;
6342 // Scan the string until we skip enough of the key OR reach the end of the key
6343 do {
6344 sourceIndex++;
6345 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6346 noOfLevels--;
6347 }
6348 } while (noOfLevels > 0
6349 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6350
6351 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6352 && noOfLevels > 0) {
6353 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6354 }
6355
6356
6357 // READ ME: this code assumes that the values for boundType
6358 // enum will not changes. They are set so that the enum value
6359 // corresponds to the number of extra bytes each bound type
6360 // needs.
6361 if(result != NULL && resultLength >= sourceIndex+boundType) {
6362 uprv_memcpy(result, source, sourceIndex);
6363 switch(boundType) {
6364 // Lower bound just gets terminated. No extra bytes
6365 case UCOL_BOUND_LOWER: // = 0
6366 break;
6367 // Upper bound needs one extra byte
6368 case UCOL_BOUND_UPPER: // = 1
6369 result[sourceIndex++] = 2;
6370 break;
6371 // Upper long bound needs two extra bytes
6372 case UCOL_BOUND_UPPER_LONG: // = 2
6373 result[sourceIndex++] = 0xFF;
6374 result[sourceIndex++] = 0xFF;
6375 break;
6376 default:
6377 *status = U_ILLEGAL_ARGUMENT_ERROR;
6378 return 0;
6379 }
6380 result[sourceIndex++] = 0;
6381
6382 return sourceIndex;
6383 } else {
6384 return sourceIndex+boundType+1;
6385 }
6386 }
6387
6388 static
6389 inline void uprv_appendByteToHexString(char *dst, uint8_t val) {
6390 uint32_t len = (uint32_t)uprv_strlen(dst);
6391 *(dst+len) = T_CString_itosOffset((val >> 4));
6392 *(dst+len+1) = T_CString_itosOffset((val & 0xF));
6393 *(dst+len+2) = 0;
6394 }
6395
6396 /* this function makes a string with representation of a sortkey */
6397 U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
6398 int32_t strength = UCOL_PRIMARY;
6399 uint32_t res_size = 0;
6400 UBool doneCase = FALSE;
6401
6402 char *current = buffer;
6403 const uint8_t *currentSk = sortkey;
6404
6405 uprv_strcpy(current, "[");
6406
6407 while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
6408 if(strength > UCOL_PRIMARY) {
6409 strcat(current, " . ");
6410 }
6411 while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
6412 uprv_appendByteToHexString(current, *currentSk++);
6413 uprv_strcat(current, " ");
6414 }
6415 if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
6416 doneCase = TRUE;
6417 } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
6418 strength ++;
6419 }
6420 uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
6421 if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6422 break;
6423 }
6424 }
6425
6426 if(coll->strength == UCOL_IDENTICAL) {
6427 uprv_strcat(current, " . ");
6428 while(*currentSk != 0) {
6429 uprv_appendByteToHexString(current, *currentSk++);
6430 uprv_strcat(current, " ");
6431 }
6432
6433 uprv_appendByteToHexString(current, *currentSk++);
6434 }
6435 uprv_strcat(current, "]");
6436
6437 if(res_size > *len) {
6438 return NULL;
6439 }
6440
6441 return buffer;
6442 }
6443
6444
6445 /****************************************************************************/
6446 /* Following are the functions that deal with the properties of a collator */
6447 /* there are new APIs and some compatibility APIs */
6448 /****************************************************************************/
6449
6450 static inline void
6451 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6452 int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6453 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6454 UBool reverseSecondary = FALSE;
6455 if(!isContinuation(CE)) {
6456 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6457 tertiary ^= coll->caseSwitch;
6458 reverseSecondary = TRUE;
6459 } else {
6460 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6461 tertiary &= UCOL_REMOVE_CASE;
6462 reverseSecondary = FALSE;
6463 }
6464
6465 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6466 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6467 primary1 = (uint8_t)(CE >> 8);
6468
6469 if(primary1 != 0) {
6470 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6471 *primShift -= 8;
6472 }
6473 if(primary2 != 0) {
6474 if(*primShift < 0) {
6475 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6476 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6477 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6478 return;
6479 }
6480 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6481 *primShift -= 8;
6482 }
6483 if(secondary != 0) {
6484 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6485 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6486 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6487 } else { // normal case
6488 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6489 }
6490 *secShift -= 8;
6491 }
6492 if(tertiary != 0) {
6493 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6494 *terShift -= 8;
6495 }
6496 }
6497
6498 static inline UBool
6499 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6500 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6501 if(newTable == NULL) {
6502 *status = U_MEMORY_ALLOCATION_ERROR;
6503 coll->latinOneFailed = TRUE;
6504 return FALSE;
6505 }
6506 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6507 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6508 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6509 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6510 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6511 coll->latinOneTableLen = size;
6512 uprv_free(coll->latinOneCEs);
6513 coll->latinOneCEs = newTable;
6514 return TRUE;
6515 }
6516
6517 static UBool
6518 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6519 UBool result = TRUE;
6520 if(coll->latinOneCEs == NULL) {
6521 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6522 if(coll->latinOneCEs == NULL) {
6523 *status = U_MEMORY_ALLOCATION_ERROR;
6524 return FALSE;
6525 }
6526 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6527 }
6528 UChar ch = 0;
6529 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6530 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6531
6532 int32_t primShift = 24, secShift = 24, terShift = 24;
6533 uint32_t CE = 0;
6534 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6535
6536 // TODO: make safe if you get more than you wanted...
6537 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6538 primShift = 24; secShift = 24; terShift = 24;
6539 if(ch < 0x100) {
6540 CE = coll->latinOneMapping[ch];
6541 } else {
6542 CE = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
6543 if(CE == UCOL_NOT_FOUND) {
6544 CE = UTRIE_GET32_FROM_LEAD(UCA->mapping, ch);
6545 }
6546 }
6547 if(CE < UCOL_NOT_FOUND) {
6548 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6549 } else {
6550 switch (getCETag(CE)) {
6551 case EXPANSION_TAG:
6552 ucol_setText(it, &ch, 1, status);
6553 while((CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6554 if(primShift < 0 || secShift < 0 || terShift < 0) {
6555 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6556 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6557 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6558 break;
6559 }
6560 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6561 }
6562 break;
6563 case CONTRACTION_TAG:
6564 // here is the trick
6565 // F2 is contraction. We do something very similar to contractions
6566 // but have two indices, one in the real contraction table and the
6567 // other to where we stuffed things. This hopes that we don't have
6568 // many contractions (this should work for latin-1 tables).
6569 {
6570 if((CE & 0x00FFF000) != 0) {
6571 *status = U_UNSUPPORTED_ERROR;
6572 return FALSE;
6573 }
6574
6575 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6576
6577 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6578
6579 coll->latinOneCEs[ch] = CE;
6580 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6581 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6582
6583 // We're going to jump into contraction table, pick the elements
6584 // and use them
6585 do {
6586 CE = *(coll->contractionCEs +
6587 (UCharOffset - coll->contractionIndex));
6588 if(getCETag(CE) == EXPANSION_TAG) {
6589 uint32_t size;
6590 uint32_t i; /* general counter */
6591 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6592 size = getExpansionCount(CE);
6593 //CE = *CEOffset++;
6594 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6595 for(i = 0; i<size; i++) {
6596 if(primShift < 0 || secShift < 0 || terShift < 0) {
6597 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6598 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6599 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6600 break;
6601 }
6602 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6603 }
6604 } else { /* else, we do */
6605 while(*CEOffset != 0) {
6606 if(primShift < 0 || secShift < 0 || terShift < 0) {
6607 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6608 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6609 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6610 break;
6611 }
6612 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6613 }
6614 }
6615 contractionOffset++;
6616 } else if(CE < UCOL_NOT_FOUND) {
6617 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6618 } else {
6619 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6620 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6621 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6622 contractionOffset++;
6623 }
6624 UCharOffset++;
6625 primShift = 24; secShift = 24; terShift = 24;
6626 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6627 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6628 return FALSE;
6629 }
6630 }
6631 } while(*UCharOffset != 0xFFFF);
6632 }
6633 break;
6634 default:
6635 coll->latinOneFailed = TRUE;
6636 result = FALSE;
6637 break;
6638 }
6639 }
6640 }
6641 ucol_closeElements(it);
6642 // compact table
6643 if(contractionOffset < coll->latinOneTableLen) {
6644 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6645 return FALSE;
6646 }
6647 }
6648 return result;
6649 }
6650
6651 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6652 if(U_SUCCESS(*status)) {
6653 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6654 coll->caseSwitch = UCOL_CASE_SWITCH;
6655 } else {
6656 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6657 }
6658
6659 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6660 coll->tertiaryMask = UCOL_REMOVE_CASE;
6661 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6662 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
6663 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6664 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6665 } else {
6666 coll->tertiaryMask = UCOL_KEEP_CASE;
6667 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6668 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6669 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6670 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6671 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6672 } else {
6673 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6674 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6675 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6676 }
6677 }
6678
6679 /* Set the compression values */
6680 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6681 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6682 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6683
6684 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6685 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6686 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6687 } else {
6688 coll->sortKeyGen = ucol_calcSortKey;
6689 }
6690 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY
6691 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
6692 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6693 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6694 //fprintf(stderr, "F");
6695 coll->latinOneUse = TRUE;
6696 } else {
6697 coll->latinOneUse = FALSE;
6698 }
6699 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6700 coll->latinOneUse = TRUE;
6701 }
6702 } else {
6703 coll->latinOneUse = FALSE;
6704 }
6705 }
6706
6707 }
6708
6709 U_CAPI uint32_t U_EXPORT2
6710 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6711 if(U_FAILURE(*status) || coll == NULL) {
6712 return 0;
6713 }
6714 if(len == -1) {
6715 len = u_strlen(varTop);
6716 }
6717 if(len == 0) {
6718 *status = U_ILLEGAL_ARGUMENT_ERROR;
6719 return 0;
6720 }
6721
6722 collIterate s;
6723 IInit_collIterate(coll, varTop, len, &s);
6724
6725 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6726
6727 /* here we check if we have consumed all characters */
6728 /* you can put in either one character or a contraction */
6729 /* you shouldn't put more... */
6730 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6731 *status = U_CE_NOT_FOUND_ERROR;
6732 return 0;
6733 }
6734
6735 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6736
6737 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6738 *status = U_PRIMARY_TOO_LONG_ERROR;
6739 return 0;
6740 }
6741
6742 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6743
6744 return CE & UCOL_PRIMARYMASK;
6745 }
6746
6747 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6748 if(U_FAILURE(*status) || coll == NULL) {
6749 return 0;
6750 }
6751 return coll->variableTopValue<<16;
6752 }
6753
6754 U_CAPI void U_EXPORT2
6755 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6756 if(U_FAILURE(*status) || coll == NULL) {
6757 return;
6758 }
6759 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6760 }
6761 /* Attribute setter API */
6762 U_CAPI void U_EXPORT2
6763 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6764 if(U_FAILURE(*status) || coll == NULL) {
6765 return;
6766 }
6767 UColAttributeValue oldFrench = coll->frenchCollation;
6768 UColAttributeValue oldCaseFirst = coll->caseFirst;
6769 switch(attr) {
6770 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6771 if(value == UCOL_ON) {
6772 coll->numericCollation = UCOL_ON;
6773 coll->numericCollationisDefault = FALSE;
6774 } else if (value == UCOL_OFF) {
6775 coll->numericCollation = UCOL_OFF;
6776 coll->numericCollationisDefault = FALSE;
6777 } else if (value == UCOL_DEFAULT) {
6778 coll->numericCollationisDefault = TRUE;
6779 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6780 } else {
6781 *status = U_ILLEGAL_ARGUMENT_ERROR;
6782 }
6783 break;
6784 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6785 if(value == UCOL_ON) {
6786 coll->hiraganaQ = UCOL_ON;
6787 coll->hiraganaQisDefault = FALSE;
6788 } else if (value == UCOL_OFF) {
6789 coll->hiraganaQ = UCOL_OFF;
6790 coll->hiraganaQisDefault = FALSE;
6791 } else if (value == UCOL_DEFAULT) {
6792 coll->hiraganaQisDefault = TRUE;
6793 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6794 } else {
6795 *status = U_ILLEGAL_ARGUMENT_ERROR;
6796 }
6797 break;
6798 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6799 if(value == UCOL_ON) {
6800 coll->frenchCollation = UCOL_ON;
6801 coll->frenchCollationisDefault = FALSE;
6802 } else if (value == UCOL_OFF) {
6803 coll->frenchCollation = UCOL_OFF;
6804 coll->frenchCollationisDefault = FALSE;
6805 } else if (value == UCOL_DEFAULT) {
6806 coll->frenchCollationisDefault = TRUE;
6807 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6808 } else {
6809 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6810 }
6811 break;
6812 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6813 if(value == UCOL_SHIFTED) {
6814 coll->alternateHandling = UCOL_SHIFTED;
6815 coll->alternateHandlingisDefault = FALSE;
6816 } else if (value == UCOL_NON_IGNORABLE) {
6817 coll->alternateHandling = UCOL_NON_IGNORABLE;
6818 coll->alternateHandlingisDefault = FALSE;
6819 } else if (value == UCOL_DEFAULT) {
6820 coll->alternateHandlingisDefault = TRUE;
6821 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6822 } else {
6823 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6824 }
6825 break;
6826 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6827 if(value == UCOL_LOWER_FIRST) {
6828 coll->caseFirst = UCOL_LOWER_FIRST;
6829 coll->caseFirstisDefault = FALSE;
6830 } else if (value == UCOL_UPPER_FIRST) {
6831 coll->caseFirst = UCOL_UPPER_FIRST;
6832 coll->caseFirstisDefault = FALSE;
6833 } else if (value == UCOL_OFF) {
6834 coll->caseFirst = UCOL_OFF;
6835 coll->caseFirstisDefault = FALSE;
6836 } else if (value == UCOL_DEFAULT) {
6837 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6838 coll->caseFirstisDefault = TRUE;
6839 } else {
6840 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6841 }
6842 break;
6843 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6844 if(value == UCOL_ON) {
6845 coll->caseLevel = UCOL_ON;
6846 coll->caseLevelisDefault = FALSE;
6847 } else if (value == UCOL_OFF) {
6848 coll->caseLevel = UCOL_OFF;
6849 coll->caseLevelisDefault = FALSE;
6850 } else if (value == UCOL_DEFAULT) {
6851 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6852 coll->caseLevelisDefault = TRUE;
6853 } else {
6854 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6855 }
6856 break;
6857 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6858 if(value == UCOL_ON) {
6859 coll->normalizationMode = UCOL_ON;
6860 coll->normalizationModeisDefault = FALSE;
6861 } else if (value == UCOL_OFF) {
6862 coll->normalizationMode = UCOL_OFF;
6863 coll->normalizationModeisDefault = FALSE;
6864 } else if (value == UCOL_DEFAULT) {
6865 coll->normalizationModeisDefault = TRUE;
6866 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6867 } else {
6868 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6869 }
6870 break;
6871 case UCOL_STRENGTH: /* attribute for strength */
6872 if (value == UCOL_DEFAULT) {
6873 coll->strengthisDefault = TRUE;
6874 coll->strength = (UColAttributeValue)coll->options->strength;
6875 } else if (value <= UCOL_IDENTICAL) {
6876 coll->strengthisDefault = FALSE;
6877 coll->strength = value;
6878 } else {
6879 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6880 }
6881 break;
6882 case UCOL_ATTRIBUTE_COUNT:
6883 default:
6884 *status = U_ILLEGAL_ARGUMENT_ERROR;
6885 break;
6886 }
6887 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6888 coll->latinOneRegenTable = TRUE;
6889 } else {
6890 coll->latinOneRegenTable = FALSE;
6891 }
6892 ucol_updateInternalState(coll, status);
6893 }
6894
6895 U_CAPI UColAttributeValue U_EXPORT2
6896 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6897 if(U_FAILURE(*status) || coll == NULL) {
6898 return UCOL_DEFAULT;
6899 }
6900 switch(attr) {
6901 case UCOL_NUMERIC_COLLATION:
6902 return coll->numericCollation;
6903 case UCOL_HIRAGANA_QUATERNARY_MODE:
6904 return coll->hiraganaQ;
6905 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6906 return coll->frenchCollation;
6907 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6908 return coll->alternateHandling;
6909 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6910 return coll->caseFirst;
6911 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6912 return coll->caseLevel;
6913 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6914 return coll->normalizationMode;
6915 case UCOL_STRENGTH: /* attribute for strength */
6916 return coll->strength;
6917 case UCOL_ATTRIBUTE_COUNT:
6918 default:
6919 *status = U_ILLEGAL_ARGUMENT_ERROR;
6920 break;
6921 }
6922 return UCOL_DEFAULT;
6923 }
6924
6925 U_CAPI void U_EXPORT2
6926 ucol_setStrength( UCollator *coll,
6927 UCollationStrength strength)
6928 {
6929 UErrorCode status = U_ZERO_ERROR;
6930 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6931 }
6932
6933 U_CAPI UCollationStrength U_EXPORT2
6934 ucol_getStrength(const UCollator *coll)
6935 {
6936 UErrorCode status = U_ZERO_ERROR;
6937 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6938 }
6939
6940 /****************************************************************************/
6941 /* Following are misc functions */
6942 /* there are new APIs and some compatibility APIs */
6943 /****************************************************************************/
6944
6945 U_CAPI UCollator* U_EXPORT2
6946 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
6947 {
6948 UCollator * localCollator;
6949 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
6950 char *stackBufferChars = (char *)stackBuffer;
6951
6952 if (status == NULL || U_FAILURE(*status)){
6953 return 0;
6954 }
6955 if ((stackBuffer && !pBufferSize) || !coll){
6956 *status = U_ILLEGAL_ARGUMENT_ERROR;
6957 return 0;
6958 }
6959 /* Pointers on 64-bit platforms need to be aligned
6960 * on a 64-bit boundry in memory.
6961 */
6962 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
6963 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
6964 *pBufferSize -= offsetUp;
6965 stackBufferChars += offsetUp;
6966 }
6967 stackBuffer = (void *)stackBufferChars;
6968
6969 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
6970 *pBufferSize = bufferSizeNeeded;
6971 return 0;
6972 }
6973 if (!stackBuffer || *pBufferSize < bufferSizeNeeded) {
6974 /* allocate one here...*/
6975 int32_t length;
6976 const UChar * rules = ucol_getRules(coll, &length);
6977
6978 localCollator = ucol_openRules(rules,
6979 length,
6980 ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status),
6981 ucol_getStrength(coll),
6982 NULL,
6983 status);
6984 if (U_SUCCESS(*status))
6985 {
6986 *status = U_SAFECLONE_ALLOCATED_WARNING;
6987 }
6988 } else {
6989 localCollator = (UCollator *)stackBuffer;
6990 memcpy(localCollator, coll, sizeof(UCollator));
6991 localCollator->freeOnClose = FALSE;
6992 localCollator->requestedLocale = NULL; // zero copies of pointers
6993 localCollator->validLocale = NULL;
6994 }
6995 return localCollator;
6996 }
6997
6998 U_CAPI int32_t U_EXPORT2
6999 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
7000 UErrorCode status = U_ZERO_ERROR;
7001 int32_t len = 0;
7002 int32_t UCAlen = 0;
7003 const UChar* ucaRules = 0;
7004 const UChar *rules = ucol_getRules(coll, &len);
7005 if(delta == UCOL_FULL_RULES) {
7006 /* take the UCA rules and append real rules at the end */
7007 /* UCA rules will be probably coming from the root RB */
7008 ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status);
7009 }
7010 if(U_FAILURE(status)) {
7011 return 0;
7012 }
7013 if(buffer!=0 && bufferLen>0){
7014 *buffer=0;
7015 if(UCAlen > 0) {
7016 u_memcpy(buffer, ucaRules, uprv_min(UCAlen, bufferLen));
7017 }
7018 if(len > 0 && bufferLen > UCAlen) {
7019 u_memcpy(buffer+UCAlen, rules, uprv_min(len, bufferLen-UCAlen));
7020 }
7021 }
7022 return u_terminateUChars(buffer, bufferLen, len+UCAlen, &status);
7023 }
7024
7025 static const UChar _NUL = 0;
7026
7027 U_CAPI const UChar* U_EXPORT2
7028 ucol_getRules( const UCollator *coll,
7029 int32_t *length)
7030 {
7031 if(coll->rules != NULL) {
7032 *length = coll->rulesLength;
7033 return coll->rules;
7034 } else {
7035 UErrorCode status = U_ZERO_ERROR;
7036 if(coll->rb != NULL) {
7037 UResourceBundle *collElem = ures_getByKey(coll->rb, "CollationElements", NULL, &status);
7038 if(U_SUCCESS(status)) {
7039 /*Semantic const */
7040 ((UCollator *)coll)->rules = ures_getStringByKey(collElem, "Sequence", length, &status);
7041 ((UCollator *)coll)->rulesLength = *length;
7042 ((UCollator *)coll)->freeRulesOnClose = FALSE;
7043 ures_close(collElem);
7044 return coll->rules;
7045 }
7046 }
7047 *length = 0;
7048 return &_NUL;
7049 }
7050 }
7051
7052 U_CAPI int32_t U_EXPORT2
7053 ucol_getDisplayName( const char *objLoc,
7054 const char *dispLoc,
7055 UChar *result,
7056 int32_t resultLength,
7057 UErrorCode *status)
7058 {
7059
7060 if(U_FAILURE(*status)) return -1;
7061 UnicodeString dst;
7062 if(!(result==NULL && resultLength==0)) {
7063 // NULL destination for pure preflighting: empty dummy string
7064 // otherwise, alias the destination buffer
7065 dst.setTo(result, 0, resultLength);
7066 }
7067 Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst);
7068 return dst.extract(result, resultLength, *status);
7069 }
7070
7071 U_CAPI const char* U_EXPORT2
7072 ucol_getAvailable(int32_t index)
7073 {
7074 return uloc_getAvailable(index);
7075 }
7076
7077 U_CAPI int32_t U_EXPORT2
7078 ucol_countAvailable()
7079 {
7080 return uloc_countAvailable();
7081 }
7082
7083 U_CAPI void U_EXPORT2
7084 ucol_getVersion(const UCollator* coll,
7085 UVersionInfo versionInfo)
7086 {
7087 /* RunTime version */
7088 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7089 /* Builder version*/
7090 uint8_t bdVersion = coll->image->version[0];
7091
7092 /* Charset Version. Need to get the version from cnv files
7093 * makeconv should populate cnv files with version and
7094 * an api has to be provided in ucnv.h to obtain this version
7095 */
7096 uint8_t csVersion = 0;
7097
7098 /* combine the version info */
7099 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7100
7101 /* Tailoring rules */
7102 versionInfo[0] = (uint8_t)(cmbVersion>>8);
7103 versionInfo[1] = (uint8_t)cmbVersion;
7104 versionInfo[2] = coll->image->version[1];
7105 versionInfo[3] = UCA->image->UCAVersion[0];
7106 }
7107
7108
7109 /* This internal API checks whether a character is tailored or not */
7110 U_CAPI UBool U_EXPORT2
7111 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7112 uint32_t CE = UCOL_NOT_FOUND;
7113 const UChar *ContractionStart = NULL;
7114 if(U_SUCCESS(*status) && coll != NULL) {
7115 if(coll == UCA) {
7116 return FALSE;
7117 } else if(u < 0x100) { /* latin-1 */
7118 CE = coll->latinOneMapping[u];
7119 if(CE == UCA->latinOneMapping[u]) {
7120 return FALSE;
7121 }
7122 } else { /* regular */
7123 /*CE = ucmpe32_get(coll->mapping, u);*/
7124 CE = UTRIE_GET32_FROM_LEAD(coll->mapping, u);
7125
7126 }
7127
7128 if(isContraction(CE)) {
7129 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7130 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7131 }
7132
7133 if(CE == UCOL_NOT_FOUND) {
7134 return FALSE;
7135 } else {
7136 return TRUE;
7137 }
7138 } else {
7139 return FALSE;
7140 }
7141 }
7142
7143
7144 /****************************************************************************/
7145 /* Following are the string compare functions */
7146 /* */
7147 /****************************************************************************/
7148
7149
7150 /* ucol_checkIdent internal function. Does byte level string compare. */
7151 /* Used by strcoll if strength == identical and strings */
7152 /* are otherwise equal. Moved out-of-line because this */
7153 /* is a rare case. */
7154 /* */
7155 /* Comparison must be done on NFD normalized strings. */
7156 /* FCD is not good enough. */
7157 /* */
7158 /* TODO: make an incremental NFD Comparison function, which could */
7159 /* be of general use */
7160
7161 static
7162 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7163 {
7164
7165 // TODO: When we have an UChar iterator, we need to access the whole string. One
7166 // useful modification would be a UChar iterator extract API, since reset next next...
7167 // is not optimal.
7168 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7169
7170 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7171 // of same type, but that doesn't really mean that it will stay that way.
7172
7173 // The division for the array length may truncate the array size to
7174 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7175 // for all platforms anyway.
7176 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7177 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7178 //UChar sStackBuf[256], tStackBuf[256];
7179 //int32_t sBufSize = 256, tBufSize = 256;
7180 int32_t comparison;
7181 int32_t sLen = 0;
7182 UChar *sBuf = NULL;
7183 int32_t tLen = 0;
7184 UChar *tBuf = NULL;
7185 UBool freeSBuf = FALSE, freeTBuf = FALSE;
7186
7187 if (sColl->flags & UCOL_USE_ITERATOR) {
7188 UNormIterator *sNIt = NULL, *tNIt = NULL;
7189 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7190 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7191 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7192 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7193 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7194 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7195 comparison = u_strCompareIter(sIt, tIt, TRUE);
7196 unorm_closeIter(sNIt);
7197 unorm_closeIter(tNIt);
7198 } else {
7199 sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7200 sBuf = sColl->string;
7201 tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7202 tBuf = tColl->string;
7203
7204 if (normalize) {
7205 *status = U_ZERO_ERROR;
7206 if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7207 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7208 sBuf, sLen,
7209 FALSE, 0,
7210 status);
7211 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7212 if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7213 &sColl->writableBuffer,
7214 (int32_t *)&sColl->writableBufSize, sLen,
7215 0)
7216 ) {
7217 *status = U_MEMORY_ALLOCATION_ERROR;
7218 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7219 }
7220 *status = U_ZERO_ERROR;
7221 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7222 sBuf, sLen,
7223 FALSE, 0,
7224 status);
7225 }
7226 if(freeSBuf) {
7227 uprv_free(sBuf);
7228 freeSBuf = FALSE;
7229 }
7230 sBuf = sColl->writableBuffer;
7231 if (sBuf != sColl->stackWritableBuffer) {
7232 sColl->flags |= UCOL_ITER_ALLOCATED;
7233 }
7234 }
7235
7236 *status = U_ZERO_ERROR;
7237 if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7238 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7239 tBuf, tLen,
7240 FALSE, 0,
7241 status);
7242 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7243 if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7244 &tColl->writableBuffer,
7245 (int32_t *)&tColl->writableBufSize, tLen,
7246 0)
7247 ) {
7248 *status = U_MEMORY_ALLOCATION_ERROR;
7249 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7250 }
7251 *status = U_ZERO_ERROR;
7252 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7253 tBuf, tLen,
7254 FALSE, 0,
7255 status);
7256 }
7257 if(freeTBuf) {
7258 uprv_free(tBuf);
7259 freeTBuf = FALSE;
7260 }
7261 tBuf = tColl->writableBuffer;
7262 if (tBuf != tColl->stackWritableBuffer) {
7263 tColl->flags |= UCOL_ITER_ALLOCATED;
7264 }
7265 }
7266 }
7267
7268 if (sLen == -1 && tLen == -1) {
7269 comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7270 } else {
7271 if (sLen == -1) {
7272 sLen = u_strlen(sBuf);
7273 }
7274 if (tLen == -1) {
7275 tLen = u_strlen(tBuf);
7276 }
7277 comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7278 if (comparison == 0) {
7279 comparison = sLen - tLen;
7280 }
7281 }
7282 }
7283
7284 if (comparison < 0) {
7285 return UCOL_LESS;
7286 } else if (comparison == 0) {
7287 return UCOL_EQUAL;
7288 } else /* comparison > 0 */ {
7289 return UCOL_GREATER;
7290 }
7291 }
7292
7293 /* CEBuf - A struct and some inline functions to handle the saving */
7294 /* of CEs in a buffer within ucol_strcoll */
7295
7296 #define UCOL_CEBUF_SIZE 512
7297 typedef struct ucol_CEBuf {
7298 uint32_t *buf;
7299 uint32_t *endp;
7300 uint32_t *pos;
7301 uint32_t localArray[UCOL_CEBUF_SIZE];
7302 } ucol_CEBuf;
7303
7304
7305 static
7306 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7307 (b)->buf = (b)->pos = (b)->localArray;
7308 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7309 };
7310
7311 static
7312 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7313 uint32_t oldSize;
7314 uint32_t newSize;
7315 uint32_t *newBuf;
7316
7317 ci->flags |= UCOL_ITER_ALLOCATED;
7318 oldSize = b->pos - b->buf;
7319 newSize = oldSize * 2;
7320 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7321 if(newBuf != NULL) {
7322 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7323 if (b->buf != b->localArray) {
7324 uprv_free(b->buf);
7325 }
7326 b->buf = newBuf;
7327 b->endp = b->buf + newSize;
7328 b->pos = b->buf + oldSize;
7329 }
7330 }
7331
7332 static
7333 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7334 if (b->pos == b->endp) {
7335 ucol_CEBuf_Expand(b, ci);
7336 }
7337 *(b)->pos++ = ce;
7338 };
7339
7340 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7341 /* It is used when compare gets in trouble and needs to bail out */
7342 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7343 collIterate *tColl)
7344 {
7345 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7346 uint8_t *sourceKeyP = sourceKey;
7347 uint8_t *targetKeyP = targetKey;
7348 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7349 const UCollator *coll = sColl->coll;
7350 UChar *source = NULL;
7351 UChar *target = NULL;
7352 UChar sStackBuf[256], tStackBuf[256];
7353 int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7354 int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7355
7356 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7357 if(sColl->flags & UCOL_USE_ITERATOR) {
7358 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7359 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7360 source = sStackBuf;
7361 UChar *sBufp = source;
7362 target = tStackBuf;
7363 UChar *tBufp = target;
7364 while(sColl->iterator->hasNext(sColl->iterator)) {
7365 *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7366 }
7367 while(tColl->iterator->hasNext(tColl->iterator)) {
7368 *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7369 }
7370 sourceLength = sBufp - source;
7371 targetLength = tBufp - target;
7372 } else { // no iterators
7373 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7374 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7375 source = sColl->string;
7376 target = tColl->string;
7377 }
7378
7379
7380
7381 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7382 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7383 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7384 if(sourceKeyP != NULL) {
7385 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7386 }
7387 }
7388
7389 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7390 if(targetKeyLen > UCOL_MAX_BUFFER) {
7391 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7392 if(targetKeyP != NULL) {
7393 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7394 }
7395 }
7396
7397 int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7398
7399 if(sourceKeyP != sourceKey) {
7400 uprv_free(sourceKeyP);
7401 }
7402
7403 if(targetKeyP != targetKey) {
7404 uprv_free(targetKeyP);
7405 }
7406
7407 if(result<0) {
7408 return UCOL_LESS;
7409 } else if(result>0) {
7410 return UCOL_GREATER;
7411 } else {
7412 return UCOL_EQUAL;
7413 }
7414 }
7415
7416
7417 static inline UCollationResult
7418 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7419 // const UCollator *coll,
7420 // const UChar *source,
7421 // int32_t sourceLength,
7422 // const UChar *target,
7423 // int32_t targetLength,
7424 UErrorCode *status)
7425 {
7426 U_ALIGN_CODE(16);
7427
7428 const UCollator *coll = sColl->coll;
7429
7430
7431 // setting up the collator parameters
7432 UColAttributeValue strength = coll->strength;
7433 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7434
7435 UBool checkSecTer = initialCheckSecTer;
7436 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7437 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7438 UBool checkIdent = (strength == UCOL_IDENTICAL);
7439 UBool checkCase = (coll->caseLevel == UCOL_ON);
7440 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7441 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7442 UBool qShifted = shifted && checkQuad;
7443 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7444
7445 if(doHiragana && shifted) {
7446 return (ucol_compareUsingSortKeys(sColl, tColl));
7447 }
7448 uint8_t caseSwitch = coll->caseSwitch;
7449 uint8_t tertiaryMask = coll->tertiaryMask;
7450
7451 // This is the lowest primary value that will not be ignored if shifted
7452 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7453
7454 UCollationResult result = UCOL_EQUAL;
7455 UCollationResult hirResult = UCOL_EQUAL;
7456
7457 // Preparing the CE buffers. They will be filled during the primary phase
7458 ucol_CEBuf sCEs;
7459 ucol_CEBuf tCEs;
7460 UCOL_INIT_CEBUF(&sCEs);
7461 UCOL_INIT_CEBUF(&tCEs);
7462
7463 uint32_t secS = 0, secT = 0;
7464 uint32_t sOrder=0, tOrder=0;
7465
7466 // Non shifted primary processing is quite simple
7467 if(!shifted) {
7468 for(;;) {
7469
7470 // We fetch CEs until we hit a non ignorable primary or end.
7471 do {
7472 // We get the next CE
7473 sOrder = ucol_IGetNextCE(coll, sColl, status);
7474 // Stuff it in the buffer
7475 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7476 // And keep just the primary part.
7477 sOrder &= UCOL_PRIMARYMASK;
7478 } while(sOrder == 0);
7479
7480 // see the comments on the above block
7481 do {
7482 tOrder = ucol_IGetNextCE(coll, tColl, status);
7483 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7484 tOrder &= UCOL_PRIMARYMASK;
7485 } while(tOrder == 0);
7486
7487 // if both primaries are the same
7488 if(sOrder == tOrder) {
7489 // and there are no more CEs, we advance to the next level
7490 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7491 break;
7492 }
7493 if(doHiragana && hirResult == UCOL_EQUAL) {
7494 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7495 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7496 ? UCOL_LESS:UCOL_GREATER;
7497 }
7498 }
7499 } else {
7500 // if two primaries are different, we are done
7501 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7502 goto commonReturn;
7503 }
7504 } // no primary difference... do the rest from the buffers
7505 } else { // shifted - do a slightly more complicated processing :)
7506 for(;;) {
7507 UBool sInShifted = FALSE;
7508 UBool tInShifted = FALSE;
7509 // This version of code can be refactored. However, it seems easier to understand this way.
7510 // Source loop. Sam as the target loop.
7511 for(;;) {
7512 sOrder = ucol_IGetNextCE(coll, sColl, status);
7513 if(sOrder == UCOL_NO_MORE_CES) {
7514 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7515 break;
7516 } else if(sOrder == 0
7517 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7518 /* UCA amendment - ignore ignorables that follow shifted code points */
7519 continue;
7520 } else if(isContinuation(sOrder)) {
7521 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7522 if(sInShifted) {
7523 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7524 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7525 continue;
7526 } else {
7527 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7528 break;
7529 }
7530 } else { /* Just lower level values */
7531 if(sInShifted) {
7532 continue;
7533 } else {
7534 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7535 continue;
7536 }
7537 }
7538 } else { /* regular */
7539 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7540 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7541 break;
7542 } else {
7543 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7544 sInShifted = TRUE;
7545 sOrder &= UCOL_PRIMARYMASK;
7546 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7547 continue;
7548 } else {
7549 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7550 sInShifted = FALSE;
7551 continue;
7552 }
7553 }
7554 }
7555 }
7556 sOrder &= UCOL_PRIMARYMASK;
7557 sInShifted = FALSE;
7558
7559 for(;;) {
7560 tOrder = ucol_IGetNextCE(coll, tColl, status);
7561 if(tOrder == UCOL_NO_MORE_CES) {
7562 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7563 break;
7564 } else if(tOrder == 0
7565 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7566 /* UCA amendment - ignore ignorables that follow shifted code points */
7567 continue;
7568 } else if(isContinuation(tOrder)) {
7569 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7570 if(tInShifted) {
7571 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7572 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7573 continue;
7574 } else {
7575 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7576 break;
7577 }
7578 } else { /* Just lower level values */
7579 if(tInShifted) {
7580 continue;
7581 } else {
7582 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7583 continue;
7584 }
7585 }
7586 } else { /* regular */
7587 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7588 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7589 break;
7590 } else {
7591 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7592 tInShifted = TRUE;
7593 tOrder &= UCOL_PRIMARYMASK;
7594 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7595 continue;
7596 } else {
7597 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7598 tInShifted = FALSE;
7599 continue;
7600 }
7601 }
7602 }
7603 }
7604 tOrder &= UCOL_PRIMARYMASK;
7605 tInShifted = FALSE;
7606
7607 if(sOrder == tOrder) {
7608 /*
7609 if(doHiragana && hirResult == UCOL_EQUAL) {
7610 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7611 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7612 ? UCOL_LESS:UCOL_GREATER;
7613 }
7614 }
7615 */
7616 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7617 break;
7618 } else {
7619 sOrder = 0; tOrder = 0;
7620 continue;
7621 }
7622 } else {
7623 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7624 goto commonReturn;
7625 }
7626 } /* no primary difference... do the rest from the buffers */
7627 }
7628
7629 /* now, we're gonna reexamine collected CEs */
7630 uint32_t *sCE;
7631 uint32_t *tCE;
7632
7633 /* This is the secondary level of comparison */
7634 if(checkSecTer) {
7635 if(!isFrenchSec) { /* normal */
7636 sCE = sCEs.buf;
7637 tCE = tCEs.buf;
7638 for(;;) {
7639 while (secS == 0) {
7640 secS = *(sCE++) & UCOL_SECONDARYMASK;
7641 }
7642
7643 while(secT == 0) {
7644 secT = *(tCE++) & UCOL_SECONDARYMASK;
7645 }
7646
7647 if(secS == secT) {
7648 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7649 break;
7650 } else {
7651 secS = 0; secT = 0;
7652 continue;
7653 }
7654 } else {
7655 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7656 goto commonReturn;
7657 }
7658 }
7659 } else { /* do the French */
7660 uint32_t *sCESave = NULL;
7661 uint32_t *tCESave = NULL;
7662 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7663 tCE = tCEs.pos-2;
7664 for(;;) {
7665 while (secS == 0 && sCE >= sCEs.buf) {
7666 if(sCESave == 0) {
7667 secS = *(sCE--);
7668 if(isContinuation(secS)) {
7669 while(isContinuation(secS = *(sCE--)));
7670 /* after this, secS has the start of continuation, and sCEs points before that */
7671 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7672 sCE+=2; /* need to point to the first continuation CP */
7673 /* However, now you can just continue doing stuff */
7674 }
7675 } else {
7676 secS = *(sCE++);
7677 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7678 sCE = sCESave; /* reset the pointer to before continuation */
7679 sCESave = 0;
7680 continue;
7681 }
7682 }
7683 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7684 }
7685
7686 while(secT == 0 && tCE >= tCEs.buf) {
7687 if(tCESave == 0) {
7688 secT = *(tCE--);
7689 if(isContinuation(secT)) {
7690 while(isContinuation(secT = *(tCE--)));
7691 /* after this, secS has the start of continuation, and sCEs points before that */
7692 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7693 tCE+=2; /* need to point to the first continuation CP */
7694 /* However, now you can just continue doing stuff */
7695 }
7696 } else {
7697 secT = *(tCE++);
7698 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7699 tCE = tCESave; /* reset the pointer to before continuation */
7700 tCESave = 0;
7701 continue;
7702 }
7703 }
7704 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7705 }
7706
7707 if(secS == secT) {
7708 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7709 break;
7710 } else {
7711 secS = 0; secT = 0;
7712 continue;
7713 }
7714 } else {
7715 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7716 goto commonReturn;
7717 }
7718 }
7719 }
7720 }
7721
7722 /* doing the case bit */
7723 if(checkCase) {
7724 sCE = sCEs.buf;
7725 tCE = tCEs.buf;
7726 for(;;) {
7727 while((secS & UCOL_REMOVE_CASE) == 0) {
7728 if(!isContinuation(*sCE++)) {
7729 secS =*(sCE-1) & UCOL_TERT_CASE_MASK;
7730 secS ^= caseSwitch;
7731 } else {
7732 secS = 0;
7733 }
7734 }
7735
7736 while((secT & UCOL_REMOVE_CASE) == 0) {
7737 if(!isContinuation(*tCE++)) {
7738 secT = *(tCE-1) & UCOL_TERT_CASE_MASK;
7739 secT ^= caseSwitch;
7740 } else {
7741 secT = 0;
7742 }
7743 }
7744
7745 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7746 result = UCOL_LESS;
7747 goto commonReturn;
7748 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7749 result = UCOL_GREATER;
7750 goto commonReturn;
7751 }
7752
7753 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7754 break;
7755 } else {
7756 secS = 0;
7757 secT = 0;
7758 }
7759 }
7760 }
7761
7762 /* Tertiary level */
7763 if(checkTertiary) {
7764 secS = 0;
7765 secT = 0;
7766 sCE = sCEs.buf;
7767 tCE = tCEs.buf;
7768 for(;;) {
7769 while((secS & UCOL_REMOVE_CASE) == 0) {
7770 secS = *(sCE++) & tertiaryMask;
7771 if(!isContinuation(secS)) {
7772 secS ^= caseSwitch;
7773 } else {
7774 secS &= UCOL_REMOVE_CASE;
7775 }
7776 }
7777
7778 while((secT & UCOL_REMOVE_CASE) == 0) {
7779 secT = *(tCE++) & tertiaryMask;
7780 if(!isContinuation(secT)) {
7781 secT ^= caseSwitch;
7782 } else {
7783 secT &= UCOL_REMOVE_CASE;
7784 }
7785 }
7786
7787 if(secS == secT) {
7788 if((secS & UCOL_REMOVE_CASE) == 1) {
7789 break;
7790 } else {
7791 secS = 0; secT = 0;
7792 continue;
7793 }
7794 } else {
7795 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7796 goto commonReturn;
7797 }
7798 }
7799 }
7800
7801
7802 if(qShifted /*checkQuad*/) {
7803 UBool sInShifted = TRUE;
7804 UBool tInShifted = TRUE;
7805 secS = 0;
7806 secT = 0;
7807 sCE = sCEs.buf;
7808 tCE = tCEs.buf;
7809 for(;;) {
7810 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7811 secS = *(sCE++);
7812 if(isContinuation(secS)) {
7813 if(!sInShifted) {
7814 continue;
7815 }
7816 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7817 secS = UCOL_PRIMARYMASK;
7818 sInShifted = FALSE;
7819 } else {
7820 sInShifted = TRUE;
7821 }
7822 }
7823 secS &= UCOL_PRIMARYMASK;
7824
7825
7826 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7827 secT = *(tCE++);
7828 if(isContinuation(secT)) {
7829 if(!tInShifted) {
7830 continue;
7831 }
7832 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7833 secT = UCOL_PRIMARYMASK;
7834 tInShifted = FALSE;
7835 } else {
7836 tInShifted = TRUE;
7837 }
7838 }
7839 secT &= UCOL_PRIMARYMASK;
7840
7841 if(secS == secT) {
7842 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7843 break;
7844 } else {
7845 secS = 0; secT = 0;
7846 continue;
7847 }
7848 } else {
7849 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7850 goto commonReturn;
7851 }
7852 }
7853 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7854 // If we're fine on quaternaries, we might be different
7855 // on Hiragana. This, however, might fail us in shifted.
7856 result = hirResult;
7857 goto commonReturn;
7858 }
7859
7860 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7861 /* as a tiebreaker if all else is equal. */
7862 /* Getting here should be quite rare - strings are not identical - */
7863 /* that is checked first, but compared == through all other checks. */
7864 if(checkIdent)
7865 {
7866 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7867 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7868 }
7869
7870 commonReturn:
7871 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7872 freeHeapWritableBuffer(sColl);
7873 freeHeapWritableBuffer(tColl);
7874
7875 if (sCEs.buf != sCEs.localArray ) {
7876 uprv_free(sCEs.buf);
7877 }
7878 if (tCEs.buf != tCEs.localArray ) {
7879 uprv_free(tCEs.buf);
7880 }
7881 }
7882
7883 return result;
7884 }
7885
7886
7887 static inline uint32_t
7888 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7889 uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
7890 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7891 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7892 int32_t offset = 1;
7893 UChar schar = 0, tchar = 0;
7894
7895 for(;;) {
7896 if(len == -1) {
7897 if(s[*index] == 0) { // end of string
7898 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7899 } else {
7900 schar = s[*index];
7901 }
7902 } else {
7903 if(*index == len) {
7904 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7905 } else {
7906 schar = s[*index];
7907 }
7908 }
7909
7910 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7911 offset++;
7912 }
7913
7914 if (schar == tchar) {
7915 (*index)++;
7916 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7917 }
7918 else
7919 {
7920 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7921 return UCOL_BAIL_OUT_CE;
7922 }
7923 // skip completely ignorables
7924 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
7925 if(isZeroCE == 0) { // we have to ignore completely ignorables
7926 (*index)++;
7927 continue;
7928 }
7929
7930 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7931 }
7932 }
7933 }
7934
7935
7936 /**
7937 * This is a fast strcoll, geared towards text in Latin-1.
7938 * It supports contractions of size two, French secondaries
7939 * and case switching. You can use it with strengths primary
7940 * to tertiary. It does not support shifted and case level.
7941 * It relies on the table build by setupLatin1Table. If it
7942 * doesn't understand something, it will go to the regular
7943 * strcoll.
7944 */
7945 static inline UCollationResult
7946 ucol_strcollUseLatin1( const UCollator *coll,
7947 const UChar *source,
7948 int32_t sLen,
7949 const UChar *target,
7950 int32_t tLen,
7951 UErrorCode *status)
7952 {
7953 U_ALIGN_CODE(16);
7954 int32_t strength = coll->strength;
7955
7956 int32_t sIndex = 0, tIndex = 0;
7957 UChar sChar = 0, tChar = 0;
7958 uint32_t sOrder=0, tOrder=0;
7959
7960 UBool endOfSource = FALSE, endOfTarget = FALSE;
7961
7962 uint32_t *elements = coll->latinOneCEs;
7963
7964 UBool haveContractions = FALSE; // if we have contractions in our string
7965 // we cannot do French secondary
7966
7967 // Do the primary level
7968 for(;;) {
7969 while(sOrder==0) { // this loop skips primary ignorables
7970 // sOrder=getNextlatinOneCE(source);
7971 if(sLen==-1) { // handling zero terminated strings
7972 sChar=source[sIndex++];
7973 if(sChar==0) {
7974 endOfSource = TRUE;
7975 break;
7976 }
7977 } else { // handling strings with known length
7978 if(sIndex==sLen) {
7979 endOfSource = TRUE;
7980 break;
7981 }
7982 sChar=source[sIndex++];
7983 }
7984 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7985 //fprintf(stderr, "R");
7986 goto returnRegular;
7987 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7988 }
7989 sOrder = elements[sChar];
7990 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7991 // specials can basically be either contractions or bail-out signs. If we get anything
7992 // else, we'll bail out anywasy
7993 if(getCETag(sOrder) == CONTRACTION_TAG) {
7994 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7995 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7996 // However, if there are contractions in the table, but we always use just one char,
7997 // we might be able to do French. This should be checked out.
7998 }
7999 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8000 //fprintf(stderr, "S");
8001 goto returnRegular;
8002 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8003 }
8004 }
8005 }
8006
8007 while(tOrder==0) { // this loop skips primary ignorables
8008 // tOrder=getNextlatinOneCE(target);
8009 if(tLen==-1) { // handling zero terminated strings
8010 tChar=target[tIndex++];
8011 if(tChar==0) {
8012 if(endOfSource) { // this is different than source loop,
8013 // as we already know that source loop is done here,
8014 // so we can either finish the primary loop if both
8015 // strings are done or anounce the result if only
8016 // target is done. Same below.
8017 goto endOfPrimLoop;
8018 } else {
8019 return UCOL_GREATER;
8020 }
8021 }
8022 } else { // handling strings with known length
8023 if(tIndex==tLen) {
8024 if(endOfSource) {
8025 goto endOfPrimLoop;
8026 } else {
8027 return UCOL_GREATER;
8028 }
8029 }
8030 tChar=target[tIndex++];
8031 }
8032 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8033 //fprintf(stderr, "R");
8034 goto returnRegular;
8035 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8036 }
8037 tOrder = elements[tChar];
8038 if(tOrder >= UCOL_NOT_FOUND) {
8039 // Handling specials, see the comments for source
8040 if(getCETag(tOrder) == CONTRACTION_TAG) {
8041 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8042 haveContractions = TRUE;
8043 }
8044 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8045 //fprintf(stderr, "S");
8046 goto returnRegular;
8047 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8048 }
8049 }
8050 }
8051 if(endOfSource) { // source is finished, but target is not, say the result.
8052 return UCOL_LESS;
8053 }
8054
8055 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8056 sOrder = 0; tOrder = 0;
8057 continue;
8058 } else {
8059 // compare current top bytes
8060 if(((sOrder^tOrder)&0xFF000000)!=0) {
8061 // top bytes differ, return difference
8062 if(sOrder < tOrder) {
8063 return UCOL_LESS;
8064 } else if(sOrder > tOrder) {
8065 return UCOL_GREATER;
8066 }
8067 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8068 // since we must return enum value
8069 }
8070
8071 // top bytes match, continue with following bytes
8072 sOrder<<=8;
8073 tOrder<<=8;
8074 }
8075 }
8076
8077 endOfPrimLoop:
8078 // after primary loop, we definitely know the sizes of strings,
8079 // so we set it and use simpler loop for secondaries and tertiaries
8080 sLen = sIndex; tLen = tIndex;
8081 if(strength >= UCOL_SECONDARY) {
8082 // adjust the table beggining
8083 elements += coll->latinOneTableLen;
8084 endOfSource = FALSE; endOfTarget = FALSE;
8085
8086 if(coll->frenchCollation == UCOL_OFF) { // non French
8087 // This loop is a simplified copy of primary loop
8088 // at this point we know that whole strings are latin-1, so we don't
8089 // check for that. We also know that we only have contractions as
8090 // specials.
8091 sIndex = 0; tIndex = 0;
8092 for(;;) {
8093 while(sOrder==0) {
8094 if(sIndex==sLen) {
8095 endOfSource = TRUE;
8096 break;
8097 }
8098 sChar=source[sIndex++];
8099 sOrder = elements[sChar];
8100 if(sOrder > UCOL_NOT_FOUND) {
8101 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8102 }
8103 }
8104
8105 while(tOrder==0) {
8106 if(tIndex==tLen) {
8107 if(endOfSource) {
8108 goto endOfSecLoop;
8109 } else {
8110 return UCOL_GREATER;
8111 }
8112 }
8113 tChar=target[tIndex++];
8114 tOrder = elements[tChar];
8115 if(tOrder > UCOL_NOT_FOUND) {
8116 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8117 }
8118 }
8119 if(endOfSource) {
8120 return UCOL_LESS;
8121 }
8122
8123 if(sOrder == tOrder) {
8124 sOrder = 0; tOrder = 0;
8125 continue;
8126 } else {
8127 // see primary loop for comments on this
8128 if(((sOrder^tOrder)&0xFF000000)!=0) {
8129 if(sOrder < tOrder) {
8130 return UCOL_LESS;
8131 } else if(sOrder > tOrder) {
8132 return UCOL_GREATER;
8133 }
8134 }
8135 sOrder<<=8;
8136 tOrder<<=8;
8137 }
8138 }
8139 } else { // French
8140 if(haveContractions) { // if we have contractions, we have to bail out
8141 // since we don't really know how to handle them here
8142 goto returnRegular;
8143 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8144 }
8145 // For French, we go backwards
8146 sIndex = sLen; tIndex = tLen;
8147 for(;;) {
8148 while(sOrder==0) {
8149 if(sIndex==0) {
8150 endOfSource = TRUE;
8151 break;
8152 }
8153 sChar=source[--sIndex];
8154 sOrder = elements[sChar];
8155 // don't even look for contractions
8156 }
8157
8158 while(tOrder==0) {
8159 if(tIndex==0) {
8160 if(endOfSource) {
8161 goto endOfSecLoop;
8162 } else {
8163 return UCOL_GREATER;
8164 }
8165 }
8166 tChar=target[--tIndex];
8167 tOrder = elements[tChar];
8168 // don't even look for contractions
8169 }
8170 if(endOfSource) {
8171 return UCOL_LESS;
8172 }
8173
8174 if(sOrder == tOrder) {
8175 sOrder = 0; tOrder = 0;
8176 continue;
8177 } else {
8178 // see the primary loop for comments
8179 if(((sOrder^tOrder)&0xFF000000)!=0) {
8180 if(sOrder < tOrder) {
8181 return UCOL_LESS;
8182 } else if(sOrder > tOrder) {
8183 return UCOL_GREATER;
8184 }
8185 }
8186 sOrder<<=8;
8187 tOrder<<=8;
8188 }
8189 }
8190 }
8191 }
8192
8193 endOfSecLoop:
8194 if(strength >= UCOL_TERTIARY) {
8195 // tertiary loop is the same as secondary (except no French)
8196 elements += coll->latinOneTableLen;
8197 sIndex = 0; tIndex = 0;
8198 endOfSource = FALSE; endOfTarget = FALSE;
8199 for(;;) {
8200 while(sOrder==0) {
8201 if(sIndex==sLen) {
8202 endOfSource = TRUE;
8203 break;
8204 }
8205 sChar=source[sIndex++];
8206 sOrder = elements[sChar];
8207 if(sOrder > UCOL_NOT_FOUND) {
8208 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8209 }
8210 }
8211 while(tOrder==0) {
8212 if(tIndex==tLen) {
8213 if(endOfSource) {
8214 return UCOL_EQUAL; // if both strings are at the end, they are equal
8215 } else {
8216 return UCOL_GREATER;
8217 }
8218 }
8219 tChar=target[tIndex++];
8220 tOrder = elements[tChar];
8221 if(tOrder > UCOL_NOT_FOUND) {
8222 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8223 }
8224 }
8225 if(endOfSource) {
8226 return UCOL_LESS;
8227 }
8228 if(sOrder == tOrder) {
8229 sOrder = 0; tOrder = 0;
8230 continue;
8231 } else {
8232 if(((sOrder^tOrder)&0xff000000)!=0) {
8233 if(sOrder < tOrder) {
8234 return UCOL_LESS;
8235 } else if(sOrder > tOrder) {
8236 return UCOL_GREATER;
8237 }
8238 }
8239 sOrder<<=8;
8240 tOrder<<=8;
8241 }
8242 }
8243 }
8244 return UCOL_EQUAL;
8245
8246 returnRegular:
8247 // Preparing the context objects for iterating over strings
8248 collIterate sColl, tColl;
8249
8250 IInit_collIterate(coll, source, sLen, &sColl);
8251 IInit_collIterate(coll, target, tLen, &tColl);
8252 return ucol_strcollRegular(&sColl, &tColl, status);
8253 }
8254
8255
8256 U_CAPI UCollationResult U_EXPORT2
8257 ucol_strcollIter( const UCollator *coll,
8258 UCharIterator *sIter,
8259 UCharIterator *tIter,
8260 UErrorCode *status) {
8261 if(!status || U_FAILURE(*status) || sIter == tIter) {
8262 return UCOL_EQUAL;
8263 }
8264 if(sIter == NULL || tIter == NULL || coll == NULL) {
8265 *status = U_ILLEGAL_ARGUMENT_ERROR;
8266 return UCOL_EQUAL;
8267 }
8268
8269 UCollationResult result = UCOL_EQUAL;
8270
8271 // Preparing the context objects for iterating over strings
8272 collIterate sColl, tColl;
8273 // The division for the array length may truncate the array size to
8274 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8275 // for all platforms anyway.
8276 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8277 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8278 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8279
8280 IInit_collIterate(coll, NULL, -1, &sColl);
8281 sColl.iterator = sIter;
8282 sColl.flags |= UCOL_USE_ITERATOR;
8283 IInit_collIterate(coll, NULL, -1, &tColl);
8284 tColl.flags |= UCOL_USE_ITERATOR;
8285 tColl.iterator = tIter;
8286
8287 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8288 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8289 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8290 sColl.flags &= ~UCOL_ITER_NORM;
8291
8292 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8293 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8294 tColl.flags &= ~UCOL_ITER_NORM;
8295 }
8296
8297 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8298
8299 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8300 (tChar = tColl.iterator->next(tColl.iterator))) {
8301 if(sChar == U_SENTINEL) {
8302 result = UCOL_EQUAL;
8303 goto end_compare;
8304 }
8305 }
8306
8307 if(sChar == U_SENTINEL) {
8308 tChar = tColl.iterator->previous(tColl.iterator);
8309 }
8310
8311 if(tChar == U_SENTINEL) {
8312 sChar = sColl.iterator->previous(sColl.iterator);
8313 }
8314
8315 sChar = sColl.iterator->previous(sColl.iterator);
8316 tChar = tColl.iterator->previous(tColl.iterator);
8317
8318 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8319 {
8320 // We are stopped in the middle of a contraction.
8321 // Scan backwards through the == part of the string looking for the start of the contraction.
8322 // It doesn't matter which string we scan, since they are the same in this region.
8323 do
8324 {
8325 sChar = sColl.iterator->previous(sColl.iterator);
8326 tChar = tColl.iterator->previous(tColl.iterator);
8327 }
8328 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8329 }
8330
8331
8332 if(U_SUCCESS(*status)) {
8333 result = ucol_strcollRegular(&sColl, &tColl, status);
8334 }
8335
8336 end_compare:
8337 if(sNormIter || tNormIter) {
8338 unorm_closeIter(sNormIter);
8339 unorm_closeIter(tNormIter);
8340 }
8341
8342 return result;
8343 }
8344
8345
8346
8347 /* */
8348 /* ucol_strcoll Main public API string comparison function */
8349 /* */
8350 U_CAPI UCollationResult U_EXPORT2
8351 ucol_strcoll( const UCollator *coll,
8352 const UChar *source,
8353 int32_t sourceLength,
8354 const UChar *target,
8355 int32_t targetLength) {
8356 U_ALIGN_CODE(16);
8357 UErrorCode status = U_ZERO_ERROR;
8358 if(source == NULL || target == NULL) {
8359 // do not crash, but return. Should have
8360 // status argument to return error.
8361 return UCOL_EQUAL;
8362 }
8363 collIterate sColl, tColl;
8364
8365 /* Scan the strings. Find: */
8366 /* The length of any leading portion that is equal */
8367 /* Whether they are exactly equal. (in which case we just return) */
8368 const UChar *pSrc = source;
8369 const UChar *pTarg = target;
8370 int32_t equalLength;
8371
8372 if (sourceLength == -1 && targetLength == -1) {
8373 // Both strings are null terminated.
8374 // Check for them being the same string, and scan through
8375 // any leading equal portion.
8376 if (source==target) {
8377 return UCOL_EQUAL;
8378 }
8379
8380 for (;;) {
8381 if ( *pSrc != *pTarg || *pSrc == 0) {
8382 break;
8383 }
8384 pSrc++;
8385 pTarg++;
8386 }
8387 if (*pSrc == 0 && *pTarg == 0) {
8388 return UCOL_EQUAL;
8389 }
8390 equalLength = pSrc - source;
8391 }
8392 else
8393 {
8394 // One or both strings has an explicit length.
8395 /* check if source and target are same strings */
8396
8397 if (source==target && sourceLength==targetLength) {
8398 return UCOL_EQUAL;
8399 }
8400 const UChar *pSrcEnd = source + sourceLength;
8401 const UChar *pTargEnd = target + targetLength;
8402
8403
8404 // Scan while the strings are bitwise ==, or until one is exhausted.
8405 for (;;) {
8406 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8407 break;
8408 }
8409 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8410 break;
8411 }
8412 if (*pSrc != *pTarg) {
8413 break;
8414 }
8415 pSrc++;
8416 pTarg++;
8417 }
8418 equalLength = pSrc - source;
8419
8420 // If we made it all the way through both strings, we are done. They are ==
8421 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8422 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) { /* and also at end of dest string */
8423 return UCOL_EQUAL;
8424 }
8425 }
8426 if (equalLength > 0) {
8427 /* There is an identical portion at the beginning of the two strings. */
8428 /* If the identical portion ends within a contraction or a comibining */
8429 /* character sequence, back up to the start of that sequence. */
8430 pSrc = source + equalLength; /* point to the first differing chars */
8431 pTarg = target + equalLength;
8432 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8433 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8434 {
8435 // We are stopped in the middle of a contraction.
8436 // Scan backwards through the == part of the string looking for the start of the contraction.
8437 // It doesn't matter which string we scan, since they are the same in this region.
8438 do
8439 {
8440 equalLength--;
8441 pSrc--;
8442 }
8443 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8444 }
8445
8446 source += equalLength;
8447 target += equalLength;
8448 if (sourceLength > 0) {
8449 sourceLength -= equalLength;
8450 }
8451 if (targetLength > 0) {
8452 targetLength -= equalLength;
8453 }
8454 }
8455
8456 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8457 // Preparing the context objects for iterating over strings
8458 IInit_collIterate(coll, source, sourceLength, &sColl);
8459 IInit_collIterate(coll, target, targetLength, &tColl);
8460 return ucol_strcollRegular(&sColl, &tColl, &status);
8461 } else {
8462 return ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8463 }
8464 }
8465
8466 /* convenience function for comparing strings */
8467 U_CAPI UBool U_EXPORT2
8468 ucol_greater( const UCollator *coll,
8469 const UChar *source,
8470 int32_t sourceLength,
8471 const UChar *target,
8472 int32_t targetLength)
8473 {
8474 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8475 == UCOL_GREATER);
8476 }
8477
8478 /* convenience function for comparing strings */
8479 U_CAPI UBool U_EXPORT2
8480 ucol_greaterOrEqual( const UCollator *coll,
8481 const UChar *source,
8482 int32_t sourceLength,
8483 const UChar *target,
8484 int32_t targetLength)
8485 {
8486 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8487 != UCOL_LESS);
8488 }
8489
8490 /* convenience function for comparing strings */
8491 U_CAPI UBool U_EXPORT2
8492 ucol_equal( const UCollator *coll,
8493 const UChar *source,
8494 int32_t sourceLength,
8495 const UChar *target,
8496 int32_t targetLength)
8497 {
8498 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8499 == UCOL_EQUAL);
8500 }
8501
8502 /* returns the locale name the collation data comes from */
8503 U_CAPI const char * U_EXPORT2
8504 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
8505 const char *result = NULL;
8506 if(status == NULL || U_FAILURE(*status)) {
8507 return NULL;
8508 }
8509 switch(type) {
8510 case ULOC_ACTUAL_LOCALE:
8511 // validLocale is set only if service registration has explicitly set the
8512 // requested and valid locales. if this is the case, the actual locale
8513 // is considered to be the valid locale.
8514 if (coll->validLocale != NULL) {
8515 result = coll->validLocale;
8516 } else if(coll->elements != NULL) {
8517 result = ures_getLocale(coll->elements, status);
8518 }
8519 break;
8520 case ULOC_VALID_LOCALE:
8521 if (coll->validLocale != NULL) {
8522 result = coll->validLocale;
8523 } else if(coll->rb != NULL) {
8524 result = ures_getLocale(coll->rb, status);
8525 }
8526 break;
8527 case ULOC_REQUESTED_LOCALE:
8528 result = coll->requestedLocale;
8529 break;
8530 default:
8531 *status = U_ILLEGAL_ARGUMENT_ERROR;
8532 }
8533 return result;
8534 }
8535
8536 U_CAPI USet * U_EXPORT2
8537 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
8538 {
8539 if(status == NULL || U_FAILURE(*status)) {
8540 return NULL;
8541 }
8542 if(coll == NULL) {
8543 *status = U_ILLEGAL_ARGUMENT_ERROR;
8544 }
8545 UParseError parseError;
8546 UColTokenParser src;
8547 int32_t rulesLen = 0;
8548 const UChar *rules = ucol_getRules(coll, &rulesLen);
8549 const UChar *current = NULL;
8550 UBool startOfRules = TRUE;
8551 // we internally use the C++ class, for the following reasons:
8552 // 1. we need to utilize canonical iterator, which is a C++ only class
8553 // 2. canonical iterator returns UnicodeStrings - USet cannot take them
8554 // 3. USet is internally really UnicodeSet, C is just a wrapper
8555 UnicodeSet *tailored = new UnicodeSet();
8556 UnicodeString pattern;
8557 CanonicalIterator it("", *status);
8558
8559
8560 // The idea is to tokenize the rule set. For each non-reset token,
8561 // we add all the canonicaly equivalent FCD sequences
8562 ucol_tok_initTokenList(&src, rules, rulesLen, UCA, status);
8563 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
8564 startOfRules = FALSE;
8565 if(src.parsedToken.strength != UCOL_TOK_RESET) {
8566 const UChar *stuff = src.source+(src.parsedToken.charsOffset);
8567 it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
8568 pattern = it.next();
8569 while(!pattern.isBogus()) {
8570 if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
8571 tailored->add(pattern);
8572 }
8573 pattern = it.next();
8574 }
8575 }
8576 }
8577 ucol_tok_closeTokenList(&src);
8578 return (USet *)tailored;
8579 }
8580
8581 U_CAPI UBool U_EXPORT2
8582 ucol_equals(const UCollator *source, const UCollator *target) {
8583 UErrorCode status = U_ZERO_ERROR;
8584 // if pointers are equal, collators are equal
8585 if(source == target) {
8586 return TRUE;
8587 }
8588 int32_t i = 0, j = 0;
8589 // if any of attributes are different, collators are not equal
8590 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
8591 if(ucol_getAttribute(source, (UColAttribute)i, &status) != ucol_getAttribute(target, (UColAttribute)i, &status) || U_FAILURE(status)) {
8592 return FALSE;
8593 }
8594 }
8595
8596 int32_t sourceRulesLen = 0, targetRulesLen = 0;
8597 const UChar *sourceRules = ucol_getRules(source, &sourceRulesLen);
8598 const UChar *targetRules = ucol_getRules(target, &targetRulesLen);
8599
8600 if(sourceRulesLen == targetRulesLen && u_strncmp(sourceRules, targetRules, sourceRulesLen) == 0) {
8601 // all the attributes are equal and the rules are equal - collators are equal
8602 return(TRUE);
8603 }
8604 // hard part, need to construct tree from rules and see if they yield the same tailoring
8605 UBool result = TRUE;
8606 UParseError parseError;
8607 UColTokenParser sourceParser, targetParser;
8608 int32_t sourceListLen = 0, targetListLen = 0;
8609 ucol_tok_initTokenList(&sourceParser, sourceRules, sourceRulesLen, UCA, &status);
8610 ucol_tok_initTokenList(&targetParser, targetRules, targetRulesLen, UCA, &status);
8611 sourceListLen = ucol_tok_assembleTokenList(&sourceParser, &parseError, &status);
8612 targetListLen = ucol_tok_assembleTokenList(&targetParser, &parseError, &status);
8613
8614 if(sourceListLen != targetListLen) {
8615 // different number of resets
8616 result = FALSE;
8617 } else {
8618 UColToken *sourceReset = NULL, *targetReset = NULL;
8619 UChar *sourceResetString = NULL, *targetResetString = NULL;
8620 int32_t sourceStringLen = 0, targetStringLen = 0;
8621 for(i = 0; i < sourceListLen; i++) {
8622 sourceReset = sourceParser.lh[i].reset;
8623 sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
8624 sourceStringLen = sourceReset->source >> 24;
8625 for(j = 0; j < sourceListLen; j++) {
8626 targetReset = targetParser.lh[j].reset;
8627 targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
8628 targetStringLen = targetReset->source >> 24;
8629 if(sourceStringLen == targetStringLen && (u_strncmp(sourceResetString, targetResetString, sourceStringLen) == 0)) {
8630 sourceReset = sourceParser.lh[i].first;
8631 targetReset = targetParser.lh[j].first;
8632 while(sourceReset != NULL && targetReset != NULL) {
8633 sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
8634 sourceStringLen = sourceReset->source >> 24;
8635 targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
8636 targetStringLen = targetReset->source >> 24;
8637 if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
8638 result = FALSE;
8639 goto returnResult;
8640 }
8641 // probably also need to check the expansions
8642 if(sourceReset->expansion) {
8643 if(!targetReset->expansion) {
8644 result = FALSE;
8645 goto returnResult;
8646 } else {
8647 // compare expansions
8648 sourceResetString = sourceParser.source+(sourceReset->expansion& 0xFFFFFF);
8649 sourceStringLen = sourceReset->expansion >> 24;
8650 targetResetString = targetParser.source+(targetReset->expansion & 0xFFFFFF);
8651 targetStringLen = targetReset->expansion >> 24;
8652 if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
8653 result = FALSE;
8654 goto returnResult;
8655 }
8656 }
8657 } else {
8658 if(targetReset->expansion) {
8659 result = FALSE;
8660 goto returnResult;
8661 }
8662 }
8663 sourceReset = sourceReset->next;
8664 targetReset = targetReset->next;
8665 }
8666 if(sourceReset != targetReset) { // at least one is not NULL
8667 // there are more tailored elements in one list
8668 result = FALSE;
8669 goto returnResult;
8670 }
8671
8672
8673 break;
8674 }
8675 }
8676 // couldn't find the reset anchor, so the collators are not equal
8677 if(j == sourceListLen) {
8678 result = FALSE;
8679 goto returnResult;
8680 }
8681 }
8682 }
8683
8684 returnResult:
8685 ucol_tok_closeTokenList(&sourceParser);
8686 ucol_tok_closeTokenList(&targetParser);
8687 return result;
8688
8689 }
8690 #endif /* #if !UCONFIG_NO_COLLATION */