2 *******************************************************************************
3 * Copyright (C) 1996-2003, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * Modification history
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
19 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/uloc.h"
25 #include "unicode/coll.h"
26 #include "unicode/tblcoll.h"
27 #include "unicode/coleitr.h"
28 #include "unicode/unorm.h"
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/caniter.h"
53 /* added by synwee for trie manipulation*/
54 #define STAGE_1_SHIFT_ 10
55 #define STAGE_2_SHIFT_ 4
56 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
57 #define STAGE_3_MASK_ 0xF
58 #define LAST_BYTE_MASK_ 0xFF
59 #define SECOND_LAST_BYTE_SHIFT_ 8
61 #define ZERO_CC_LIMIT_ 0xC0
63 static UCollator
* UCA
= NULL
;
64 static UCAConstants
*UCAconsts
= NULL
;
65 static UDataMemory
* UCA_DATA_MEM
= NULL
;
69 static UBool U_CALLCONV
70 isAcceptableUCA(void * /*context*/,
71 const char * /*type*/, const char * /*name*/,
72 const UDataInfo
*pInfo
){
73 /* context, type & name are intentionally not used */
74 if( pInfo
->size
>=20 &&
75 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
76 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
77 pInfo
->dataFormat
[0]==ucaDataInfo
.dataFormat
[0] && /* dataFormat="UCol" */
78 pInfo
->dataFormat
[1]==ucaDataInfo
.dataFormat
[1] &&
79 pInfo
->dataFormat
[2]==ucaDataInfo
.dataFormat
[2] &&
80 pInfo
->dataFormat
[3]==ucaDataInfo
.dataFormat
[3] &&
81 pInfo
->formatVersion
[0]==ucaDataInfo
.formatVersion
[0] &&
82 pInfo
->formatVersion
[1]>=ucaDataInfo
.formatVersion
[1]// &&
83 //pInfo->formatVersion[1]==ucaDataInfo.formatVersion[1] &&
84 //pInfo->formatVersion[2]==ucaDataInfo.formatVersion[2] && // Too harsh
85 //pInfo->formatVersion[3]==ucaDataInfo.formatVersion[3] && // Too harsh
87 UVersionInfo UCDVersion
;
88 u_getUnicodeVersion(UCDVersion
);
89 if(pInfo
->dataVersion
[0]==UCDVersion
[0] &&
90 pInfo
->dataVersion
[1]==UCDVersion
[1]) { // &&
91 //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
92 //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
103 static int32_t U_CALLCONV
104 _getFoldingOffset(uint32_t data
) {
105 return (int32_t)(data
&0xFFFFFF);
111 inline void IInit_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
112 int32_t sourceLen
, collIterate
*s
) {
113 (s
)->string
= (s
)->pos
= (UChar
*)(sourceString
);
116 if (sourceLen
>= 0) {
117 s
->flags
|= UCOL_ITER_HASLEN
;
118 (s
)->endp
= (UChar
*)sourceString
+sourceLen
;
121 /* change to enable easier checking for end of string for fcdpositon */
124 (s
)->CEpos
= (s
)->toReturn
= (s
)->CEs
;
125 (s
)->writableBuffer
= (s
)->stackWritableBuffer
;
126 (s
)->writableBufSize
= UCOL_WRITABLE_BUFFER_SIZE
;
127 (s
)->coll
= (collator
);
128 (s
)->fcdPosition
= 0;
129 if(collator
->normalizationMode
== UCOL_ON
) {
130 (s
)->flags
|= UCOL_ITER_NORM
;
132 if(collator
->hiraganaQ
== UCOL_ON
&& collator
->strength
>= UCOL_QUATERNARY
) {
133 (s
)->flags
|= UCOL_HIRAGANA_Q
;
135 (s
)->iterator
= NULL
;
136 //(s)->iteratorIndex = 0;
139 U_CAPI
void U_EXPORT2
140 uprv_init_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
141 int32_t sourceLen
, collIterate
*s
){
142 /* Out-of-line version for use from other files. */
143 IInit_collIterate(collator
, sourceString
, sourceLen
, s
);
148 * Backup the state of the collIterate struct data
149 * @param data collIterate to backup
150 * @param backup storage
153 inline void backupState(const collIterate
*data
, collIterateState
*backup
)
155 backup
->fcdPosition
= data
->fcdPosition
;
156 backup
->flags
= data
->flags
;
157 backup
->origFlags
= data
->origFlags
;
158 backup
->pos
= data
->pos
;
159 backup
->bufferaddress
= data
->writableBuffer
;
160 backup
->buffersize
= data
->writableBufSize
;
161 if(data
->iterator
!= NULL
) {
162 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
163 backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
);
164 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
165 backup
->iteratorMove
= 0;
166 if(backup
->iteratorIndex
== UITER_NO_STATE
) {
167 while((backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
)) == UITER_NO_STATE
) {
168 backup
->iteratorMove
++;
169 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
171 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
177 * Loads the state into the collIterate struct data
178 * @param data collIterate to backup
179 * @param backup storage
180 * @param forwards boolean to indicate if forwards iteration is used,
181 * false indicates backwards iteration
184 inline void loadState(collIterate
*data
, const collIterateState
*backup
,
187 UErrorCode status
= U_ZERO_ERROR
;
188 data
->flags
= backup
->flags
;
189 data
->origFlags
= backup
->origFlags
;
190 if(data
->iterator
!= NULL
) {
191 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
192 data
->iterator
->setState(data
->iterator
, backup
->iteratorIndex
, &status
);
193 if(backup
->iteratorMove
!= 0) {
194 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
197 data
->pos
= backup
->pos
;
198 if ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
199 data
->writableBuffer
!= backup
->bufferaddress
) {
201 this is when a new buffer has been reallocated and we'll have to
202 calculate the new position.
203 note the new buffer has to contain the contents of the old buffer.
206 data
->pos
= data
->writableBuffer
+
207 (data
->pos
- backup
->bufferaddress
);
210 /* backwards direction */
211 uint32_t temp
= backup
->buffersize
-
212 (data
->pos
- backup
->bufferaddress
);
213 data
->pos
= data
->writableBuffer
+ (data
->writableBufSize
- temp
);
216 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
218 this is alittle tricky.
219 if we are initially not in the normalization buffer, even if we
220 normalize in the later stage, the data in the buffer will be
221 ignored, since we skip back up to the data string.
222 however if we are already in the normalization buffer, any
223 further normalization will pull data into the normalization
224 buffer and modify the fcdPosition.
225 since we are keeping the data in the buffer for use, the
226 fcdPosition can not be reverted back.
229 data
->fcdPosition
= backup
->fcdPosition
;
236 * Checks for a collIterate being positioned at the end of
241 inline UBool
collIter_eos(collIterate
*s
) {
242 if(s
->flags
& UCOL_USE_ITERATOR
) {
243 return !(s
->iterator
->hasNext(s
->iterator
));
245 if ((s
->flags
& UCOL_ITER_HASLEN
) == 0 && *s
->pos
!= 0) {
246 // Null terminated string, but not at null, so not at end.
247 // Whether in main or normalization buffer doesn't matter.
251 // String with length. Can't be in normalization buffer, which is always
253 if (s
->flags
& UCOL_ITER_HASLEN
) {
254 return (s
->pos
== s
->endp
);
257 // We are at a null termination, could be either normalization buffer or main string.
258 if ((s
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
259 // At null at end of main string.
263 // At null at end of normalization buffer. Need to check whether there there are
264 // any characters left in the main buffer.
265 if(s
->origFlags
& UCOL_USE_ITERATOR
) {
266 return !(s
->iterator
->hasNext(s
->iterator
));
267 } else if ((s
->origFlags
& UCOL_ITER_HASLEN
) == 0) {
268 // Null terminated main string. fcdPosition is the 'return' position into main buf.
269 return (*s
->fcdPosition
== 0);
272 // Main string with an end pointer.
273 return s
->fcdPosition
== s
->endp
;
279 * Checks for a collIterate being positioned at the start of
284 inline UBool
collIter_bos(collIterate
*source
) {
285 // if we're going backwards, we need to know whether there is more in the
286 // iterator, even if we are in the side buffer
287 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
288 return !source
->iterator
->hasPrevious(source
->iterator
);
290 if (source
->pos
<= source
->string
||
291 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
292 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)) {
299 inline UBool
collIter_SimpleBos(collIterate
*source
) {
300 // if we're going backwards, we need to know whether there is more in the
301 // iterator, even if we are in the side buffer
302 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
303 return !source
->iterator
->hasPrevious(source
->iterator
);
305 if (source
->pos
== source
->string
) {
310 //return (data->pos == data->string) ||
314 * Checks and free writable buffer if it is not the original stack buffer
315 * in collIterate. This function does not reassign the writable buffer.
316 * @param data collIterate struct to determine and free the writable buffer
319 inline void freeHeapWritableBuffer(collIterate
*data
)
321 if (data
->writableBuffer
!= data
->stackWritableBuffer
) {
322 uprv_free(data
->writableBuffer
);
327 /****************************************************************************/
328 /* Following are the open/close functions */
330 /****************************************************************************/
332 tryOpeningFromRules(UResourceBundle
*collElem
, UErrorCode
*status
) {
333 int32_t rulesLen
= 0;
334 const UChar
*rules
= ures_getStringByKey(collElem
, "Sequence", &rulesLen
, status
);
335 return ucol_openRules(rules
, rulesLen
, UCOL_DEFAULT
, UCOL_DEFAULT
, NULL
, status
);
341 ucol_open(const char *loc
,
344 UCollator
*result
= NULL
;
345 if (status
&& U_SUCCESS(*status
)) {
346 result
= Collator::createUCollator(loc
, status
);
351 return ucol_open_internal(loc
, status
);
357 ucol_open_internal(const char *loc
,
360 ucol_initUCA(status
);
363 if(U_FAILURE(*status
)) return 0;
365 UCollator
*result
= NULL
;
366 UResourceBundle
*b
= ures_open(NULL
, loc
, status
);
367 UResourceBundle
*collElem
= ures_getByKey(b
, "CollationElements", NULL
, status
);
368 UResourceBundle
*binary
= NULL
;
369 UErrorCode binaryStatus
= U_ZERO_ERROR
;
371 if(*status
== U_MISSING_RESOURCE_ERROR
) { /* We didn't find the tailoring data, we fallback to the UCA */
372 *status
= U_USING_DEFAULT_WARNING
;
373 result
= ucol_initCollator(UCA
->image
, result
, status
);
374 // if we use UCA, real locale is root
375 result
->rb
= ures_open(NULL
, "", status
);
376 result
->elements
= ures_open(NULL
, "", status
);
377 if(U_FAILURE(*status
)) {
381 result
->hasRealData
= FALSE
;
382 } else if(U_SUCCESS(*status
)) {
383 binary
= ures_getByKey(collElem
, "%%CollationBin", NULL
, &binaryStatus
);
385 if(binaryStatus
== U_MISSING_RESOURCE_ERROR
) { /* we didn't find the binary image, we should use the rules */
387 result
= tryOpeningFromRules(collElem
, status
);
388 if(U_FAILURE(*status
)) {
391 } else if(U_SUCCESS(*status
)) { /* otherwise, we'll pick a collation data that exists */
393 const uint8_t *inData
= ures_getBinary(binary
, &len
, status
);
394 UCATableHeader
*colData
= (UCATableHeader
*)inData
;
395 if(uprv_memcmp(colData
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
396 uprv_memcmp(colData
->UCDVersion
, UCA
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0 ||
397 colData
->version
[0] != UCOL_BUILDER_VERSION
) {
398 *status
= U_DIFFERENT_UCA_VERSION
;
399 result
= tryOpeningFromRules(collElem
, status
);
401 if(U_FAILURE(*status
)){
404 if((uint32_t)len
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
405 result
= ucol_initCollator((const UCATableHeader
*)inData
, result
, status
);
406 if(U_FAILURE(*status
)){
409 result
->hasRealData
= TRUE
;
411 result
= ucol_initCollator(UCA
->image
, result
, status
);
412 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(inData
+((const UCATableHeader
*)inData
)->options
), status
);
413 if(U_FAILURE(*status
)){
416 result
->hasRealData
= FALSE
;
418 result
->freeImageOnClose
= FALSE
;
422 result
->elements
= collElem
;
423 } else { /* There is another error, and we're just gonna clean up */
426 ures_close(collElem
);
431 result
->validLocale
= NULL
; // default is to use rb info
434 loc
= ures_getLocale(result
->rb
, status
);
436 result
->requestedLocale
= (char *)uprv_malloc((uprv_strlen(loc
)+1)*sizeof(char));
438 if (result
->requestedLocale
== NULL
) {
439 *status
= U_MEMORY_ALLOCATION_ERROR
;
440 ures_close(b
); // ??? appears needed
441 ures_close(collElem
);
442 ures_close(binary
); // ??? appears needed
445 uprv_strcpy(result
->requestedLocale
, loc
);
451 U_CAPI
void U_EXPORT2
452 ucol_setReqValidLocales(UCollator
*coll
, char *requestedLocaleToAdopt
, char *validLocaleToAdopt
)
455 if (coll
->validLocale
) {
456 uprv_free(coll
->validLocale
);
458 coll
->validLocale
= validLocaleToAdopt
;
459 if (coll
->requestedLocale
) { // should always have
460 uprv_free(coll
->requestedLocale
);
462 coll
->requestedLocale
= requestedLocaleToAdopt
;
466 U_CAPI
void U_EXPORT2
467 ucol_close(UCollator
*coll
)
470 // these are always owned by each UCollator struct,
471 // so we always free them
472 if(coll
->validLocale
!= NULL
) {
473 uprv_free(coll
->validLocale
);
475 if(coll
->requestedLocale
!= NULL
) {
476 uprv_free(coll
->requestedLocale
);
479 /* Here, it would be advisable to close: */
480 /* - UData for UCA (unless we stuff it in the root resb */
481 /* Again, do we need additional housekeeping... HMMM! */
482 if(coll
->freeOnClose
== FALSE
){
483 return; /* for safeClone, if freeOnClose is FALSE,
484 don't free the other instance data */
486 if(coll
->freeOptionsOnClose
!= FALSE
) {
487 if(coll
->options
!= NULL
) {
488 uprv_free(coll
->options
);
491 if(coll
->mapping
!= NULL
) {
492 /*ucmpe32_close(coll->mapping);*/
493 uprv_free(coll
->mapping
);
495 if(coll
->rules
!= NULL
&& coll
->freeRulesOnClose
) {
496 uprv_free((UChar
*)coll
->rules
);
498 if(coll
->rb
!= NULL
) { /* pointing to read-only memory */
499 ures_close(coll
->rb
);
501 if(coll
->freeImageOnClose
== TRUE
) {
502 uprv_free((UCATableHeader
*)coll
->image
);
504 if(coll
->elements
!= NULL
) {
505 ures_close(coll
->elements
);
507 if(coll
->latinOneCEs
!= NULL
) {
508 uprv_free(coll
->latinOneCEs
);
514 U_CAPI UCollator
* U_EXPORT2
515 ucol_openRules( const UChar
*rules
,
517 UColAttributeValue normalizationMode
,
518 UCollationStrength strength
,
519 UParseError
*parseError
,
522 uint32_t listLen
= 0;
524 UColAttributeValue norm
;
527 if(status
== NULL
|| U_FAILURE(*status
)){
531 if(rulesLength
< -1 || (rules
== NULL
&& rulesLength
!= 0)) {
532 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
536 if(rulesLength
== -1) {
537 rulesLength
= u_strlen(rules
);
540 if(parseError
== NULL
){
544 switch(normalizationMode
) {
548 norm
= normalizationMode
;
551 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
555 ucol_initUCA(status
);
557 if(U_FAILURE(*status
)){
561 ucol_tok_initTokenList(&src
, rules
, rulesLength
, UCA
, status
);
562 listLen
= ucol_tok_assembleTokenList(&src
,parseError
, status
);
564 if(U_FAILURE(*status
)) {
565 /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
566 /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
567 /* so something might be done here... or on lower level */
569 if(*status
== U_ILLEGAL_ARGUMENT_ERROR
) {
570 fprintf(stderr
, "bad option starting at offset %i\n", src
.current
-src
.source
);
572 fprintf(stderr
, "invalid rule just before offset %i\n", src
.current
-src
.source
);
575 ucol_tok_closeTokenList(&src
);
578 UCollator
*result
= NULL
;
579 UCATableHeader
*table
= NULL
;
581 if(src
.resultLen
> 0 || src
.removeSet
!= NULL
) { /* we have a set of rules, let's make something of it */
582 /* also, if we wanted to remove some contractions, we should make a tailoring */
583 table
= ucol_assembleTailoringTable(&src
, status
);
584 if(U_SUCCESS(*status
)) {
586 table
->version
[0] = UCOL_BUILDER_VERSION
;
587 // no tailoring information on this level
588 table
->version
[1] = table
->version
[2] = table
->version
[3] = 0;
590 u_getUnicodeVersion(table
->UCDVersion
);
592 uprv_memcpy(table
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
593 result
= ucol_initCollator(table
,0,status
);
594 result
->hasRealData
= TRUE
;
595 result
->freeImageOnClose
= TRUE
;
597 } else { /* no rules, but no error either */
598 // must be only options
599 // We will init the collator from UCA
600 result
= ucol_initCollator(UCA
->image
,0,status
);
601 // And set only the options
602 UColOptionSet
*opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
605 *status
= U_MEMORY_ALLOCATION_ERROR
;
608 uprv_memcpy(opts
, src
.opts
, sizeof(UColOptionSet
));
609 ucol_setOptionsFromHeader(result
, opts
, status
);
610 result
->freeOptionsOnClose
= TRUE
;
611 result
->hasRealData
= FALSE
;
612 result
->freeImageOnClose
= FALSE
;
615 if(U_SUCCESS(*status
)) {
617 result
->dataInfo
.dataVersion
[0] = UCOL_BUILDER_VERSION
;
618 if(rulesLength
> 0) {
619 newRules
= (UChar
*)uprv_malloc((rulesLength
+1)*U_SIZEOF_UCHAR
);
621 if (newRules
== NULL
) {
622 *status
= U_MEMORY_ALLOCATION_ERROR
;
625 uprv_memcpy(newRules
, rules
, rulesLength
*U_SIZEOF_UCHAR
);
626 newRules
[rulesLength
]=0;
627 result
->rules
= newRules
;
628 result
->rulesLength
= rulesLength
;
629 result
->freeRulesOnClose
= TRUE
;
632 result
->elements
= NULL
;
633 result
->validLocale
= NULL
;
634 result
->requestedLocale
= NULL
;
635 ucol_setAttribute(result
, UCOL_STRENGTH
, strength
, status
);
636 ucol_setAttribute(result
, UCOL_NORMALIZATION_MODE
, norm
, status
);
649 ucol_tok_closeTokenList(&src
);
654 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
655 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
656 U_CAPI
uint8_t* U_EXPORT2
657 ucol_cloneRuleData(const UCollator
*coll
, int32_t *length
, UErrorCode
*status
)
659 uint8_t *result
= NULL
;
660 if(U_FAILURE(*status
)) {
663 if(coll
->hasRealData
== TRUE
) {
664 *length
= coll
->image
->size
;
665 result
= (uint8_t *)uprv_malloc(*length
);
667 if (result
== NULL
) {
668 *status
= U_MEMORY_ALLOCATION_ERROR
;
671 uprv_memcpy(result
, coll
->image
, *length
);
673 *length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
674 result
= (uint8_t *)uprv_malloc(*length
);
676 if (result
== NULL
) {
677 *status
= U_MEMORY_ALLOCATION_ERROR
;
680 uprv_memcpy(result
, UCA
->image
, sizeof(UCATableHeader
));
681 uprv_memcpy(result
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
686 void ucol_setOptionsFromHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
687 if(U_FAILURE(*status
)) {
690 result
->caseFirst
= (UColAttributeValue
)opts
->caseFirst
;
691 result
->caseLevel
= (UColAttributeValue
)opts
->caseLevel
;
692 result
->frenchCollation
= (UColAttributeValue
)opts
->frenchCollation
;
693 result
->normalizationMode
= (UColAttributeValue
)opts
->normalizationMode
;
694 result
->strength
= (UColAttributeValue
)opts
->strength
;
695 result
->variableTopValue
= opts
->variableTopValue
;
696 result
->alternateHandling
= (UColAttributeValue
)opts
->alternateHandling
;
697 result
->hiraganaQ
= (UColAttributeValue
)opts
->hiraganaQ
;
698 result
->numericCollation
= (UColAttributeValue
)opts
->numericCollation
;
700 result
->caseFirstisDefault
= TRUE
;
701 result
->caseLevelisDefault
= TRUE
;
702 result
->frenchCollationisDefault
= TRUE
;
703 result
->normalizationModeisDefault
= TRUE
;
704 result
->strengthisDefault
= TRUE
;
705 result
->variableTopValueisDefault
= TRUE
;
706 result
->hiraganaQisDefault
= TRUE
;
707 result
->numericCollationisDefault
= TRUE
;
709 ucol_updateInternalState(result
, status
);
711 result
->options
= opts
;
715 // doesn't look like anybody is using this
716 void ucol_putOptionsToHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
717 if(U_FAILURE(*status
)) {
720 opts
->caseFirst
= result
->caseFirst
;
721 opts
->caseLevel
= result
->caseLevel
;
722 opts
->frenchCollation
= result
->frenchCollation
;
723 opts
->normalizationMode
= result
->normalizationMode
;
724 opts
->strength
= result
->strength
;
725 opts
->variableTopValue
= result
->variableTopValue
;
726 opts
->alternateHandling
= result
->alternateHandling
;
727 opts
->hiraganaQ
= result
->hiraganaQ
;
728 opts
->numericCollation
= result
->numericCollation
;
732 static const uint16_t *fcdTrieIndex
=NULL
;
736 * Approximate determination if a character is at a contraction end.
737 * Guaranteed to be TRUE if a character is at the end of a contraction,
738 * otherwise it is not deterministic.
739 * @param c character to be determined
740 * @param coll collator
743 inline UBool
ucol_contractionEndCP(UChar c
, const UCollator
*coll
) {
744 if (UTF_IS_TRAIL(c
)) {
748 if (c
< coll
->minContrEndCP
) {
754 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
755 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
757 htbyte
= coll
->contrEndCP
[hash
>>3];
758 return (((htbyte
>> (hash
& 7)) & 1) == 1);
764 * i_getCombiningClass()
765 * A fast, at least partly inline version of u_getCombiningClass()
766 * This is a candidate for further optimization. Used heavily
767 * in contraction processing.
770 inline uint8_t i_getCombiningClass(UChar c
, const UCollator
*coll
) {
772 if (c
>= 0x300 && ucol_unsafeCP(c
, coll
)) {
773 sCC
= u_getCombiningClass(c
);
779 UCollator
* ucol_initCollator(const UCATableHeader
*image
, UCollator
*fillIn
, UErrorCode
*status
) {
781 UCollator
*result
= fillIn
;
782 if(U_FAILURE(*status
) || image
== NULL
) {
787 result
= (UCollator
*)uprv_malloc(sizeof(UCollator
));
789 *status
= U_MEMORY_ALLOCATION_ERROR
;
792 result
->freeOnClose
= TRUE
;
794 result
->freeOnClose
= FALSE
;
797 result
->image
= image
;
798 const uint8_t *mapping
= (uint8_t*)result
->image
+result
->image
->mappingPosition
;
799 /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
800 UTrie
*newUCAmapping
= (UTrie
*)uprv_malloc(sizeof(UTrie
));
801 if(newUCAmapping
!= NULL
) {
802 utrie_unserialize(newUCAmapping
, mapping
, result
->image
->endExpansionCE
- result
->image
->mappingPosition
, status
);
804 *status
= U_MEMORY_ALLOCATION_ERROR
;
805 if(result
->freeOnClose
== TRUE
) {
811 if(U_SUCCESS(*status
)) {
812 result
->mapping
= newUCAmapping
;
814 if(result
->freeOnClose
== TRUE
) {
818 uprv_free(newUCAmapping
);
822 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
823 result
->latinOneMapping
= UTRIE_GET32_LATIN1(result
->mapping
);
824 result
->contractionCEs
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->contractionCEs
);
825 result
->contractionIndex
= (UChar
*)((uint8_t*)result
->image
+result
->image
->contractionIndex
);
826 result
->expansion
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->expansion
);
828 result
->options
= (UColOptionSet
*)((uint8_t*)result
->image
+result
->image
->options
);
829 result
->freeOptionsOnClose
= FALSE
;
832 result
->caseFirst
= (UColAttributeValue
)result
->options
->caseFirst
;
833 result
->caseLevel
= (UColAttributeValue
)result
->options
->caseLevel
;
834 result
->frenchCollation
= (UColAttributeValue
)result
->options
->frenchCollation
;
835 result
->normalizationMode
= (UColAttributeValue
)result
->options
->normalizationMode
;
836 result
->strength
= (UColAttributeValue
)result
->options
->strength
;
837 result
->variableTopValue
= result
->options
->variableTopValue
;
838 result
->alternateHandling
= (UColAttributeValue
)result
->options
->alternateHandling
;
839 result
->hiraganaQ
= (UColAttributeValue
)result
->options
->hiraganaQ
;
840 result
->numericCollation
= (UColAttributeValue
)result
->options
->numericCollation
;
842 result
->caseFirstisDefault
= TRUE
;
843 result
->caseLevelisDefault
= TRUE
;
844 result
->frenchCollationisDefault
= TRUE
;
845 result
->normalizationModeisDefault
= TRUE
;
846 result
->strengthisDefault
= TRUE
;
847 result
->variableTopValueisDefault
= TRUE
;
848 result
->alternateHandlingisDefault
= TRUE
;
849 result
->hiraganaQisDefault
= TRUE
;
850 result
->numericCollationisDefault
= TRUE
;
852 result
->scriptOrder
= NULL
;
854 result
->rules
= NULL
;
855 result
->rulesLength
= 0;
857 /* get the version info from UCATableHeader and populate the Collator struct*/
858 result
->dataInfo
.dataVersion
[0] = result
->image
->version
[0]; /* UCA Builder version*/
859 result
->dataInfo
.dataVersion
[1] = result
->image
->version
[1]; /* UCA Tailoring rules version*/
861 result
->unsafeCP
= (uint8_t *)result
->image
+ result
->image
->unsafeCP
;
862 result
->minUnsafeCP
= 0;
863 for (c
=0; c
<0x300; c
++) { // Find the smallest unsafe char.
864 if (ucol_unsafeCP(c
, result
)) break;
866 result
->minUnsafeCP
= c
;
868 result
->contrEndCP
= (uint8_t *)result
->image
+ result
->image
->contrEndCP
;
869 result
->minContrEndCP
= 0;
870 for (c
=0; c
<0x300; c
++) { // Find the Contraction-ending char.
871 if (ucol_contractionEndCP(c
, result
)) break;
873 result
->minContrEndCP
= c
;
875 /* max expansion tables */
876 result
->endExpansionCE
= (uint32_t*)((uint8_t*)result
->image
+
877 result
->image
->endExpansionCE
);
878 result
->lastEndExpansionCE
= result
->endExpansionCE
+
879 result
->image
->endExpansionCECount
- 1;
880 result
->expansionCESize
= (uint8_t*)result
->image
+
881 result
->image
->expansionCESize
;
883 if (fcdTrieIndex
== NULL
) {
884 fcdTrieIndex
= unorm_getFCDTrie(status
);
887 //result->errorCode = *status;
889 result
->latinOneCEs
= NULL
;
891 result
->latinOneRegenTable
= FALSE
;
892 result
->latinOneFailed
= FALSE
;
894 ucol_updateInternalState(result
, status
);
904 udata_close(UCA_DATA_MEM
);
914 /* Following is a port of Mark's code for new treatment of implicits.
915 * It is positioned here, since ucol_initUCA need to initialize the
916 * variables below according to the data in the fractional UCA.
921 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
922 * b) bump any non-CJK characters by 10FFFF.
923 * The relevant blocks are:
924 * A: 4E00..9FFF; CJK Unified Ideographs
925 * F900..FAFF; CJK Compatibility Ideographs
926 * B: 3400..4DBF; CJK Unified Ideographs Extension A
927 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
929 * no new B characters are allocated between 4E00 and FAFF, and
930 * no new A characters are outside of this range,
931 * (very high probability) this simple code will work.
932 * The reordered blocks are:
934 * Block2 is CJK_COMPAT_USED
936 * Any other CJK gets its normal code point
937 * Any non-CJK gets +10FFFF
938 * When we reorder Block1, we make sure that it is at the very start,
939 * so that it will use a 3-byte form.
943 static const uint32_t
944 NON_CJK_OFFSET
= 0x110000,
946 OTHER_COUNT
= 256 - BYTES_TO_AVOID
,
947 LAST_COUNT
= OTHER_COUNT
/ 2,
948 LAST_COUNT2
= OTHER_COUNT
/ 21, // room for intervening, without expanding to 5 bytes
949 IMPLICIT_3BYTE_COUNT
= 1;
951 // These depend on initUCA, and are initialized at that time
953 IMPLICIT_BASE_BYTE
= 0,
954 IMPLICIT_LIMIT_BYTE
= 0, // leave room for 1 3-byte and 2 4-byte forms
956 IMPLICIT_4BYTE_BOUNDARY
= 0,
958 LAST2_MULTIPLIER
= 0,
959 IMPLICIT_BASE_3BYTE
= 0,
960 IMPLICIT_BASE_4BYTE
= 0;
964 CJK_LIMIT
= 0x9FFF+1,
965 CJK_COMPAT_USED_BASE
= 0xFA0E,
966 CJK_COMPAT_USED_LIMIT
= 0xFA2F+1,
968 CJK_A_LIMIT
= 0x4DBF+1,
969 CJK_B_BASE
= 0x20000,
970 CJK_B_LIMIT
= 0x2A6DF+1;
972 static inline UChar32
swapCJK(UChar32 cp
) {
974 if (cp
>= CJK_BASE
) {
975 if (cp
< CJK_LIMIT
) return cp
- CJK_BASE
;
977 if (cp
< CJK_COMPAT_USED_BASE
) return cp
+ NON_CJK_OFFSET
;
979 if (cp
< CJK_COMPAT_USED_LIMIT
) return cp
- CJK_COMPAT_USED_BASE
980 + (CJK_LIMIT
- CJK_BASE
);
981 if (cp
< CJK_B_BASE
) return cp
+ NON_CJK_OFFSET
;
983 if (cp
< CJK_B_LIMIT
) return cp
; // non-BMP-CJK
985 return cp
+ NON_CJK_OFFSET
; // non-CJK
987 if (cp
< CJK_A_BASE
) return cp
+ NON_CJK_OFFSET
;
989 if (cp
< CJK_A_LIMIT
) return cp
- CJK_A_BASE
990 + (CJK_LIMIT
- CJK_BASE
)
991 + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
992 return cp
+ NON_CJK_OFFSET
; // non-CJK
996 // GET IMPLICIT PRIMARY WEIGHTS
997 // Return value is left justified primary key
999 static inline uint32_t getImplicitPrimary(UChar32 cp
) {
1001 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1005 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1007 // we now have a range of numbers from 0 to 21FFFF.
1009 // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
1010 // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
1011 // we shift so that HAN all has the same first primary, for compression.
1012 // for the 4 byte case, we make the gap as large as we can fit.
1013 // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
1014 // Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
1016 int32_t last0
= cp
- IMPLICIT_4BYTE_BOUNDARY
;
1018 int32_t last1
= cp
/ LAST_COUNT
;
1019 last0
= cp
% LAST_COUNT
;
1021 int32_t last2
= last1
/ OTHER_COUNT
;
1022 last1
%= OTHER_COUNT
;
1024 if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
1025 + Utility.hex(last2) + ", "
1026 + Utility.hex(last1) + ", "
1027 + Utility.hex(last0) + ", "
1031 return IMPLICIT_BASE_3BYTE
+ (last2
<< 24) + (last1
<< 16) + ((last0
*LAST_MULTIPLIER
) << 8);
1033 int32_t last1
= last0
/ LAST_COUNT2
;
1034 last0
%= LAST_COUNT2
;
1036 int32_t last2
= last1
/ OTHER_COUNT
;
1037 last1
%= OTHER_COUNT
;
1039 int32_t last3
= last2
/ OTHER_COUNT
;
1040 last2
%= OTHER_COUNT
;
1043 if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
1044 + Utility.hex(last3) + ", "
1045 + Utility.hex(last2) + ", "
1046 + Utility.hex(last1) + ", "
1047 + Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
1051 return IMPLICIT_BASE_4BYTE
+ (last3
<< 24) + (last2
<< 16) + (last1
<< 8) + (last0
* LAST2_MULTIPLIER
);
1055 /* this function is either called from initUCA or from genUCA before
1056 * doing canonical closure for the UCA.
1058 U_CAPI
void U_EXPORT2
1059 uprv_uca_initImplicitConstants(uint32_t baseByte
)
1061 IMPLICIT_BASE_BYTE
= baseByte
;
1062 IMPLICIT_LIMIT_BYTE
= IMPLICIT_BASE_BYTE
+ 4; // leave room for 1 3-byte and 2 4-byte forms
1064 IMPLICIT_4BYTE_BOUNDARY
= IMPLICIT_3BYTE_COUNT
* OTHER_COUNT
* LAST_COUNT
;
1065 LAST_MULTIPLIER
= OTHER_COUNT
/ LAST_COUNT
;
1066 LAST2_MULTIPLIER
= OTHER_COUNT
/ LAST_COUNT2
;
1067 IMPLICIT_BASE_3BYTE
= (IMPLICIT_BASE_BYTE
<< 24) + 0x030300;
1068 IMPLICIT_BASE_4BYTE
= ((IMPLICIT_BASE_BYTE
+ IMPLICIT_3BYTE_COUNT
) << 24) + 0x030303;
1071 /* do not close UCA returned by ucol_initUCA! */
1073 ucol_initUCA(UErrorCode
*status
) {
1074 if(U_FAILURE(*status
)) {
1078 UBool f
= (UCA
== NULL
);
1082 UCollator
*newUCA
= NULL
;
1083 UDataMemory
*result
= udata_openChoice(NULL
, UCA_DATA_TYPE
, UCA_DATA_NAME
, isAcceptableUCA
, NULL
, status
);
1085 if(U_FAILURE(*status
)) {
1087 udata_close(result
);
1092 if(result
!= NULL
) { /* It looks like sometimes we can fail to find the data file */
1093 newUCA
= ucol_initCollator((const UCATableHeader
*)udata_getMemory(result
), newUCA
, status
);
1094 if(U_SUCCESS(*status
)){
1096 newUCA
->elements
= NULL
;
1097 newUCA
->validLocale
= NULL
;
1098 newUCA
->requestedLocale
= NULL
;
1099 newUCA
->hasRealData
= FALSE
; // real data lives in .dat file...
1100 newUCA
->freeImageOnClose
= FALSE
;
1104 UCA_DATA_MEM
= result
;
1110 if(newUCA
!= NULL
) {
1111 udata_close(result
);
1115 ucln_i18n_registerCleanup();
1117 // Initalize variables for implicit generation
1118 UCAconsts
= (UCAConstants
*)((uint8_t *)UCA
->image
+ UCA
->image
->UCAConsts
);
1119 uprv_uca_initImplicitConstants(UCAconsts
->UCA_PRIMARY_IMPLICIT_MIN
);
1120 UCA
->mapping
->getFoldingOffset
= _getFoldingOffset
;
1122 udata_close(result
);
1132 /* collIterNormalize Incremental Normalization happens here. */
1133 /* pick up the range of chars identifed by FCD, */
1134 /* normalize it into the collIterate's writable buffer, */
1135 /* switch the collIterate's state to use the writable buffer. */
1138 void collIterNormalize(collIterate
*collationSource
)
1140 UErrorCode status
= U_ZERO_ERROR
;
1143 UChar
*srcP
= collationSource
->pos
- 1; /* Start of chars to normalize */
1144 UChar
*endP
= collationSource
->fcdPosition
; /* End of region to normalize+1 */
1146 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1147 srcP
, (int32_t)(endP
- srcP
),
1150 if(status
== U_BUFFER_OVERFLOW_ERROR
|| status
== U_STRING_NOT_TERMINATED_WARNING
) {
1151 // reallocate and terminate
1152 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1153 &collationSource
->writableBuffer
,
1154 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1158 fprintf(stderr
, "collIterNormalize(), out of memory\n");
1162 status
= U_ZERO_ERROR
;
1163 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1164 srcP
, (int32_t)(endP
- srcP
),
1168 if (U_FAILURE(status
)) {
1170 fprintf(stderr
, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status
));
1175 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1176 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1178 collationSource
->pos
= collationSource
->writableBuffer
;
1179 collationSource
->origFlags
= collationSource
->flags
;
1180 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1181 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1185 // This function takes the iterator and extracts normalized stuff up to the next boundary
1186 // It is similar in the end results to the collIterNormalize, but for the cases when we
1189 inline void normalizeIterator(collIterate
*collationSource
) {
1190 UErrorCode status
= U_ZERO_ERROR
;
1191 UBool wasNormalized
= FALSE
;
1192 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1193 uint32_t iterIndex
= collationSource
->iterator
->getState(collationSource
->iterator
);
1194 int32_t normLen
= unorm_next(collationSource
->iterator
, collationSource
->writableBuffer
,
1195 (int32_t)collationSource
->writableBufSize
, UNORM_FCD
, 0, TRUE
, &wasNormalized
, &status
);
1196 if(status
== U_BUFFER_OVERFLOW_ERROR
|| normLen
== (int32_t)collationSource
->writableBufSize
) {
1197 // reallocate and terminate
1198 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1199 &collationSource
->writableBuffer
,
1200 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1204 fprintf(stderr
, "normalizeIterator(), out of memory\n");
1208 status
= U_ZERO_ERROR
;
1209 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1210 collationSource
->iterator
->setState(collationSource
->iterator
, iterIndex
, &status
);
1211 normLen
= unorm_next(collationSource
->iterator
, collationSource
->writableBuffer
,
1212 (int32_t)collationSource
->writableBufSize
, UNORM_FCD
, 0, TRUE
, &wasNormalized
, &status
);
1214 // Terminate the buffer - we already checked that it is big enough
1215 collationSource
->writableBuffer
[normLen
] = 0;
1216 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1217 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1219 collationSource
->pos
= collationSource
->writableBuffer
;
1220 collationSource
->origFlags
= collationSource
->flags
;
1221 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1222 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1226 /* Incremental FCD check and normalize */
1227 /* Called from getNextCE when normalization state is suspect. */
1228 /* When entering, the state is known to be this: */
1229 /* o We are working in the main buffer of the collIterate, not the side */
1230 /* writable buffer. When in the side buffer, normalization mode is always off, */
1231 /* so we won't get here. */
1232 /* o The leading combining class from the current character is 0 or */
1233 /* the trailing combining class of the previous char was zero. */
1234 /* True because the previous call to this function will have always exited */
1235 /* that way, and we get called for every char where cc might be non-zero. */
1237 inline UBool
collIterFCD(collIterate
*collationSource
) {
1239 const UChar
*srcP
, *endP
;
1241 uint8_t prevTrailingCC
= 0;
1243 UBool needNormalize
= FALSE
;
1245 srcP
= collationSource
->pos
-1;
1247 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1248 endP
= collationSource
->endp
;
1253 // Get the trailing combining class of the current character. If it's zero,
1257 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1259 if (UTF_IS_FIRST_SURROGATE(c
)) {
1260 if ((endP
== NULL
|| srcP
!= endP
) && UTF_IS_SECOND_SURROGATE(c2
=*srcP
)) {
1262 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1268 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1270 if (prevTrailingCC
!= 0) {
1271 // The current char has a non-zero trailing CC. Scan forward until we find
1272 // a char with a leading cc of zero.
1273 while (endP
== NULL
|| srcP
!= endP
)
1275 const UChar
*savedSrcP
= srcP
;
1279 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1280 if (fcd
!= 0 && UTF_IS_FIRST_SURROGATE(c
)) {
1281 if ((endP
== NULL
|| srcP
!= endP
) && UTF_IS_SECOND_SURROGATE(c2
=*srcP
)) {
1283 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1288 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1289 if (leadingCC
== 0) {
1290 srcP
= savedSrcP
; // Hit char that is not part of combining sequence.
1291 // back up over it. (Could be surrogate pair!)
1295 if (leadingCC
< prevTrailingCC
) {
1296 needNormalize
= TRUE
;
1299 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1304 collationSource
->fcdPosition
= (UChar
*)srcP
;
1306 return needNormalize
;
1309 /****************************************************************************/
1310 /* Following are the CE retrieval functions */
1312 /****************************************************************************/
1314 /* there should be a macro version of this function in the header file */
1315 /* This is the first function that tries to fetch a collation element */
1316 /* If it's not succesfull or it encounters a more difficult situation */
1317 /* some more sofisticated and slower functions are invoked */
1319 inline uint32_t ucol_IGetNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1321 if (collationSource
->CEpos
> collationSource
->toReturn
) { /* Are there any CEs from previous expansions? */
1322 order
= *(collationSource
->toReturn
++); /* if so, return them */
1323 if(collationSource
->CEpos
== collationSource
->toReturn
) {
1324 collationSource
->CEpos
= collationSource
->toReturn
= collationSource
->CEs
;
1331 for (;;) /* Loop handles case when incremental normalize switches */
1332 { /* to or from the side buffer / original string, and we */
1333 /* need to start again to get the next character. */
1335 if ((collationSource
->flags
& (UCOL_ITER_HASLEN
| UCOL_ITER_INNORMBUF
| UCOL_ITER_NORM
| UCOL_HIRAGANA_Q
| UCOL_USE_ITERATOR
)) == 0)
1337 // The source string is null terminated and we're not working from the side buffer,
1338 // and we're not normalizing. This is the fast path.
1339 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1340 ch
= *collationSource
->pos
++;
1345 return UCOL_NO_MORE_CES
;
1349 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1350 // Normal path for strings when length is specified.
1351 // (We can't be in side buffer because it is always null terminated.)
1352 if (collationSource
->pos
>= collationSource
->endp
) {
1353 // Ran off of the end of the main source string. We're done.
1354 return UCOL_NO_MORE_CES
;
1356 ch
= *collationSource
->pos
++;
1358 else if(collationSource
->flags
& UCOL_USE_ITERATOR
) {
1359 UChar32 iterCh
= collationSource
->iterator
->next(collationSource
->iterator
);
1360 if(iterCh
== U_SENTINEL
) {
1361 return UCOL_NO_MORE_CES
;
1367 // Null terminated string.
1368 ch
= *collationSource
->pos
++;
1370 // Ran off end of buffer.
1371 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1372 // Ran off end of main string. backing up one character.
1373 collationSource
->pos
--;
1374 return UCOL_NO_MORE_CES
;
1378 // Hit null in the normalize side buffer.
1379 // Usually this means the end of the normalized data,
1380 // except for one odd case: a null followed by combining chars,
1381 // which is the case if we are at the start of the buffer.
1382 if (collationSource
->pos
== collationSource
->writableBuffer
+1) {
1386 // Null marked end of side buffer.
1387 // Revert to the main string and
1388 // loop back to top to try again to get a character.
1389 collationSource
->pos
= collationSource
->fcdPosition
;
1390 collationSource
->flags
= collationSource
->origFlags
;
1396 if(collationSource
->flags
&UCOL_HIRAGANA_Q
) {
1397 if((ch
>=0x3040 && ch
<=0x3094) || ch
== 0x309d || ch
== 0x309e) {
1398 collationSource
->flags
|= UCOL_WAS_HIRAGANA
;
1400 collationSource
->flags
&= ~UCOL_WAS_HIRAGANA
;
1404 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1405 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1406 if ((collationSource
->flags
& UCOL_ITER_NORM
) == 0) {
1410 if (collationSource
->fcdPosition
>= collationSource
->pos
) {
1411 // An earlier FCD check has already covered the current character.
1412 // We can go ahead and process this char.
1416 if (ch
< ZERO_CC_LIMIT_
) {
1417 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1421 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1422 // We need to peek at the next character in order to tell if we are FCD
1423 if ((collationSource
->flags
& UCOL_ITER_HASLEN
) && collationSource
->pos
>= collationSource
->endp
) {
1424 // We are at the last char of source string.
1425 // It is always OK for FCD check.
1429 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1430 if (*collationSource
->pos
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1436 // Need a more complete FCD check and possible normalization.
1437 if (collIterFCD(collationSource
)) {
1438 collIterNormalize(collationSource
);
1440 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1441 // No normalization was needed. Go ahead and process the char we already had.
1445 // Some normalization happened. Next loop iteration will pick up a char
1446 // from the normalization buffer.
1452 /* For latin-1 characters we never need to fall back to the UCA table */
1453 /* because all of the UCA data is replicated in the latinOneMapping array */
1454 order
= coll
->latinOneMapping
[ch
];
1455 if (order
> UCOL_NOT_FOUND
) {
1456 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
);
1461 order
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
1462 if(order
> UCOL_NOT_FOUND
) { /* if a CE is special */
1463 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
); /* and try to get the special CE */
1465 if(order
== UCOL_NOT_FOUND
) { /* We couldn't find a good CE in the tailoring */
1466 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1467 order
= UTRIE_GET32_FROM_LEAD(UCA
->mapping
, ch
);
1469 if(order
> UCOL_NOT_FOUND
) { /* UCA also gives us a special CE */
1470 order
= ucol_prv_getSpecialCE(UCA
, ch
, order
, collationSource
, status
);
1474 return order
; /* return the CE */
1477 /* ucol_getNextCE, out-of-line version for use from other files. */
1478 U_CAPI
uint32_t U_EXPORT2
1479 ucol_getNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1480 return ucol_IGetNextCE(coll
, collationSource
, status
);
1485 * Incremental previous normalization happens here. Pick up the range of chars
1486 * identifed by FCD, normalize it into the collIterate's writable buffer,
1487 * switch the collIterate's state to use the writable buffer.
1488 * @param data collation iterator data
1491 void collPrevIterNormalize(collIterate
*data
)
1493 UErrorCode status
= U_ZERO_ERROR
;
1494 UChar
*pEnd
= data
->pos
; /* End normalize + 1 */
1499 /* Start normalize */
1500 if (data
->fcdPosition
== NULL
) {
1501 pStart
= data
->string
;
1504 pStart
= data
->fcdPosition
+ 1;
1507 normLen
= unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0,
1508 data
->writableBuffer
, 0, &status
);
1510 if (data
->writableBufSize
<= normLen
) {
1511 freeHeapWritableBuffer(data
);
1512 data
->writableBuffer
= (UChar
*)uprv_malloc((normLen
+ 1) *
1514 if(data
->writableBuffer
== NULL
) { // something is wrong here, return
1517 data
->flags
|= UCOL_ITER_ALLOCATED
;
1518 /* to handle the zero termination */
1519 data
->writableBufSize
= normLen
+ 1;
1521 status
= U_ZERO_ERROR
;
1523 this puts the null termination infront of the normalized string instead
1526 pStartNorm
= data
->writableBuffer
+ (data
->writableBufSize
- normLen
);
1527 *(pStartNorm
- 1) = 0;
1528 unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0, pStartNorm
,
1531 data
->pos
= data
->writableBuffer
+ data
->writableBufSize
;
1532 data
->origFlags
= data
->flags
;
1533 data
->flags
|= UCOL_ITER_INNORMBUF
;
1534 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
1539 * Incremental FCD check for previous iteration and normalize. Called from
1540 * getPrevCE when normalization state is suspect.
1541 * When entering, the state is known to be this:
1542 * o We are working in the main buffer of the collIterate, not the side
1543 * writable buffer. When in the side buffer, normalization mode is always
1544 * off, so we won't get here.
1545 * o The leading combining class from the current character is 0 or the
1546 * trailing combining class of the previous char was zero.
1547 * True because the previous call to this function will have always exited
1548 * that way, and we get called for every char where cc might be non-zero.
1549 * @param data collation iterate struct
1550 * @return normalization status, TRUE for normalization to be done, FALSE
1554 inline UBool
collPrevIterFCD(collIterate
*data
)
1556 const UChar
*src
, *start
;
1559 uint8_t trailingCC
= 0;
1561 UBool result
= FALSE
;
1563 start
= data
->string
;
1564 src
= data
->pos
+ 1;
1566 /* Get the trailing combining class of the current character. */
1568 if (!UTF_IS_SURROGATE(c
)) {
1569 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1570 } else if (UTF_IS_SECOND_SURROGATE(c
) && start
< src
&& UTF_IS_FIRST_SURROGATE(c2
= *(src
- 1))) {
1572 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1574 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1576 } else /* unpaired surrogate */ {
1580 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1582 if (leadingCC
!= 0) {
1584 The current char has a non-zero leading combining class.
1585 Scan backward until we find a char with a trailing cc of zero.
1590 data
->fcdPosition
= NULL
;
1595 if (!UTF_IS_SURROGATE(c
)) {
1596 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1597 } else if (UTF_IS_SECOND_SURROGATE(c
) && start
< src
&& UTF_IS_FIRST_SURROGATE(c2
= *(src
- 1))) {
1599 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1601 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1603 } else /* unpaired surrogate */ {
1607 trailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1609 if (trailingCC
== 0) {
1613 if (leadingCC
< trailingCC
) {
1617 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1621 data
->fcdPosition
= (UChar
*)src
;
1626 /** gets a character from the string at a given offset
1627 * Handles both normal and iterative cases.
1628 * No error checking - caller beware!
1631 UChar
peekCharacter(collIterate
*source
, int32_t offset
) {
1632 if(source
->pos
!= NULL
) {
1633 return *(source
->pos
+ offset
);
1634 } else if(source
->iterator
!= NULL
) {
1636 source
->iterator
->move(source
->iterator
, offset
, UITER_CURRENT
);
1637 UChar toReturn
= (UChar
)source
->iterator
->next(source
->iterator
);
1638 source
->iterator
->move(source
->iterator
, -offset
-1, UITER_CURRENT
);
1641 return (UChar
)source
->iterator
->current(source
->iterator
);
1644 return (UChar
)U_SENTINEL
;
1649 * Determines if we are at the start of the data string in the backwards
1650 * collation iterator
1651 * @param data collation iterator
1652 * @return TRUE if we are at the start
1655 inline UBool
isAtStartPrevIterate(collIterate
*data
) {
1656 if(data
->pos
== NULL
&& data
->iterator
!= NULL
) {
1657 return !data
->iterator
->hasPrevious(data
->iterator
);
1659 //return (collIter_bos(data)) ||
1660 return (data
->pos
== data
->string
) ||
1661 ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
1662 *(data
->pos
- 1) == 0 && data
->fcdPosition
== NULL
);
1666 * Inline function that gets a simple CE.
1667 * So what it does is that it will first check the expansion buffer. If the
1668 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1669 * is different from the string pointer, we return the collation element at the
1670 * return pointer and decrement it.
1671 * For more complicated CEs it resorts to getComplicatedCE.
1672 * @param coll collator data
1673 * @param data collation iterator struct
1674 * @param status error status
1677 inline uint32_t ucol_IGetPrevCE(const UCollator
*coll
, collIterate
*data
,
1680 uint32_t result
= UCOL_NULLORDER
;
1681 if (data
->toReturn
> data
->CEs
) {
1683 result
= *(data
->toReturn
);
1684 if (data
->CEs
== data
->toReturn
) {
1685 data
->CEpos
= data
->toReturn
;
1691 Loop handles case when incremental normalize switches to or from the
1692 side buffer / original string, and we need to start again to get the
1696 if (data
->flags
& UCOL_ITER_HASLEN
) {
1698 Normal path for strings when length is specified.
1699 Not in side buffer because it is always null terminated.
1701 if (data
->pos
<= data
->string
) {
1702 /* End of the main source string */
1703 return UCOL_NO_MORE_CES
;
1708 // we are using an iterator to go back. Pray for us!
1709 else if (data
->flags
& UCOL_USE_ITERATOR
) {
1710 UChar32 iterCh
= data
->iterator
->previous(data
->iterator
);
1711 if(iterCh
== U_SENTINEL
) {
1712 return UCOL_NO_MORE_CES
;
1720 /* we are in the side buffer. */
1723 At the start of the normalize side buffer.
1725 Because pointer points to the last accessed character,
1726 hence we have to increment it by one here.
1728 if (data
->fcdPosition
== NULL
) {
1729 data
->pos
= data
->string
;
1730 return UCOL_NO_MORE_CES
;
1733 data
->pos
= data
->fcdPosition
+ 1;
1735 data
->flags
= data
->origFlags
;
1740 if(data
->flags
&UCOL_HIRAGANA_Q
) {
1741 if(ch
>=0x3040 && ch
<=0x309f) {
1742 data
->flags
|= UCOL_WAS_HIRAGANA
;
1744 data
->flags
&= ~UCOL_WAS_HIRAGANA
;
1749 * got a character to determine if there's fcd and/or normalization
1751 * if the current character is not fcd.
1752 * if current character is at the start of the string
1753 * Trailing combining class == 0.
1754 * Note if pos is in the writablebuffer, norm is always 0
1756 if (ch
< ZERO_CC_LIMIT_
||
1757 // this should propel us out of the loop in the iterator case
1758 (data
->flags
& UCOL_ITER_NORM
) == 0 ||
1759 (data
->fcdPosition
!= NULL
&& data
->fcdPosition
<= data
->pos
)
1760 || data
->string
== data
->pos
) {
1764 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1765 /* if next character is FCD */
1766 if (data
->pos
== data
->string
) {
1767 /* First char of string is always OK for FCD check */
1771 /* Not first char of string, do the FCD fast test */
1772 if (*(data
->pos
- 1) < NFC_ZERO_CC_BLOCK_LIMIT_
) {
1777 /* Need a more complete FCD check and possible normalization. */
1778 if (collPrevIterFCD(data
)) {
1779 collPrevIterNormalize(data
);
1782 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1783 /* No normalization. Go ahead and process the char. */
1788 Some normalization happened.
1789 Next loop picks up a char from the normalization buffer.
1793 /* attempt to handle contractions, after removal of the backwards
1796 if (ucol_contractionEndCP(ch
, coll
) && !isAtStartPrevIterate(data
)) {
1797 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, UCOL_CONTRACTION
, data
, status
);
1801 result
= coll
->latinOneMapping
[ch
];
1802 if (result
> UCOL_NOT_FOUND
) {
1803 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, result
, data
, status
);
1807 // TODO: fix me for THAI - I reference *(data->pos-1)
1808 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0 &&
1809 /*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally
1810 data
->pos
> data
->string
&&
1811 UCOL_ISTHAIPREVOWEL(peekCharacter(data
, -1)))
1812 //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
1817 /*result = ucmpe32_get(coll->mapping, ch);*/
1818 result
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
1820 if (result
> UCOL_NOT_FOUND
) {
1821 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, result
, data
, status
);
1823 if (result
== UCOL_NOT_FOUND
) {
1824 if (!isAtStartPrevIterate(data
) &&
1825 ucol_contractionEndCP(ch
, data
->coll
)) {
1826 result
= UCOL_CONTRACTION
;
1829 /*result = ucmpe32_get(UCA->mapping, ch);*/
1830 result
= UTRIE_GET32_FROM_LEAD(UCA
->mapping
, ch
);
1833 if (result
> UCOL_NOT_FOUND
) {
1834 result
= ucol_prv_getSpecialPrevCE(UCA
, ch
, result
, data
, status
);
1844 /* ucol_getPrevCE, out-of-line version for use from other files. */
1845 U_CAPI
uint32_t U_EXPORT2
1846 ucol_getPrevCE(const UCollator
*coll
, collIterate
*data
,
1847 UErrorCode
*status
) {
1848 return ucol_IGetPrevCE(coll
, data
, status
);
1852 /* this should be connected to special Jamo handling */
1853 U_CAPI
uint32_t U_EXPORT2
1854 ucol_getFirstCE(const UCollator
*coll
, UChar u
, UErrorCode
*status
) {
1857 IInit_collIterate(coll
, &u
, 1, &colIt
);
1858 order
= ucol_IGetNextCE(coll
, &colIt
, status
);
1859 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1864 * Inserts the argument character into the end of the buffer pushing back the
1866 * @param data collIterate struct data
1867 * @param pNull pointer to the null termination
1868 * @param ch character to be appended
1869 * @return the position of the new addition
1872 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar ch
)
1874 uint32_t size
= data
->writableBufSize
;
1876 const uint32_t incsize
= 5;
1878 if ((data
->writableBuffer
+ size
) > (pNull
+ 1)) {
1885 buffer will always be null terminated at the end.
1886 giving extra space since it is likely that more characters will be added.
1889 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
1890 if(newbuffer
!= NULL
) { // something wrong, but no status
1891 uprv_memcpy(newbuffer
, data
->writableBuffer
,
1892 data
->writableBufSize
* sizeof(UChar
));
1894 freeHeapWritableBuffer(data
);
1895 data
->writableBufSize
= size
;
1896 data
->writableBuffer
= newbuffer
;
1898 newbuffer
= newbuffer
+ data
->writableBufSize
;
1900 *(newbuffer
+ 1) = 0;
1906 * Inserts the argument string into the end of the buffer pushing back the
1908 * @param data collIterate struct data
1909 * @param pNull pointer to the null termination
1910 * @param string to be appended
1911 * @param length of the string to be appended
1912 * @return the position of the new addition
1915 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar
*str
,
1918 uint32_t size
= pNull
- data
->writableBuffer
;
1921 if (data
->writableBuffer
+ data
->writableBufSize
> pNull
+ length
+ 1) {
1922 uprv_memcpy(pNull
, str
, length
* sizeof(UChar
));
1923 *(pNull
+ length
) = 0;
1928 buffer will always be null terminated at the end.
1929 giving extra space since it is likely that more characters will be added.
1931 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * (size
+ length
+ 1));
1932 if(newbuffer
!= NULL
) {
1933 uprv_memcpy(newbuffer
, data
->writableBuffer
, size
* sizeof(UChar
));
1934 uprv_memcpy(newbuffer
+ size
, str
, length
* sizeof(UChar
));
1936 freeHeapWritableBuffer(data
);
1937 data
->writableBufSize
= size
+ length
+ 1;
1938 data
->writableBuffer
= newbuffer
;
1945 * Special normalization function for contraction in the forwards iterator.
1946 * This normalization sequence will place the current character at source->pos
1947 * and its following normalized sequence into the buffer.
1948 * The fcd position, pos will be changed.
1949 * pos will now point to positions in the buffer.
1950 * Flags will be changed accordingly.
1951 * @param data collation iterator data
1954 inline void normalizeNextContraction(collIterate
*data
)
1956 UChar
*buffer
= data
->writableBuffer
;
1957 uint32_t buffersize
= data
->writableBufSize
;
1959 UErrorCode status
= U_ZERO_ERROR
;
1960 /* because the pointer points to the next character */
1961 UChar
*pStart
= data
->pos
- 1;
1966 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1967 *data
->writableBuffer
= *(pStart
- 1);
1971 strsize
= u_strlen(data
->writableBuffer
);
1974 pEnd
= data
->fcdPosition
;
1976 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
1979 if (buffersize
<= normLen
+ strsize
) {
1980 uint32_t size
= strsize
+ normLen
+ 1;
1981 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
1983 uprv_memcpy(temp
, buffer
, sizeof(UChar
) * strsize
);
1984 freeHeapWritableBuffer(data
);
1985 data
->writableBuffer
= temp
;
1986 data
->writableBufSize
= size
;
1987 data
->flags
|= UCOL_ITER_ALLOCATED
;
1991 status
= U_ZERO_ERROR
;
1992 pStartNorm
= buffer
+ strsize
;
1993 /* null-termination will be added here */
1994 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
,
1995 normLen
+ 1, &status
);
1997 data
->pos
= data
->writableBuffer
+ strsize
;
1998 data
->origFlags
= data
->flags
;
1999 data
->flags
|= UCOL_ITER_INNORMBUF
;
2000 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2004 * Contraction character management function that returns the next character
2005 * for the forwards iterator.
2006 * Does nothing if the next character is in buffer and not the first character
2008 * Else it checks next character in data string to see if it is normalizable.
2009 * If it is not, the character is simply copied into the buffer, else
2010 * the whole normalized substring is copied into the buffer, including the
2011 * current character.
2012 * @param data collation element iterator data
2013 * @return next character
2016 inline UChar
getNextNormalizedChar(collIterate
*data
)
2020 // Here we need to add the iterator code. One problem is the way
2021 // end of string is handled. If we just return next char, it could
2022 // be the sentinel. Most of the cases already check for this, but we
2024 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ) {
2025 /* if no normalization and not in buffer. */
2026 if(data
->flags
& UCOL_USE_ITERATOR
) {
2027 return (UChar
)data
->iterator
->next(data
->iterator
);
2029 return *(data
->pos
++);
2033 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2034 //normalizeIterator(data);
2037 UChar
*pEndWritableBuffer
= NULL
;
2038 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2039 if ((innormbuf
&& *data
->pos
!= 0) ||
2040 (data
->fcdPosition
!= NULL
&& !innormbuf
&&
2041 data
->pos
< data
->fcdPosition
)) {
2043 if next character is in normalized buffer, no further normalization
2046 return *(data
->pos
++);
2049 if (data
->flags
& UCOL_ITER_HASLEN
) {
2050 /* in data string */
2051 if (data
->pos
+ 1 == data
->endp
) {
2052 return *(data
->pos
++);
2057 // inside the normalization buffer, but at the end
2058 // (since we encountered zero). This means, in the
2059 // case we're using char iterator, that we need to
2060 // do another round of normalization.
2061 //if(data->origFlags & UCOL_USE_ITERATOR) {
2062 // we need to restore original flags,
2063 // otherwise, we'll lose them
2064 //data->flags = data->origFlags;
2065 //normalizeIterator(data);
2066 //return *(data->pos++);
2069 in writable buffer, at this point fcdPosition can not be
2070 pointing to the end of the data string. see contracting tag.
2072 if(data
->fcdPosition
) {
2073 if (*(data
->fcdPosition
+ 1) == 0 ||
2074 data
->fcdPosition
+ 1 == data
->endp
) {
2075 /* at the end of the string, dump it into the normalizer */
2076 data
->pos
= insertBufferEnd(data
, data
->pos
,
2077 *(data
->fcdPosition
)) + 1;
2078 return *(data
->fcdPosition
++);
2080 pEndWritableBuffer
= data
->pos
;
2081 data
->pos
= data
->fcdPosition
;
2082 } else if(data
->origFlags
& UCOL_USE_ITERATOR
) {
2083 // if we are here, we're using a normalizing iterator.
2084 // we should just continue further.
2085 data
->flags
= data
->origFlags
;
2087 return (UChar
)data
->iterator
->next(data
->iterator
);
2092 if (*(data
->pos
+ 1) == 0) {
2093 return *(data
->pos
++);
2099 nextch
= *data
->pos
;
2102 * if the current character is not fcd.
2103 * Trailing combining class == 0.
2105 if ((data
->fcdPosition
== NULL
|| data
->fcdPosition
< data
->pos
) &&
2106 (nextch
>= NFC_ZERO_CC_BLOCK_LIMIT_
||
2107 ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
)) {
2109 Need a more complete FCD check and possible normalization.
2110 normalize substring will be appended to buffer
2112 if (collIterFCD(data
)) {
2113 normalizeNextContraction(data
);
2114 return *(data
->pos
++);
2116 else if (innormbuf
) {
2117 /* fcdposition shifted even when there's no normalization, if we
2118 don't input the rest into this, we'll get the wrong position when
2119 we reach the end of the writableBuffer */
2120 int32_t length
= data
->fcdPosition
- data
->pos
+ 1;
2121 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
,
2122 data
->pos
- 1, length
);
2123 return *(data
->pos
++);
2129 no normalization is to be done hence only one character will be
2130 appended to the buffer.
2132 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
, ch
) + 1;
2135 /* points back to the pos in string */
2140 inline void goBackOne(collIterate
*data
) {
2142 // somehow, it looks like we need to keep iterator synced up
2143 // at all times, as above.
2147 if(data
->iterator
) {
2148 data
->iterator
->previous(data
->iterator
);
2151 if(data
->iterator
&& (data
->flags
& UCOL_USE_ITERATOR
)) {
2152 data
->iterator
->previous(data
->iterator
);
2161 * Function to copy the buffer into writableBuffer and sets the fcd position to
2162 * the correct position
2163 * @param source data string source
2164 * @param buffer character buffer
2165 * @param tempdb current position in buffer that has been used up
2168 inline void setDiscontiguosAttribute(collIterate
*source
, UChar
*buffer
,
2171 /* okay confusing part here. to ensure that the skipped characters are
2172 considered later, we need to place it in the appropriate position in the
2173 normalization buffer and reassign the pos pointer. simple case if pos
2174 reside in string, simply copy to normalization buffer and
2175 fcdposition = pos, pos = start of normalization buffer. if pos in
2176 normalization buffer, we'll insert the copy infront of pos and point pos
2177 to the start of the normalization buffer. why am i doing these copies?
2178 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2179 not require any changes, which be really painful. */
2180 uint32_t length
= u_strlen(buffer
);;
2181 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
2182 u_strcpy(tempdb
, source
->pos
);
2185 source
->fcdPosition
= source
->pos
;
2186 source
->origFlags
= source
->flags
;
2187 source
->flags
|= UCOL_ITER_INNORMBUF
;
2188 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2191 if (length
>= source
->writableBufSize
) {
2192 freeHeapWritableBuffer(source
);
2193 source
->writableBuffer
=
2194 (UChar
*)uprv_malloc((length
+ 1) * sizeof(UChar
));
2195 if(source
->writableBuffer
== NULL
) {
2198 source
->writableBufSize
= length
;
2201 u_strcpy(source
->writableBuffer
, buffer
);
2202 source
->pos
= source
->writableBuffer
;
2206 * Function to get the discontiguos collation element within the source.
2207 * Note this function will set the position to the appropriate places.
2208 * @param coll current collator used
2209 * @param source data string source
2210 * @param constart index to the start character in the contraction table
2211 * @return discontiguos collation element offset
2214 uint32_t getDiscontiguous(const UCollator
*coll
, collIterate
*source
,
2215 const UChar
*constart
)
2217 /* source->pos currently points to the second combining character after
2218 the start character */
2219 UChar
*temppos
= source
->pos
;
2220 UChar buffer
[4*UCOL_MAX_BUFFER
];
2221 UChar
*tempdb
= buffer
;
2222 const UChar
*tempconstart
= constart
;
2223 uint8_t tempflags
= source
->flags
;
2224 UBool multicontraction
= FALSE
;
2225 UChar
*tempbufferpos
= 0;
2226 collIterateState discState
;
2228 backupState(source
, &discState
);
2230 //*tempdb = *(source->pos - 1);
2231 *tempdb
= peekCharacter(source
, -1);
2239 if (((source
->flags
& UCOL_ITER_HASLEN
) && source
->pos
>= source
->endp
)
2240 || (peekCharacter(source
, 0) == 0 &&
2241 //|| (*source->pos == 0 &&
2242 ((source
->flags
& UCOL_ITER_INNORMBUF
) == 0 ||
2243 source
->fcdPosition
== NULL
||
2244 source
->fcdPosition
== source
->endp
||
2245 *(source
->fcdPosition
) == 0 ||
2246 u_getCombiningClass(*(source
->fcdPosition
)) == 0)) ||
2247 /* end of string in null terminated string or stopped by a
2248 null character, note fcd does not always point to a base
2249 character after the discontiguos change */
2250 u_getCombiningClass(peekCharacter(source
, 0)) == 0) {
2251 //u_getCombiningClass(*(source->pos)) == 0) {
2252 //constart = (UChar *)coll->image + getContractOffset(CE);
2253 if (multicontraction
) {
2255 source
->pos
= temppos
- 1;
2256 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2257 return *(coll
->contractionCEs
+
2258 (tempconstart
- coll
->contractionIndex
));
2260 constart
= tempconstart
;
2264 UCharOffset
= (UChar
*)(tempconstart
+ 1); /* skip the backward offset*/
2265 schar
= getNextNormalizedChar(source
);
2267 while (schar
> (tchar
= *UCharOffset
)) {
2271 if (schar
!= tchar
) {
2272 /* not the correct codepoint. we stuff the current codepoint into
2273 the discontiguos buffer and try the next character */
2279 if (u_getCombiningClass(schar
) ==
2280 u_getCombiningClass(peekCharacter(source
, -2))) {
2281 //u_getCombiningClass(*(source->pos - 2))) {
2286 result
= *(coll
->contractionCEs
+
2287 (UCharOffset
- coll
->contractionIndex
));
2291 if (result
== UCOL_NOT_FOUND
) {
2293 } else if (isContraction(result
)) {
2294 /* this is a multi-contraction*/
2295 tempconstart
= (UChar
*)coll
->image
+ getContractOffset(result
);
2296 if (*(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
))
2297 != UCOL_NOT_FOUND
) {
2298 multicontraction
= TRUE
;
2299 temppos
= source
->pos
+ 1;
2300 tempbufferpos
= buffer
+ u_strlen(buffer
);
2303 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2308 /* no problems simply reverting just like that,
2309 if we are in string before getting into this function, points back to
2310 string hence no problem.
2311 if we are in normalization buffer before getting into this function,
2312 since we'll never use another normalization within this function, we
2313 know that fcdposition points to a base character. the normalization buffer
2314 never change, hence this revert works. */
2315 loadState(source
, &discState
, TRUE
);
2318 //source->pos = temppos - 1;
2319 source
->flags
= tempflags
;
2320 return *(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
));
2324 inline UBool
isNonChar(UChar32 cp
) {
2325 if ((cp
& 0xFFFE) == 0xFFFE || (0xFDD0 <= cp
&& cp
<= 0xFDEF) || (0xD800 <= cp
&& cp
<= 0xDFFF)) {
2331 /* now uses Mark's getImplicitPrimary code */
2333 inline uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
) {
2337 uint32_t r
= getImplicitPrimary(cp
);
2338 *(collationSource
->CEpos
++) = ((r
& 0x0000FFFF)<<16) | 0x000000C0;
2339 return (r
& UCOL_PRIMARYMASK
) | 0x00000505; // This was 'order'
2343 * Inserts the argument character into the front of the buffer replacing the
2344 * front null terminator.
2345 * @param data collation element iterator data
2346 * @param pNull pointer to the null terminator
2347 * @param ch character to be appended
2348 * @return positon of added character
2351 inline UChar
* insertBufferFront(collIterate
*data
, UChar
*pNull
, UChar ch
)
2353 uint32_t size
= data
->writableBufSize
;
2356 const uint32_t incsize
= 5;
2358 if (pNull
> data
->writableBuffer
+ 1) {
2365 buffer will always be null terminated infront.
2366 giving extra space since it is likely that more characters will be added.
2369 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
2370 if(newbuffer
== NULL
) {
2373 end
= newbuffer
+ incsize
;
2374 uprv_memcpy(end
, data
->writableBuffer
,
2375 data
->writableBufSize
* sizeof(UChar
));
2379 freeHeapWritableBuffer(data
);
2381 data
->writableBufSize
= size
;
2382 data
->writableBuffer
= newbuffer
;
2387 * Special normalization function for contraction in the previous iterator.
2388 * This normalization sequence will place the current character at source->pos
2389 * and its following normalized sequence into the buffer.
2390 * The fcd position, pos will be changed.
2391 * pos will now point to positions in the buffer.
2392 * Flags will be changed accordingly.
2393 * @param data collation iterator data
2396 inline void normalizePrevContraction(collIterate
*data
)
2398 UChar
*buffer
= data
->writableBuffer
;
2399 uint32_t buffersize
= data
->writableBufSize
;
2400 uint32_t nulltermsize
;
2401 UErrorCode status
= U_ZERO_ERROR
;
2402 UChar
*pEnd
= data
->pos
+ 1; /* End normalize + 1 */
2407 if (data
->flags
& UCOL_ITER_HASLEN
) {
2409 normalization buffer not used yet, we'll pull down the next
2410 character into the end of the buffer
2412 *(buffer
+ (buffersize
- 1)) = *(data
->pos
+ 1);
2413 nulltermsize
= buffersize
- 1;
2416 nulltermsize
= buffersize
;
2417 UChar
*temp
= buffer
+ (nulltermsize
- 1);
2418 while (*(temp
--) != 0) {
2423 /* Start normalize */
2424 if (data
->fcdPosition
== NULL
) {
2425 pStart
= data
->string
;
2428 pStart
= data
->fcdPosition
+ 1;
2431 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
2434 if (nulltermsize
<= normLen
) {
2435 uint32_t size
= buffersize
- nulltermsize
+ normLen
+ 1;
2436 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2438 nulltermsize
= normLen
+ 1;
2439 uprv_memcpy(temp
+ normLen
, buffer
,
2440 sizeof(UChar
) * (buffersize
- nulltermsize
));
2441 freeHeapWritableBuffer(data
);
2442 data
->writableBuffer
= temp
;
2443 data
->writableBufSize
= size
;
2447 status
= U_ZERO_ERROR
;
2449 this puts the null termination infront of the normalized string instead
2452 pStartNorm
= buffer
+ (nulltermsize
- normLen
);
2453 *(pStartNorm
- 1) = 0;
2454 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
, normLen
,
2457 data
->pos
= data
->writableBuffer
+ nulltermsize
;
2458 data
->origFlags
= data
->flags
;
2459 data
->flags
|= UCOL_ITER_INNORMBUF
;
2460 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2464 * Contraction character management function that returns the previous character
2465 * for the backwards iterator.
2466 * Does nothing if the previous character is in buffer and not the first
2468 * Else it checks previous character in data string to see if it is
2470 * If it is not, the character is simply copied into the buffer, else
2471 * the whole normalized substring is copied into the buffer, including the
2472 * current character.
2473 * @param data collation element iterator data
2474 * @return previous character
2477 inline UChar
getPrevNormalizedChar(collIterate
*data
)
2482 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2483 UChar
*pNull
= NULL
;
2484 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ||
2485 (innormbuf
&& *(data
->pos
- 1) != 0)) {
2487 if no normalization.
2488 if previous character is in normalized buffer, no further normalization
2491 if(data
->flags
& UCOL_USE_ITERATOR
) {
2492 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
2493 return (UChar
)data
->iterator
->next(data
->iterator
);
2495 return *(data
->pos
- 1);
2500 if (data
->flags
& UCOL_ITER_HASLEN
) {
2501 /* in data string */
2502 if ((start
- 1) == data
->string
) {
2503 return *(start
- 1);
2507 prevch
= *(start
- 1);
2511 in writable buffer, at this point fcdPosition can not be NULL.
2512 see contracting tag.
2514 if (data
->fcdPosition
== data
->string
) {
2515 /* at the start of the string, just dump it into the normalizer */
2516 insertBufferFront(data
, data
->pos
- 1, *(data
->fcdPosition
));
2517 data
->fcdPosition
= NULL
;
2518 return *(data
->pos
- 1);
2520 pNull
= data
->pos
- 1;
2521 start
= data
->fcdPosition
;
2523 prevch
= *(start
- 1);
2526 * if the current character is not fcd.
2527 * Trailing combining class == 0.
2529 if (data
->fcdPosition
> start
&&
2530 (ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
|| prevch
>= NFC_ZERO_CC_BLOCK_LIMIT_
))
2533 Need a more complete FCD check and possible normalization.
2534 normalize substring will be appended to buffer
2536 UChar
*backuppos
= data
->pos
;
2538 if (collPrevIterFCD(data
)) {
2539 normalizePrevContraction(data
);
2540 return *(data
->pos
- 1);
2542 data
->pos
= backuppos
;
2543 data
->fcdPosition
++;
2548 no normalization is to be done hence only one character will be
2549 appended to the buffer.
2551 insertBufferFront(data
, pNull
, ch
);
2552 data
->fcdPosition
--;
2558 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2559 /* It is called by getNextCE */
2561 uint32_t ucol_prv_getSpecialCE(const UCollator
*coll
, UChar ch
, uint32_t CE
, collIterate
*source
, UErrorCode
*status
) {
2562 collIterateState entryState
;
2563 backupState(source
, &entryState
);
2567 // This loop will repeat only in the case of contractions, and only when a contraction
2568 // is found and the first CE resulting from that contraction is itself a special
2569 // (an expansion, for example.) All other special CE types are fully handled the
2570 // first time through, and the loop exits.
2572 const uint32_t *CEOffset
= NULL
;
2573 switch(getCETag(CE
)) {
2575 /* This one is not found, and we'll let somebody else bother about it... no more games */
2578 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2579 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2580 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2581 /* we return 0 (completely ignorable - per UCA specification */
2584 collIterateState state
;
2585 backupState(source
, &state
);
2586 if (collIter_eos(source
) || !(UTF16_IS_TRAIL((trail
= getNextNormalizedChar(source
))))) {
2587 // we chould have stepped one char forward and it might have turned that it
2588 // was not a trail surrogate. In that case, we have to backup.
2589 loadState(source
, &state
, TRUE
);
2592 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2593 CE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, CE
&0xFFFFFF, trail
);
2594 if(CE
== UCOL_NOT_FOUND
) { // there are tailored surrogates in this block, but not this one.
2595 // We need to backup
2596 loadState(source
, &state
, TRUE
);
2599 // calculate the supplementary code point value, if surrogate was not tailored
2600 cp
= ((((uint32_t)ch
)<<10UL)+(trail
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2605 /* Thai/Lao reordering */
2606 if (((source
->flags
) & UCOL_ITER_INNORMBUF
) /* Already Swapped || */
2607 || (source
->iterator
&& !source
->iterator
->hasNext(source
->iterator
))
2608 || (source
->pos
&& source
->endp
== source
->pos
) /* At end of string. No swap possible || */
2609 /*|| UCOL_ISTHAIBASECONSONANT(*(source->pos)) == 0*/) /* next char not Thai base cons.*/ // This is from the old specs - we now rearrange unconditionally
2611 // Treat Thai as a length one expansion */
2612 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
2617 // Move the prevowel and the following base Consonant into the normalization buffer
2618 // with their order swapped
2620 source
->writableBuffer
[0] = peekCharacter(source
, 0);
2621 source
->writableBuffer
[1] = peekCharacter(source
, -1);
2622 source
->writableBuffer
[2] = 0;
2625 source
->fcdPosition
= source
->pos
+1; // Indicate where to continue in main input string
2626 // after exhausting the writableBuffer
2627 } else if(source
->iterator
) {
2628 source
->iterator
->next(source
->iterator
);
2630 source
->pos
= source
->writableBuffer
;
2631 source
->origFlags
= source
->flags
;
2632 source
->flags
|= UCOL_ITER_INNORMBUF
;
2633 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2635 CE
= UCOL_IGNORABLE
;
2640 // Special processing is getting a CE that is preceded by a certain prefix
2641 // Currently this is only needed for optimizing Japanese length and iteration marks.
2642 // When we encouter a special processing tag, we go backwards and try to see if
2644 // Contraction tables are used - so the whole process is not unlike contraction.
2645 // prefix data is stored backwards in the table.
2646 const UChar
*UCharOffset
;
2648 collIterateState prefixState
;
2649 backupState(source
, &prefixState
);
2650 loadState(source
, &entryState
, TRUE
);
2651 goBackOne(source
); // We want to look at the point where we entered - actually one
2655 // This loop will run once per source string character, for as long as we
2656 // are matching a potential contraction sequence
2658 // First we position ourselves at the begining of contraction sequence
2659 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2660 if (collIter_bos(source
)) {
2661 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2664 schar
= getPrevNormalizedChar(source
);
2667 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2671 if (schar
== tchar
) {
2672 // Found the source string char in the table.
2673 // Pick up the corresponding CE from the table.
2674 CE
= *(coll
->contractionCEs
+
2675 (UCharOffset
- coll
->contractionIndex
));
2679 // if there is a completely ignorable code point in the middle of
2680 // a prefix, we need to act as if it's not there
2681 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
2682 // lone surrogates cannot be set to zero as it would break other processing
2683 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
2684 // it's easy for BMP code points
2687 } else if(UTF_IS_TRAIL(schar
) || UTF_IS_LEAD(schar
)) {
2688 // for supplementary code points, we have to check the next one
2689 // situations where we are going to ignore
2690 // 1. beginning of the string: schar is a lone surrogate
2691 // 2. schar is a lone surrogate
2692 // 3. schar is a trail surrogate in a valid surrogate sequence
2693 // that is explicitly set to zero.
2694 if (!collIter_bos(source
)) {
2696 if(UTF_IS_LEAD(lead
= getPrevNormalizedChar(source
))) {
2697 isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, lead
);
2698 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
2699 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
2701 // this is a real, assigned completely ignorable code point
2707 // lone surrogate, completely ignorable
2711 // lone surrogate at the beggining, completely ignorable
2715 // Source string char was not in the table.
2716 // We have not found the prefix.
2717 CE
= *(coll
->contractionCEs
+
2718 (ContractionStart
- coll
->contractionIndex
));
2722 // The source string char was in the contraction table, and the corresponding
2723 // CE is not a prefix CE. We found the prefix, break
2724 // out of loop, this CE will end up being returned. This is the normal
2725 // way out of prefix handling when the source actually contained
2730 if(CE
!= UCOL_NOT_FOUND
) { // we found something and we can merilly continue
2731 loadState(source
, &prefixState
, TRUE
);
2732 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2733 source
->flags
= source
->origFlags
;
2735 } else { // prefix search was a failure, we have to backup all the way to the start
2736 loadState(source
, &entryState
, TRUE
);
2740 case CONTRACTION_TAG
:
2742 /* This should handle contractions */
2743 collIterateState state
;
2744 backupState(source
, &state
);
2745 uint32_t firstCE
= UCOL_NOT_FOUND
;
2746 const UChar
*UCharOffset
;
2750 /* This loop will run once per source string character, for as long as we */
2751 /* are matching a potential contraction sequence */
2753 /* First we position ourselves at the begining of contraction sequence */
2754 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2756 if (collIter_eos(source
)) {
2757 // Ran off the end of the source string.
2758 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2759 // So we'll pick whatever we have at the point...
2760 if (CE
== UCOL_NOT_FOUND
) {
2761 // back up the source over all the chars we scanned going into this contraction.
2763 loadState(source
, &state
, TRUE
);
2764 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2765 source
->flags
= source
->origFlags
;
2771 uint8_t maxCC
= (uint8_t)(*(UCharOffset
)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2772 uint8_t allSame
= (uint8_t)(*(UCharOffset
++)>>8);
2774 schar
= getNextNormalizedChar(source
);
2775 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2779 if (schar
== tchar
) {
2780 // Found the source string char in the contraction table.
2781 // Pick up the corresponding CE from the table.
2782 CE
= *(coll
->contractionCEs
+
2783 (UCharOffset
- coll
->contractionIndex
));
2787 // if there is a completely ignorable code point in the middle of
2788 // contraction, we need to act as if it's not there
2789 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
2790 // it's easy for BMP code points
2793 } else if(UTF_IS_LEAD(schar
)) {
2794 if(!collIter_eos(source
)) {
2795 backupState(source
, &state
);
2796 UChar trail
= getNextNormalizedChar(source
);
2797 if(UTF_IS_TRAIL(trail
)) { // do stuff with trail
2798 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
2799 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, isZeroCE
&0xFFFFFF, trail
);
2805 // broken surrogate sequence, thus completely ignorable
2806 loadState(source
, &state
, TRUE
);
2809 loadState(source
, &state
, TRUE
);
2810 } else { // no more characters, so broken surrogate pair...
2811 // this contraction will ultimately fail, but not because of us
2814 } // else if(UTF_IS_LEAD(schar))
2816 // Source string char was not in contraction table.
2817 // Unless we have a discontiguous contraction, we have finished
2818 // with this contraction.
2820 if (schar
< 0x300 ||
2822 (sCC
= i_getCombiningClass(schar
, coll
)) == 0 ||
2824 (allSame
!= 0 && sCC
== maxCC
) ||
2825 collIter_eos(source
)) {
2826 // Contraction can not be discontiguous.
2827 goBackOne(source
); // back up the source string by one,
2828 // because the character we just looked at was
2829 // not part of the contraction. */
2830 CE
= *(coll
->contractionCEs
+
2831 (ContractionStart
- coll
->contractionIndex
));
2834 // Contraction is possibly discontiguous.
2835 // Scan more of source string looking for a match
2838 /* find the next character if schar is not a base character
2839 and we are not yet at the end of the string */
2840 tempchar
= getNextNormalizedChar(source
);
2842 if (i_getCombiningClass(tempchar
, coll
) == 0) {
2844 /* Spit out the last char of the string, wasn't tasty enough */
2845 CE
= *(coll
->contractionCEs
+
2846 (ContractionStart
- coll
->contractionIndex
));
2848 CE
= getDiscontiguous(coll
, source
, ContractionStart
);
2851 } // else after if(schar == tchar)
2853 if(CE
== UCOL_NOT_FOUND
) {
2854 /* The Source string did not match the contraction that we were checking. */
2855 /* Back up the source position to undo the effects of having partially */
2856 /* scanned through what ultimately proved to not be a contraction. */
2857 loadState(source
, &state
, TRUE
);
2859 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2860 source
->flags
= source
->origFlags
;
2865 if(!isContraction(CE
)) {
2866 // The source string char was in the contraction table, and the corresponding
2867 // CE is not a contraction CE. We completed the contraction, break
2868 // out of loop, this CE will end up being returned. This is the normal
2869 // way out of contraction handling when the source actually contained
2875 // The source string char was in the contraction table, and the corresponding
2876 // CE is IS a contraction CE. We will continue looping to check the source
2877 // string for the remaining chars in the contraction.
2878 uint32_t tempCE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
2879 if(tempCE
!= UCOL_NOT_FOUND
) {
2880 // We have scanned a a section of source string for which there is a
2881 // CE from the contraction table. Remember the CE and scan position, so
2882 // that we can return to this point if further scanning fails to
2883 // match a longer contraction sequence.
2887 backupState(source
, &state
);
2888 getNextNormalizedChar(source
);
2890 // Another way to do this is:
2891 //collIterateState tempState;
2892 //backupState(source, &tempState);
2893 //goBackOne(source);
2894 //backupState(source, &state);
2895 //loadState(source, &tempState, TRUE);
2897 // The problem is that for incomplete contractions we have to remember the previous
2898 // position. Before, the only thing I needed to do was state.pos--;
2899 // After iterator introduction and especially after introduction of normalizing
2900 // iterators, it became much more difficult to decrease the saved state.
2901 // I'm not yet sure which of the two methods above is faster.
2905 } // case CONTRACTION_TAG:
2906 case LONG_PRIMARY_TAG
:
2908 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
2909 CE
= ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
2914 /* This should handle expansion. */
2915 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2916 /* I have to decide where continuations are going to be dealt with */
2918 uint32_t i
; /* general counter */
2919 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
2920 size
= getExpansionCount(CE
);
2922 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2923 for(i
= 1; i
<size
; i
++) {
2924 *(source
->CEpos
++) = *CEOffset
++;
2926 } else { /* else, we do */
2927 while(*CEOffset
!= 0) {
2928 *(source
->CEpos
++) = *CEOffset
++;
2936 We do a check to see if we want to collate digits as numbers; if so we generate
2937 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2940 uint32_t i
; /* general counter */
2942 if (coll
->numericCollation
== UCOL_ON
){
2945 uint32_t digIndx
= 0;
2946 uint32_t endIndex
= 0;
2947 uint32_t trailingZeroIndex
= 0;
2949 uint32_t primWeight
= 0;
2951 uint32_t digVal
= 0;
2952 uint8_t collateVal
= 0;
2954 UBool nonZeroValReached
= false;
2956 uint8_t *numTempBuf
;
2957 uint8_t stackNumTempBuf
[UCOL_MAX_BUFFER
]; // I just need a temporary place to store my generated CEs.
2958 uint32_t numTempBufSize
= UCOL_MAX_BUFFER
;
2960 numTempBuf
= stackNumTempBuf
;
2962 We parse the source string until we hit a char that's NOT a digit.
2963 Use this u_charDigitValue. This might be slow because we have to
2964 handle surrogates...
2967 if (U16_IS_LEAD(ch
)){
2968 if (!collIter_eos(source
))
2969 char32
= U16_GET_SUPPLEMENTARY(ch
, getNextNormalizedChar(source
));
2975 digVal
= u_charDigitValue(char32
);
2978 We pad a zero in front of the first element anyways. This takes
2979 care of the (probably) most common case where people are sorting things followed
2984 // Make sure we have enough space.
2985 if (digIndx
>= ((numTempBufSize
- 2) * 2) + 1)
2987 numTempBufSize
*= 2;
2988 if (numTempBuf
== stackNumTempBuf
){
2989 numTempBuf
= (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize
);
2990 memcpy(numTempBuf
, stackNumTempBuf
, UCOL_MAX_BUFFER
);
2992 realloc(numTempBuf
, numTempBufSize
);
2995 // Skipping over leading zeroes.
2996 if (digVal
!= 0 || nonZeroValReached
){
2997 if (digVal
!= 0 && !nonZeroValReached
)
2998 nonZeroValReached
= true;
3001 We parse the digit string into base 100 numbers (this fits into a byte).
3002 We only add to the buffer in twos, thus if we are parsing an odd character,
3003 that serves as the 'tens' digit while the if we are parsing an even one, that
3004 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3005 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3006 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3007 than all the other bytes.
3010 if (digIndx
% 2 == 1){
3011 collateVal
+= (uint8_t)digVal
;
3013 // We don't enter the low-order-digit case unless we've already seen
3014 // the high order, or for the first digit, which is always non-zero.
3015 if (collateVal
!= 0)
3016 trailingZeroIndex
= 0;
3018 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3022 // We drop the collation value into the buffer so if we need to do
3023 // a "front patch" we don't have to check to see if we're hitting the
3025 collateVal
= (uint8_t)(digVal
* 10);
3027 // Check for trailing zeroes.
3028 if (collateVal
== 0)
3030 if (!trailingZeroIndex
)
3031 trailingZeroIndex
= (digIndx
/2) + 2;
3034 trailingZeroIndex
= 0;
3036 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3041 // Get next character.
3042 if (!collIter_eos(source
)){
3043 ch
= getNextNormalizedChar(source
);
3044 if (U16_IS_LEAD(ch
)){
3045 if (!collIter_eos(source
))
3046 char32
= U16_GET_SUPPLEMENTARY(ch
, getNextNormalizedChar(source
));
3051 if ((digVal
= u_charDigitValue(char32
)) == -1){
3052 // Resetting position to point to the next unprocessed char. We
3053 // overshot it when doing our test/set for numbers.
3055 if (char32
> 0xFFFF) // For surrogates.
3063 if (nonZeroValReached
== false){
3068 endIndex
= trailingZeroIndex
? trailingZeroIndex
: ((digIndx
/2) + 2) ;
3069 if (digIndx
% 2 != 0){
3071 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3072 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3073 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3074 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3077 for(i
= 2; i
< endIndex
; i
++){
3078 numTempBuf
[i
] = (((((numTempBuf
[i
] - 6)/2) % 10) * 10) +
3079 (((numTempBuf
[i
+1])-6)/2) / 10) * 2 + 6;
3084 // Subtract one off of the last byte.
3085 numTempBuf
[endIndex
-1] -= 1;
3088 We want to skip over the first two slots in the buffer. The first slot
3089 is reserved for the header byte 0x1B. The second slot is for the
3090 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3092 numTempBuf
[0] = 0x1B;
3093 numTempBuf
[1] = (uint8_t)(0x80 + ((digIndx
/2) & 0x7F));
3095 // Now transfer the collation key to our collIterate struct.
3096 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3097 size
= ((endIndex
+1) & ~1)/2;
3098 CE
= (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3099 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3100 UCOL_BYTE_COMMON
; // Tertiary weight.
3101 i
= 2; // Reset the index into the buffer.
3104 primWeight
= numTempBuf
[i
++] << 8;
3106 primWeight
|= numTempBuf
[i
++];
3107 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3110 if (numTempBuf
!= stackNumTempBuf
)
3114 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3115 size
= getExpansionCount(CE
);
3117 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3118 for(i
= 1; i
<size
; i
++) {
3119 *(source
->CEpos
++) = *CEOffset
++;
3121 } else { /* else, we do */
3122 while(*CEOffset
!= 0) {
3123 *(source
->CEpos
++) = *CEOffset
++;
3129 /* various implicits optimization */
3130 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3131 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3132 //return getImplicit(cp, source, 0x04000000);
3133 return getImplicit(cp
, source
);
3134 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3135 /* UCA is filled with these. Tailorings are NOT_FOUND */
3136 //return getImplicit(cp, source, 0);
3137 return getImplicit(cp
, source
);
3138 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3139 return 0; /* broken surrogate sequence */
3140 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3142 if( source
->flags
& UCOL_USE_ITERATOR
) {
3143 if(U_IS_TRAIL(nextChar
= (UChar
)source
->iterator
->current(source
->iterator
))) {
3144 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3145 source
->iterator
->next(source
->iterator
);
3146 return getImplicit(cp
, source
);
3150 } else if((((source
->flags
& UCOL_ITER_HASLEN
) == 0 ) || (source
->pos
<source
->endp
)) &&
3151 U_IS_TRAIL((nextChar
=*source
->pos
))) {
3152 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3154 return getImplicit(cp
, source
);
3156 return 0; /* completely ignorable */
3158 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3161 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3162 //const uint32_t LCount = 19;
3163 const uint32_t VCount
= 21;
3164 const uint32_t TCount
= 28;
3165 //const uint32_t NCount = VCount * TCount; // 588
3166 //const uint32_t SCount = LCount * NCount; // 11172
3167 uint32_t L
= ch
- SBase
;
3169 // divide into pieces
3171 uint32_t T
= L
% TCount
; // we do it in this order since some compilers can do % and / in one operation
3173 uint32_t V
= L
% VCount
;
3182 // return the first CE, but first put the rest into the expansion buffer
3183 if (!source
->coll
->image
->jamoSpecial
) { // FAST PATH
3185 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
3186 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3187 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, V
);
3189 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
3190 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3191 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, T
);
3194 /*return ucmpe32_get(UCA->mapping, L);*/ // return first one
3195 /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3196 return UTRIE_GET32_FROM_LEAD(coll
->mapping
, L
);
3198 } else { // Jamo is Special
3199 // Since Hanguls pass the FCD check, it is
3200 // guaranteed that we won't be in
3201 // the normalization buffer if something like this happens
3202 // However, if we are using a uchar iterator and normalization
3203 // is ON, the Hangul that lead us here is going to be in that
3204 // normalization buffer. Here we want to restore the uchar
3205 // iterator state and pull out of the normalization buffer
3206 if(source
->iterator
!= NULL
&& source
->flags
& UCOL_ITER_INNORMBUF
) {
3207 source
->flags
= source
->origFlags
; // restore the iterator
3210 // Move Jamos into normalization buffer
3211 source
->writableBuffer
[0] = (UChar
)L
;
3212 source
->writableBuffer
[1] = (UChar
)V
;
3214 source
->writableBuffer
[2] = (UChar
)T
;
3215 source
->writableBuffer
[3] = 0;
3217 source
->writableBuffer
[2] = 0;
3220 source
->fcdPosition
= source
->pos
; // Indicate where to continue in main input string
3221 // after exhausting the writableBuffer
3222 source
->pos
= source
->writableBuffer
;
3223 source
->origFlags
= source
->flags
;
3224 source
->flags
|= UCOL_ITER_INNORMBUF
;
3225 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3227 return(UCOL_IGNORABLE
);
3231 /* not yet implemented */
3232 /* probably after 1.8 */
3233 return UCOL_NOT_FOUND
;
3235 *status
= U_INTERNAL_PROGRAM_ERROR
;
3239 if (CE
<= UCOL_NOT_FOUND
) break;
3245 /* now uses Mark's getImplicitPrimary code */
3247 inline uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
) {
3252 uint32_t r
= getImplicitPrimary(cp
);
3254 *(collationSource
->CEpos
++) = (r
& UCOL_PRIMARYMASK
) | 0x00000505;
3255 collationSource
->toReturn
= collationSource
->CEpos
;
3256 return ((r
& 0x0000FFFF)<<16) | 0x000000C0;
3260 * This function handles the special CEs like contractions, expansions,
3262 * It is called by both getPrevCE
3264 uint32_t ucol_prv_getSpecialPrevCE(const UCollator
*coll
, UChar ch
, uint32_t CE
,
3265 collIterate
*source
,
3268 const uint32_t *CEOffset
= NULL
;
3269 UChar
*UCharOffset
= NULL
;
3271 const UChar
*constart
= NULL
;
3273 UChar buffer
[UCOL_MAX_BUFFER
];
3274 uint32_t *endCEBuffer
;
3276 int32_t noChars
= 0;
3280 /* the only ces that loops are thai and contractions */
3281 switch (getCETag(CE
))
3283 case NOT_FOUND_TAG
: /* this tag always returns */
3285 case SURROGATE_TAG
: /* This is a surrogate pair */
3286 /* essentialy an engaged lead surrogate. */
3287 /* if you have encountered it here, it means that a */
3288 /* broken sequence was encountered and this is an error */
3291 if ((source
->flags
& UCOL_ITER_INNORMBUF
) || /* Already Swapped || */
3292 source
->string
== source
->pos
|| /* At start of string.|| */
3293 /* previous char not Thai prevowel */
3294 /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
3295 UCOL_ISTHAIPREVOWEL(peekCharacter(source
, -1)) == FALSE
)
3296 //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
3298 /* Treat Thai as a length one expansion */
3299 /* find the offset to expansion table */
3300 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
);
3306 Move the prevowel and the following base Consonant into the
3307 normalization buffer with their order swapped
3309 UChar
*tempbuffer
= source
->writableBuffer
+
3310 (source
->writableBufSize
- 1);
3311 *(tempbuffer
- 2) = 0;
3312 *(tempbuffer
- 1) = peekCharacter(source
, 0);
3313 *(tempbuffer
) = peekCharacter(source
, -1);
3316 Indicate where to continue in main input string after exhausting
3319 if (source
->pos
- 1 == source
->string
) {
3320 source
->fcdPosition
= NULL
;
3322 source
->fcdPosition
= source
->pos
-2;
3325 source
->pos
= tempbuffer
;
3326 source
->origFlags
= source
->flags
;
3327 source
->flags
|= UCOL_ITER_INNORMBUF
;
3328 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3330 //CE = UCOL_IGNORABLE;
3331 return(UCOL_IGNORABLE
);
3336 // Special processing is getting a CE that is preceded by a certain prefix
3337 // Currently this is only needed for optimizing Japanese length and iteration marks.
3338 // When we encouter a special processing tag, we go backwards and try to see if
3340 // Contraction tables are used - so the whole process is not unlike contraction.
3341 // prefix data is stored backwards in the table.
3342 const UChar
*UCharOffset
;
3344 collIterateState prefixState
;
3345 backupState(source
, &prefixState
);
3347 // This loop will run once per source string character, for as long as we
3348 // are matching a potential contraction sequence
3350 // First we position ourselves at the begining of contraction sequence
3351 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3353 if (collIter_bos(source
)) {
3354 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3357 schar
= getPrevNormalizedChar(source
);
3360 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3364 if (schar
== tchar
) {
3365 // Found the source string char in the table.
3366 // Pick up the corresponding CE from the table.
3367 CE
= *(coll
->contractionCEs
+
3368 (UCharOffset
- coll
->contractionIndex
));
3372 // if there is a completely ignorable code point in the middle of
3373 // a prefix, we need to act as if it's not there
3374 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3375 // lone surrogates cannot be set to zero as it would break other processing
3376 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
3377 // it's easy for BMP code points
3380 } else if(UTF_IS_TRAIL(schar
) || UTF_IS_LEAD(schar
)) {
3381 // for supplementary code points, we have to check the next one
3382 // situations where we are going to ignore
3383 // 1. beginning of the string: schar is a lone surrogate
3384 // 2. schar is a lone surrogate
3385 // 3. schar is a trail surrogate in a valid surrogate sequence
3386 // that is explicitly set to zero.
3387 if (!collIter_bos(source
)) {
3389 if(UTF_IS_LEAD(lead
= getPrevNormalizedChar(source
))) {
3390 isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, lead
);
3391 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
3392 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
3394 // this is a real, assigned completely ignorable code point
3400 // lone surrogate, completely ignorable
3404 // lone surrogate at the beggining, completely ignorable
3408 // Source string char was not in the table.
3409 // We have not found the prefix.
3410 CE
= *(coll
->contractionCEs
+
3411 (ContractionStart
- coll
->contractionIndex
));
3415 // The source string char was in the contraction table, and the corresponding
3416 // CE is not a prefix CE. We found the prefix, break
3417 // out of loop, this CE will end up being returned. This is the normal
3418 // way out of prefix handling when the source actually contained
3423 loadState(source
, &prefixState
, TRUE
);
3427 case CONTRACTION_TAG
:
3428 /* to ensure that the backwards and forwards iteration matches, we
3429 take the current region of most possible match and pass it through
3430 the forward iteration. this will ensure that the obstinate problem of
3431 overlapping contractions will not occur.
3433 schar
= peekCharacter(source
, 0);
3434 constart
= (UChar
*)coll
->image
+ getContractOffset(CE
);
3435 if (isAtStartPrevIterate(source
)
3436 /* commented away contraction end checks after adding the checks
3438 /* start of string or this is not the end of any contraction */
3439 CE
= *(coll
->contractionCEs
+
3440 (constart
- coll
->contractionIndex
));
3444 UCharOffset
= strbuffer
+ (UCOL_MAX_BUFFER
- 1);
3445 *(UCharOffset
--) = 0;
3447 // have to swap thai characters
3448 while (ucol_unsafeCP(schar
, coll
) || UCOL_ISTHAIBASECONSONANT(schar
)) {
3449 *(UCharOffset
) = schar
;
3452 schar
= getPrevNormalizedChar(source
);
3454 // TODO: when we exhaust the contraction buffer,
3455 // it needs to get reallocated. The problem is
3456 // that the size depends on the string which is
3457 // not iterated over. However, since we're travelling
3458 // backwards, we already had to set the iterator at
3459 // the end - so we might as well know where we are?
3460 if (UCharOffset
+ 1 == buffer
) {
3461 /* we have exhausted the buffer */
3462 int32_t newsize
= 0;
3463 if(source
->pos
) { // actually dealing with a position
3464 newsize
= source
->pos
- source
->string
+ 1;
3465 } else { // iterator
3466 newsize
= 4 * UCOL_MAX_BUFFER
;
3468 strbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) *
3469 (newsize
+ UCOL_MAX_BUFFER
));
3471 if (strbuffer
== NULL
) {
3472 *status
= U_MEMORY_ALLOCATION_ERROR
;
3473 return UCOL_NO_MORE_CES
;
3475 UCharOffset
= strbuffer
+ newsize
;
3476 uprv_memcpy(UCharOffset
, buffer
,
3477 UCOL_MAX_BUFFER
* sizeof(UChar
));
3480 if ((source
->pos
&& (source
->pos
== source
->string
||
3481 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
3482 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)))
3483 || (source
->iterator
&& !source
->iterator
->hasPrevious(source
->iterator
))) {
3487 /* adds the initial base character to the string */
3488 *(UCharOffset
) = schar
;
3491 /* a new collIterate is used to simply things, since using the current
3492 collIterate will mean that the forward and backwards iteration will
3493 share and change the same buffers. we don't want to get into that. */
3495 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3496 IInit_collIterate(coll
, UCharOffset
, noChars
, &temp
);
3497 temp
.flags
&= ~UCOL_ITER_NORM
;
3499 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3500 endCEBuffer
= source
->CEs
+ UCOL_EXPAND_CE_BUFFER_SIZE
;
3501 while (CE
!= UCOL_NO_MORE_CES
) {
3502 *(source
->CEpos
++) = CE
;
3503 if (source
->CEpos
== endCEBuffer
) {
3504 /* ran out of CE space, bail.
3505 there's no guarantee of the right character position after
3507 *status
= U_BUFFER_OVERFLOW_ERROR
;
3508 source
->CEpos
= source
->CEs
;
3509 freeHeapWritableBuffer(&temp
);
3510 if (strbuffer
!= buffer
) {
3511 uprv_free(strbuffer
);
3513 return UCOL_NULLORDER
;
3515 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3517 freeHeapWritableBuffer(&temp
);
3518 if (strbuffer
!= buffer
) {
3519 uprv_free(strbuffer
);
3521 source
->toReturn
= source
->CEpos
- 1;
3522 if (source
->toReturn
== source
->CEs
) {
3523 source
->CEpos
= source
->CEs
;
3525 return *(source
->toReturn
);
3526 case LONG_PRIMARY_TAG
:
3528 *(source
->CEpos
++) = ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
3529 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
3530 source
->toReturn
= source
->CEpos
- 1;
3531 return *(source
->toReturn
);
3533 case EXPANSION_TAG
: /* this tag always returns */
3535 This should handle expansion.
3536 NOTE: we can encounter both continuations and expansions in an expansion!
3537 I have to decide where continuations are going to be dealt with
3539 /* find the offset to expansion table */
3540 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3541 size
= getExpansionCount(CE
);
3544 if there are less than 16 elements in expansion, we don't terminate
3547 for (count
= 0; count
< size
; count
++) {
3548 *(source
->CEpos
++) = *CEOffset
++;
3553 while (*CEOffset
!= 0) {
3554 *(source
->CEpos
++) = *CEOffset
++;
3557 source
->toReturn
= source
->CEpos
- 1;
3558 // in case of one element expansion, we
3559 // want to immediately return CEpos
3560 if(source
->toReturn
== source
->CEs
) {
3561 source
->CEpos
= source
->CEs
;
3563 return *(source
->toReturn
);
3567 We do a check to see if we want to collate digits as numbers; if so we generate
3568 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3571 uint32_t i
; /* general counter */
3573 if (coll
->numericCollation
== UCOL_ON
){
3576 uint32_t digIndx
= 0;
3577 uint32_t endIndex
= 0;
3578 uint32_t leadingZeroIndex
= 0;
3579 uint32_t trailingZeroCount
= 0;
3581 uint32_t primWeight
= 0;
3583 uint32_t digVal
= 0;
3584 uint8_t collateVal
= 0;
3586 UBool nonZeroValReached
= false;
3588 uint8_t *numTempBuf
;
3589 uint8_t stackNumTempBuf
[UCOL_MAX_BUFFER
]; // I just need a temporary place to store my generated CEs.
3590 uint32_t numTempBufSize
= UCOL_MAX_BUFFER
;
3592 numTempBuf
= stackNumTempBuf
;
3594 We parse the source string until we hit a char that's NOT a digit.
3595 Use this u_charDigitValue. This might be slow because we have to
3596 handle surrogates...
3599 if (U16_IS_TRAIL (ch
)){
3600 if (!collIter_bos(source
)){
3601 char32
= U16_GET_SUPPLEMENTARY(getPrevNormalizedChar(source
),ch
);
3609 digVal
= u_charDigitValue(char32
);
3612 // Make sure we have enough space.
3613 if (digIndx
>= ((numTempBufSize
- 2) * 2) + 1)
3615 numTempBufSize
*= 2;
3616 if (numTempBuf
== stackNumTempBuf
){
3617 numTempBuf
= (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize
);
3618 memcpy(numTempBuf
, stackNumTempBuf
, UCOL_MAX_BUFFER
);
3620 realloc(numTempBuf
, numTempBufSize
);
3623 // Skip over trailing zeroes, and keep a count of them.
3625 nonZeroValReached
= true;
3626 if (nonZeroValReached
){
3628 We parse the digit string into base 100 numbers (this fits into a byte).
3629 We only add to the buffer in twos, thus if we are parsing an odd character,
3630 that serves as the 'tens' digit while the if we are parsing an even one, that
3631 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3632 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3633 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3634 than all the other bytes.
3636 Since we're doing in this reverse we want to put the first digit encountered into the
3637 ones place and the second digit encountered into the tens place.
3640 if ((digIndx
+ trailingZeroCount
) % 2 == 1){
3641 // High-order digit case (tens place)
3642 collateVal
+= digVal
* 10;
3644 // We cannot set leadingZeroIndex unless it has been set for the
3645 // low-order digit. Therefore, all we can do for the high-order
3646 // digit is turn it off, never on.
3647 // The only time we will have a high digit without a low is for
3648 // the very first non-zero digit, so no zero check is necessary.
3649 if (collateVal
!= 0)
3650 leadingZeroIndex
= 0;
3652 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3656 // Low-order digit case (ones place)
3657 collateVal
= digVal
;
3659 // Check for leading zeroes.
3660 if (collateVal
== 0)
3662 if (!leadingZeroIndex
)
3663 leadingZeroIndex
= (digIndx
/2) + 2;
3666 leadingZeroIndex
= 0;
3668 // No need to write to buffer; the case of a last odd digit
3669 // is handled below.
3674 ++trailingZeroCount
;
3676 if (!collIter_bos(source
)){
3677 ch
= getPrevNormalizedChar(source
);
3679 if (U16_IS_TRAIL(ch
)){
3680 if (!collIter_bos(source
))
3682 char32
= U16_GET_SUPPLEMENTARY(getPrevNormalizedChar(source
),ch
);
3689 if ((digVal
= u_charDigitValue(char32
)) == -1){
3690 // Don't need to "reverse" the goBackOne call,
3691 // as this points to the next position to process..
3692 if (char32
> 0xFFFF) // For surrogates.
3693 getNextNormalizedChar(source
);
3700 if (nonZeroValReached
== false){
3702 trailingZeroCount
= 0;
3706 if ((digIndx
+ trailingZeroCount
) % 2 != 0){
3707 numTempBuf
[((digIndx
)/2) + 2] = collateVal
*2 + 6;
3711 endIndex
= leadingZeroIndex
? leadingZeroIndex
: ((digIndx
/2) + 2) ;
3713 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3717 We want to skip over the first two slots in the buffer. The first slot
3718 is reserved for the header byte 0x1B. The second slot is for the
3719 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3720 The exponent must be adjusted by the number of leading zeroes, and the number of
3723 numTempBuf
[0] = 0x1B;
3724 uint32_t exponent
= (digIndx
+trailingZeroCount
)/2;
3725 if (leadingZeroIndex
)
3726 exponent
-= ((digIndx
/2) + 2 - leadingZeroIndex
);
3727 numTempBuf
[1] = 0x80 + (exponent
& 0x7F);
3729 // Now transfer the collation key to our collIterate struct.
3730 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3731 //size = ((endIndex+1) & ~1)/2;
3732 *(source
->CEpos
++) = (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3733 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3734 UCOL_BYTE_COMMON
; // Tertiary weight.
3735 i
= endIndex
- 1; // Reset the index into the buffer.
3738 primWeight
= numTempBuf
[i
--] << 8;
3740 primWeight
|= numTempBuf
[i
--];
3741 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3743 if (numTempBuf
!= stackNumTempBuf
)
3746 source
->toReturn
= source
->CEpos
-1;
3747 return *(source
->toReturn
);
3750 /* find the offset to expansion table */
3751 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3752 size
= getExpansionCount(CE
);
3755 if there are less than 16 elements in expansion, we don't terminate
3758 for (count
= 0; count
< size
; count
++) {
3759 *(source
->CEpos
++) = *CEOffset
++;
3764 while (*CEOffset
!= 0) {
3765 *(source
->CEpos
++) = *CEOffset
++;
3768 source
->toReturn
= source
->CEpos
- 1;
3769 // in case of one element expansion, we
3770 // want to immediately return CEpos
3771 if(source
->toReturn
== source
->CEs
) {
3772 source
->CEpos
= source
->CEs
;
3774 return *(source
->toReturn
);
3777 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3780 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3781 //const uint32_t LCount = 19;
3782 const uint32_t VCount
= 21;
3783 const uint32_t TCount
= 28;
3784 //const uint32_t NCount = VCount * TCount; /* 588 */
3785 //const uint32_t SCount = LCount * NCount; /* 11172 */
3787 uint32_t L
= ch
- SBase
;
3790 we do it in this order since some compilers can do % and / in one
3793 uint32_t T
= L
% TCount
;
3795 uint32_t V
= L
% VCount
;
3804 return the first CE, but first put the rest into the expansion buffer
3806 if (!source
->coll
->image
->jamoSpecial
)
3808 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
3809 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3810 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, L
);
3811 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
3812 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3813 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, V
);
3815 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
3816 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3817 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, T
);
3819 source
->toReturn
= source
->CEpos
- 1;
3820 return *(source
->toReturn
);
3822 // Since Hanguls pass the FCD check, it is
3823 // guaranteed that we won't be in
3824 // the normalization buffer if something like this happens
3825 // Move Jamos into normalization buffer
3827 Move the Jamos into the
3828 normalization buffer
3830 UChar
*tempbuffer
= source
->writableBuffer
+
3831 (source
->writableBufSize
- 1);
3834 *(tempbuffer
- 1) = (UChar
)T
;
3835 *(tempbuffer
- 2) = (UChar
)V
;
3836 *(tempbuffer
- 3) = (UChar
)L
;
3837 *(tempbuffer
- 4) = 0;
3839 *(tempbuffer
- 1) = (UChar
)V
;
3840 *(tempbuffer
- 2) = (UChar
)L
;
3841 *(tempbuffer
- 3) = 0;
3845 Indicate where to continue in main input string after exhausting
3848 if (source
->pos
== source
->string
) {
3849 source
->fcdPosition
= NULL
;
3851 source
->fcdPosition
= source
->pos
-1;
3854 source
->pos
= tempbuffer
;
3855 source
->origFlags
= source
->flags
;
3856 source
->flags
|= UCOL_ITER_INNORMBUF
;
3857 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3859 return(UCOL_IGNORABLE
);
3862 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3863 return 0; /* broken surrogate sequence */
3864 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3869 if (isAtStartPrevIterate(source
)) {
3870 /* we are at the start of the string, wrong place to be at */
3873 if (source
->pos
!= source
->writableBuffer
) {
3874 prev
= source
->pos
- 1;
3876 prev
= source
->fcdPosition
;
3880 /* Handles Han and Supplementary characters here.*/
3881 if (UTF_IS_FIRST_SURROGATE(prevChar
)) {
3882 cp
= ((((uint32_t)prevChar
)<<10UL)+(ch
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3885 return 0; /* completely ignorable */
3887 return getPrevImplicit(cp
, source
);
3889 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3890 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3891 return getPrevImplicit(ch
, source
);
3892 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3893 return getPrevImplicit(ch
, source
);
3894 /* UCA is filled with these. Tailorings are NOT_FOUND */
3895 /* not yet implemented */
3896 case CHARSET_TAG
: /* this tag always returns */
3897 /* probably after 1.8 */
3898 return UCOL_NOT_FOUND
;
3899 default: /* this tag always returns */
3900 *status
= U_INTERNAL_PROGRAM_ERROR
;
3904 if (CE
<= UCOL_NOT_FOUND
) {
3911 /* This should really be a macro */
3912 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3915 uint8_t *reallocateBuffer(uint8_t **secondaries
, uint8_t *secStart
, uint8_t *second
, uint32_t *secSize
, uint32_t newSize
, UErrorCode
*status
) {
3917 fprintf(stderr
, ".");
3919 uint8_t *newStart
= NULL
;
3920 uint32_t offset
= *secondaries
-secStart
;
3922 if(secStart
==second
) {
3923 newStart
=(uint8_t*)uprv_malloc(newSize
);
3924 if(newStart
==NULL
) {
3925 *status
= U_MEMORY_ALLOCATION_ERROR
;
3928 uprv_memcpy(newStart
, secStart
, *secondaries
-secStart
);
3930 newStart
=(uint8_t*)uprv_realloc(secStart
, newSize
);
3931 if(newStart
==NULL
) {
3932 *status
= U_MEMORY_ALLOCATION_ERROR
;
3936 *secondaries
=newStart
+offset
;
3942 /* This should really be a macro */
3943 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3944 /* secondaries in French */
3946 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3956 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3958 while((start)<(end)) { \
3960 *(start)++ = *(end); \
3965 /****************************************************************************/
3966 /* Following are the sortkey generation functions */
3968 /****************************************************************************/
3971 * Merge two sort keys.
3972 * This is useful, for example, to combine sort keys from first and last names
3973 * to sort such pairs.
3974 * Merged sort keys consider on each collation level the first part first entirely,
3975 * then the second one.
3976 * It is possible to merge multiple sort keys by consecutively merging
3977 * another one with the intermediate result.
3979 * The length of the merge result is the sum of the lengths of the input sort keys
3982 * @param src1 the first sort key
3983 * @param src1Length the length of the first sort key, including the zero byte at the end;
3984 * can be -1 if the function is to find the length
3985 * @param src2 the second sort key
3986 * @param src2Length the length of the second sort key, including the zero byte at the end;
3987 * can be -1 if the function is to find the length
3988 * @param dest the buffer where the merged sort key is written,
3989 * can be NULL if destCapacity==0
3990 * @param destCapacity the number of bytes in the dest buffer
3991 * @return the length of the merged sort key, src1Length+src2Length-1;
3992 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3993 * in which cases the contents of dest is undefined
3997 U_CAPI
int32_t U_EXPORT2
3998 ucol_mergeSortkeys(const uint8_t *src1
, int32_t src1Length
,
3999 const uint8_t *src2
, int32_t src2Length
,
4000 uint8_t *dest
, int32_t destCapacity
) {
4004 /* check arguments */
4005 if( src1
==NULL
|| src1Length
<-2 || src1Length
==0 || (src1Length
>0 && src1
[src1Length
-1]!=0) ||
4006 src2
==NULL
|| src2Length
<-2 || src2Length
==0 || (src2Length
>0 && src2
[src2Length
-1]!=0) ||
4007 destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)
4009 /* error, attempt to write a zero byte and return 0 */
4010 if(dest
!=NULL
&& destCapacity
>0) {
4016 /* check lengths and capacity */
4018 src1Length
=(int32_t)uprv_strlen((const char *)src1
)+1;
4021 src2Length
=(int32_t)uprv_strlen((const char *)src2
)+1;
4024 destLength
=src1Length
+src2Length
-1;
4025 if(destLength
>destCapacity
) {
4026 /* the merged sort key does not fit into the destination */
4030 /* merge the sort keys with the same number of levels */
4031 while(*src1
!=0 && *src2
!=0) { /* while both have another level */
4032 /* copy level from src1 not including 00 or 01 */
4033 while((b
=*src1
)>=2) {
4038 /* add a 02 merge separator */
4041 /* copy level from src2 not including 00 or 01 */
4042 while((b
=*src2
)>=2) {
4047 /* if both sort keys have another level, then add a 01 level separator and continue */
4048 if(*src1
==1 && *src2
==1) {
4056 * here, at least one sort key is finished now, but the other one
4057 * might have some contents left from containing more levels;
4058 * that contents is just appended to the result
4061 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4064 /* append src2, "the other, unfinished sort key" */
4065 uprv_strcpy((char *)dest
, (const char *)src2
);
4067 /* trust that neither sort key contained illegally embedded zero bytes */
4072 U_CAPI
int32_t U_EXPORT2
4073 ucol_getSortKey(const UCollator
*coll
,
4074 const UChar
*source
,
4075 int32_t sourceLength
,
4077 int32_t resultLength
)
4079 UErrorCode status
= U_ZERO_ERROR
;
4081 if(source
== NULL
) {
4082 // this is actually an error situation, but we would need to
4083 // have an error code to return it. Until we introduce a new
4084 // API, it stays like this
4087 /* this uses the function pointer that is set in updateinternalstate */
4088 /* currently, there are two funcs: */
4089 /*ucol_calcSortKey(...);*/
4090 /*ucol_calcSortKeySimpleTertiary(...);*/
4092 int32_t keySize
= coll
->sortKeyGen(coll
, source
, sourceLength
, &result
, resultLength
, FALSE
, &status
);
4093 //((UCollator *)coll)->errorCode = status; /*semantically const */
4097 /* this function is called by the C++ API for sortkey generation */
4099 ucol_getSortKeyWithAllocation(const UCollator
*coll
,
4100 const UChar
*source
, int32_t sourceLength
,
4102 UErrorCode
*pErrorCode
) {
4104 return coll
->sortKeyGen(coll
, source
, sourceLength
, pResult
, 0, TRUE
, pErrorCode
);
4107 #define UCOL_FSEC_BUF_SIZE 256
4109 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4110 /* or if we run out of space while making a sortkey and want to return ASAP */
4111 int32_t ucol_getSortKeySize(const UCollator
*coll
, collIterate
*s
, int32_t currentSize
, UColAttributeValue strength
, int32_t len
) {
4112 UErrorCode status
= U_ZERO_ERROR
;
4113 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4114 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4115 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4116 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4117 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4118 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4119 //UBool qShifted = shifted && (compareQuad == 0);
4120 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4121 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4122 uint8_t fSecsBuff
[UCOL_FSEC_BUF_SIZE
];
4123 uint8_t *fSecs
= fSecsBuff
;
4124 uint32_t fSecsLen
= 0, fSecsMaxLen
= UCOL_FSEC_BUF_SIZE
;
4125 uint8_t *frenchStartPtr
= NULL
, *frenchEndPtr
= NULL
;
4127 uint32_t variableTopValue
= coll
->variableTopValue
;
4128 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4131 /* allocate one more space for hiragana */
4133 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4135 uint32_t order
= UCOL_NO_MORE_CES
;
4136 uint8_t primary1
= 0;
4137 uint8_t primary2
= 0;
4138 uint8_t secondary
= 0;
4139 uint8_t tertiary
= 0;
4140 int32_t caseShift
= 0;
4141 uint32_t c2
= 0, c3
= 0, c4
= 0; /* variables for compression */
4143 uint8_t caseSwitch
= coll
->caseSwitch
;
4144 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4145 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4147 UBool wasShifted
= FALSE
;
4148 UBool notIsContinuation
= FALSE
;
4149 uint8_t leadPrimary
= 0;
4153 order
= ucol_IGetNextCE(coll
, s
, &status
);
4154 if(order
== UCOL_NO_MORE_CES
) {
4162 notIsContinuation
= !isContinuation(order
);
4165 if(notIsContinuation
) {
4166 tertiary
= (uint8_t)((order
& UCOL_BYTE_SIZE_MASK
));
4168 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4170 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4171 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4172 primary1
= (uint8_t)(order
>> 8);
4175 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4176 || (!notIsContinuation
&& wasShifted
))
4177 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4178 /* and other ignorables should be removed if following a shifted code point */
4179 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4180 /* we should just completely ignore it */
4183 if(compareQuad
== 0) {
4185 currentSize
+= (c2
/UCOL_BOT_COUNT4
)+1;
4196 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4197 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4198 /* calculate sortkey size */
4199 if(primary1
!= UCOL_IGNORABLE
) {
4200 if(notIsContinuation
) {
4201 if(leadPrimary
== primary1
) {
4204 if(leadPrimary
!= 0) {
4207 if(primary2
== UCOL_IGNORABLE
) {
4208 /* one byter, not compressed */
4211 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
4212 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4213 (primary1
> (*UCAconsts
->UCA_LAST_NON_VARIABLE
>>24) && primary1
< (*UCAconsts
->UCA_FIRST_IMPLICIT
>>24))) {
4214 /* not compressible */
4217 } else { /* compress */
4218 leadPrimary
= primary1
;
4222 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4224 if(primary2
!= UCOL_IGNORABLE
) {
4230 if(secondary
> compareSec
) { /* I think that != 0 test should be != IGNORABLE */
4232 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4236 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4237 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+1;
4239 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+1;
4246 fSecs
[fSecsLen
++] = secondary
;
4247 if(fSecsLen
== fSecsMaxLen
) {
4248 if(fSecs
== fSecsBuff
) {
4249 fSecs
= (uint8_t *)uprv_malloc(2*fSecsLen
);
4251 fSecs
= (uint8_t *)uprv_realloc(fSecs
, 2*fSecsLen
);
4254 status
= U_MEMORY_ALLOCATION_ERROR
;
4259 if(notIsContinuation
) {
4260 if (frenchStartPtr
!= NULL
) {
4261 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4262 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4263 frenchStartPtr
= NULL
;
4266 if (frenchStartPtr
== NULL
) {
4267 frenchStartPtr
= fSecs
+fSecsLen
-2;
4269 frenchEndPtr
= fSecs
+fSecsLen
-1;
4275 if (caseShift
== 0) {
4277 caseShift
= UCOL_CASE_SHIFT_START
;
4279 if((tertiary
&0x3F) > 0 && notIsContinuation
) {
4281 if((tertiary
&0xC0) != 0) {
4282 if (caseShift
== 0) {
4284 caseShift
= UCOL_CASE_SHIFT_START
;
4290 if(notIsContinuation
) {
4291 tertiary
^= caseSwitch
;
4295 tertiary
&= tertiaryMask
;
4296 if(tertiary
> compareTer
) { /* I think that != 0 test should be != IGNORABLE */
4297 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4301 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
4302 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
4303 currentSize
+= (c3
/(uint32_t)coll
->tertiaryTopCount
)+1;
4305 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
)+1;
4313 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4314 if(s
->flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4315 if(c4
>0) { // Close this part
4316 currentSize
+= (c4
/UCOL_BOT_COUNT4
)+1;
4319 currentSize
++; // Add the Hiragana
4320 } else { // This wasn't Hiragana, so we can continue adding stuff
4330 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4334 if(frenchStartPtr
!= NULL
) {
4335 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4337 for(i
= 0; i
<fSecsLen
; i
++) {
4338 secondary
= *(fSecs
+fSecsLen
-i
-1);
4339 /* This is compression code. */
4340 if (secondary
== UCOL_COMMON2
) {
4344 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4345 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+((c2
%(uint32_t)UCOL_TOP_COUNT2
!= 0)?1:0);
4347 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4355 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4357 if(fSecs
!= fSecsBuff
) {
4363 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
) + ((c3
%(uint32_t)coll
->tertiaryBottomCount
!= 0)?1:0);
4366 if(c4
> 0 && compareQuad
== 0) {
4367 currentSize
+= (c4
/(uint32_t)UCOL_BOT_COUNT4
)+((c4
%(uint32_t)UCOL_BOT_COUNT4
!= 0)?1:0);
4371 currentSize
+= u_lengthOfIdenticalLevelRun(s
->string
, len
);
4378 inline void doCaseShift(uint8_t **cases
, uint32_t &caseShift
) {
4379 if (caseShift
== 0) {
4380 *(*cases
)++ = UCOL_CASE_BYTE_START
;
4381 caseShift
= UCOL_CASE_SHIFT_START
;
4385 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4386 // know how many values we wanted to add, even if we didn't add them all
4388 inline void addWithIncrement(uint8_t *&primaries
, uint8_t *limit
, uint32_t &size
, const uint8_t value
) {
4390 if(primaries
< limit
) {
4391 *(primaries
)++ = value
;
4395 // Packs the secondary buffer when processing French locale. Adds the terminator.
4397 inline uint8_t *packFrench(uint8_t *primaries
, uint8_t *primEnd
, uint8_t *secondaries
, uint32_t *secsize
, uint8_t *frenchStartPtr
, uint8_t *frenchEndPtr
) {
4400 uint32_t i
= 0, size
= 0;
4401 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4402 addWithIncrement(primaries
, primEnd
, i
, UCOL_LEVELTERMINATOR
);
4403 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4404 if(frenchStartPtr
!= NULL
) {
4405 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4407 for(i
= 0; i
<*secsize
; i
++) {
4408 secondary
= *(secondaries
-i
-1);
4409 /* This is compression code. */
4410 if (secondary
== UCOL_COMMON2
) {
4414 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4415 while (count2
> UCOL_TOP_COUNT2
) {
4416 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
));
4417 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4419 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1)));
4421 while (count2
> UCOL_BOT_COUNT2
) {
4422 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4423 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4425 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4429 addWithIncrement(primaries
, primEnd
, size
, secondary
);
4433 while (count2
> UCOL_BOT_COUNT2
) {
4434 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4435 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4437 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4443 /* This is the sortkey work horse function */
4444 U_CFUNC
int32_t U_CALLCONV
4445 ucol_calcSortKey(const UCollator
*coll
,
4446 const UChar
*source
,
4447 int32_t sourceLength
,
4449 uint32_t resultLength
,
4450 UBool allocateSKBuffer
,
4453 uint32_t i
= 0; /* general purpose counter */
4455 /* Stack allocated buffers for buffers we use */
4456 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
], caseB
[UCOL_CASE_MAX_BUFFER
], quad
[UCOL_QUAD_MAX_BUFFER
];
4458 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
, *cases
= caseB
, *quads
= quad
;
4460 if(U_FAILURE(*status
)) {
4464 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
4465 primaries
= *result
= prim
;
4466 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
4469 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
,
4470 caseSize
= UCOL_CASE_MAX_BUFFER
, quadSize
= UCOL_QUAD_MAX_BUFFER
;
4472 uint32_t sortKeySize
= 1; /* it is always \0 terminated */
4474 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
4475 UChar
*normSource
= normBuffer
;
4476 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
4478 int32_t len
= (sourceLength
== -1 ? u_strlen(source
) : sourceLength
);
4480 UColAttributeValue strength
= coll
->strength
;
4482 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4483 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4484 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4485 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4486 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4487 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4488 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4489 //UBool qShifted = shifted && (compareQuad == 0);
4490 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4491 const uint8_t *scriptOrder
= coll
->scriptOrder
;
4493 uint32_t variableTopValue
= coll
->variableTopValue
;
4494 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4495 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4496 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4497 uint8_t UCOL_HIRAGANA_QUAD
= 0;
4499 UCOL_HIRAGANA_QUAD
=UCOL_COMMON_BOT4
++;
4500 /* allocate one more space for hiragana, value for hiragana */
4502 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4504 /* support for special features like caselevel and funky secondaries */
4505 uint8_t *frenchStartPtr
= NULL
;
4506 uint8_t *frenchEndPtr
= NULL
;
4507 uint32_t caseShift
= 0;
4509 sortKeySize
+= ((compareSec
?0:1) + (compareTer
?0:1) + (doCase
?1:0) + /*(qShifted?1:0)*/(compareQuad
?0:1) + (compareIdent
?1:0));
4511 /* If we need to normalize, we'll do it all at once at the beginning! */
4512 UNormalizationMode normMode
;
4514 normMode
= UNORM_NFD
;
4515 } else if(coll
->normalizationMode
!= UCOL_OFF
) {
4516 normMode
= UNORM_FCD
;
4518 normMode
= UNORM_NONE
;
4521 if(normMode
!= UNORM_NONE
&& UNORM_YES
!= unorm_quickCheck(source
, len
, normMode
, status
)) {
4522 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4526 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
4527 normSourceLen
= len
;
4528 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
4529 if(normSource
== NULL
) {
4530 *status
= U_MEMORY_ALLOCATION_ERROR
;
4533 *status
= U_ZERO_ERROR
;
4534 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4540 if(U_FAILURE(*status
)) {
4543 source
= normSource
;
4547 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4548 if(source
== normSource
) {
4549 s
.flags
&= ~UCOL_ITER_NORM
;
4552 if(resultLength
== 0 || primaries
== NULL
) {
4553 int32_t keyLen
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4554 if(normSource
!= normBuffer
) {
4555 uprv_free(normSource
);
4559 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
4561 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
4563 uint8_t *primStart
= primaries
;
4564 uint8_t *secStart
= secondaries
;
4565 uint8_t *terStart
= tertiaries
;
4566 uint8_t *caseStart
= cases
;
4567 uint8_t *quadStart
= quads
;
4571 uint8_t primary1
= 0;
4572 uint8_t primary2
= 0;
4573 uint8_t secondary
= 0;
4574 uint8_t tertiary
= 0;
4575 uint8_t caseSwitch
= coll
->caseSwitch
;
4576 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4577 int8_t tertiaryAddition
= (int8_t)coll
->tertiaryAddition
;
4578 uint8_t tertiaryTop
= coll
->tertiaryTop
;
4579 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
4580 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4581 uint8_t caseBits
= 0;
4583 UBool finished
= FALSE
;
4584 UBool wasShifted
= FALSE
;
4585 UBool notIsContinuation
= FALSE
;
4587 uint32_t prevBuffSize
= 0;
4589 uint32_t count2
= 0, count3
= 0, count4
= 0;
4590 uint8_t leadPrimary
= 0;
4593 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
4595 order
= ucol_IGetNextCE(coll
, &s
, status
);
4596 if(order
== UCOL_NO_MORE_CES
) {
4605 notIsContinuation
= !isContinuation(order
);
4607 if(notIsContinuation
) {
4608 tertiary
= (uint8_t)(order
& UCOL_BYTE_SIZE_MASK
);
4610 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4613 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4614 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4615 primary1
= (uint8_t)(order
>> 8);
4617 if(notIsContinuation
) {
4618 if(scriptOrder
!= NULL
) {
4619 primary1
= scriptOrder
[primary1
];
4623 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4624 || (!notIsContinuation
&& wasShifted
))
4625 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4626 /* and other ignorables should be removed if following a shifted code point */
4627 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4628 /* we should just completely ignore it */
4631 if(compareQuad
== 0) {
4633 while (count4
> UCOL_BOT_COUNT4
) {
4634 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4635 count4
-= UCOL_BOT_COUNT4
;
4637 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4640 /* We are dealing with a variable and we're treating them as shifted */
4641 /* This is a shifted ignorable */
4642 if(primary1
!= 0) { /* we need to check this since we could be in continuation */
4643 *quads
++ = primary1
;
4646 *quads
++ = primary2
;
4652 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4653 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4654 /* regular and simple sortkey calc */
4655 if(primary1
!= UCOL_IGNORABLE
) {
4656 if(notIsContinuation
) {
4657 if(leadPrimary
== primary1
) {
4658 *primaries
++ = primary2
;
4660 if(leadPrimary
!= 0) {
4661 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
4663 if(primary2
== UCOL_IGNORABLE
) {
4664 /* one byter, not compressed */
4665 *primaries
++ = primary1
;
4667 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
4668 (primary1
> (*UCAconsts
->UCA_LAST_NON_VARIABLE
>>24) && primary1
< (*UCAconsts
->UCA_FIRST_IMPLICIT
>>24))) {
4669 /* not compressible */
4671 *primaries
++ = primary1
;
4672 *primaries
++ = primary2
;
4673 } else { /* compress */
4674 *primaries
++ = leadPrimary
= primary1
;
4675 *primaries
++ = primary2
;
4678 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4679 *primaries
++ = primary1
;
4680 if(primary2
!= UCOL_IGNORABLE
) {
4681 *primaries
++ = primary2
; /* second part */
4686 if(secondary
> compareSec
) {
4688 /* This is compression code. */
4689 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4693 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4694 while (count2
> UCOL_TOP_COUNT2
) {
4695 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
4696 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4698 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
4700 while (count2
> UCOL_BOT_COUNT2
) {
4701 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
4702 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4704 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
4708 *secondaries
++ = secondary
;
4711 *secondaries
++ = secondary
;
4712 /* Do the special handling for French secondaries */
4713 /* We need to get continuation elements and do intermediate restore */
4714 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4715 if(notIsContinuation
) {
4716 if (frenchStartPtr
!= NULL
) {
4717 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4718 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4719 frenchStartPtr
= NULL
;
4722 if (frenchStartPtr
== NULL
) {
4723 frenchStartPtr
= secondaries
- 2;
4725 frenchEndPtr
= secondaries
-1;
4731 doCaseShift(&cases
, caseShift
);
4732 if(notIsContinuation
) {
4733 caseBits
= (uint8_t)(tertiary
& 0xC0);
4736 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
4737 if((caseBits
& 0xC0) == 0) {
4738 *(cases
-1) |= 1 << (--caseShift
);
4740 *(cases
-1) |= 0 << (--caseShift
);
4742 doCaseShift(&cases
, caseShift
);
4743 *(cases
-1) |= ((caseBits
>>6)&1) << (--caseShift
);
4746 if((caseBits
& 0xC0) == 0) {
4747 *(cases
-1) |= 0 << (--caseShift
);
4749 *(cases
-1) |= 1 << (--caseShift
);
4751 doCaseShift(&cases
, caseShift
);
4752 *(cases
-1) |= ((caseBits
>>7)&1) << (--caseShift
);
4759 if(notIsContinuation
) {
4760 tertiary
^= caseSwitch
;
4764 tertiary
&= tertiaryMask
;
4765 if(tertiary
> compareTer
) {
4766 /* This is compression code. */
4767 /* sequence size check is included in the if clause */
4768 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4771 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
4772 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
4773 tertiary
+= tertiaryAddition
;
4776 if ((tertiary
> tertiaryCommon
)) {
4777 while (count3
> coll
->tertiaryTopCount
) {
4778 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
4779 count3
-= (uint32_t)coll
->tertiaryTopCount
;
4781 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
4783 while (count3
> coll
->tertiaryBottomCount
) {
4784 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
4785 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
4787 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
4791 *tertiaries
++ = tertiary
;
4795 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4796 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4797 if(count4
>0) { // Close this part
4798 while (count4
> UCOL_BOT_COUNT4
) {
4799 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4800 count4
-= UCOL_BOT_COUNT4
;
4802 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4805 *quads
++ = UCOL_HIRAGANA_QUAD
; // Add the Hiragana
4806 } else { // This wasn't Hiragana, so we can continue adding stuff
4812 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
4813 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
4814 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4815 if(source
== normSource
) {
4816 s
.flags
&= ~UCOL_ITER_NORM
;
4818 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4819 *status
= U_BUFFER_OVERFLOW_ERROR
;
4822 } else { /* It's much nicer if we can actually reallocate */
4823 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
)+(cases
-caseStart
)+(quads
-quadStart
);
4824 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
4825 if(U_SUCCESS(*status
)) {
4826 *result
= primStart
;
4827 primarySafeEnd
= primStart
+ resultLength
- 2;
4829 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4830 if(source
== normSource
) {
4831 s
.flags
&= ~UCOL_ITER_NORM
;
4833 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4843 prevBuffSize
= minBufferSize
;
4844 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
4845 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
4846 caseStart
= reallocateBuffer(&cases
, caseStart
, caseB
, &caseSize
, 2*caseSize
, status
);
4847 quadStart
= reallocateBuffer(&quads
, quadStart
, quad
, &quadSize
, 2*quadSize
, status
);
4849 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4850 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4851 if(source
== normSource
) {
4852 s
.flags
&= ~UCOL_ITER_NORM
;
4854 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4860 /* Here, we are generally done with processing */
4861 /* bailing out would not be too productive */
4863 if(U_SUCCESS(*status
)) {
4864 sortKeySize
+= (primaries
- primStart
);
4865 /* we have done all the CE's, now let's put them together to form a key */
4866 if(compareSec
== 0) {
4868 while (count2
> UCOL_BOT_COUNT2
) {
4869 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
4870 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4872 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
4874 uint32_t secsize
= secondaries
-secStart
;
4875 if(!isFrenchSec
) { // Regular situation, we know the length of secondaries
4876 sortKeySize
+= secsize
;
4877 if(sortKeySize
<= resultLength
) {
4878 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4879 uprv_memcpy(primaries
, secStart
, secsize
);
4880 primaries
+= secsize
;
4882 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
4883 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4884 if(U_SUCCESS(*status
)) {
4885 *result
= primStart
;
4886 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4887 uprv_memcpy(primaries
, secStart
, secsize
);
4888 primaries
+= secsize
;
4891 *status
= U_BUFFER_OVERFLOW_ERROR
;
4894 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4895 uint8_t *newPrim
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
4896 sortKeySize
+= secsize
;
4897 if(sortKeySize
<= resultLength
) { // if we managed to pack fine
4898 primaries
= newPrim
; // update the primary pointer
4899 } else { // overflow, need to reallocate and redo
4900 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
4901 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4902 if(U_SUCCESS(*status
)) {
4903 primaries
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
4906 *status
= U_BUFFER_OVERFLOW_ERROR
;
4913 uint32_t casesize
= cases
- caseStart
;
4914 sortKeySize
+= casesize
;
4915 if(sortKeySize
<= resultLength
) {
4916 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4917 uprv_memcpy(primaries
, caseStart
, casesize
);
4918 primaries
+= casesize
;
4920 if(allocateSKBuffer
== TRUE
) {
4921 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4922 if(U_SUCCESS(*status
)) {
4923 *result
= primStart
;
4924 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4925 uprv_memcpy(primaries
, caseStart
, casesize
);
4928 *status
= U_BUFFER_OVERFLOW_ERROR
;
4933 if(compareTer
== 0) {
4935 if (coll
->tertiaryCommon
!= UCOL_COMMON_BOT3
) {
4936 while (count3
>= coll
->tertiaryTopCount
) {
4937 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
4938 count3
-= (uint32_t)coll
->tertiaryTopCount
;
4940 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
4942 while (count3
> coll
->tertiaryBottomCount
) {
4943 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
4944 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
4946 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
4949 uint32_t tersize
= tertiaries
- terStart
;
4950 sortKeySize
+= tersize
;
4951 if(sortKeySize
<= resultLength
) {
4952 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4953 uprv_memcpy(primaries
, terStart
, tersize
);
4954 primaries
+= tersize
;
4956 if(allocateSKBuffer
== TRUE
) {
4957 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4958 if(U_SUCCESS(*status
)) {
4959 *result
= primStart
;
4960 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4961 uprv_memcpy(primaries
, terStart
, tersize
);
4964 *status
= U_BUFFER_OVERFLOW_ERROR
;
4968 if(compareQuad
== 0/*qShifted == TRUE*/) {
4970 while (count4
> UCOL_BOT_COUNT4
) {
4971 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4972 count4
-= UCOL_BOT_COUNT4
;
4974 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4976 uint32_t quadsize
= quads
- quadStart
;
4977 sortKeySize
+= quadsize
;
4978 if(sortKeySize
<= resultLength
) {
4979 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4980 uprv_memcpy(primaries
, quadStart
, quadsize
);
4981 primaries
+= quadsize
;
4983 if(allocateSKBuffer
== TRUE
) {
4984 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4985 if(U_SUCCESS(*status
)) {
4986 *result
= primStart
;
4987 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4988 uprv_memcpy(primaries
, quadStart
, quadsize
);
4991 *status
= U_BUFFER_OVERFLOW_ERROR
;
4997 sortKeySize
+= u_lengthOfIdenticalLevelRun(s
.string
, len
);
4998 if(sortKeySize
<= resultLength
) {
4999 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5000 primaries
+= u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5002 if(allocateSKBuffer
== TRUE
) {
5003 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, sortKeySize
, status
);
5004 if(U_SUCCESS(*status
)) {
5005 *result
= primStart
;
5006 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5007 u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5010 *status
= U_BUFFER_OVERFLOW_ERROR
;
5015 *(primaries
++) = '\0';
5018 if(terStart
!= tert
) {
5019 uprv_free(terStart
);
5020 uprv_free(secStart
);
5021 uprv_free(caseStart
);
5022 uprv_free(quadStart
);
5025 if(normSource
!= normBuffer
) {
5026 uprv_free(normSource
);
5029 if(allocateSKBuffer
== TRUE
) {
5030 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5032 if (*result
== NULL
) {
5033 *status
= U_MEMORY_ALLOCATION_ERROR
;
5036 uprv_memcpy(*result
, primStart
, sortKeySize
);
5037 if(primStart
!= prim
) {
5038 uprv_free(primStart
);
5046 U_CFUNC
int32_t U_CALLCONV
5047 ucol_calcSortKeySimpleTertiary(const UCollator
*coll
,
5048 const UChar
*source
,
5049 int32_t sourceLength
,
5051 uint32_t resultLength
,
5052 UBool allocateSKBuffer
,
5056 uint32_t i
= 0; /* general purpose counter */
5058 /* Stack allocated buffers for buffers we use */
5059 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
];
5061 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
;
5063 if(U_FAILURE(*status
)) {
5067 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
5068 primaries
= *result
= prim
;
5069 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
5072 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
;
5074 uint32_t sortKeySize
= 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5076 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
5077 UChar
*normSource
= normBuffer
;
5078 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
5080 int32_t len
= sourceLength
;
5082 /* If we need to normalize, we'll do it all at once at the beginning! */
5083 if(coll
->normalizationMode
!= UCOL_OFF
&& UNORM_YES
!= unorm_quickCheck(source
, len
, UNORM_FCD
, status
)) {
5084 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5088 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
5089 normSourceLen
= len
;
5090 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
5091 if(normSource
== NULL
) {
5092 *status
= U_MEMORY_ALLOCATION_ERROR
;
5095 *status
= U_ZERO_ERROR
;
5096 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5102 if(U_FAILURE(*status
)) {
5105 source
= normSource
;
5109 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5110 if(source
== normSource
) {
5111 s
.flags
&= ~UCOL_ITER_NORM
;
5114 if(resultLength
== 0 || primaries
== NULL
) {
5115 int32_t t
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5116 if(normSource
!= normBuffer
) {
5117 uprv_free(normSource
);
5122 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
5124 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
5126 uint8_t *primStart
= primaries
;
5127 uint8_t *secStart
= secondaries
;
5128 uint8_t *terStart
= tertiaries
;
5132 uint8_t primary1
= 0;
5133 uint8_t primary2
= 0;
5134 uint8_t secondary
= 0;
5135 uint8_t tertiary
= 0;
5136 uint8_t caseSwitch
= coll
->caseSwitch
;
5137 uint8_t tertiaryMask
= coll
->tertiaryMask
;
5138 int8_t tertiaryAddition
= (int8_t)coll
->tertiaryAddition
;
5139 uint8_t tertiaryTop
= coll
->tertiaryTop
;
5140 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
5141 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
5143 uint32_t prevBuffSize
= 0;
5145 UBool finished
= FALSE
;
5146 UBool notIsContinuation
= FALSE
;
5148 uint32_t count2
= 0, count3
= 0;
5149 uint8_t leadPrimary
= 0;
5152 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
5154 order
= ucol_IGetNextCE(coll
, &s
, status
);
5160 if(order
== UCOL_NO_MORE_CES
) {
5165 notIsContinuation
= !isContinuation(order
);
5167 if(notIsContinuation
) {
5168 tertiary
= (uint8_t)((order
& tertiaryMask
));
5170 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
5172 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5173 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5174 primary1
= (uint8_t)(order
>> 8);
5176 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5177 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5178 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5179 /* regular and simple sortkey calc */
5180 if(primary1
!= UCOL_IGNORABLE
) {
5181 if(notIsContinuation
) {
5182 if(leadPrimary
== primary1
) {
5183 *primaries
++ = primary2
;
5185 if(leadPrimary
!= 0) {
5186 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5188 if(primary2
== UCOL_IGNORABLE
) {
5189 /* one byter, not compressed */
5190 *primaries
++ = primary1
;
5192 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
5193 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5194 (primary1
> (*UCAconsts
->UCA_LAST_NON_VARIABLE
>>24) && primary1
< (*UCAconsts
->UCA_FIRST_IMPLICIT
>>24))) {
5195 /* not compressible */
5197 *primaries
++ = primary1
;
5198 *primaries
++ = primary2
;
5199 } else { /* compress */
5200 *primaries
++ = leadPrimary
= primary1
;
5201 *primaries
++ = primary2
;
5204 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5205 *primaries
++ = primary1
;
5206 if(primary2
!= UCOL_IGNORABLE
) {
5207 *primaries
++ = primary2
; /* second part */
5212 if(secondary
> 0) { /* I think that != 0 test should be != IGNORABLE */
5213 /* This is compression code. */
5214 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5218 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5219 while (count2
> UCOL_TOP_COUNT2
) {
5220 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5221 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5223 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5225 while (count2
> UCOL_BOT_COUNT2
) {
5226 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5227 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5229 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5233 *secondaries
++ = secondary
;
5237 if(notIsContinuation
) {
5238 tertiary
^= caseSwitch
;
5242 /* This is compression code. */
5243 /* sequence size check is included in the if clause */
5244 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5247 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
5248 tertiary
+= tertiaryAddition
;
5249 } else if (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
5250 tertiary
-= tertiaryAddition
;
5253 if ((tertiary
> tertiaryCommon
)) {
5254 while (count3
> coll
->tertiaryTopCount
) {
5255 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5256 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5258 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5260 while (count3
> coll
->tertiaryBottomCount
) {
5261 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5262 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5264 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5268 *tertiaries
++ = tertiary
;
5272 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5273 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5274 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5275 if(source
== normSource
) {
5276 s
.flags
&= ~UCOL_ITER_NORM
;
5278 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5279 *status
= U_BUFFER_OVERFLOW_ERROR
;
5282 } else { /* It's much nicer if we can actually reallocate */
5283 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
);
5284 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5285 if(U_SUCCESS(*status
)) {
5286 *result
= primStart
;
5287 primarySafeEnd
= primStart
+ resultLength
- 2;
5289 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5290 if(source
== normSource
) {
5291 s
.flags
&= ~UCOL_ITER_NORM
;
5293 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5303 prevBuffSize
= minBufferSize
;
5304 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5305 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5307 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5308 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5309 if(source
== normSource
) {
5310 s
.flags
&= ~UCOL_ITER_NORM
;
5312 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5318 if(U_SUCCESS(*status
)) {
5319 sortKeySize
+= (primaries
- primStart
);
5320 /* we have done all the CE's, now let's put them together to form a key */
5322 while (count2
> UCOL_BOT_COUNT2
) {
5323 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5324 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5326 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5328 uint32_t secsize
= secondaries
-secStart
;
5329 sortKeySize
+= secsize
;
5330 if(sortKeySize
<= resultLength
) {
5331 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5332 uprv_memcpy(primaries
, secStart
, secsize
);
5333 primaries
+= secsize
;
5335 if(allocateSKBuffer
== TRUE
) {
5336 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5337 if(U_SUCCESS(*status
)) {
5338 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5339 *result
= primStart
;
5340 uprv_memcpy(primaries
, secStart
, secsize
);
5343 *status
= U_BUFFER_OVERFLOW_ERROR
;
5348 if (coll
->tertiaryCommon
!= UCOL_COMMON3_NORMAL
) {
5349 while (count3
>= coll
->tertiaryTopCount
) {
5350 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5351 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5353 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5355 while (count3
> coll
->tertiaryBottomCount
) {
5356 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5357 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5359 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5362 uint32_t tersize
= tertiaries
- terStart
;
5363 sortKeySize
+= tersize
;
5364 if(sortKeySize
<= resultLength
) {
5365 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5366 uprv_memcpy(primaries
, terStart
, tersize
);
5367 primaries
+= tersize
;
5369 if(allocateSKBuffer
== TRUE
) {
5370 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5371 if(U_SUCCESS(*status
)) {
5372 *result
= primStart
;
5373 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5374 uprv_memcpy(primaries
, terStart
, tersize
);
5377 *status
= U_MEMORY_ALLOCATION_ERROR
;
5381 *(primaries
++) = '\0';
5384 if(terStart
!= tert
) {
5385 uprv_free(terStart
);
5386 uprv_free(secStart
);
5389 if(normSource
!= normBuffer
) {
5390 uprv_free(normSource
);
5393 if(allocateSKBuffer
== TRUE
) {
5394 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5396 if (*result
== NULL
) {
5397 *status
= U_MEMORY_ALLOCATION_ERROR
;
5400 uprv_memcpy(*result
, primStart
, sortKeySize
);
5401 if(primStart
!= prim
) {
5402 uprv_free(primStart
);
5410 UBool
isShiftedCE(uint32_t CE
, uint32_t LVT
, UBool
*wasShifted
) {
5411 UBool notIsContinuation
= !isContinuation(CE
);
5412 uint8_t primary1
= (uint8_t)((CE
>> 24) & 0xFF);
5413 if(LVT
&& ((notIsContinuation
&& (CE
& 0xFFFF0000)<= LVT
&& primary1
> 0)
5414 || (!notIsContinuation
&& *wasShifted
))
5415 || (*wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
5416 // The stuff below should probably be in the sortkey code... maybe not...
5417 if(primary1
!= 0) { /* if we were shifted and we got an ignorable code point */
5418 /* we should just completely ignore it */
5422 //*wasShifted = TRUE;
5425 *wasShifted
= FALSE
;
5430 void terminatePSKLevel(int32_t level
, int32_t maxLevel
, int32_t &i
, uint8_t *dest
) {
5431 if(level
< maxLevel
) {
5432 dest
[i
++] = UCOL_LEVELTERMINATOR
;
5438 /** enumeration of level identifiers for partial sort key generation */
5440 UCOL_PSK_PRIMARY
= 0,
5441 UCOL_PSK_SECONDARY
= 1,
5443 UCOL_PSK_TERTIARY
= 3,
5444 UCOL_PSK_QUATERNARY
= 4,
5445 UCOL_PSK_QUIN
= 5, /** This is an extra level, not used - but we have three bits to blow */
5446 UCOL_PSK_IDENTICAL
= 6,
5447 UCOL_PSK_NULL
= 7, /** level for the end of sort key. Will just produce zeros */
5451 /** collation state enum. *_SHIFT value is how much to shift right
5452 * to get the state piece to the right. *_MASK value should be
5453 * ANDed with the shifted state. This data is stored in state[1]
5457 UCOL_PSK_LEVEL_SHIFT
= 0, /** level identificator. stores an enum value from above */
5458 UCOL_PSK_LEVEL_MASK
= 7, /** three bits */
5459 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
= 3, /** number of bytes of primary or quaternary already written */
5460 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
= 1,
5461 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5462 * This field is also used to denote that the French secondary level is finished
5464 UCOL_PSK_WAS_SHIFTED_SHIFT
= 4,/** was the last value shifted */
5465 UCOL_PSK_WAS_SHIFTED_MASK
= 1, /** can be 0 or 1 (Boolean) */
5466 UCOL_PSK_USED_FRENCH_SHIFT
= 5,/** how many French bytes have we already written */
5467 UCOL_PSK_USED_FRENCH_MASK
= 3, /** up to 4 bytes. See comment just below */
5468 /** When we do French we need to reverse secondary values. However, continuations
5469 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5471 UCOL_PSK_USED_ELEMENTS_SHIFT
= 7,
5472 UCOL_PSK_USED_ELEMENTS_MASK
= 0x3FF,
5473 UCOL_PSK_ITER_SKIP_SHIFT
= 17,
5474 UCOL_PSK_ITER_SKIP_MASK
= 0x7FFF
5478 /** main sortkey part procedure. On the first call,
5479 * you should pass in a collator, an iterator, empty state
5480 * state[0] == state[1] == 0, a buffer to hold results
5481 * number of bytes you need and an error code pointer.
5482 * Make sure your buffer is big enough to hold the wanted
5483 * number of sortkey bytes. I don't check.
5484 * The only meaningful status you can get back is
5485 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5486 * have been dealt a raw deal and that you probably won't
5487 * be able to use partial sortkey generation for this
5488 * particular combination of string and collator. This
5489 * is highly unlikely, but you should still check the error code.
5490 * Any other status means that you're not in a sane situation
5491 * anymore. After the first call, preserve state values and
5492 * use them on subsequent calls to obtain more bytes of a sortkey.
5493 * Use until the number of bytes written is smaller than the requested
5494 * number of bytes. Generated sortkey is not compatible with the
5495 * one generated by ucol_getSortKey, as we don't do any compression.
5496 * However, levels are still terminated by a 1 (one) and the sortkey
5497 * is terminated by a 0 (zero). Identical level is the same as in the
5498 * regular sortkey - internal bocu-1 implementation is used.
5499 * For curious, although you cannot do much about this, here is
5500 * the structure of state words.
5501 * state[0] - iterator state. Depends on the iterator implementation,
5502 * but allows the iterator to continue where it stopped in
5503 * the last iteration.
5504 * state[1] - collation processing state. Here is the distribution
5506 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5507 * quaternary, quin (we don't use this one), identical and
5508 * null (producing only zeroes - first one to terminate the
5509 * sortkey and subsequent to fill the buffer).
5510 * 3 - byte count. Number of bytes written on the primary level.
5511 * 4 - was shifted. Whether the previous iteration finished in the
5513 * 5, 6 - French continuation bytes written. See the comment in the enum
5514 * 7..16 - Used elements. Number of CEs that were already used from the
5515 * expansion buffer or number of bytes from a bocu sequence on
5516 * the identical level.
5517 * 17..31 - iterator skip. Number of move operations iterator needs to
5518 * skip from the current state in order to continue. This is used
5519 * only if normalization is turned on, since the normalizing iterator
5520 * can return undefined state, which means that it's in the middle
5521 * of normalizing sequence.
5523 U_CAPI
int32_t U_EXPORT2
5524 ucol_nextSortKeyPart(const UCollator
*coll
,
5525 UCharIterator
*iter
,
5527 uint8_t *dest
, int32_t count
,
5528 UErrorCode
*status
) {
5529 /* error checking */
5530 if(status
==NULL
|| U_FAILURE(*status
)) {
5533 if( coll
==NULL
|| iter
==NULL
||
5535 count
<0 || (count
>0 && dest
==NULL
)
5537 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
5546 /** Setting up situation according to the state we got from the previous iteration */
5547 // The state of the iterator from the previous invocation
5548 uint32_t iterState
= state
[0];
5549 // Has the last iteration ended in the shifted state
5550 UBool wasShifted
= ((state
[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT
) & UCOL_PSK_WAS_SHIFTED_MASK
)?TRUE
:FALSE
;
5551 // What is the current level of the sortkey?
5552 int32_t level
= (state
[1] >> UCOL_PSK_LEVEL_SHIFT
) & UCOL_PSK_LEVEL_MASK
;
5553 // Have we written only one byte from a two byte primary in the previous iteration?
5554 // Also on secondary level - have we finished with the French secondary?
5555 int32_t byteCountOrFrenchDone
= (state
[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
;
5556 // number of bytes in the continuation buffer for French
5557 int32_t usedFrench
= (state
[1] >> UCOL_PSK_USED_FRENCH_SHIFT
) & UCOL_PSK_USED_FRENCH_MASK
;
5558 // Skip the CEs that we got from an extraction
5559 // and delivered in the previous call
5560 int32_t usedElements
= (state
[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT
) & UCOL_PSK_USED_ELEMENTS_MASK
;
5561 // Number of times to skip because the iterator returned
5562 // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
5563 // last valid state.
5564 int32_t iterSkips
= (state
[1] >> UCOL_PSK_ITER_SKIP_SHIFT
) & UCOL_PSK_ITER_SKIP_MASK
;
5566 /** values that depend on the collator attributes */
5567 // strength of the collator.
5568 int32_t strength
= ucol_getAttribute(coll
, UCOL_STRENGTH
, status
);
5569 // maximal level of the partial sortkey. Need to take whether case level is done
5570 int32_t maxLevel
= 0;
5571 if(strength
< UCOL_TERTIARY
) {
5572 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
5573 maxLevel
= UCOL_PSK_CASE
;
5575 maxLevel
= strength
;
5578 if(strength
== UCOL_TERTIARY
) {
5579 maxLevel
= UCOL_PSK_TERTIARY
;
5580 } else if(strength
== UCOL_QUATERNARY
) {
5581 maxLevel
= UCOL_PSK_QUATERNARY
;
5582 } else { // identical
5583 maxLevel
= UCOL_IDENTICAL
;
5586 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5587 uint8_t UCOL_HIRAGANA_QUAD
=
5588 (ucol_getAttribute(coll
, UCOL_HIRAGANA_QUATERNARY_MODE
, status
) == UCOL_ON
)?0xFE:0xFF;
5589 // Boundary value that decides whether a CE is shifted or not
5590 uint32_t LVT
= (coll
->alternateHandling
== UCOL_SHIFTED
)?(coll
->variableTopValue
<<16):0;
5591 // Are we doing French collation?
5592 UBool doingFrench
= (ucol_getAttribute(coll
, UCOL_FRENCH_COLLATION
, status
) == UCOL_ON
);
5594 /** initializing the collation state */
5595 UBool notIsContinuation
= FALSE
;
5596 uint32_t CE
= UCOL_NO_MORE_CES
;
5599 IInit_collIterate(coll
, NULL
, -1, &s
);
5601 s
.flags
|= UCOL_USE_ITERATOR
;
5602 // This variable tells us whether we have produced some other levels in this iteration
5603 // before we moved to the identical level. In that case, we need to switch the
5604 // type of the iterator.
5605 UBool doingIdenticalFromStart
= FALSE
;
5606 // Normalizing iterator
5607 // The division for the array length may truncate the array size to
5608 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5609 // for all platforms anyway.
5610 UAlignedMemory stackNormIter
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
5611 UNormIterator
*normIter
= NULL
;
5612 // If the normalization is turned on for the collator and we are below identical level
5613 // we will use a FCD normalizing iterator
5614 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
&& level
< UCOL_PSK_IDENTICAL
) {
5615 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
5616 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_FCD
, status
);
5617 s
.flags
&= ~UCOL_ITER_NORM
;
5618 if(U_FAILURE(*status
)) {
5621 } else if(level
== UCOL_PSK_IDENTICAL
) {
5622 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5623 // will be updating the state - and this cannot be done on an ordinary iterator.
5624 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
5625 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
5626 s
.flags
&= ~UCOL_ITER_NORM
;
5627 if(U_FAILURE(*status
)) {
5630 doingIdenticalFromStart
= TRUE
;
5633 // This is the tentative new state of the iterator. The problem
5634 // is that the iterator might return an undefined state, in
5635 // which case we should save the last valid state and increase
5636 // the iterator skip value.
5637 uint32_t newState
= 0;
5639 // First, we set the iterator to the last valid position
5640 // from the last iteration. This was saved in state[0].
5641 if(iterState
== 0) {
5643 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
&& !byteCountOrFrenchDone
) {
5644 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
5646 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5649 /* reset to previous state */
5650 s
.iterator
->setState(s
.iterator
, iterState
, status
);
5651 if(U_FAILURE(*status
)) {
5656 // Then, we may have to move more, if the normalizing iterator
5657 // was going through a normalizing sequence.
5659 // if we are on secondary level AND we do French, we need to go backward instead of forward
5660 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
5661 s
.iterator
->move(s
.iterator
, -iterSkips
, UITER_CURRENT
);
5663 s
.iterator
->move(s
.iterator
, iterSkips
, UITER_CURRENT
);
5668 // Number of expansion CEs that were already consumed in the
5669 // previous iteration for the last code point processed. We
5670 // want to clean out the expansion buffer, so that we can
5671 // get correct CEs. This value is persistent over iterations,
5672 // since we can have several iterations on the one expansion
5674 int32_t consumedExpansionCEs
= usedElements
;
5675 // Number of bytes already writted from a bocsu sequence. Since
5676 // the longes bocsu sequence is 4 long, this can be up to 3. It
5677 // shares the state field with consumedExpansionCEs value, since
5678 // they cannot simultanously appear on the same level
5679 int32_t bocsuBytesUsed
= 0;
5680 // Clean out the expansion buffer unless we are on
5681 // identical level. In that case we use this field
5682 // to store the number of bytes already written
5683 // from the previous bocsu sequence.
5684 if(level
< UCOL_PSK_IDENTICAL
&& usedElements
!= 0) {
5685 while(usedElements
-->0) {
5686 // If we're doing French and we are on the secondary level,
5688 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
5689 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
5691 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5693 if(CE
==UCOL_NO_MORE_CES
) {
5694 /* should not happen */
5695 *status
=U_INTERNAL_PROGRAM_ERROR
;
5700 bocsuBytesUsed
= usedElements
;
5703 // This variable prevents the adjusting of iterator
5704 // skip variable when we are the first time on a
5705 // level. I hope there is a better way to do it, but
5706 // I could not think of it.
5707 UBool firstTimeOnLevel
= TRUE
;
5708 // French secondary needs to know whether the iterator state of zero came from previous level OR
5709 // from a new invocation...
5710 UBool wasDoingPrimary
= FALSE
;
5711 // Case level is kind of goofy. This variable tells us that
5712 // we are still not done with the case level.
5713 UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator
= FALSE
;
5714 // destination buffer byte counter. When this guy
5715 // gets to count, we're done with the iteration
5717 // used to count the zero bytes written after we
5718 // have finished with the sort key
5722 // Hm.... I think we're ready to plunge in. Basic story is as following:
5723 // we have a fall through case based on level. This is used for initial
5724 // positioning on iteration start. Every level processor contains a
5725 // for(;;) which will be broken when we exhaust all the CEs. Other
5726 // way to exit is a goto saveState, which happens when we have filled
5729 case UCOL_PSK_PRIMARY
:
5730 wasDoingPrimary
= TRUE
;
5735 // We should save the state only if we
5736 // are sure that we are done with the
5737 // previous iterator state
5738 if(consumedExpansionCEs
== 0 && byteCountOrFrenchDone
== 0) {
5739 newState
= s
.iterator
->getState(s
.iterator
);
5740 if(newState
!= UITER_NO_STATE
) {
5741 iterState
= newState
;
5744 if(!firstTimeOnLevel
&& !byteCountOrFrenchDone
) {
5749 firstTimeOnLevel
= FALSE
;
5750 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5751 if(CE
==UCOL_NO_MORE_CES
) {
5752 // Add the level separator
5753 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5754 byteCountOrFrenchDone
=0;
5755 // Restart the iteration an move to the
5757 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5758 level
= UCOL_PSK_SECONDARY
;
5761 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5762 CE
>>= UCOL_PRIMARYORDERSHIFT
; /* get primary */
5764 if(byteCountOrFrenchDone
== 0) {
5765 // get the second byte of primary
5766 dest
[i
++]=(uint8_t)(CE
>> 8);
5768 byteCountOrFrenchDone
= 0;
5770 if((CE
&=0xff)!=0) {
5773 byteCountOrFrenchDone
=1;
5776 dest
[i
++]=(uint8_t)CE
;
5780 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
5781 // s.pos != NULL means there is a normalization buffer in effect
5782 // in iterative case, this means that we are doing Thai (maybe discontiguos)
5783 consumedExpansionCEs
++;
5785 consumedExpansionCEs
= 0;
5787 if(s
.pos
&& *s
.pos
== 0) {
5788 // maybe it is the end of Thai - we have to have
5793 /* fall through to next level */
5794 case UCOL_PSK_SECONDARY
:
5795 if(strength
>= UCOL_SECONDARY
) {
5801 // We should save the state only if we
5802 // are sure that we are done with the
5803 // previous iterator state
5804 if(consumedExpansionCEs
== 0) {
5805 newState
= s
.iterator
->getState(s
.iterator
);
5806 if(newState
!= UITER_NO_STATE
) {
5807 iterState
= newState
;
5810 if(!firstTimeOnLevel
) {
5815 firstTimeOnLevel
= FALSE
;
5816 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5817 if(CE
==UCOL_NO_MORE_CES
) {
5818 // Add the level separator
5819 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5820 byteCountOrFrenchDone
=0;
5821 // Restart the iteration an move to the
5823 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5824 level
= UCOL_PSK_CASE
;
5827 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5828 CE
>>= 8; /* get secondary */
5830 dest
[i
++]=(uint8_t)CE
;
5833 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
5834 consumedExpansionCEs
++;
5836 consumedExpansionCEs
= 0;
5838 if(s
.pos
&& *s
.pos
== 0) {
5842 } else { // French secondary processing
5843 uint8_t frenchBuff
[UCOL_MAX_BUFFER
];
5844 int32_t frenchIndex
= 0;
5845 // Here we are going backwards.
5846 // If the iterator is at the beggining, it should be
5848 if(wasDoingPrimary
) {
5849 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
5855 if(consumedExpansionCEs
== 0) {
5856 newState
= s
.iterator
->getState(s
.iterator
);
5857 if(newState
!= UITER_NO_STATE
) {
5858 iterState
= newState
;
5861 if(!firstTimeOnLevel
) {
5866 firstTimeOnLevel
= FALSE
;
5867 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
5868 if(CE
==UCOL_NO_MORE_CES
) {
5869 // Add the level separator
5870 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5871 byteCountOrFrenchDone
=0;
5872 // Restart the iteration an move to the next level
5873 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5874 level
= UCOL_PSK_CASE
;
5877 if(isContinuation(CE
)) { // if it's a continuation, we want to save it and
5878 // reverse when we get a first non-continuation CE.
5880 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
5881 } else if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5882 CE
>>= 8; /* get secondary */
5885 dest
[i
++]=(uint8_t)CE
;
5888 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
5889 frenchIndex
-= usedFrench
;
5891 while(i
< count
&& frenchIndex
) {
5892 dest
[i
++] = frenchBuff
[--frenchIndex
];
5897 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
5898 consumedExpansionCEs
++;
5900 consumedExpansionCEs
= 0;
5902 if(s
.pos
&& *s
.pos
== 0) {
5908 level
= UCOL_PSK_CASE
;
5910 /* fall through to next level */
5912 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
5913 uint32_t caseShift
= UCOL_CASE_SHIFT_START
;
5914 uint8_t caseByte
= UCOL_CASE_BYTE_START
;
5915 uint8_t caseBits
= 0;
5921 // We should save the state only if we
5922 // are sure that we are done with the
5923 // previous iterator state
5924 if(consumedExpansionCEs
== 0) {
5925 newState
= s
.iterator
->getState(s
.iterator
);
5926 if(newState
!= UITER_NO_STATE
) {
5927 iterState
= newState
;
5930 if(!firstTimeOnLevel
) {
5935 firstTimeOnLevel
= FALSE
;
5936 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5937 if(CE
==UCOL_NO_MORE_CES
) {
5938 // On the case level we might have an unfinished
5939 // case byte. Add one if it's started.
5940 if(caseShift
!= UCOL_CASE_SHIFT_START
) {
5941 dest
[i
++] = caseByte
;
5943 // This is kind of tricky - situation where
5944 // we need to keep the iterator in the old
5945 // state, but don't need to bring anything
5946 // to the next invocation
5948 // Add the level separator
5949 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5950 // Restart the iteration and move to the
5952 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5953 level
= UCOL_PSK_TERTIARY
;
5955 dontAdvanceIteratorBecauseWeNeedALevelTerminator
= TRUE
;
5960 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5961 if(!isContinuation(CE
)) {
5962 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
5963 caseBits
= (uint8_t)(CE
& 0xC0);
5964 // this copies the case level logic from the
5965 // sort key generation code
5967 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
5968 if((caseBits
& 0xC0) == 0) {
5969 caseByte
|= 1 << (--caseShift
);
5971 caseByte
|= 0 << (--caseShift
);
5973 if(caseShift
== 0) {
5974 dest
[i
++] = caseByte
;
5975 caseShift
= UCOL_CASE_SHIFT_START
;
5976 caseByte
= UCOL_CASE_BYTE_START
;
5978 caseByte
|= ((caseBits
>>6)&1) << (--caseShift
);
5981 if((caseBits
& 0xC0) == 0) {
5982 caseByte
|= 0 << (--caseShift
);
5984 caseByte
|= 1 << (--caseShift
);
5986 if(caseShift
== 0) {
5987 dest
[i
++] = caseByte
;
5988 caseShift
= UCOL_CASE_SHIFT_START
;
5989 caseByte
= UCOL_CASE_BYTE_START
;
5991 caseByte
|= ((caseBits
>>7)&1) << (--caseShift
);
5998 // Not sure this is correct for the case level - revisit
5999 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6000 consumedExpansionCEs
++;
6002 consumedExpansionCEs
= 0;
6004 if(s
.pos
&& *s
.pos
== 0) {
6009 level
= UCOL_PSK_TERTIARY
;
6011 /* fall through to next level */
6012 case UCOL_PSK_TERTIARY
:
6013 if(strength
>= UCOL_TERTIARY
) {
6018 // We should save the state only if we
6019 // are sure that we are done with the
6020 // previous iterator state
6021 if(consumedExpansionCEs
== 0) {
6022 newState
= s
.iterator
->getState(s
.iterator
);
6023 if(newState
!= UITER_NO_STATE
) {
6024 iterState
= newState
;
6027 if(!firstTimeOnLevel
) {
6032 firstTimeOnLevel
= FALSE
;
6033 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6034 if(CE
==UCOL_NO_MORE_CES
) {
6035 // Add the level separator
6036 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6037 byteCountOrFrenchDone
=0;
6038 // Restart the iteration an move to the
6040 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6041 level
= UCOL_PSK_QUATERNARY
;
6044 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6045 notIsContinuation
= !isContinuation(CE
);
6047 if(notIsContinuation
) {
6048 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6049 CE
^= coll
->caseSwitch
;
6050 CE
&= coll
->tertiaryMask
;
6052 CE
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6056 dest
[i
++]=(uint8_t)CE
;
6059 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6060 consumedExpansionCEs
++;
6062 consumedExpansionCEs
= 0;
6064 if(s
.pos
&& *s
.pos
== 0) {
6069 // if we're not doing tertiary
6071 level
= UCOL_PSK_NULL
;
6073 /* fall through to next level */
6074 case UCOL_PSK_QUATERNARY
:
6075 if(strength
>= UCOL_QUATERNARY
) {
6080 // We should save the state only if we
6081 // are sure that we are done with the
6082 // previous iterator state
6083 if(consumedExpansionCEs
== 0) {
6084 newState
= s
.iterator
->getState(s
.iterator
);
6085 if(newState
!= UITER_NO_STATE
) {
6086 iterState
= newState
;
6089 if(!firstTimeOnLevel
) {
6094 firstTimeOnLevel
= FALSE
;
6095 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6096 if(CE
==UCOL_NO_MORE_CES
) {
6097 // Add the level separator
6098 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6099 //dest[i++] = UCOL_LEVELTERMINATOR;
6100 byteCountOrFrenchDone
=0;
6101 // Restart the iteration an move to the
6103 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6104 level
= UCOL_PSK_QUIN
;
6107 if(isShiftedCE(CE
, LVT
, &wasShifted
)) {
6108 CE
>>= 16; /* get primary */
6110 if(byteCountOrFrenchDone
== 0) {
6111 dest
[i
++]=(uint8_t)(CE
>> 8);
6113 byteCountOrFrenchDone
= 0;
6115 if((CE
&=0xff)!=0) {
6118 byteCountOrFrenchDone
=1;
6121 dest
[i
++]=(uint8_t)CE
;
6125 notIsContinuation
= !isContinuation(CE
);
6126 if(notIsContinuation
) {
6127 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
6128 dest
[i
++] = UCOL_HIRAGANA_QUAD
;
6134 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6135 consumedExpansionCEs
++;
6137 consumedExpansionCEs
= 0;
6139 if(s
.pos
&& *s
.pos
== 0) {
6144 // if we're not doing quaternary
6146 level
= UCOL_PSK_NULL
;
6148 /* fall through to next level */
6150 level
= UCOL_PSK_IDENTICAL
;
6151 /* fall through to next level */
6152 case UCOL_PSK_IDENTICAL
:
6153 if(strength
>= UCOL_IDENTICAL
) {
6154 UChar32 first
, second
;
6155 int32_t bocsuBytesWritten
= 0;
6156 // We always need to do identical on
6157 // the NFD form of the string.
6158 if(normIter
== NULL
) {
6159 // we arrived from the level below and
6160 // normalization was not turned on.
6161 // therefore, we need to make a fresh NFD iterator
6162 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6163 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6164 } else if(!doingIdenticalFromStart
) {
6165 // there is an iterator, but we did some other levels.
6166 // therefore, we have a FCD iterator - need to make
6168 // normIter being at the beginning does not guarantee
6169 // that the underlying iterator is at the beginning
6170 iter
->move(iter
, 0, UITER_START
);
6171 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6173 // At this point we have a NFD iterator that is positioned
6174 // in the right place
6175 if(U_FAILURE(*status
)) {
6178 first
= uiter_previous32(s
.iterator
);
6179 // maybe we're at the start of the string
6180 if(first
== U_SENTINEL
) {
6183 uiter_next32(s
.iterator
);
6189 if(j
+1 < bocsuBytesWritten
) {
6190 bocsuBytesUsed
= j
+1;
6195 // On identical level, we will always save
6196 // the state if we reach this point, since
6197 // we don't depend on getNextCE for content
6198 // all the content is in our buffer and we
6199 // already either stored the full buffer OR
6200 // otherwise we won't arrive here.
6201 newState
= s
.iterator
->getState(s
.iterator
);
6202 if(newState
!= UITER_NO_STATE
) {
6203 iterState
= newState
;
6210 second
= uiter_next32(s
.iterator
);
6212 // end condition for identical level
6213 if(second
== U_SENTINEL
) {
6214 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6215 level
= UCOL_PSK_NULL
;
6218 bocsuBytesWritten
= u_writeIdenticalLevelRunTwoChars(first
, second
, buff
);
6222 if(bocsuBytesUsed
!= 0) {
6223 while(bocsuBytesUsed
-->0) {
6228 while(i
< count
&& j
< bocsuBytesWritten
) {
6229 dest
[i
++] = buff
[j
++];
6234 level
= UCOL_PSK_NULL
;
6236 /* fall through to next level */
6244 *status
= U_INTERNAL_PROGRAM_ERROR
;
6249 // Now we need to return stuff. First we want to see whether we have
6250 // done everything for the current state of iterator.
6251 if(consumedExpansionCEs
|| byteCountOrFrenchDone
6252 || dontAdvanceIteratorBecauseWeNeedALevelTerminator
) {
6253 // Any of above mean that the previous transaction
6254 // wasn't finished and that we should store the
6255 // previous iterator state.
6256 state
[0] = iterState
;
6258 // The transaction is complete. We will continue in
6260 if((newState
= s
.iterator
->getState(s
.iterator
))!= UITER_NO_STATE
) {
6261 state
[0] = s
.iterator
->getState(s
.iterator
);
6264 state
[0] = iterState
;
6268 // Store the number of elements processed. On CE levels, this is
6269 // the number of expansion CEs processed. On identical level, this
6270 // is the number of bocsu bytes written.
6271 if(level
< UCOL_PSK_IDENTICAL
) {
6272 if((consumedExpansionCEs
& UCOL_PSK_USED_ELEMENTS_MASK
) != consumedExpansionCEs
) {
6273 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6275 state
[1] = (consumedExpansionCEs
& UCOL_PSK_USED_ELEMENTS_MASK
) << UCOL_PSK_USED_ELEMENTS_SHIFT
;
6277 if((bocsuBytesUsed
& UCOL_PSK_USED_ELEMENTS_MASK
) != bocsuBytesUsed
) {
6278 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6280 state
[1] = (bocsuBytesUsed
& UCOL_PSK_USED_ELEMENTS_MASK
) << UCOL_PSK_USED_ELEMENTS_SHIFT
;
6283 // Next we put in the level of comparison
6284 state
[1] |= ((level
& UCOL_PSK_LEVEL_MASK
) << UCOL_PSK_LEVEL_SHIFT
);
6286 // If we are doing French, we need to store whether we have just finished the French level
6287 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6288 state
[1] |= (((state
[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6290 state
[1] |= ((byteCountOrFrenchDone
& UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6293 // Was the latest CE shifted
6295 state
[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT
;
6297 // Check for iterSkips overflow
6298 if((iterSkips
& UCOL_PSK_ITER_SKIP_MASK
) != iterSkips
) {
6299 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6302 state
[1] |= ((iterSkips
& UCOL_PSK_ITER_SKIP_MASK
) << UCOL_PSK_ITER_SKIP_SHIFT
);
6304 // Check for French overflow
6305 if((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) != usedFrench
) {
6306 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6308 // Store number of bytes written in the French secondary continuation sequence
6309 state
[1] |= ((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) << UCOL_PSK_USED_FRENCH_SHIFT
);
6312 // If we have used normalizing iterator, get rid of it
6313 if(normIter
!= NULL
) {
6314 unorm_closeIter(normIter
);
6317 // Return number of meaningful sortkey bytes.
6322 * Produce a bound for a given sortkey and a number of levels.
6324 U_CAPI
int32_t U_EXPORT2
6325 ucol_getBound(const uint8_t *source
,
6326 int32_t sourceLength
,
6327 UColBoundMode boundType
,
6328 uint32_t noOfLevels
,
6330 int32_t resultLength
,
6331 UErrorCode
*status
) {
6332 // consistency checks
6333 if(status
== NULL
|| U_FAILURE(*status
)) {
6336 if(source
== NULL
) {
6337 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6341 int32_t sourceIndex
= 0;
6342 // Scan the string until we skip enough of the key OR reach the end of the key
6345 if(source
[sourceIndex
] == UCOL_LEVELTERMINATOR
) {
6348 } while (noOfLevels
> 0
6349 && (source
[sourceIndex
] != 0 || sourceIndex
< sourceLength
));
6351 if((source
[sourceIndex
] == 0 || sourceIndex
== sourceLength
)
6352 && noOfLevels
> 0) {
6353 *status
= U_SORT_KEY_TOO_SHORT_WARNING
;
6357 // READ ME: this code assumes that the values for boundType
6358 // enum will not changes. They are set so that the enum value
6359 // corresponds to the number of extra bytes each bound type
6361 if(result
!= NULL
&& resultLength
>= sourceIndex
+boundType
) {
6362 uprv_memcpy(result
, source
, sourceIndex
);
6364 // Lower bound just gets terminated. No extra bytes
6365 case UCOL_BOUND_LOWER
: // = 0
6367 // Upper bound needs one extra byte
6368 case UCOL_BOUND_UPPER
: // = 1
6369 result
[sourceIndex
++] = 2;
6371 // Upper long bound needs two extra bytes
6372 case UCOL_BOUND_UPPER_LONG
: // = 2
6373 result
[sourceIndex
++] = 0xFF;
6374 result
[sourceIndex
++] = 0xFF;
6377 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6380 result
[sourceIndex
++] = 0;
6384 return sourceIndex
+boundType
+1;
6389 inline void uprv_appendByteToHexString(char *dst
, uint8_t val
) {
6390 uint32_t len
= (uint32_t)uprv_strlen(dst
);
6391 *(dst
+len
) = T_CString_itosOffset((val
>> 4));
6392 *(dst
+len
+1) = T_CString_itosOffset((val
& 0xF));
6396 /* this function makes a string with representation of a sortkey */
6397 U_CAPI
char* U_EXPORT2
ucol_sortKeyToString(const UCollator
*coll
, const uint8_t *sortkey
, char *buffer
, uint32_t *len
) {
6398 int32_t strength
= UCOL_PRIMARY
;
6399 uint32_t res_size
= 0;
6400 UBool doneCase
= FALSE
;
6402 char *current
= buffer
;
6403 const uint8_t *currentSk
= sortkey
;
6405 uprv_strcpy(current
, "[");
6407 while(strength
<= UCOL_QUATERNARY
&& strength
<= coll
->strength
) {
6408 if(strength
> UCOL_PRIMARY
) {
6409 strcat(current
, " . ");
6411 while(*currentSk
!= 0x01 && *currentSk
!= 0x00) { /* print a level */
6412 uprv_appendByteToHexString(current
, *currentSk
++);
6413 uprv_strcat(current
, " ");
6415 if(coll
->caseLevel
== UCOL_ON
&& strength
== UCOL_SECONDARY
&& doneCase
== FALSE
) {
6417 } else if(coll
->caseLevel
== UCOL_OFF
|| doneCase
== TRUE
|| strength
!= UCOL_SECONDARY
) {
6420 uprv_appendByteToHexString(current
, *currentSk
++); /* This should print '01' */
6421 if(strength
== UCOL_QUATERNARY
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
) {
6426 if(coll
->strength
== UCOL_IDENTICAL
) {
6427 uprv_strcat(current
, " . ");
6428 while(*currentSk
!= 0) {
6429 uprv_appendByteToHexString(current
, *currentSk
++);
6430 uprv_strcat(current
, " ");
6433 uprv_appendByteToHexString(current
, *currentSk
++);
6435 uprv_strcat(current
, "]");
6437 if(res_size
> *len
) {
6445 /****************************************************************************/
6446 /* Following are the functions that deal with the properties of a collator */
6447 /* there are new APIs and some compatibility APIs */
6448 /****************************************************************************/
6451 ucol_addLatinOneEntry(UCollator
*coll
, UChar ch
, uint32_t CE
,
6452 int32_t *primShift
, int32_t *secShift
, int32_t *terShift
) {
6453 uint8_t primary1
= 0, primary2
= 0, secondary
= 0, tertiary
= 0;
6454 UBool reverseSecondary
= FALSE
;
6455 if(!isContinuation(CE
)) {
6456 tertiary
= (uint8_t)((CE
& coll
->tertiaryMask
));
6457 tertiary
^= coll
->caseSwitch
;
6458 reverseSecondary
= TRUE
;
6460 tertiary
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6461 tertiary
&= UCOL_REMOVE_CASE
;
6462 reverseSecondary
= FALSE
;
6465 secondary
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6466 primary2
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6467 primary1
= (uint8_t)(CE
>> 8);
6470 coll
->latinOneCEs
[ch
] |= (primary1
<< *primShift
);
6474 if(*primShift
< 0) {
6475 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6476 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6477 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6480 coll
->latinOneCEs
[ch
] |= (primary2
<< *primShift
);
6483 if(secondary
!= 0) {
6484 if(reverseSecondary
&& coll
->frenchCollation
== UCOL_ON
) { // reverse secondary
6485 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] >>= 8; // make space for secondary
6486 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< 24);
6487 } else { // normal case
6488 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< *secShift
);
6493 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] |= (tertiary
<< *terShift
);
6499 ucol_resizeLatinOneTable(UCollator
*coll
, int32_t size
, UErrorCode
*status
) {
6500 uint32_t *newTable
= (uint32_t *)uprv_malloc(size
*sizeof(uint32_t)*3);
6501 if(newTable
== NULL
) {
6502 *status
= U_MEMORY_ALLOCATION_ERROR
;
6503 coll
->latinOneFailed
= TRUE
;
6506 int32_t sizeToCopy
= ((size
<coll
->latinOneTableLen
)?size
:coll
->latinOneTableLen
)*sizeof(uint32_t);
6507 uprv_memset(newTable
, 0, size
*sizeof(uint32_t)*3);
6508 uprv_memcpy(newTable
, coll
->latinOneCEs
, sizeToCopy
);
6509 uprv_memcpy(newTable
+size
, coll
->latinOneCEs
+coll
->latinOneTableLen
, sizeToCopy
);
6510 uprv_memcpy(newTable
+2*size
, coll
->latinOneCEs
+2*coll
->latinOneTableLen
, sizeToCopy
);
6511 coll
->latinOneTableLen
= size
;
6512 uprv_free(coll
->latinOneCEs
);
6513 coll
->latinOneCEs
= newTable
;
6518 ucol_setUpLatinOne(UCollator
*coll
, UErrorCode
*status
) {
6519 UBool result
= TRUE
;
6520 if(coll
->latinOneCEs
== NULL
) {
6521 coll
->latinOneCEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN
*3);
6522 if(coll
->latinOneCEs
== NULL
) {
6523 *status
= U_MEMORY_ALLOCATION_ERROR
;
6526 coll
->latinOneTableLen
= UCOL_LATINONETABLELEN
;
6529 UCollationElements
*it
= ucol_openElements(coll
, &ch
, 1, status
);
6530 uprv_memset(coll
->latinOneCEs
, 0, sizeof(uint32_t)*coll
->latinOneTableLen
*3);
6532 int32_t primShift
= 24, secShift
= 24, terShift
= 24;
6534 int32_t contractionOffset
= UCOL_ENDOFLATINONERANGE
+1;
6536 // TODO: make safe if you get more than you wanted...
6537 for(ch
= 0; ch
<= UCOL_ENDOFLATINONERANGE
; ch
++) {
6538 primShift
= 24; secShift
= 24; terShift
= 24;
6540 CE
= coll
->latinOneMapping
[ch
];
6542 CE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
6543 if(CE
== UCOL_NOT_FOUND
) {
6544 CE
= UTRIE_GET32_FROM_LEAD(UCA
->mapping
, ch
);
6547 if(CE
< UCOL_NOT_FOUND
) {
6548 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6550 switch (getCETag(CE
)) {
6552 ucol_setText(it
, &ch
, 1, status
);
6553 while((CE
= ucol_next(it
, status
)) != UCOL_NULLORDER
) {
6554 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6555 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6556 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6557 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6560 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6563 case CONTRACTION_TAG
:
6564 // here is the trick
6565 // F2 is contraction. We do something very similar to contractions
6566 // but have two indices, one in the real contraction table and the
6567 // other to where we stuffed things. This hopes that we don't have
6568 // many contractions (this should work for latin-1 tables).
6570 if((CE
& 0x00FFF000) != 0) {
6571 *status
= U_UNSUPPORTED_ERROR
;
6575 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
6577 CE
|= (contractionOffset
& 0xFFF) << 12; // insert the offset in latin-1 table
6579 coll
->latinOneCEs
[ch
] = CE
;
6580 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = CE
;
6581 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = CE
;
6583 // We're going to jump into contraction table, pick the elements
6586 CE
= *(coll
->contractionCEs
+
6587 (UCharOffset
- coll
->contractionIndex
));
6588 if(getCETag(CE
) == EXPANSION_TAG
) {
6590 uint32_t i
; /* general counter */
6591 uint32_t *CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
6592 size
= getExpansionCount(CE
);
6594 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6595 for(i
= 0; i
<size
; i
++) {
6596 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6597 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6598 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6599 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6602 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6604 } else { /* else, we do */
6605 while(*CEOffset
!= 0) {
6606 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6607 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6608 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6609 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6612 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6615 contractionOffset
++;
6616 } else if(CE
< UCOL_NOT_FOUND
) {
6617 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
++, CE
, &primShift
, &secShift
, &terShift
);
6619 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6620 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6621 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6622 contractionOffset
++;
6625 primShift
= 24; secShift
= 24; terShift
= 24;
6626 if(contractionOffset
== coll
->latinOneTableLen
) { // we need to reallocate
6627 if(!ucol_resizeLatinOneTable(coll
, 2*coll
->latinOneTableLen
, status
)) {
6631 } while(*UCharOffset
!= 0xFFFF);
6635 coll
->latinOneFailed
= TRUE
;
6641 ucol_closeElements(it
);
6643 if(contractionOffset
< coll
->latinOneTableLen
) {
6644 if(!ucol_resizeLatinOneTable(coll
, contractionOffset
, status
)) {
6651 void ucol_updateInternalState(UCollator
*coll
, UErrorCode
*status
) {
6652 if(U_SUCCESS(*status
)) {
6653 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6654 coll
->caseSwitch
= UCOL_CASE_SWITCH
;
6656 coll
->caseSwitch
= UCOL_NO_CASE_SWITCH
;
6659 if(coll
->caseLevel
== UCOL_ON
|| coll
->caseFirst
== UCOL_OFF
) {
6660 coll
->tertiaryMask
= UCOL_REMOVE_CASE
;
6661 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
6662 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_OFF
;
6663 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_OFF
;
6664 coll
->tertiaryBottom
= UCOL_COMMON_BOT3
;
6666 coll
->tertiaryMask
= UCOL_KEEP_CASE
;
6667 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_ON
;
6668 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6669 coll
->tertiaryCommon
= UCOL_COMMON3_UPPERFIRST
;
6670 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_UPPER
;
6671 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_UPPER
;
6673 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
6674 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_LOWER
;
6675 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_LOWER
;
6679 /* Set the compression values */
6680 uint8_t tertiaryTotal
= (uint8_t)(coll
->tertiaryTop
- UCOL_COMMON_BOT3
-1);
6681 coll
->tertiaryTopCount
= (uint8_t)(UCOL_PROPORTION3
*tertiaryTotal
); /* we multilply double with int, but need only int */
6682 coll
->tertiaryBottomCount
= (uint8_t)(tertiaryTotal
- coll
->tertiaryTopCount
);
6684 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
== UCOL_TERTIARY
6685 && coll
->frenchCollation
== UCOL_OFF
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
) {
6686 coll
->sortKeyGen
= ucol_calcSortKeySimpleTertiary
;
6688 coll
->sortKeyGen
= ucol_calcSortKey
;
6690 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
<= UCOL_TERTIARY
6691 && coll
->alternateHandling
== UCOL_NON_IGNORABLE
&& !coll
->latinOneFailed
) {
6692 if(coll
->latinOneCEs
== NULL
|| coll
->latinOneRegenTable
) {
6693 if(ucol_setUpLatinOne(coll
, status
)) { // if we succeed in building latin1 table, we'll use it
6694 //fprintf(stderr, "F");
6695 coll
->latinOneUse
= TRUE
;
6697 coll
->latinOneUse
= FALSE
;
6699 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6700 coll
->latinOneUse
= TRUE
;
6703 coll
->latinOneUse
= FALSE
;
6709 U_CAPI
uint32_t U_EXPORT2
6710 ucol_setVariableTop(UCollator
*coll
, const UChar
*varTop
, int32_t len
, UErrorCode
*status
) {
6711 if(U_FAILURE(*status
) || coll
== NULL
) {
6715 len
= u_strlen(varTop
);
6718 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6723 IInit_collIterate(coll
, varTop
, len
, &s
);
6725 uint32_t CE
= ucol_IGetNextCE(coll
, &s
, status
);
6727 /* here we check if we have consumed all characters */
6728 /* you can put in either one character or a contraction */
6729 /* you shouldn't put more... */
6730 if(s
.pos
!= s
.endp
|| CE
== UCOL_NO_MORE_CES
) {
6731 *status
= U_CE_NOT_FOUND_ERROR
;
6735 uint32_t nextCE
= ucol_IGetNextCE(coll
, &s
, status
);
6737 if(isContinuation(nextCE
) && (nextCE
& UCOL_PRIMARYMASK
) != 0) {
6738 *status
= U_PRIMARY_TOO_LONG_ERROR
;
6742 coll
->variableTopValue
= (CE
& UCOL_PRIMARYMASK
)>>16;
6744 return CE
& UCOL_PRIMARYMASK
;
6747 U_CAPI
uint32_t U_EXPORT2
ucol_getVariableTop(const UCollator
*coll
, UErrorCode
*status
) {
6748 if(U_FAILURE(*status
) || coll
== NULL
) {
6751 return coll
->variableTopValue
<<16;
6754 U_CAPI
void U_EXPORT2
6755 ucol_restoreVariableTop(UCollator
*coll
, const uint32_t varTop
, UErrorCode
*status
) {
6756 if(U_FAILURE(*status
) || coll
== NULL
) {
6759 coll
->variableTopValue
= (varTop
& UCOL_PRIMARYMASK
)>>16;
6761 /* Attribute setter API */
6762 U_CAPI
void U_EXPORT2
6763 ucol_setAttribute(UCollator
*coll
, UColAttribute attr
, UColAttributeValue value
, UErrorCode
*status
) {
6764 if(U_FAILURE(*status
) || coll
== NULL
) {
6767 UColAttributeValue oldFrench
= coll
->frenchCollation
;
6768 UColAttributeValue oldCaseFirst
= coll
->caseFirst
;
6770 case UCOL_NUMERIC_COLLATION
: /* sort substrings of digits as numbers */
6771 if(value
== UCOL_ON
) {
6772 coll
->numericCollation
= UCOL_ON
;
6773 coll
->numericCollationisDefault
= FALSE
;
6774 } else if (value
== UCOL_OFF
) {
6775 coll
->numericCollation
= UCOL_OFF
;
6776 coll
->numericCollationisDefault
= FALSE
;
6777 } else if (value
== UCOL_DEFAULT
) {
6778 coll
->numericCollationisDefault
= TRUE
;
6779 coll
->numericCollation
= (UColAttributeValue
)coll
->options
->numericCollation
;
6781 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6784 case UCOL_HIRAGANA_QUATERNARY_MODE
: /* special quaternary values for Hiragana */
6785 if(value
== UCOL_ON
) {
6786 coll
->hiraganaQ
= UCOL_ON
;
6787 coll
->hiraganaQisDefault
= FALSE
;
6788 } else if (value
== UCOL_OFF
) {
6789 coll
->hiraganaQ
= UCOL_OFF
;
6790 coll
->hiraganaQisDefault
= FALSE
;
6791 } else if (value
== UCOL_DEFAULT
) {
6792 coll
->hiraganaQisDefault
= TRUE
;
6793 coll
->hiraganaQ
= (UColAttributeValue
)coll
->options
->hiraganaQ
;
6795 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6798 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
6799 if(value
== UCOL_ON
) {
6800 coll
->frenchCollation
= UCOL_ON
;
6801 coll
->frenchCollationisDefault
= FALSE
;
6802 } else if (value
== UCOL_OFF
) {
6803 coll
->frenchCollation
= UCOL_OFF
;
6804 coll
->frenchCollationisDefault
= FALSE
;
6805 } else if (value
== UCOL_DEFAULT
) {
6806 coll
->frenchCollationisDefault
= TRUE
;
6807 coll
->frenchCollation
= (UColAttributeValue
)coll
->options
->frenchCollation
;
6809 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6812 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
6813 if(value
== UCOL_SHIFTED
) {
6814 coll
->alternateHandling
= UCOL_SHIFTED
;
6815 coll
->alternateHandlingisDefault
= FALSE
;
6816 } else if (value
== UCOL_NON_IGNORABLE
) {
6817 coll
->alternateHandling
= UCOL_NON_IGNORABLE
;
6818 coll
->alternateHandlingisDefault
= FALSE
;
6819 } else if (value
== UCOL_DEFAULT
) {
6820 coll
->alternateHandlingisDefault
= TRUE
;
6821 coll
->alternateHandling
= (UColAttributeValue
)coll
->options
->alternateHandling
;
6823 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6826 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
6827 if(value
== UCOL_LOWER_FIRST
) {
6828 coll
->caseFirst
= UCOL_LOWER_FIRST
;
6829 coll
->caseFirstisDefault
= FALSE
;
6830 } else if (value
== UCOL_UPPER_FIRST
) {
6831 coll
->caseFirst
= UCOL_UPPER_FIRST
;
6832 coll
->caseFirstisDefault
= FALSE
;
6833 } else if (value
== UCOL_OFF
) {
6834 coll
->caseFirst
= UCOL_OFF
;
6835 coll
->caseFirstisDefault
= FALSE
;
6836 } else if (value
== UCOL_DEFAULT
) {
6837 coll
->caseFirst
= (UColAttributeValue
)coll
->options
->caseFirst
;
6838 coll
->caseFirstisDefault
= TRUE
;
6840 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6843 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
6844 if(value
== UCOL_ON
) {
6845 coll
->caseLevel
= UCOL_ON
;
6846 coll
->caseLevelisDefault
= FALSE
;
6847 } else if (value
== UCOL_OFF
) {
6848 coll
->caseLevel
= UCOL_OFF
;
6849 coll
->caseLevelisDefault
= FALSE
;
6850 } else if (value
== UCOL_DEFAULT
) {
6851 coll
->caseLevel
= (UColAttributeValue
)coll
->options
->caseLevel
;
6852 coll
->caseLevelisDefault
= TRUE
;
6854 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6857 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
6858 if(value
== UCOL_ON
) {
6859 coll
->normalizationMode
= UCOL_ON
;
6860 coll
->normalizationModeisDefault
= FALSE
;
6861 } else if (value
== UCOL_OFF
) {
6862 coll
->normalizationMode
= UCOL_OFF
;
6863 coll
->normalizationModeisDefault
= FALSE
;
6864 } else if (value
== UCOL_DEFAULT
) {
6865 coll
->normalizationModeisDefault
= TRUE
;
6866 coll
->normalizationMode
= (UColAttributeValue
)coll
->options
->normalizationMode
;
6868 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6871 case UCOL_STRENGTH
: /* attribute for strength */
6872 if (value
== UCOL_DEFAULT
) {
6873 coll
->strengthisDefault
= TRUE
;
6874 coll
->strength
= (UColAttributeValue
)coll
->options
->strength
;
6875 } else if (value
<= UCOL_IDENTICAL
) {
6876 coll
->strengthisDefault
= FALSE
;
6877 coll
->strength
= value
;
6879 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6882 case UCOL_ATTRIBUTE_COUNT
:
6884 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6887 if(oldFrench
!= coll
->frenchCollation
|| oldCaseFirst
!= coll
->caseFirst
) {
6888 coll
->latinOneRegenTable
= TRUE
;
6890 coll
->latinOneRegenTable
= FALSE
;
6892 ucol_updateInternalState(coll
, status
);
6895 U_CAPI UColAttributeValue U_EXPORT2
6896 ucol_getAttribute(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
) {
6897 if(U_FAILURE(*status
) || coll
== NULL
) {
6898 return UCOL_DEFAULT
;
6901 case UCOL_NUMERIC_COLLATION
:
6902 return coll
->numericCollation
;
6903 case UCOL_HIRAGANA_QUATERNARY_MODE
:
6904 return coll
->hiraganaQ
;
6905 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
6906 return coll
->frenchCollation
;
6907 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
6908 return coll
->alternateHandling
;
6909 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
6910 return coll
->caseFirst
;
6911 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
6912 return coll
->caseLevel
;
6913 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
6914 return coll
->normalizationMode
;
6915 case UCOL_STRENGTH
: /* attribute for strength */
6916 return coll
->strength
;
6917 case UCOL_ATTRIBUTE_COUNT
:
6919 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6922 return UCOL_DEFAULT
;
6925 U_CAPI
void U_EXPORT2
6926 ucol_setStrength( UCollator
*coll
,
6927 UCollationStrength strength
)
6929 UErrorCode status
= U_ZERO_ERROR
;
6930 ucol_setAttribute(coll
, UCOL_STRENGTH
, strength
, &status
);
6933 U_CAPI UCollationStrength U_EXPORT2
6934 ucol_getStrength(const UCollator
*coll
)
6936 UErrorCode status
= U_ZERO_ERROR
;
6937 return ucol_getAttribute(coll
, UCOL_STRENGTH
, &status
);
6940 /****************************************************************************/
6941 /* Following are misc functions */
6942 /* there are new APIs and some compatibility APIs */
6943 /****************************************************************************/
6945 U_CAPI UCollator
* U_EXPORT2
6946 ucol_safeClone(const UCollator
*coll
, void *stackBuffer
, int32_t * pBufferSize
, UErrorCode
*status
)
6948 UCollator
* localCollator
;
6949 int32_t bufferSizeNeeded
= (int32_t)sizeof(UCollator
);
6950 char *stackBufferChars
= (char *)stackBuffer
;
6952 if (status
== NULL
|| U_FAILURE(*status
)){
6955 if ((stackBuffer
&& !pBufferSize
) || !coll
){
6956 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6959 /* Pointers on 64-bit platforms need to be aligned
6960 * on a 64-bit boundry in memory.
6962 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
6963 int32_t offsetUp
= (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars
);
6964 *pBufferSize
-= offsetUp
;
6965 stackBufferChars
+= offsetUp
;
6967 stackBuffer
= (void *)stackBufferChars
;
6969 if (stackBuffer
&& *pBufferSize
<= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
6970 *pBufferSize
= bufferSizeNeeded
;
6973 if (!stackBuffer
|| *pBufferSize
< bufferSizeNeeded
) {
6974 /* allocate one here...*/
6976 const UChar
* rules
= ucol_getRules(coll
, &length
);
6978 localCollator
= ucol_openRules(rules
,
6980 ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
),
6981 ucol_getStrength(coll
),
6984 if (U_SUCCESS(*status
))
6986 *status
= U_SAFECLONE_ALLOCATED_WARNING
;
6989 localCollator
= (UCollator
*)stackBuffer
;
6990 memcpy(localCollator
, coll
, sizeof(UCollator
));
6991 localCollator
->freeOnClose
= FALSE
;
6992 localCollator
->requestedLocale
= NULL
; // zero copies of pointers
6993 localCollator
->validLocale
= NULL
;
6995 return localCollator
;
6998 U_CAPI
int32_t U_EXPORT2
6999 ucol_getRulesEx(const UCollator
*coll
, UColRuleOption delta
, UChar
*buffer
, int32_t bufferLen
) {
7000 UErrorCode status
= U_ZERO_ERROR
;
7003 const UChar
* ucaRules
= 0;
7004 const UChar
*rules
= ucol_getRules(coll
, &len
);
7005 if(delta
== UCOL_FULL_RULES
) {
7006 /* take the UCA rules and append real rules at the end */
7007 /* UCA rules will be probably coming from the root RB */
7008 ucaRules
= ures_getStringByKey(coll
->rb
,"%%UCARULES",&UCAlen
,&status
);
7010 if(U_FAILURE(status
)) {
7013 if(buffer
!=0 && bufferLen
>0){
7016 u_memcpy(buffer
, ucaRules
, uprv_min(UCAlen
, bufferLen
));
7018 if(len
> 0 && bufferLen
> UCAlen
) {
7019 u_memcpy(buffer
+UCAlen
, rules
, uprv_min(len
, bufferLen
-UCAlen
));
7022 return u_terminateUChars(buffer
, bufferLen
, len
+UCAlen
, &status
);
7025 static const UChar _NUL
= 0;
7027 U_CAPI
const UChar
* U_EXPORT2
7028 ucol_getRules( const UCollator
*coll
,
7031 if(coll
->rules
!= NULL
) {
7032 *length
= coll
->rulesLength
;
7035 UErrorCode status
= U_ZERO_ERROR
;
7036 if(coll
->rb
!= NULL
) {
7037 UResourceBundle
*collElem
= ures_getByKey(coll
->rb
, "CollationElements", NULL
, &status
);
7038 if(U_SUCCESS(status
)) {
7040 ((UCollator
*)coll
)->rules
= ures_getStringByKey(collElem
, "Sequence", length
, &status
);
7041 ((UCollator
*)coll
)->rulesLength
= *length
;
7042 ((UCollator
*)coll
)->freeRulesOnClose
= FALSE
;
7043 ures_close(collElem
);
7052 U_CAPI
int32_t U_EXPORT2
7053 ucol_getDisplayName( const char *objLoc
,
7054 const char *dispLoc
,
7056 int32_t resultLength
,
7060 if(U_FAILURE(*status
)) return -1;
7062 if(!(result
==NULL
&& resultLength
==0)) {
7063 // NULL destination for pure preflighting: empty dummy string
7064 // otherwise, alias the destination buffer
7065 dst
.setTo(result
, 0, resultLength
);
7067 Collator::getDisplayName(Locale(objLoc
), Locale(dispLoc
), dst
);
7068 return dst
.extract(result
, resultLength
, *status
);
7071 U_CAPI
const char* U_EXPORT2
7072 ucol_getAvailable(int32_t index
)
7074 return uloc_getAvailable(index
);
7077 U_CAPI
int32_t U_EXPORT2
7078 ucol_countAvailable()
7080 return uloc_countAvailable();
7083 U_CAPI
void U_EXPORT2
7084 ucol_getVersion(const UCollator
* coll
,
7085 UVersionInfo versionInfo
)
7087 /* RunTime version */
7088 uint8_t rtVersion
= UCOL_RUNTIME_VERSION
;
7089 /* Builder version*/
7090 uint8_t bdVersion
= coll
->image
->version
[0];
7092 /* Charset Version. Need to get the version from cnv files
7093 * makeconv should populate cnv files with version and
7094 * an api has to be provided in ucnv.h to obtain this version
7096 uint8_t csVersion
= 0;
7098 /* combine the version info */
7099 uint16_t cmbVersion
= (uint16_t)((rtVersion
<<11) | (bdVersion
<<6) | (csVersion
));
7101 /* Tailoring rules */
7102 versionInfo
[0] = (uint8_t)(cmbVersion
>>8);
7103 versionInfo
[1] = (uint8_t)cmbVersion
;
7104 versionInfo
[2] = coll
->image
->version
[1];
7105 versionInfo
[3] = UCA
->image
->UCAVersion
[0];
7109 /* This internal API checks whether a character is tailored or not */
7110 U_CAPI UBool U_EXPORT2
7111 ucol_isTailored(const UCollator
*coll
, const UChar u
, UErrorCode
*status
) {
7112 uint32_t CE
= UCOL_NOT_FOUND
;
7113 const UChar
*ContractionStart
= NULL
;
7114 if(U_SUCCESS(*status
) && coll
!= NULL
) {
7117 } else if(u
< 0x100) { /* latin-1 */
7118 CE
= coll
->latinOneMapping
[u
];
7119 if(CE
== UCA
->latinOneMapping
[u
]) {
7122 } else { /* regular */
7123 /*CE = ucmpe32_get(coll->mapping, u);*/
7124 CE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, u
);
7128 if(isContraction(CE
)) {
7129 ContractionStart
= (UChar
*)coll
->image
+getContractOffset(CE
);
7130 CE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
7133 if(CE
== UCOL_NOT_FOUND
) {
7144 /****************************************************************************/
7145 /* Following are the string compare functions */
7147 /****************************************************************************/
7150 /* ucol_checkIdent internal function. Does byte level string compare. */
7151 /* Used by strcoll if strength == identical and strings */
7152 /* are otherwise equal. Moved out-of-line because this */
7153 /* is a rare case. */
7155 /* Comparison must be done on NFD normalized strings. */
7156 /* FCD is not good enough. */
7158 /* TODO: make an incremental NFD Comparison function, which could */
7159 /* be of general use */
7162 UCollationResult
ucol_checkIdent(collIterate
*sColl
, collIterate
*tColl
, UBool normalize
, UErrorCode
*status
)
7165 // TODO: When we have an UChar iterator, we need to access the whole string. One
7166 // useful modification would be a UChar iterator extract API, since reset next next...
7168 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7170 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7171 // of same type, but that doesn't really mean that it will stay that way.
7173 // The division for the array length may truncate the array size to
7174 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7175 // for all platforms anyway.
7176 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7177 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7178 //UChar sStackBuf[256], tStackBuf[256];
7179 //int32_t sBufSize = 256, tBufSize = 256;
7185 UBool freeSBuf
= FALSE
, freeTBuf
= FALSE
;
7187 if (sColl
->flags
& UCOL_USE_ITERATOR
) {
7188 UNormIterator
*sNIt
= NULL
, *tNIt
= NULL
;
7189 sNIt
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
7190 tNIt
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
7191 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7192 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7193 UCharIterator
*sIt
= unorm_setIter(sNIt
, sColl
->iterator
, UNORM_NFD
, status
);
7194 UCharIterator
*tIt
= unorm_setIter(tNIt
, tColl
->iterator
, UNORM_NFD
, status
);
7195 comparison
= u_strCompareIter(sIt
, tIt
, TRUE
);
7196 unorm_closeIter(sNIt
);
7197 unorm_closeIter(tNIt
);
7199 sLen
= (sColl
->flags
& UCOL_ITER_HASLEN
) ? sColl
->endp
- sColl
->string
: -1;
7200 sBuf
= sColl
->string
;
7201 tLen
= (tColl
->flags
& UCOL_ITER_HASLEN
) ? tColl
->endp
- tColl
->string
: -1;
7202 tBuf
= tColl
->string
;
7205 *status
= U_ZERO_ERROR
;
7206 if (unorm_quickCheck(sBuf
, sLen
, UNORM_NFD
, status
) != UNORM_YES
) {
7207 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
7211 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
7212 if(!u_growBufferFromStatic(sColl
->stackWritableBuffer
,
7213 &sColl
->writableBuffer
,
7214 (int32_t *)&sColl
->writableBufSize
, sLen
,
7217 *status
= U_MEMORY_ALLOCATION_ERROR
;
7218 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7220 *status
= U_ZERO_ERROR
;
7221 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
7230 sBuf
= sColl
->writableBuffer
;
7231 if (sBuf
!= sColl
->stackWritableBuffer
) {
7232 sColl
->flags
|= UCOL_ITER_ALLOCATED
;
7236 *status
= U_ZERO_ERROR
;
7237 if (unorm_quickCheck(tBuf
, tLen
, UNORM_NFD
, status
) != UNORM_YES
) {
7238 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7242 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
7243 if(!u_growBufferFromStatic(tColl
->stackWritableBuffer
,
7244 &tColl
->writableBuffer
,
7245 (int32_t *)&tColl
->writableBufSize
, tLen
,
7248 *status
= U_MEMORY_ALLOCATION_ERROR
;
7249 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7251 *status
= U_ZERO_ERROR
;
7252 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7261 tBuf
= tColl
->writableBuffer
;
7262 if (tBuf
!= tColl
->stackWritableBuffer
) {
7263 tColl
->flags
|= UCOL_ITER_ALLOCATED
;
7268 if (sLen
== -1 && tLen
== -1) {
7269 comparison
= u_strcmpCodePointOrder(sBuf
, tBuf
);
7272 sLen
= u_strlen(sBuf
);
7275 tLen
= u_strlen(tBuf
);
7277 comparison
= u_memcmpCodePointOrder(sBuf
, tBuf
, uprv_min(sLen
, tLen
));
7278 if (comparison
== 0) {
7279 comparison
= sLen
- tLen
;
7284 if (comparison
< 0) {
7286 } else if (comparison
== 0) {
7288 } else /* comparison > 0 */ {
7289 return UCOL_GREATER
;
7293 /* CEBuf - A struct and some inline functions to handle the saving */
7294 /* of CEs in a buffer within ucol_strcoll */
7296 #define UCOL_CEBUF_SIZE 512
7297 typedef struct ucol_CEBuf
{
7301 uint32_t localArray
[UCOL_CEBUF_SIZE
];
7306 inline void UCOL_INIT_CEBUF(ucol_CEBuf
*b
) {
7307 (b
)->buf
= (b
)->pos
= (b
)->localArray
;
7308 (b
)->endp
= (b
)->buf
+ UCOL_CEBUF_SIZE
;
7312 void ucol_CEBuf_Expand(ucol_CEBuf
*b
, collIterate
*ci
) {
7317 ci
->flags
|= UCOL_ITER_ALLOCATED
;
7318 oldSize
= b
->pos
- b
->buf
;
7319 newSize
= oldSize
* 2;
7320 newBuf
= (uint32_t *)uprv_malloc(newSize
* sizeof(uint32_t));
7321 if(newBuf
!= NULL
) {
7322 uprv_memcpy(newBuf
, b
->buf
, oldSize
* sizeof(uint32_t));
7323 if (b
->buf
!= b
->localArray
) {
7327 b
->endp
= b
->buf
+ newSize
;
7328 b
->pos
= b
->buf
+ oldSize
;
7333 inline void UCOL_CEBUF_PUT(ucol_CEBuf
*b
, uint32_t ce
, collIterate
*ci
) {
7334 if (b
->pos
== b
->endp
) {
7335 ucol_CEBuf_Expand(b
, ci
);
7340 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7341 /* It is used when compare gets in trouble and needs to bail out */
7342 static UCollationResult
ucol_compareUsingSortKeys(collIterate
*sColl
,
7345 uint8_t sourceKey
[UCOL_MAX_BUFFER
], targetKey
[UCOL_MAX_BUFFER
];
7346 uint8_t *sourceKeyP
= sourceKey
;
7347 uint8_t *targetKeyP
= targetKey
;
7348 int32_t sourceKeyLen
= UCOL_MAX_BUFFER
, targetKeyLen
= UCOL_MAX_BUFFER
;
7349 const UCollator
*coll
= sColl
->coll
;
7350 UChar
*source
= NULL
;
7351 UChar
*target
= NULL
;
7352 UChar sStackBuf
[256], tStackBuf
[256];
7353 int32_t sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7354 int32_t targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7356 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7357 if(sColl
->flags
& UCOL_USE_ITERATOR
) {
7358 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7359 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7361 UChar
*sBufp
= source
;
7363 UChar
*tBufp
= target
;
7364 while(sColl
->iterator
->hasNext(sColl
->iterator
)) {
7365 *sBufp
++ = (UChar
)sColl
->iterator
->next(sColl
->iterator
);
7367 while(tColl
->iterator
->hasNext(tColl
->iterator
)) {
7368 *tBufp
++ = (UChar
)tColl
->iterator
->next(tColl
->iterator
);
7370 sourceLength
= sBufp
- source
;
7371 targetLength
= tBufp
- target
;
7372 } else { // no iterators
7373 sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7374 targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7375 source
= sColl
->string
;
7376 target
= tColl
->string
;
7381 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7382 if(sourceKeyLen
> UCOL_MAX_BUFFER
) {
7383 sourceKeyP
= (uint8_t*)uprv_malloc(sourceKeyLen
*sizeof(uint8_t));
7384 if(sourceKeyP
!= NULL
) {
7385 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7389 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7390 if(targetKeyLen
> UCOL_MAX_BUFFER
) {
7391 targetKeyP
= (uint8_t*)uprv_malloc(targetKeyLen
*sizeof(uint8_t));
7392 if(targetKeyP
!= NULL
) {
7393 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7397 int32_t result
= uprv_strcmp((const char*)sourceKeyP
, (const char*)targetKeyP
);
7399 if(sourceKeyP
!= sourceKey
) {
7400 uprv_free(sourceKeyP
);
7403 if(targetKeyP
!= targetKey
) {
7404 uprv_free(targetKeyP
);
7409 } else if(result
>0) {
7410 return UCOL_GREATER
;
7417 static inline UCollationResult
7418 ucol_strcollRegular( collIterate
*sColl
, collIterate
*tColl
,
7419 // const UCollator *coll,
7420 // const UChar *source,
7421 // int32_t sourceLength,
7422 // const UChar *target,
7423 // int32_t targetLength,
7428 const UCollator
*coll
= sColl
->coll
;
7431 // setting up the collator parameters
7432 UColAttributeValue strength
= coll
->strength
;
7433 UBool initialCheckSecTer
= (strength
>= UCOL_SECONDARY
);
7435 UBool checkSecTer
= initialCheckSecTer
;
7436 UBool checkTertiary
= (strength
>= UCOL_TERTIARY
);
7437 UBool checkQuad
= (strength
>= UCOL_QUATERNARY
);
7438 UBool checkIdent
= (strength
== UCOL_IDENTICAL
);
7439 UBool checkCase
= (coll
->caseLevel
== UCOL_ON
);
7440 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && checkSecTer
;
7441 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
7442 UBool qShifted
= shifted
&& checkQuad
;
7443 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && checkQuad
;
7445 if(doHiragana
&& shifted
) {
7446 return (ucol_compareUsingSortKeys(sColl
, tColl
));
7448 uint8_t caseSwitch
= coll
->caseSwitch
;
7449 uint8_t tertiaryMask
= coll
->tertiaryMask
;
7451 // This is the lowest primary value that will not be ignored if shifted
7452 uint32_t LVT
= (shifted
)?(coll
->variableTopValue
<<16):0;
7454 UCollationResult result
= UCOL_EQUAL
;
7455 UCollationResult hirResult
= UCOL_EQUAL
;
7457 // Preparing the CE buffers. They will be filled during the primary phase
7460 UCOL_INIT_CEBUF(&sCEs
);
7461 UCOL_INIT_CEBUF(&tCEs
);
7463 uint32_t secS
= 0, secT
= 0;
7464 uint32_t sOrder
=0, tOrder
=0;
7466 // Non shifted primary processing is quite simple
7470 // We fetch CEs until we hit a non ignorable primary or end.
7472 // We get the next CE
7473 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7474 // Stuff it in the buffer
7475 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7476 // And keep just the primary part.
7477 sOrder
&= UCOL_PRIMARYMASK
;
7478 } while(sOrder
== 0);
7480 // see the comments on the above block
7482 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7483 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7484 tOrder
&= UCOL_PRIMARYMASK
;
7485 } while(tOrder
== 0);
7487 // if both primaries are the same
7488 if(sOrder
== tOrder
) {
7489 // and there are no more CEs, we advance to the next level
7490 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7493 if(doHiragana
&& hirResult
== UCOL_EQUAL
) {
7494 if((sColl
->flags
& UCOL_WAS_HIRAGANA
) != (tColl
->flags
& UCOL_WAS_HIRAGANA
)) {
7495 hirResult
= ((sColl
->flags
& UCOL_WAS_HIRAGANA
) > (tColl
->flags
& UCOL_WAS_HIRAGANA
))
7496 ? UCOL_LESS
:UCOL_GREATER
;
7500 // if two primaries are different, we are done
7501 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7504 } // no primary difference... do the rest from the buffers
7505 } else { // shifted - do a slightly more complicated processing :)
7507 UBool sInShifted
= FALSE
;
7508 UBool tInShifted
= FALSE
;
7509 // This version of code can be refactored. However, it seems easier to understand this way.
7510 // Source loop. Sam as the target loop.
7512 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7513 if(sOrder
== UCOL_NO_MORE_CES
) {
7514 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7516 } else if(sOrder
== 0
7517 || (sInShifted
&& (sOrder
& UCOL_PRIMARYMASK
) == 0)) {
7518 /* UCA amendment - ignore ignorables that follow shifted code points */
7520 } else if(isContinuation(sOrder
)) {
7521 if((sOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7523 sOrder
= (sOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7524 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7527 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7530 } else { /* Just lower level values */
7534 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7538 } else { /* regular */
7539 if((sOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7540 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7543 if((sOrder
& UCOL_PRIMARYMASK
) > 0) {
7545 sOrder
&= UCOL_PRIMARYMASK
;
7546 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7549 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7556 sOrder
&= UCOL_PRIMARYMASK
;
7560 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7561 if(tOrder
== UCOL_NO_MORE_CES
) {
7562 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7564 } else if(tOrder
== 0
7565 || (tInShifted
&& (tOrder
& UCOL_PRIMARYMASK
) == 0)) {
7566 /* UCA amendment - ignore ignorables that follow shifted code points */
7568 } else if(isContinuation(tOrder
)) {
7569 if((tOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7571 tOrder
= (tOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7572 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7575 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7578 } else { /* Just lower level values */
7582 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7586 } else { /* regular */
7587 if((tOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7588 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7591 if((tOrder
& UCOL_PRIMARYMASK
) > 0) {
7593 tOrder
&= UCOL_PRIMARYMASK
;
7594 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7597 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7604 tOrder
&= UCOL_PRIMARYMASK
;
7607 if(sOrder
== tOrder
) {
7609 if(doHiragana && hirResult == UCOL_EQUAL) {
7610 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7611 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7612 ? UCOL_LESS:UCOL_GREATER;
7616 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7619 sOrder
= 0; tOrder
= 0;
7623 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7626 } /* no primary difference... do the rest from the buffers */
7629 /* now, we're gonna reexamine collected CEs */
7633 /* This is the secondary level of comparison */
7635 if(!isFrenchSec
) { /* normal */
7640 secS
= *(sCE
++) & UCOL_SECONDARYMASK
;
7644 secT
= *(tCE
++) & UCOL_SECONDARYMASK
;
7648 if(secS
== UCOL_NO_MORE_CES_SECONDARY
) {
7655 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7659 } else { /* do the French */
7660 uint32_t *sCESave
= NULL
;
7661 uint32_t *tCESave
= NULL
;
7662 sCE
= sCEs
.pos
-2; /* this could also be sCEs-- if needs to be optimized */
7665 while (secS
== 0 && sCE
>= sCEs
.buf
) {
7668 if(isContinuation(secS
)) {
7669 while(isContinuation(secS
= *(sCE
--)));
7670 /* after this, secS has the start of continuation, and sCEs points before that */
7671 sCESave
= sCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7672 sCE
+=2; /* need to point to the first continuation CP */
7673 /* However, now you can just continue doing stuff */
7677 if(!isContinuation(secS
)) { /* This means we have finished with this cont */
7678 sCE
= sCESave
; /* reset the pointer to before continuation */
7683 secS
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7686 while(secT
== 0 && tCE
>= tCEs
.buf
) {
7689 if(isContinuation(secT
)) {
7690 while(isContinuation(secT
= *(tCE
--)));
7691 /* after this, secS has the start of continuation, and sCEs points before that */
7692 tCESave
= tCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7693 tCE
+=2; /* need to point to the first continuation CP */
7694 /* However, now you can just continue doing stuff */
7698 if(!isContinuation(secT
)) { /* This means we have finished with this cont */
7699 tCE
= tCESave
; /* reset the pointer to before continuation */
7704 secT
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7708 if(secS
== UCOL_NO_MORE_CES_SECONDARY
|| (sCE
< sCEs
.buf
&& tCE
< tCEs
.buf
)) {
7715 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7722 /* doing the case bit */
7727 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7728 if(!isContinuation(*sCE
++)) {
7729 secS
=*(sCE
-1) & UCOL_TERT_CASE_MASK
;
7736 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7737 if(!isContinuation(*tCE
++)) {
7738 secT
= *(tCE
-1) & UCOL_TERT_CASE_MASK
;
7745 if((secS
& UCOL_CASE_BIT_MASK
) < (secT
& UCOL_CASE_BIT_MASK
)) {
7748 } else if((secS
& UCOL_CASE_BIT_MASK
) > (secT
& UCOL_CASE_BIT_MASK
)) {
7749 result
= UCOL_GREATER
;
7753 if((secS
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
|| (secT
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
) {
7762 /* Tertiary level */
7769 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7770 secS
= *(sCE
++) & tertiaryMask
;
7771 if(!isContinuation(secS
)) {
7774 secS
&= UCOL_REMOVE_CASE
;
7778 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7779 secT
= *(tCE
++) & tertiaryMask
;
7780 if(!isContinuation(secT
)) {
7783 secT
&= UCOL_REMOVE_CASE
;
7788 if((secS
& UCOL_REMOVE_CASE
) == 1) {
7795 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7802 if(qShifted
/*checkQuad*/) {
7803 UBool sInShifted
= TRUE
;
7804 UBool tInShifted
= TRUE
;
7810 while(secS
== 0 && secS
!= UCOL_NO_MORE_CES
|| (isContinuation(secS
) && !sInShifted
)) {
7812 if(isContinuation(secS
)) {
7816 } else if(secS
> LVT
|| (secS
& UCOL_PRIMARYMASK
) == 0) { /* non continuation */
7817 secS
= UCOL_PRIMARYMASK
;
7823 secS
&= UCOL_PRIMARYMASK
;
7826 while(secT
== 0 && secT
!= UCOL_NO_MORE_CES
|| (isContinuation(secT
) && !tInShifted
)) {
7828 if(isContinuation(secT
)) {
7832 } else if(secT
> LVT
|| (secT
& UCOL_PRIMARYMASK
) == 0) {
7833 secT
= UCOL_PRIMARYMASK
;
7839 secT
&= UCOL_PRIMARYMASK
;
7842 if(secS
== UCOL_NO_MORE_CES_PRIMARY
) {
7849 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7853 } else if(doHiragana
&& hirResult
!= UCOL_EQUAL
) {
7854 // If we're fine on quaternaries, we might be different
7855 // on Hiragana. This, however, might fail us in shifted.
7860 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7861 /* as a tiebreaker if all else is equal. */
7862 /* Getting here should be quite rare - strings are not identical - */
7863 /* that is checked first, but compared == through all other checks. */
7866 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7867 result
= ucol_checkIdent(sColl
, tColl
, TRUE
, status
);
7871 if ((sColl
->flags
| tColl
->flags
) & UCOL_ITER_ALLOCATED
) {
7872 freeHeapWritableBuffer(sColl
);
7873 freeHeapWritableBuffer(tColl
);
7875 if (sCEs
.buf
!= sCEs
.localArray
) {
7876 uprv_free(sCEs
.buf
);
7878 if (tCEs
.buf
!= tCEs
.localArray
) {
7879 uprv_free(tCEs
.buf
);
7887 static inline uint32_t
7888 ucol_getLatinOneContraction(const UCollator
*coll
, int32_t strength
,
7889 uint32_t CE
, const UChar
*s
, int32_t *index
, int32_t len
) {
7890 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
&0xFFF);
7891 int32_t latinOneOffset
= (CE
& 0x00FFF000) >> 12;
7893 UChar schar
= 0, tchar
= 0;
7897 if(s
[*index
] == 0) { // end of string
7898 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7904 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7910 while(schar
> (tchar
= *(UCharOffset
+offset
))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7914 if (schar
== tchar
) {
7916 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
+offset
]);
7920 if(schar
& 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7921 return UCOL_BAIL_OUT_CE
;
7923 // skip completely ignorables
7924 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
7925 if(isZeroCE
== 0) { // we have to ignore completely ignorables
7930 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7937 * This is a fast strcoll, geared towards text in Latin-1.
7938 * It supports contractions of size two, French secondaries
7939 * and case switching. You can use it with strengths primary
7940 * to tertiary. It does not support shifted and case level.
7941 * It relies on the table build by setupLatin1Table. If it
7942 * doesn't understand something, it will go to the regular
7945 static inline UCollationResult
7946 ucol_strcollUseLatin1( const UCollator
*coll
,
7947 const UChar
*source
,
7949 const UChar
*target
,
7954 int32_t strength
= coll
->strength
;
7956 int32_t sIndex
= 0, tIndex
= 0;
7957 UChar sChar
= 0, tChar
= 0;
7958 uint32_t sOrder
=0, tOrder
=0;
7960 UBool endOfSource
= FALSE
, endOfTarget
= FALSE
;
7962 uint32_t *elements
= coll
->latinOneCEs
;
7964 UBool haveContractions
= FALSE
; // if we have contractions in our string
7965 // we cannot do French secondary
7967 // Do the primary level
7969 while(sOrder
==0) { // this loop skips primary ignorables
7970 // sOrder=getNextlatinOneCE(source);
7971 if(sLen
==-1) { // handling zero terminated strings
7972 sChar
=source
[sIndex
++];
7977 } else { // handling strings with known length
7982 sChar
=source
[sIndex
++];
7984 if(sChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7985 //fprintf(stderr, "R");
7987 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7989 sOrder
= elements
[sChar
];
7990 if(sOrder
>= UCOL_NOT_FOUND
) { // if we got a special
7991 // specials can basically be either contractions or bail-out signs. If we get anything
7992 // else, we'll bail out anywasy
7993 if(getCETag(sOrder
) == CONTRACTION_TAG
) {
7994 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, sOrder
, source
, &sIndex
, sLen
);
7995 haveContractions
= TRUE
; // if there are contractions, we cannot do French secondary
7996 // However, if there are contractions in the table, but we always use just one char,
7997 // we might be able to do French. This should be checked out.
7999 if(sOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8000 //fprintf(stderr, "S");
8002 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8007 while(tOrder
==0) { // this loop skips primary ignorables
8008 // tOrder=getNextlatinOneCE(target);
8009 if(tLen
==-1) { // handling zero terminated strings
8010 tChar
=target
[tIndex
++];
8012 if(endOfSource
) { // this is different than source loop,
8013 // as we already know that source loop is done here,
8014 // so we can either finish the primary loop if both
8015 // strings are done or anounce the result if only
8016 // target is done. Same below.
8019 return UCOL_GREATER
;
8022 } else { // handling strings with known length
8027 return UCOL_GREATER
;
8030 tChar
=target
[tIndex
++];
8032 if(tChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8033 //fprintf(stderr, "R");
8035 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8037 tOrder
= elements
[tChar
];
8038 if(tOrder
>= UCOL_NOT_FOUND
) {
8039 // Handling specials, see the comments for source
8040 if(getCETag(tOrder
) == CONTRACTION_TAG
) {
8041 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, tOrder
, target
, &tIndex
, tLen
);
8042 haveContractions
= TRUE
;
8044 if(tOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8045 //fprintf(stderr, "S");
8047 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8051 if(endOfSource
) { // source is finished, but target is not, say the result.
8055 if(sOrder
== tOrder
) { // if we have same CEs, we continue the loop
8056 sOrder
= 0; tOrder
= 0;
8059 // compare current top bytes
8060 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8061 // top bytes differ, return difference
8062 if(sOrder
< tOrder
) {
8064 } else if(sOrder
> tOrder
) {
8065 return UCOL_GREATER
;
8067 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8068 // since we must return enum value
8071 // top bytes match, continue with following bytes
8078 // after primary loop, we definitely know the sizes of strings,
8079 // so we set it and use simpler loop for secondaries and tertiaries
8080 sLen
= sIndex
; tLen
= tIndex
;
8081 if(strength
>= UCOL_SECONDARY
) {
8082 // adjust the table beggining
8083 elements
+= coll
->latinOneTableLen
;
8084 endOfSource
= FALSE
; endOfTarget
= FALSE
;
8086 if(coll
->frenchCollation
== UCOL_OFF
) { // non French
8087 // This loop is a simplified copy of primary loop
8088 // at this point we know that whole strings are latin-1, so we don't
8089 // check for that. We also know that we only have contractions as
8091 sIndex
= 0; tIndex
= 0;
8098 sChar
=source
[sIndex
++];
8099 sOrder
= elements
[sChar
];
8100 if(sOrder
> UCOL_NOT_FOUND
) {
8101 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, sOrder
, source
, &sIndex
, sLen
);
8110 return UCOL_GREATER
;
8113 tChar
=target
[tIndex
++];
8114 tOrder
= elements
[tChar
];
8115 if(tOrder
> UCOL_NOT_FOUND
) {
8116 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, tOrder
, target
, &tIndex
, tLen
);
8123 if(sOrder
== tOrder
) {
8124 sOrder
= 0; tOrder
= 0;
8127 // see primary loop for comments on this
8128 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8129 if(sOrder
< tOrder
) {
8131 } else if(sOrder
> tOrder
) {
8132 return UCOL_GREATER
;
8140 if(haveContractions
) { // if we have contractions, we have to bail out
8141 // since we don't really know how to handle them here
8143 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8145 // For French, we go backwards
8146 sIndex
= sLen
; tIndex
= tLen
;
8153 sChar
=source
[--sIndex
];
8154 sOrder
= elements
[sChar
];
8155 // don't even look for contractions
8163 return UCOL_GREATER
;
8166 tChar
=target
[--tIndex
];
8167 tOrder
= elements
[tChar
];
8168 // don't even look for contractions
8174 if(sOrder
== tOrder
) {
8175 sOrder
= 0; tOrder
= 0;
8178 // see the primary loop for comments
8179 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8180 if(sOrder
< tOrder
) {
8182 } else if(sOrder
> tOrder
) {
8183 return UCOL_GREATER
;
8194 if(strength
>= UCOL_TERTIARY
) {
8195 // tertiary loop is the same as secondary (except no French)
8196 elements
+= coll
->latinOneTableLen
;
8197 sIndex
= 0; tIndex
= 0;
8198 endOfSource
= FALSE
; endOfTarget
= FALSE
;
8205 sChar
=source
[sIndex
++];
8206 sOrder
= elements
[sChar
];
8207 if(sOrder
> UCOL_NOT_FOUND
) {
8208 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, sOrder
, source
, &sIndex
, sLen
);
8214 return UCOL_EQUAL
; // if both strings are at the end, they are equal
8216 return UCOL_GREATER
;
8219 tChar
=target
[tIndex
++];
8220 tOrder
= elements
[tChar
];
8221 if(tOrder
> UCOL_NOT_FOUND
) {
8222 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, tOrder
, target
, &tIndex
, tLen
);
8228 if(sOrder
== tOrder
) {
8229 sOrder
= 0; tOrder
= 0;
8232 if(((sOrder
^tOrder
)&0xff000000)!=0) {
8233 if(sOrder
< tOrder
) {
8235 } else if(sOrder
> tOrder
) {
8236 return UCOL_GREATER
;
8247 // Preparing the context objects for iterating over strings
8248 collIterate sColl
, tColl
;
8250 IInit_collIterate(coll
, source
, sLen
, &sColl
);
8251 IInit_collIterate(coll
, target
, tLen
, &tColl
);
8252 return ucol_strcollRegular(&sColl
, &tColl
, status
);
8256 U_CAPI UCollationResult U_EXPORT2
8257 ucol_strcollIter( const UCollator
*coll
,
8258 UCharIterator
*sIter
,
8259 UCharIterator
*tIter
,
8260 UErrorCode
*status
) {
8261 if(!status
|| U_FAILURE(*status
) || sIter
== tIter
) {
8264 if(sIter
== NULL
|| tIter
== NULL
|| coll
== NULL
) {
8265 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8269 UCollationResult result
= UCOL_EQUAL
;
8271 // Preparing the context objects for iterating over strings
8272 collIterate sColl
, tColl
;
8273 // The division for the array length may truncate the array size to
8274 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8275 // for all platforms anyway.
8276 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8277 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8278 UNormIterator
*sNormIter
= NULL
, *tNormIter
= NULL
;
8280 IInit_collIterate(coll
, NULL
, -1, &sColl
);
8281 sColl
.iterator
= sIter
;
8282 sColl
.flags
|= UCOL_USE_ITERATOR
;
8283 IInit_collIterate(coll
, NULL
, -1, &tColl
);
8284 tColl
.flags
|= UCOL_USE_ITERATOR
;
8285 tColl
.iterator
= tIter
;
8287 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
) {
8288 sNormIter
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
8289 sColl
.iterator
= unorm_setIter(sNormIter
, sIter
, UNORM_FCD
, status
);
8290 sColl
.flags
&= ~UCOL_ITER_NORM
;
8292 tNormIter
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
8293 tColl
.iterator
= unorm_setIter(tNormIter
, tIter
, UNORM_FCD
, status
);
8294 tColl
.flags
&= ~UCOL_ITER_NORM
;
8297 UChar32 sChar
= U_SENTINEL
, tChar
= U_SENTINEL
;
8299 while((sChar
= sColl
.iterator
->next(sColl
.iterator
)) ==
8300 (tChar
= tColl
.iterator
->next(tColl
.iterator
))) {
8301 if(sChar
== U_SENTINEL
) {
8302 result
= UCOL_EQUAL
;
8307 if(sChar
== U_SENTINEL
) {
8308 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8311 if(tChar
== U_SENTINEL
) {
8312 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8315 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8316 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8318 if (ucol_unsafeCP((UChar
)sChar
, coll
) || ucol_unsafeCP((UChar
)tChar
, coll
))
8320 // We are stopped in the middle of a contraction.
8321 // Scan backwards through the == part of the string looking for the start of the contraction.
8322 // It doesn't matter which string we scan, since they are the same in this region.
8325 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8326 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8328 while (sChar
!= U_SENTINEL
&& ucol_unsafeCP((UChar
)sChar
, coll
));
8332 if(U_SUCCESS(*status
)) {
8333 result
= ucol_strcollRegular(&sColl
, &tColl
, status
);
8337 if(sNormIter
|| tNormIter
) {
8338 unorm_closeIter(sNormIter
);
8339 unorm_closeIter(tNormIter
);
8348 /* ucol_strcoll Main public API string comparison function */
8350 U_CAPI UCollationResult U_EXPORT2
8351 ucol_strcoll( const UCollator
*coll
,
8352 const UChar
*source
,
8353 int32_t sourceLength
,
8354 const UChar
*target
,
8355 int32_t targetLength
) {
8357 UErrorCode status
= U_ZERO_ERROR
;
8358 if(source
== NULL
|| target
== NULL
) {
8359 // do not crash, but return. Should have
8360 // status argument to return error.
8363 collIterate sColl
, tColl
;
8365 /* Scan the strings. Find: */
8366 /* The length of any leading portion that is equal */
8367 /* Whether they are exactly equal. (in which case we just return) */
8368 const UChar
*pSrc
= source
;
8369 const UChar
*pTarg
= target
;
8370 int32_t equalLength
;
8372 if (sourceLength
== -1 && targetLength
== -1) {
8373 // Both strings are null terminated.
8374 // Check for them being the same string, and scan through
8375 // any leading equal portion.
8376 if (source
==target
) {
8381 if ( *pSrc
!= *pTarg
|| *pSrc
== 0) {
8387 if (*pSrc
== 0 && *pTarg
== 0) {
8390 equalLength
= pSrc
- source
;
8394 // One or both strings has an explicit length.
8395 /* check if source and target are same strings */
8397 if (source
==target
&& sourceLength
==targetLength
) {
8400 const UChar
*pSrcEnd
= source
+ sourceLength
;
8401 const UChar
*pTargEnd
= target
+ targetLength
;
8404 // Scan while the strings are bitwise ==, or until one is exhausted.
8406 if (pSrc
== pSrcEnd
|| pTarg
== pTargEnd
) {
8409 if ((*pSrc
== 0 && sourceLength
== -1) || (*pTarg
== 0 && targetLength
== -1)) {
8412 if (*pSrc
!= *pTarg
) {
8418 equalLength
= pSrc
- source
;
8420 // If we made it all the way through both strings, we are done. They are ==
8421 if ((pSrc
==pSrcEnd
|| (pSrcEnd
<pSrc
&& *pSrc
==0)) && /* At end of src string, however it was specified. */
8422 (pTarg
==pTargEnd
|| (pTargEnd
<pTarg
&& *pTarg
==0))) { /* and also at end of dest string */
8426 if (equalLength
> 0) {
8427 /* There is an identical portion at the beginning of the two strings. */
8428 /* If the identical portion ends within a contraction or a comibining */
8429 /* character sequence, back up to the start of that sequence. */
8430 pSrc
= source
+ equalLength
; /* point to the first differing chars */
8431 pTarg
= target
+ equalLength
;
8432 if (pSrc
!= source
+sourceLength
&& ucol_unsafeCP(*pSrc
, coll
) ||
8433 pTarg
!= target
+targetLength
&& ucol_unsafeCP(*pTarg
, coll
))
8435 // We are stopped in the middle of a contraction.
8436 // Scan backwards through the == part of the string looking for the start of the contraction.
8437 // It doesn't matter which string we scan, since they are the same in this region.
8443 while (equalLength
>0 && ucol_unsafeCP(*pSrc
, coll
));
8446 source
+= equalLength
;
8447 target
+= equalLength
;
8448 if (sourceLength
> 0) {
8449 sourceLength
-= equalLength
;
8451 if (targetLength
> 0) {
8452 targetLength
-= equalLength
;
8456 if(!coll
->latinOneUse
|| (sourceLength
> 0 && *source
&0xff00) || (targetLength
> 0 && *target
&0xff00)) {
8457 // Preparing the context objects for iterating over strings
8458 IInit_collIterate(coll
, source
, sourceLength
, &sColl
);
8459 IInit_collIterate(coll
, target
, targetLength
, &tColl
);
8460 return ucol_strcollRegular(&sColl
, &tColl
, &status
);
8462 return ucol_strcollUseLatin1(coll
, source
, sourceLength
, target
, targetLength
, &status
);
8466 /* convenience function for comparing strings */
8467 U_CAPI UBool U_EXPORT2
8468 ucol_greater( const UCollator
*coll
,
8469 const UChar
*source
,
8470 int32_t sourceLength
,
8471 const UChar
*target
,
8472 int32_t targetLength
)
8474 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8478 /* convenience function for comparing strings */
8479 U_CAPI UBool U_EXPORT2
8480 ucol_greaterOrEqual( const UCollator
*coll
,
8481 const UChar
*source
,
8482 int32_t sourceLength
,
8483 const UChar
*target
,
8484 int32_t targetLength
)
8486 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8490 /* convenience function for comparing strings */
8491 U_CAPI UBool U_EXPORT2
8492 ucol_equal( const UCollator
*coll
,
8493 const UChar
*source
,
8494 int32_t sourceLength
,
8495 const UChar
*target
,
8496 int32_t targetLength
)
8498 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8502 /* returns the locale name the collation data comes from */
8503 U_CAPI
const char * U_EXPORT2
8504 ucol_getLocale(const UCollator
*coll
, ULocDataLocaleType type
, UErrorCode
*status
) {
8505 const char *result
= NULL
;
8506 if(status
== NULL
|| U_FAILURE(*status
)) {
8510 case ULOC_ACTUAL_LOCALE
:
8511 // validLocale is set only if service registration has explicitly set the
8512 // requested and valid locales. if this is the case, the actual locale
8513 // is considered to be the valid locale.
8514 if (coll
->validLocale
!= NULL
) {
8515 result
= coll
->validLocale
;
8516 } else if(coll
->elements
!= NULL
) {
8517 result
= ures_getLocale(coll
->elements
, status
);
8520 case ULOC_VALID_LOCALE
:
8521 if (coll
->validLocale
!= NULL
) {
8522 result
= coll
->validLocale
;
8523 } else if(coll
->rb
!= NULL
) {
8524 result
= ures_getLocale(coll
->rb
, status
);
8527 case ULOC_REQUESTED_LOCALE
:
8528 result
= coll
->requestedLocale
;
8531 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8536 U_CAPI USet
* U_EXPORT2
8537 ucol_getTailoredSet(const UCollator
*coll
, UErrorCode
*status
)
8539 if(status
== NULL
|| U_FAILURE(*status
)) {
8543 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8545 UParseError parseError
;
8546 UColTokenParser src
;
8547 int32_t rulesLen
= 0;
8548 const UChar
*rules
= ucol_getRules(coll
, &rulesLen
);
8549 const UChar
*current
= NULL
;
8550 UBool startOfRules
= TRUE
;
8551 // we internally use the C++ class, for the following reasons:
8552 // 1. we need to utilize canonical iterator, which is a C++ only class
8553 // 2. canonical iterator returns UnicodeStrings - USet cannot take them
8554 // 3. USet is internally really UnicodeSet, C is just a wrapper
8555 UnicodeSet
*tailored
= new UnicodeSet();
8556 UnicodeString pattern
;
8557 CanonicalIterator
it("", *status
);
8560 // The idea is to tokenize the rule set. For each non-reset token,
8561 // we add all the canonicaly equivalent FCD sequences
8562 ucol_tok_initTokenList(&src
, rules
, rulesLen
, UCA
, status
);
8563 while ((current
= ucol_tok_parseNextToken(&src
, startOfRules
, &parseError
, status
)) != NULL
) {
8564 startOfRules
= FALSE
;
8565 if(src
.parsedToken
.strength
!= UCOL_TOK_RESET
) {
8566 const UChar
*stuff
= src
.source
+(src
.parsedToken
.charsOffset
);
8567 it
.setSource(UnicodeString(stuff
, src
.parsedToken
.charsLen
), *status
);
8568 pattern
= it
.next();
8569 while(!pattern
.isBogus()) {
8570 if(Normalizer::quickCheck(pattern
, UNORM_FCD
, *status
) != UNORM_NO
) {
8571 tailored
->add(pattern
);
8573 pattern
= it
.next();
8577 ucol_tok_closeTokenList(&src
);
8578 return (USet
*)tailored
;
8581 U_CAPI UBool U_EXPORT2
8582 ucol_equals(const UCollator
*source
, const UCollator
*target
) {
8583 UErrorCode status
= U_ZERO_ERROR
;
8584 // if pointers are equal, collators are equal
8585 if(source
== target
) {
8588 int32_t i
= 0, j
= 0;
8589 // if any of attributes are different, collators are not equal
8590 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
8591 if(ucol_getAttribute(source
, (UColAttribute
)i
, &status
) != ucol_getAttribute(target
, (UColAttribute
)i
, &status
) || U_FAILURE(status
)) {
8596 int32_t sourceRulesLen
= 0, targetRulesLen
= 0;
8597 const UChar
*sourceRules
= ucol_getRules(source
, &sourceRulesLen
);
8598 const UChar
*targetRules
= ucol_getRules(target
, &targetRulesLen
);
8600 if(sourceRulesLen
== targetRulesLen
&& u_strncmp(sourceRules
, targetRules
, sourceRulesLen
) == 0) {
8601 // all the attributes are equal and the rules are equal - collators are equal
8604 // hard part, need to construct tree from rules and see if they yield the same tailoring
8605 UBool result
= TRUE
;
8606 UParseError parseError
;
8607 UColTokenParser sourceParser
, targetParser
;
8608 int32_t sourceListLen
= 0, targetListLen
= 0;
8609 ucol_tok_initTokenList(&sourceParser
, sourceRules
, sourceRulesLen
, UCA
, &status
);
8610 ucol_tok_initTokenList(&targetParser
, targetRules
, targetRulesLen
, UCA
, &status
);
8611 sourceListLen
= ucol_tok_assembleTokenList(&sourceParser
, &parseError
, &status
);
8612 targetListLen
= ucol_tok_assembleTokenList(&targetParser
, &parseError
, &status
);
8614 if(sourceListLen
!= targetListLen
) {
8615 // different number of resets
8618 UColToken
*sourceReset
= NULL
, *targetReset
= NULL
;
8619 UChar
*sourceResetString
= NULL
, *targetResetString
= NULL
;
8620 int32_t sourceStringLen
= 0, targetStringLen
= 0;
8621 for(i
= 0; i
< sourceListLen
; i
++) {
8622 sourceReset
= sourceParser
.lh
[i
].reset
;
8623 sourceResetString
= sourceParser
.source
+(sourceReset
->source
& 0xFFFFFF);
8624 sourceStringLen
= sourceReset
->source
>> 24;
8625 for(j
= 0; j
< sourceListLen
; j
++) {
8626 targetReset
= targetParser
.lh
[j
].reset
;
8627 targetResetString
= targetParser
.source
+(targetReset
->source
& 0xFFFFFF);
8628 targetStringLen
= targetReset
->source
>> 24;
8629 if(sourceStringLen
== targetStringLen
&& (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) == 0)) {
8630 sourceReset
= sourceParser
.lh
[i
].first
;
8631 targetReset
= targetParser
.lh
[j
].first
;
8632 while(sourceReset
!= NULL
&& targetReset
!= NULL
) {
8633 sourceResetString
= sourceParser
.source
+(sourceReset
->source
& 0xFFFFFF);
8634 sourceStringLen
= sourceReset
->source
>> 24;
8635 targetResetString
= targetParser
.source
+(targetReset
->source
& 0xFFFFFF);
8636 targetStringLen
= targetReset
->source
>> 24;
8637 if(sourceStringLen
!= targetStringLen
|| (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) != 0)) {
8641 // probably also need to check the expansions
8642 if(sourceReset
->expansion
) {
8643 if(!targetReset
->expansion
) {
8647 // compare expansions
8648 sourceResetString
= sourceParser
.source
+(sourceReset
->expansion
& 0xFFFFFF);
8649 sourceStringLen
= sourceReset
->expansion
>> 24;
8650 targetResetString
= targetParser
.source
+(targetReset
->expansion
& 0xFFFFFF);
8651 targetStringLen
= targetReset
->expansion
>> 24;
8652 if(sourceStringLen
!= targetStringLen
|| (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) != 0)) {
8658 if(targetReset
->expansion
) {
8663 sourceReset
= sourceReset
->next
;
8664 targetReset
= targetReset
->next
;
8666 if(sourceReset
!= targetReset
) { // at least one is not NULL
8667 // there are more tailored elements in one list
8676 // couldn't find the reset anchor, so the collators are not equal
8677 if(j
== sourceListLen
) {
8685 ucol_tok_closeTokenList(&sourceParser
);
8686 ucol_tok_closeTokenList(&targetParser
);
8690 #endif /* #if !UCONFIG_NO_COLLATION */