2 *******************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * Modification history
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
19 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_COLLATION
25 #include "unicode/uloc.h"
26 #include "unicode/coll.h"
27 #include "unicode/tblcoll.h"
28 #include "unicode/coleitr.h"
29 #include "unicode/unorm.h"
30 #include "unicode/udata.h"
31 #include "unicode/uchar.h"
32 #include "unicode/caniter.h"
56 /* added by synwee for trie manipulation*/
57 #define STAGE_1_SHIFT_ 10
58 #define STAGE_2_SHIFT_ 4
59 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
60 #define STAGE_3_MASK_ 0xF
61 #define LAST_BYTE_MASK_ 0xFF
62 #define SECOND_LAST_BYTE_SHIFT_ 8
64 #define ZERO_CC_LIMIT_ 0xC0
66 // static UCA. There is only one. Collators don't use it.
67 // It is referenced only in ucol_initUCA and ucol_cleanup
68 static UCollator
* _staticUCA
= NULL
;
69 // static pointer to udata memory. Inited in ucol_initUCA
70 // used for cleanup in ucol_cleanup
71 static UDataMemory
* UCA_DATA_MEM
= NULL
;
73 // this is static pointer to the normalizer fcdTrieIndex
74 // it is always the same between calls to u_cleanup
75 // and therefore writing to it is not synchronized.
76 // It is cleaned in ucol_cleanup
77 static const uint16_t *fcdTrieIndex
=NULL
;
80 static UBool U_CALLCONV
81 isAcceptableUCA(void * /*context*/,
82 const char * /*type*/, const char * /*name*/,
83 const UDataInfo
*pInfo
){
84 /* context, type & name are intentionally not used */
85 if( pInfo
->size
>=20 &&
86 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
87 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
88 pInfo
->dataFormat
[0]==UCA_DATA_FORMAT_0
&& /* dataFormat="UCol" */
89 pInfo
->dataFormat
[1]==UCA_DATA_FORMAT_1
&&
90 pInfo
->dataFormat
[2]==UCA_DATA_FORMAT_2
&&
91 pInfo
->dataFormat
[3]==UCA_DATA_FORMAT_3
&&
92 pInfo
->formatVersion
[0]==UCA_FORMAT_VERSION_0
&&
93 pInfo
->formatVersion
[1]>=UCA_FORMAT_VERSION_1
// &&
94 //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
95 //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
96 //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
98 UVersionInfo UCDVersion
;
99 u_getUnicodeVersion(UCDVersion
);
100 if(pInfo
->dataVersion
[0]==UCDVersion
[0] &&
101 pInfo
->dataVersion
[1]==UCDVersion
[1]) { // &&
102 //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
103 //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
114 static int32_t U_CALLCONV
115 _getFoldingOffset(uint32_t data
) {
116 return (int32_t)(data
&0xFFFFFF);
122 inline void IInit_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
123 int32_t sourceLen
, collIterate
*s
) {
124 (s
)->string
= (s
)->pos
= (UChar
*)(sourceString
);
127 if (sourceLen
>= 0) {
128 s
->flags
|= UCOL_ITER_HASLEN
;
129 (s
)->endp
= (UChar
*)sourceString
+sourceLen
;
132 /* change to enable easier checking for end of string for fcdpositon */
135 (s
)->CEpos
= (s
)->toReturn
= (s
)->CEs
;
136 (s
)->writableBuffer
= (s
)->stackWritableBuffer
;
137 (s
)->writableBufSize
= UCOL_WRITABLE_BUFFER_SIZE
;
138 (s
)->coll
= (collator
);
139 (s
)->fcdPosition
= 0;
140 if(collator
->normalizationMode
== UCOL_ON
) {
141 (s
)->flags
|= UCOL_ITER_NORM
;
143 if(collator
->hiraganaQ
== UCOL_ON
&& collator
->strength
>= UCOL_QUATERNARY
) {
144 (s
)->flags
|= UCOL_HIRAGANA_Q
;
146 (s
)->iterator
= NULL
;
147 //(s)->iteratorIndex = 0;
150 U_CAPI
void U_EXPORT2
151 uprv_init_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
152 int32_t sourceLen
, collIterate
*s
){
153 /* Out-of-line version for use from other files. */
154 IInit_collIterate(collator
, sourceString
, sourceLen
, s
);
159 * Backup the state of the collIterate struct data
160 * @param data collIterate to backup
161 * @param backup storage
164 inline void backupState(const collIterate
*data
, collIterateState
*backup
)
166 backup
->fcdPosition
= data
->fcdPosition
;
167 backup
->flags
= data
->flags
;
168 backup
->origFlags
= data
->origFlags
;
169 backup
->pos
= data
->pos
;
170 backup
->bufferaddress
= data
->writableBuffer
;
171 backup
->buffersize
= data
->writableBufSize
;
172 if(data
->iterator
!= NULL
) {
173 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
174 backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
);
175 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
176 backup
->iteratorMove
= 0;
177 if(backup
->iteratorIndex
== UITER_NO_STATE
) {
178 while((backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
)) == UITER_NO_STATE
) {
179 backup
->iteratorMove
++;
180 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
182 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
188 * Loads the state into the collIterate struct data
189 * @param data collIterate to backup
190 * @param backup storage
191 * @param forwards boolean to indicate if forwards iteration is used,
192 * false indicates backwards iteration
195 inline void loadState(collIterate
*data
, const collIterateState
*backup
,
198 UErrorCode status
= U_ZERO_ERROR
;
199 data
->flags
= backup
->flags
;
200 data
->origFlags
= backup
->origFlags
;
201 if(data
->iterator
!= NULL
) {
202 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
203 data
->iterator
->setState(data
->iterator
, backup
->iteratorIndex
, &status
);
204 if(backup
->iteratorMove
!= 0) {
205 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
208 data
->pos
= backup
->pos
;
209 if ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
210 data
->writableBuffer
!= backup
->bufferaddress
) {
212 this is when a new buffer has been reallocated and we'll have to
213 calculate the new position.
214 note the new buffer has to contain the contents of the old buffer.
217 data
->pos
= data
->writableBuffer
+
218 (data
->pos
- backup
->bufferaddress
);
221 /* backwards direction */
222 uint32_t temp
= backup
->buffersize
-
223 (data
->pos
- backup
->bufferaddress
);
224 data
->pos
= data
->writableBuffer
+ (data
->writableBufSize
- temp
);
227 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
229 this is alittle tricky.
230 if we are initially not in the normalization buffer, even if we
231 normalize in the later stage, the data in the buffer will be
232 ignored, since we skip back up to the data string.
233 however if we are already in the normalization buffer, any
234 further normalization will pull data into the normalization
235 buffer and modify the fcdPosition.
236 since we are keeping the data in the buffer for use, the
237 fcdPosition can not be reverted back.
240 data
->fcdPosition
= backup
->fcdPosition
;
247 * Checks for a collIterate being positioned at the end of
252 inline UBool
collIter_eos(collIterate
*s
) {
253 if(s
->flags
& UCOL_USE_ITERATOR
) {
254 return !(s
->iterator
->hasNext(s
->iterator
));
256 if ((s
->flags
& UCOL_ITER_HASLEN
) == 0 && *s
->pos
!= 0) {
257 // Null terminated string, but not at null, so not at end.
258 // Whether in main or normalization buffer doesn't matter.
262 // String with length. Can't be in normalization buffer, which is always
264 if (s
->flags
& UCOL_ITER_HASLEN
) {
265 return (s
->pos
== s
->endp
);
268 // We are at a null termination, could be either normalization buffer or main string.
269 if ((s
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
270 // At null at end of main string.
274 // At null at end of normalization buffer. Need to check whether there there are
275 // any characters left in the main buffer.
276 if(s
->origFlags
& UCOL_USE_ITERATOR
) {
277 return !(s
->iterator
->hasNext(s
->iterator
));
278 } else if ((s
->origFlags
& UCOL_ITER_HASLEN
) == 0) {
279 // Null terminated main string. fcdPosition is the 'return' position into main buf.
280 return (*s
->fcdPosition
== 0);
283 // Main string with an end pointer.
284 return s
->fcdPosition
== s
->endp
;
290 * Checks for a collIterate being positioned at the start of
295 inline UBool
collIter_bos(collIterate
*source
) {
296 // if we're going backwards, we need to know whether there is more in the
297 // iterator, even if we are in the side buffer
298 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
299 return !source
->iterator
->hasPrevious(source
->iterator
);
301 if (source
->pos
<= source
->string
||
302 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
303 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)) {
310 inline UBool
collIter_SimpleBos(collIterate
*source
) {
311 // if we're going backwards, we need to know whether there is more in the
312 // iterator, even if we are in the side buffer
313 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
314 return !source
->iterator
->hasPrevious(source
->iterator
);
316 if (source
->pos
== source
->string
) {
321 //return (data->pos == data->string) ||
325 * Checks and free writable buffer if it is not the original stack buffer
326 * in collIterate. This function does not reassign the writable buffer.
327 * @param data collIterate struct to determine and free the writable buffer
330 inline void freeHeapWritableBuffer(collIterate
*data
)
332 if (data
->writableBuffer
!= data
->stackWritableBuffer
) {
333 uprv_free(data
->writableBuffer
);
338 /****************************************************************************/
339 /* Following are the open/close functions */
341 /****************************************************************************/
343 tryOpeningFromRules(UResourceBundle
*collElem
, UErrorCode
*status
) {
344 int32_t rulesLen
= 0;
345 const UChar
*rules
= ures_getStringByKey(collElem
, "Sequence", &rulesLen
, status
);
346 return ucol_openRules(rules
, rulesLen
, UCOL_DEFAULT
, UCOL_DEFAULT
, NULL
, status
);
352 ucol_open(const char *loc
,
355 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN
);
356 UTRACE_DATA1(UTRACE_INFO
, "locale = \"%s\"", loc
);
357 UCollator
*result
= NULL
;
360 #if !UCONFIG_NO_SERVICE
361 result
= Collator::createUCollator(loc
, status
);
365 result
= ucol_open_internal(loc
, status
);
367 UTRACE_EXIT_PTR_STATUS(result
, *status
);
374 ucol_open_internal(const char *loc
,
377 const UCollator
* UCA
= ucol_initUCA(status
);
380 if(U_FAILURE(*status
)) return 0;
384 UCollator
*result
= NULL
;
385 UResourceBundle
*b
= ures_open(U_ICUDATA_COLL
, loc
, status
);
387 /* we try to find stuff from keyword */
388 UResourceBundle
*collations
= ures_getByKey(b
, "collations", NULL
, status
);
389 UResourceBundle
*collElem
= NULL
;
391 // if there is a keyword, we pick it up and try to get elements
392 if(!uloc_getKeywordValue(loc
, "collation", keyBuffer
, 256, status
)) {
393 // no keyword. we try to find the default setting, which will give us the keyword value
394 UResourceBundle
*defaultColl
= ures_getByKeyWithFallback(collations
, "default", NULL
, status
);
395 if(U_SUCCESS(*status
)) {
396 int32_t defaultKeyLen
= 0;
397 const UChar
*defaultKey
= ures_getString(defaultColl
, &defaultKeyLen
, status
);
398 u_UCharsToChars(defaultKey
, keyBuffer
, defaultKeyLen
);
399 keyBuffer
[defaultKeyLen
] = 0;
401 *status
= U_INTERNAL_PROGRAM_ERROR
;
404 ures_close(defaultColl
);
406 collElem
= ures_getByKeyWithFallback(collations
, keyBuffer
, collElem
, status
);
408 UResourceBundle
*binary
= NULL
;
409 UErrorCode binaryStatus
= U_ZERO_ERROR
;
411 if(*status
== U_MISSING_RESOURCE_ERROR
) { /* We didn't find the tailoring data, we fallback to the UCA */
412 *status
= U_USING_DEFAULT_WARNING
;
413 result
= ucol_initCollator(UCA
->image
, result
, UCA
, status
);
414 // if we use UCA, real locale is root
415 result
->rb
= ures_open(U_ICUDATA_COLL
, "", status
);
416 result
->elements
= ures_open(U_ICUDATA_COLL
, "", status
);
417 if(U_FAILURE(*status
)) {
421 result
->hasRealData
= FALSE
;
422 } else if(U_SUCCESS(*status
)) {
423 binary
= ures_getByKey(collElem
, "%%CollationBin", NULL
, &binaryStatus
);
425 if(binaryStatus
== U_MISSING_RESOURCE_ERROR
) { /* we didn't find the binary image, we should use the rules */
427 result
= tryOpeningFromRules(collElem
, status
);
428 if(U_FAILURE(*status
)) {
431 } else if(U_SUCCESS(*status
)) { /* otherwise, we'll pick a collation data that exists */
433 const uint8_t *inData
= ures_getBinary(binary
, &len
, status
);
434 UCATableHeader
*colData
= (UCATableHeader
*)inData
;
435 if(uprv_memcmp(colData
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
436 uprv_memcmp(colData
->UCDVersion
, UCA
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0 ||
437 colData
->version
[0] != UCOL_BUILDER_VERSION
) {
438 *status
= U_DIFFERENT_UCA_VERSION
;
439 result
= tryOpeningFromRules(collElem
, status
);
441 if(U_FAILURE(*status
)){
444 if((uint32_t)len
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
445 result
= ucol_initCollator((const UCATableHeader
*)inData
, result
, UCA
, status
);
446 if(U_FAILURE(*status
)){
449 result
->hasRealData
= TRUE
;
451 result
= ucol_initCollator(UCA
->image
, result
, UCA
, status
);
452 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(inData
+((const UCATableHeader
*)inData
)->options
), status
);
453 if(U_FAILURE(*status
)){
456 result
->hasRealData
= FALSE
;
458 result
->freeImageOnClose
= FALSE
;
462 result
->elements
= collElem
;
463 } else { /* There is another error, and we're just gonna clean up */
466 ures_close(collElem
);
467 ures_close(collations
);
472 result
->validLocale
= NULL
; // default is to use rb info
475 loc
= ures_getLocale(result
->rb
, status
);
477 result
->requestedLocale
= (char *)uprv_malloc((uprv_strlen(loc
)+1)*sizeof(char));
479 if (result
->requestedLocale
== NULL
) {
480 *status
= U_MEMORY_ALLOCATION_ERROR
;
481 ures_close(b
); // ??? appears needed
482 ures_close(collElem
);
483 ures_close(collations
);
484 ures_close(binary
); // ??? appears needed
487 uprv_strcpy(result
->requestedLocale
, loc
);
490 ures_close(collations
); //??? we have to decide on that. Probably affects something :)
495 U_CAPI
void U_EXPORT2
496 ucol_setReqValidLocales(UCollator
*coll
, char *requestedLocaleToAdopt
, char *validLocaleToAdopt
)
499 if (coll
->validLocale
) {
500 uprv_free(coll
->validLocale
);
502 coll
->validLocale
= validLocaleToAdopt
;
503 if (coll
->requestedLocale
) { // should always have
504 uprv_free(coll
->requestedLocale
);
506 coll
->requestedLocale
= requestedLocaleToAdopt
;
510 U_CAPI
void U_EXPORT2
511 ucol_close(UCollator
*coll
)
513 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE
);
514 UTRACE_DATA1(UTRACE_INFO
, "coll = %p", coll
);
516 // these are always owned by each UCollator struct,
517 // so we always free them
518 if(coll
->validLocale
!= NULL
) {
519 uprv_free(coll
->validLocale
);
521 if(coll
->requestedLocale
!= NULL
) {
522 uprv_free(coll
->requestedLocale
);
525 /* Here, it would be advisable to close: */
526 /* - UData for UCA (unless we stuff it in the root resb */
527 /* Again, do we need additional housekeeping... HMMM! */
528 UTRACE_DATA1(UTRACE_INFO
, "coll->freeOnClose: %d", coll
->freeOnClose
);
529 if(coll
->freeOnClose
){
530 /* for safeClone, if freeOnClose is FALSE,
531 don't free the other instance data */
532 if(coll
->freeOptionsOnClose
!= FALSE
) {
533 if(coll
->options
!= NULL
) {
534 uprv_free(coll
->options
);
537 if(coll
->mapping
!= NULL
) {
538 /*ucmpe32_close(coll->mapping);*/
539 uprv_free(coll
->mapping
);
541 if(coll
->rules
!= NULL
&& coll
->freeRulesOnClose
) {
542 uprv_free((UChar
*)coll
->rules
);
544 if(coll
->rb
!= NULL
) { /* pointing to read-only memory */
545 ures_close(coll
->rb
);
547 if(coll
->freeImageOnClose
== TRUE
) {
548 uprv_free((UCATableHeader
*)coll
->image
);
550 if(coll
->elements
!= NULL
) {
551 ures_close(coll
->elements
);
553 if(coll
->latinOneCEs
!= NULL
) {
554 uprv_free(coll
->latinOneCEs
);
562 U_CAPI UCollator
* U_EXPORT2
563 ucol_openRules( const UChar
*rules
,
565 UColAttributeValue normalizationMode
,
566 UCollationStrength strength
,
567 UParseError
*parseError
,
570 uint32_t listLen
= 0;
572 UColAttributeValue norm
;
575 if(status
== NULL
|| U_FAILURE(*status
)){
580 if (U_FAILURE(*status
)) {
584 if(rulesLength
< -1 || (rules
== NULL
&& rulesLength
!= 0)) {
585 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
589 if(rulesLength
== -1) {
590 rulesLength
= u_strlen(rules
);
593 if(parseError
== NULL
){
597 switch(normalizationMode
) {
601 norm
= normalizationMode
;
604 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
608 UCollator
*UCA
= ucol_initUCA(status
);
610 if(U_FAILURE(*status
)){
614 ucol_tok_initTokenList(&src
, rules
, rulesLength
, UCA
, status
);
615 listLen
= ucol_tok_assembleTokenList(&src
,parseError
, status
);
617 if(U_FAILURE(*status
)) {
618 /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
619 /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
620 /* so something might be done here... or on lower level */
622 if(*status
== U_ILLEGAL_ARGUMENT_ERROR
) {
623 fprintf(stderr
, "bad option starting at offset %i\n", src
.current
-src
.source
);
625 fprintf(stderr
, "invalid rule just before offset %i\n", src
.current
-src
.source
);
628 ucol_tok_closeTokenList(&src
);
631 UCollator
*result
= NULL
;
632 UCATableHeader
*table
= NULL
;
634 if(src
.resultLen
> 0 || src
.removeSet
!= NULL
) { /* we have a set of rules, let's make something of it */
635 /* also, if we wanted to remove some contractions, we should make a tailoring */
636 table
= ucol_assembleTailoringTable(&src
, status
);
637 if(U_SUCCESS(*status
)) {
639 table
->version
[0] = UCOL_BUILDER_VERSION
;
640 // no tailoring information on this level
641 table
->version
[1] = table
->version
[2] = table
->version
[3] = 0;
643 u_getUnicodeVersion(table
->UCDVersion
);
645 uprv_memcpy(table
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
646 result
= ucol_initCollator(table
, 0, UCA
, status
);
647 result
->hasRealData
= TRUE
;
648 result
->freeImageOnClose
= TRUE
;
650 } else { /* no rules, but no error either */
651 // must be only options
652 // We will init the collator from UCA
653 result
= ucol_initCollator(UCA
->image
, 0, UCA
, status
);
654 // And set only the options
655 UColOptionSet
*opts
= (UColOptionSet
*)uprv_malloc(sizeof(UColOptionSet
));
658 *status
= U_MEMORY_ALLOCATION_ERROR
;
661 uprv_memcpy(opts
, src
.opts
, sizeof(UColOptionSet
));
662 ucol_setOptionsFromHeader(result
, opts
, status
);
663 result
->freeOptionsOnClose
= TRUE
;
664 result
->hasRealData
= FALSE
;
665 result
->freeImageOnClose
= FALSE
;
668 if(U_SUCCESS(*status
)) {
670 result
->dataInfo
.dataVersion
[0] = UCOL_BUILDER_VERSION
;
671 if(rulesLength
> 0) {
672 newRules
= (UChar
*)uprv_malloc((rulesLength
+1)*U_SIZEOF_UCHAR
);
674 if (newRules
== NULL
) {
675 *status
= U_MEMORY_ALLOCATION_ERROR
;
678 uprv_memcpy(newRules
, rules
, rulesLength
*U_SIZEOF_UCHAR
);
679 newRules
[rulesLength
]=0;
680 result
->rules
= newRules
;
681 result
->rulesLength
= rulesLength
;
682 result
->freeRulesOnClose
= TRUE
;
685 result
->elements
= NULL
;
686 result
->validLocale
= NULL
;
687 result
->requestedLocale
= NULL
;
688 ucol_setAttribute(result
, UCOL_STRENGTH
, strength
, status
);
689 ucol_setAttribute(result
, UCOL_NORMALIZATION_MODE
, norm
, status
);
702 ucol_tok_closeTokenList(&src
);
707 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
708 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
709 U_CAPI
uint8_t* U_EXPORT2
710 ucol_cloneRuleData(const UCollator
*coll
, int32_t *length
, UErrorCode
*status
)
712 uint8_t *result
= NULL
;
713 if(U_FAILURE(*status
)) {
716 if(coll
->hasRealData
== TRUE
) {
717 *length
= coll
->image
->size
;
718 result
= (uint8_t *)uprv_malloc(*length
);
720 if (result
== NULL
) {
721 *status
= U_MEMORY_ALLOCATION_ERROR
;
724 uprv_memcpy(result
, coll
->image
, *length
);
726 *length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
727 result
= (uint8_t *)uprv_malloc(*length
);
729 if (result
== NULL
) {
730 *status
= U_MEMORY_ALLOCATION_ERROR
;
734 /* build the UCATableHeader with minimal entries */
735 /* do not copy the header from the UCA file because its values are wrong! */
736 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
738 /* reset everything */
739 uprv_memset(result
, 0, *length
);
741 /* set the tailoring-specific values */
742 UCATableHeader
*myData
= (UCATableHeader
*)result
;
743 myData
->size
= *length
;
745 /* offset for the options, the only part of the data that is present after the header */
746 myData
->options
= sizeof(UCATableHeader
);
748 /* need to always set the expansion value for an upper bound of the options */
749 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
751 myData
->magic
= UCOL_HEADER_MAGIC
;
752 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
753 myData
->charSetFamily
= U_CHARSET_FAMILY
;
755 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
756 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
758 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
759 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
760 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
761 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
763 /* copy the collator options */
764 uprv_memcpy(result
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
769 void ucol_setOptionsFromHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
770 if(U_FAILURE(*status
)) {
773 result
->caseFirst
= (UColAttributeValue
)opts
->caseFirst
;
774 result
->caseLevel
= (UColAttributeValue
)opts
->caseLevel
;
775 result
->frenchCollation
= (UColAttributeValue
)opts
->frenchCollation
;
776 result
->normalizationMode
= (UColAttributeValue
)opts
->normalizationMode
;
777 result
->strength
= (UColAttributeValue
)opts
->strength
;
778 result
->variableTopValue
= opts
->variableTopValue
;
779 result
->alternateHandling
= (UColAttributeValue
)opts
->alternateHandling
;
780 result
->hiraganaQ
= (UColAttributeValue
)opts
->hiraganaQ
;
781 result
->numericCollation
= (UColAttributeValue
)opts
->numericCollation
;
783 result
->caseFirstisDefault
= TRUE
;
784 result
->caseLevelisDefault
= TRUE
;
785 result
->frenchCollationisDefault
= TRUE
;
786 result
->normalizationModeisDefault
= TRUE
;
787 result
->strengthisDefault
= TRUE
;
788 result
->variableTopValueisDefault
= TRUE
;
789 result
->hiraganaQisDefault
= TRUE
;
790 result
->numericCollationisDefault
= TRUE
;
792 ucol_updateInternalState(result
, status
);
794 result
->options
= opts
;
798 // doesn't look like anybody is using this
799 void ucol_putOptionsToHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
800 if(U_FAILURE(*status
)) {
803 opts
->caseFirst
= result
->caseFirst
;
804 opts
->caseLevel
= result
->caseLevel
;
805 opts
->frenchCollation
= result
->frenchCollation
;
806 opts
->normalizationMode
= result
->normalizationMode
;
807 opts
->strength
= result
->strength
;
808 opts
->variableTopValue
= result
->variableTopValue
;
809 opts
->alternateHandling
= result
->alternateHandling
;
810 opts
->hiraganaQ
= result
->hiraganaQ
;
811 opts
->numericCollation
= result
->numericCollation
;
817 * Approximate determination if a character is at a contraction end.
818 * Guaranteed to be TRUE if a character is at the end of a contraction,
819 * otherwise it is not deterministic.
820 * @param c character to be determined
821 * @param coll collator
824 inline UBool
ucol_contractionEndCP(UChar c
, const UCollator
*coll
) {
825 if (UTF_IS_TRAIL(c
)) {
829 if (c
< coll
->minContrEndCP
) {
835 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
836 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
838 htbyte
= coll
->contrEndCP
[hash
>>3];
839 return (((htbyte
>> (hash
& 7)) & 1) == 1);
845 * i_getCombiningClass()
846 * A fast, at least partly inline version of u_getCombiningClass()
847 * This is a candidate for further optimization. Used heavily
848 * in contraction processing.
851 inline uint8_t i_getCombiningClass(UChar c
, const UCollator
*coll
) {
853 if (c
>= 0x300 && ucol_unsafeCP(c
, coll
)) {
854 sCC
= u_getCombiningClass(c
);
860 UCollator
* ucol_initCollator(const UCATableHeader
*image
, UCollator
*fillIn
, const UCollator
*UCA
, UErrorCode
*status
) {
862 UCollator
*result
= fillIn
;
863 if(U_FAILURE(*status
) || image
== NULL
) {
868 result
= (UCollator
*)uprv_malloc(sizeof(UCollator
));
870 *status
= U_MEMORY_ALLOCATION_ERROR
;
873 result
->freeOnClose
= TRUE
;
875 result
->freeOnClose
= FALSE
;
878 result
->image
= image
;
879 const uint8_t *mapping
= (uint8_t*)result
->image
+result
->image
->mappingPosition
;
880 /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
881 UTrie
*newUCAmapping
= (UTrie
*)uprv_malloc(sizeof(UTrie
));
882 if(newUCAmapping
!= NULL
) {
883 utrie_unserialize(newUCAmapping
, mapping
, result
->image
->endExpansionCE
- result
->image
->mappingPosition
, status
);
885 *status
= U_MEMORY_ALLOCATION_ERROR
;
886 if(result
->freeOnClose
== TRUE
) {
892 if(U_SUCCESS(*status
)) {
893 result
->mapping
= newUCAmapping
;
895 if(result
->freeOnClose
== TRUE
) {
899 uprv_free(newUCAmapping
);
903 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
904 result
->latinOneMapping
= UTRIE_GET32_LATIN1(result
->mapping
);
905 result
->contractionCEs
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->contractionCEs
);
906 result
->contractionIndex
= (UChar
*)((uint8_t*)result
->image
+result
->image
->contractionIndex
);
907 result
->expansion
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->expansion
);
909 result
->options
= (UColOptionSet
*)((uint8_t*)result
->image
+result
->image
->options
);
910 result
->freeOptionsOnClose
= FALSE
;
913 result
->caseFirst
= (UColAttributeValue
)result
->options
->caseFirst
;
914 result
->caseLevel
= (UColAttributeValue
)result
->options
->caseLevel
;
915 result
->frenchCollation
= (UColAttributeValue
)result
->options
->frenchCollation
;
916 result
->normalizationMode
= (UColAttributeValue
)result
->options
->normalizationMode
;
917 result
->strength
= (UColAttributeValue
)result
->options
->strength
;
918 result
->variableTopValue
= result
->options
->variableTopValue
;
919 result
->alternateHandling
= (UColAttributeValue
)result
->options
->alternateHandling
;
920 result
->hiraganaQ
= (UColAttributeValue
)result
->options
->hiraganaQ
;
921 result
->numericCollation
= (UColAttributeValue
)result
->options
->numericCollation
;
923 result
->caseFirstisDefault
= TRUE
;
924 result
->caseLevelisDefault
= TRUE
;
925 result
->frenchCollationisDefault
= TRUE
;
926 result
->normalizationModeisDefault
= TRUE
;
927 result
->strengthisDefault
= TRUE
;
928 result
->variableTopValueisDefault
= TRUE
;
929 result
->alternateHandlingisDefault
= TRUE
;
930 result
->hiraganaQisDefault
= TRUE
;
931 result
->numericCollationisDefault
= TRUE
;
933 result
->scriptOrder
= NULL
;
935 result
->rules
= NULL
;
936 result
->rulesLength
= 0;
938 /* get the version info from UCATableHeader and populate the Collator struct*/
939 result
->dataInfo
.dataVersion
[0] = result
->image
->version
[0]; /* UCA Builder version*/
940 result
->dataInfo
.dataVersion
[1] = result
->image
->version
[1]; /* UCA Tailoring rules version*/
942 result
->unsafeCP
= (uint8_t *)result
->image
+ result
->image
->unsafeCP
;
943 result
->minUnsafeCP
= 0;
944 for (c
=0; c
<0x300; c
++) { // Find the smallest unsafe char.
945 if (ucol_unsafeCP(c
, result
)) break;
947 result
->minUnsafeCP
= c
;
949 result
->contrEndCP
= (uint8_t *)result
->image
+ result
->image
->contrEndCP
;
950 result
->minContrEndCP
= 0;
951 for (c
=0; c
<0x300; c
++) { // Find the Contraction-ending char.
952 if (ucol_contractionEndCP(c
, result
)) break;
954 result
->minContrEndCP
= c
;
956 /* max expansion tables */
957 result
->endExpansionCE
= (uint32_t*)((uint8_t*)result
->image
+
958 result
->image
->endExpansionCE
);
959 result
->lastEndExpansionCE
= result
->endExpansionCE
+
960 result
->image
->endExpansionCECount
- 1;
961 result
->expansionCESize
= (uint8_t*)result
->image
+
962 result
->image
->expansionCESize
;
965 //result->errorCode = *status;
967 result
->latinOneCEs
= NULL
;
969 result
->latinOneRegenTable
= FALSE
;
970 result
->latinOneFailed
= FALSE
;
973 ucol_updateInternalState(result
, status
);
979 /* new Mark's code */
982 * For generation of Implicit CEs
985 * Cleaned up so that changes can be made more easily.
987 # First Implicit: E26A792D
988 # Last Implicit: E3DC70C0
989 # First CJK: E0030300
991 # First CJK_A: E0A9DF00
992 # Last CJK_A: E0DE3100
994 /* Following is a port of Mark's code for new treatment of implicits.
995 * It is positioned here, since ucol_initUCA need to initialize the
996 * variables below according to the data in the fractional UCA.
1001 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1002 * b) bump any non-CJK characters by 10FFFF.
1003 * The relevant blocks are:
1004 * A: 4E00..9FFF; CJK Unified Ideographs
1005 * F900..FAFF; CJK Compatibility Ideographs
1006 * B: 3400..4DBF; CJK Unified Ideographs Extension A
1007 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
1009 * no new B characters are allocated between 4E00 and FAFF, and
1010 * no new A characters are outside of this range,
1011 * (very high probability) this simple code will work.
1012 * The reordered blocks are:
1014 * Block2 is CJK_COMPAT_USED
1017 * Any other CJK gets its normal code point
1018 * Any non-CJK gets +10FFFF
1019 * When we reorder Block1, we make sure that it is at the very start,
1020 * so that it will use a 3-byte form.
1021 * Warning: the we only pick up the compatibility characters that are
1022 * NOT decomposed, so that block is smaller!
1026 static const UChar32
1027 NON_CJK_OFFSET
= 0x110000,
1028 UCOL_MAX_INPUT
= 0x220001; // 2 * Unicode range + 2
1031 * Precomputed by constructor
1034 final3Multiplier
= 0,
1035 final4Multiplier
= 0,
1048 static const UChar32
1050 CJK_LIMIT
= 0x9FFF+1,
1051 CJK_COMPAT_USED_BASE
= 0xFA0E,
1052 CJK_COMPAT_USED_LIMIT
= 0xFA2F+1,
1053 CJK_A_BASE
= 0x3400,
1054 CJK_A_LIMIT
= 0x4DBF+1,
1055 CJK_B_BASE
= 0x20000,
1056 CJK_B_LIMIT
= 0x2A6DF+1;
1058 static UChar32
swapCJK(UChar32 i
) {
1060 if (i
>= CJK_BASE
) {
1061 if (i
< CJK_LIMIT
) return i
- CJK_BASE
;
1063 if (i
< CJK_COMPAT_USED_BASE
) return i
+ NON_CJK_OFFSET
;
1065 if (i
< CJK_COMPAT_USED_LIMIT
) return i
- CJK_COMPAT_USED_BASE
1066 + (CJK_LIMIT
- CJK_BASE
);
1067 if (i
< CJK_B_BASE
) return i
+ NON_CJK_OFFSET
;
1069 if (i
< CJK_B_LIMIT
) return i
; // non-BMP-CJK
1071 return i
+ NON_CJK_OFFSET
; // non-CJK
1073 if (i
< CJK_A_BASE
) return i
+ NON_CJK_OFFSET
;
1075 if (i
< CJK_A_LIMIT
) return i
- CJK_A_BASE
1076 + (CJK_LIMIT
- CJK_BASE
)
1077 + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
1078 return i
+ NON_CJK_OFFSET
; // non-CJK
1081 U_CAPI UChar32 U_EXPORT2
1082 uprv_uca_getRawFromCodePoint(UChar32 i
) {
1083 return swapCJK(i
)+1;
1086 U_CAPI UChar32 U_EXPORT2
1087 uprv_uca_getCodePointFromRaw(UChar32 i
) {
1090 if(i
>= NON_CJK_OFFSET
) {
1091 result
= i
- NON_CJK_OFFSET
;
1092 } else if(i
>= CJK_B_BASE
) {
1094 } else if(i
< CJK_A_LIMIT
+ (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) { // rest of CJKs, compacted
1095 if(i
< CJK_LIMIT
- CJK_BASE
) {
1096 result
= i
+ CJK_BASE
;
1097 } else if(i
< (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) {
1098 result
= i
+ CJK_COMPAT_USED_BASE
- (CJK_LIMIT
- CJK_BASE
);
1100 result
= i
+ CJK_A_BASE
- (CJK_LIMIT
- CJK_BASE
) - (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
1108 // GET IMPLICIT PRIMARY WEIGHTS
1109 // Return value is left justified primary key
1110 U_CAPI
uint32_t U_EXPORT2
1111 uprv_uca_getImplicitFromRaw(UChar32 cp
) {
1113 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1114 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1117 int32_t last0
= cp
- min4Boundary
;
1119 int32_t last1
= cp
/ final3Count
;
1120 last0
= cp
% final3Count
;
1122 int32_t last2
= last1
/ medialCount
;
1123 last1
%= medialCount
;
1125 last0
= minTrail
+ last0
*final3Multiplier
; // spread out, leaving gap at start
1126 last1
= minTrail
+ last1
; // offset
1127 last2
= min3Primary
+ last2
; // offset
1129 if (last2 >= min4Primary) {
1130 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1133 return (last2
<< 24) + (last1
<< 16) + (last0
<< 8);
1135 int32_t last1
= last0
/ final4Count
;
1136 last0
%= final4Count
;
1138 int32_t last2
= last1
/ medialCount
;
1139 last1
%= medialCount
;
1141 int32_t last3
= last2
/ medialCount
;
1142 last2
%= medialCount
;
1144 last0
= minTrail
+ last0
*final4Multiplier
; // spread out, leaving gap at start
1145 last1
= minTrail
+ last1
; // offset
1146 last2
= minTrail
+ last2
; // offset
1147 last3
= min4Primary
+ last3
; // offset
1149 if (last3 > max4Primary) {
1150 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1153 return (last3
<< 24) + (last2
<< 16) + (last1
<< 8) + last0
;
1157 U_CAPI
uint32_t U_EXPORT2
1158 uprv_uca_getImplicitPrimary(UChar32 cp
) {
1159 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1163 // we now have a range of numbers from 0 to 21FFFF.
1165 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1167 return uprv_uca_getImplicitFromRaw(cp
);
1171 * Converts implicit CE into raw integer ("code point")
1173 * @return -1 if illegal format
1175 U_CAPI UChar32 U_EXPORT2
1176 uprv_uca_getRawFromImplicit(uint32_t implicit
) {
1178 UChar32 b3
= implicit
& 0xFF;
1180 UChar32 b2
= implicit
& 0xFF;
1182 UChar32 b1
= implicit
& 0xFF;
1184 UChar32 b0
= implicit
& 0xFF;
1186 // simple parameter checks
1187 if (b0
< min3Primary
|| b0
> max4Primary
1188 || b1
< minTrail
|| b1
> maxTrail
) return -1;
1192 // take care of the final values, and compose
1193 if (b0
< min4Primary
) {
1194 if (b2
< minTrail
|| b2
> max3Trail
|| b3
!= 0) return -1;
1196 UChar32 remainder
= b2
% final3Multiplier
;
1197 if (remainder
!= 0) return -1;
1199 b2
/= final3Multiplier
;
1200 result
= ((b0
* medialCount
) + b1
) * final3Count
+ b2
;
1202 if (b2
< minTrail
|| b2
> maxTrail
1203 || b3
< minTrail
|| b3
> max4Trail
) return -1;
1206 UChar32 remainder
= b3
% final4Multiplier
;
1207 if (remainder
!= 0) return -1;
1208 b3
/= final4Multiplier
;
1210 result
= (((b0
* medialCount
) + b1
) * medialCount
+ b2
) * final4Count
+ b3
+ min4Boundary
;
1213 if (result
< 0 || result
> UCOL_MAX_INPUT
) return -1;
1218 static inline int32_t divideAndRoundUp(int a
, int b
) {
1222 /* this function is either called from initUCA or from genUCA before
1223 * doing canonical closure for the UCA.
1227 * Set up to generate implicits.
1230 * @param minTrail final byte
1231 * @param maxTrail final byte
1232 * @param gap3 the gap we leave for tailoring for 3-byte forms
1233 * @param gap4 the gap we leave for tailoring for 4-byte forms
1235 static void initImplicitConstants(int minPrimary
, int maxPrimary
,
1236 int minTrailIn
, int maxTrailIn
,
1237 int gap3
, int primaries3count
,
1238 UErrorCode
*status
) {
1239 // some simple parameter checks
1240 if (minPrimary
< 0 || minPrimary
>= maxPrimary
|| maxPrimary
> 0xFF) {
1241 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1244 if (minTrailIn
< 0 || minTrailIn
>= maxTrailIn
|| maxTrailIn
> 0xFF) {
1245 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1248 if (primaries3count
< 1) {
1249 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1253 minTrail
= minTrailIn
;
1254 maxTrail
= maxTrailIn
;
1256 min3Primary
= minPrimary
;
1257 max4Primary
= maxPrimary
;
1258 // compute constants for use later.
1259 // number of values we can use in trailing bytes
1260 // leave room for empty values between AND above, e.g. if gap = 2
1261 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1262 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1263 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1264 final3Multiplier
= gap3
+ 1;
1265 final3Count
= (maxTrail
- minTrail
+ 1) / final3Multiplier
;
1266 max3Trail
= minTrail
+ (final3Count
- 1) * final3Multiplier
;
1268 // medials can use full range
1269 medialCount
= (maxTrail
- minTrail
+ 1);
1270 // find out how many values fit in each form
1271 int32_t threeByteCount
= medialCount
* final3Count
;
1272 // now determine where the 3/4 boundary is.
1273 // we use 3 bytes below the boundary, and 4 above
1274 int32_t primariesAvailable
= maxPrimary
- minPrimary
+ 1;
1275 int32_t primaries4count
= primariesAvailable
- primaries3count
;
1278 int32_t min3ByteCoverage
= primaries3count
* threeByteCount
;
1279 min4Primary
= minPrimary
+ primaries3count
;
1280 min4Boundary
= min3ByteCoverage
;
1281 // Now expand out the multiplier for the 4 bytes, and redo.
1283 int32_t totalNeeded
= UCOL_MAX_INPUT
- min4Boundary
;
1284 int32_t neededPerPrimaryByte
= divideAndRoundUp(totalNeeded
, primaries4count
);
1285 //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1286 int32_t neededPerFinalByte
= divideAndRoundUp(neededPerPrimaryByte
, medialCount
* medialCount
);
1287 //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1288 int32_t gap4
= (maxTrail
- minTrail
- 1) / neededPerFinalByte
;
1289 //if (DEBUG) System.out.println("expandedGap: " + gap4);
1291 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1294 final4Multiplier
= gap4
+ 1;
1295 final4Count
= neededPerFinalByte
;
1296 max4Trail
= minTrail
+ (final4Count
- 1) * final4Multiplier
;
1299 System.out.println("final4Count: " + final4Count);
1300 for (int counter = 0; counter <= final4Count; ++counter) {
1301 int value = minTrail + (1 + counter)*final4Multiplier;
1302 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1309 * Supply parameters for generating implicit CEs
1311 U_CAPI
void U_EXPORT2
1312 uprv_uca_initImplicitConstants(int32_t minPrimary
, int32_t maxPrimary
, UErrorCode
*status
) {
1313 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1314 initImplicitConstants(minPrimary
, maxPrimary
, 0x04, 0xFE, 1, 1, status
);
1318 static UBool U_CALLCONV
1322 udata_close(UCA_DATA_MEM
);
1323 UCA_DATA_MEM
= NULL
;
1326 ucol_close(_staticUCA
);
1329 fcdTrieIndex
= NULL
;
1334 /* do not close UCA returned by ucol_initUCA! */
1336 ucol_initUCA(UErrorCode
*status
) {
1337 if(U_FAILURE(*status
)) {
1341 UBool f
= (_staticUCA
== NULL
);
1345 UCollator
*newUCA
= NULL
;
1346 UDataMemory
*result
= udata_openChoice(NULL
, UCA_DATA_TYPE
, UCA_DATA_NAME
, isAcceptableUCA
, NULL
, status
);
1348 if(U_FAILURE(*status
)) {
1350 udata_close(result
);
1356 if (fcdTrieIndex
== NULL
) {
1357 fcdTrieIndex
= unorm_getFCDTrie(status
);
1358 ucln_i18n_registerCleanup(UCLN_I18N_UCOL
, ucol_cleanup
);
1361 if(result
!= NULL
) { /* It looks like sometimes we can fail to find the data file */
1362 newUCA
= ucol_initCollator((const UCATableHeader
*)udata_getMemory(result
), newUCA
, newUCA
, status
);
1363 if(U_SUCCESS(*status
)){
1365 newUCA
->elements
= NULL
;
1366 newUCA
->validLocale
= NULL
;
1367 newUCA
->requestedLocale
= NULL
;
1368 newUCA
->hasRealData
= FALSE
; // real data lives in .dat file...
1369 newUCA
->freeImageOnClose
= FALSE
;
1371 if(_staticUCA
== NULL
) {
1372 _staticUCA
= newUCA
;
1373 UCA_DATA_MEM
= result
;
1379 if(newUCA
!= NULL
) {
1380 udata_close(result
);
1384 ucln_i18n_registerCleanup(UCLN_I18N_UCOL
, ucol_cleanup
);
1386 // Initalize variables for implicit generation
1387 const UCAConstants
*UCAconsts
= (UCAConstants
*)((uint8_t *)_staticUCA
->image
+ _staticUCA
->image
->UCAConsts
);
1388 uprv_uca_initImplicitConstants(UCAconsts
->UCA_PRIMARY_IMPLICIT_MIN
, UCAconsts
->UCA_PRIMARY_IMPLICIT_MAX
, status
);
1389 _staticUCA
->mapping
->getFoldingOffset
= _getFoldingOffset
;
1391 udata_close(result
);
1401 /* collIterNormalize Incremental Normalization happens here. */
1402 /* pick up the range of chars identifed by FCD, */
1403 /* normalize it into the collIterate's writable buffer, */
1404 /* switch the collIterate's state to use the writable buffer. */
1407 void collIterNormalize(collIterate
*collationSource
)
1409 UErrorCode status
= U_ZERO_ERROR
;
1412 UChar
*srcP
= collationSource
->pos
- 1; /* Start of chars to normalize */
1413 UChar
*endP
= collationSource
->fcdPosition
; /* End of region to normalize+1 */
1415 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1416 srcP
, (int32_t)(endP
- srcP
),
1419 if(status
== U_BUFFER_OVERFLOW_ERROR
|| status
== U_STRING_NOT_TERMINATED_WARNING
) {
1420 // reallocate and terminate
1421 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1422 &collationSource
->writableBuffer
,
1423 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1427 fprintf(stderr
, "collIterNormalize(), out of memory\n");
1431 status
= U_ZERO_ERROR
;
1432 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1433 srcP
, (int32_t)(endP
- srcP
),
1437 if (U_FAILURE(status
)) {
1439 fprintf(stderr
, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status
));
1444 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1445 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1447 collationSource
->pos
= collationSource
->writableBuffer
;
1448 collationSource
->origFlags
= collationSource
->flags
;
1449 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1450 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1454 // This function takes the iterator and extracts normalized stuff up to the next boundary
1455 // It is similar in the end results to the collIterNormalize, but for the cases when we
1458 inline void normalizeIterator(collIterate
*collationSource
) {
1459 UErrorCode status
= U_ZERO_ERROR
;
1460 UBool wasNormalized
= FALSE
;
1461 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1462 uint32_t iterIndex
= collationSource
->iterator
->getState(collationSource
->iterator
);
1463 int32_t normLen
= unorm_next(collationSource
->iterator
, collationSource
->writableBuffer
,
1464 (int32_t)collationSource
->writableBufSize
, UNORM_FCD
, 0, TRUE
, &wasNormalized
, &status
);
1465 if(status
== U_BUFFER_OVERFLOW_ERROR
|| normLen
== (int32_t)collationSource
->writableBufSize
) {
1466 // reallocate and terminate
1467 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1468 &collationSource
->writableBuffer
,
1469 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1473 fprintf(stderr
, "normalizeIterator(), out of memory\n");
1477 status
= U_ZERO_ERROR
;
1478 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1479 collationSource
->iterator
->setState(collationSource
->iterator
, iterIndex
, &status
);
1480 normLen
= unorm_next(collationSource
->iterator
, collationSource
->writableBuffer
,
1481 (int32_t)collationSource
->writableBufSize
, UNORM_FCD
, 0, TRUE
, &wasNormalized
, &status
);
1483 // Terminate the buffer - we already checked that it is big enough
1484 collationSource
->writableBuffer
[normLen
] = 0;
1485 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1486 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1488 collationSource
->pos
= collationSource
->writableBuffer
;
1489 collationSource
->origFlags
= collationSource
->flags
;
1490 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1491 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1495 /* Incremental FCD check and normalize */
1496 /* Called from getNextCE when normalization state is suspect. */
1497 /* When entering, the state is known to be this: */
1498 /* o We are working in the main buffer of the collIterate, not the side */
1499 /* writable buffer. When in the side buffer, normalization mode is always off, */
1500 /* so we won't get here. */
1501 /* o The leading combining class from the current character is 0 or */
1502 /* the trailing combining class of the previous char was zero. */
1503 /* True because the previous call to this function will have always exited */
1504 /* that way, and we get called for every char where cc might be non-zero. */
1506 inline UBool
collIterFCD(collIterate
*collationSource
) {
1508 const UChar
*srcP
, *endP
;
1510 uint8_t prevTrailingCC
= 0;
1512 UBool needNormalize
= FALSE
;
1514 srcP
= collationSource
->pos
-1;
1516 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1517 endP
= collationSource
->endp
;
1522 // Get the trailing combining class of the current character. If it's zero,
1526 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1528 if (UTF_IS_FIRST_SURROGATE(c
)) {
1529 if ((endP
== NULL
|| srcP
!= endP
) && UTF_IS_SECOND_SURROGATE(c2
=*srcP
)) {
1531 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1537 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1539 if (prevTrailingCC
!= 0) {
1540 // The current char has a non-zero trailing CC. Scan forward until we find
1541 // a char with a leading cc of zero.
1542 while (endP
== NULL
|| srcP
!= endP
)
1544 const UChar
*savedSrcP
= srcP
;
1548 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1549 if (fcd
!= 0 && UTF_IS_FIRST_SURROGATE(c
)) {
1550 if ((endP
== NULL
|| srcP
!= endP
) && UTF_IS_SECOND_SURROGATE(c2
=*srcP
)) {
1552 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1557 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1558 if (leadingCC
== 0) {
1559 srcP
= savedSrcP
; // Hit char that is not part of combining sequence.
1560 // back up over it. (Could be surrogate pair!)
1564 if (leadingCC
< prevTrailingCC
) {
1565 needNormalize
= TRUE
;
1568 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1573 collationSource
->fcdPosition
= (UChar
*)srcP
;
1575 return needNormalize
;
1578 /****************************************************************************/
1579 /* Following are the CE retrieval functions */
1581 /****************************************************************************/
1583 /* there should be a macro version of this function in the header file */
1584 /* This is the first function that tries to fetch a collation element */
1585 /* If it's not succesfull or it encounters a more difficult situation */
1586 /* some more sofisticated and slower functions are invoked */
1588 inline uint32_t ucol_IGetNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1590 if (collationSource
->CEpos
> collationSource
->toReturn
) { /* Are there any CEs from previous expansions? */
1591 order
= *(collationSource
->toReturn
++); /* if so, return them */
1592 if(collationSource
->CEpos
== collationSource
->toReturn
) {
1593 collationSource
->CEpos
= collationSource
->toReturn
= collationSource
->CEs
;
1600 for (;;) /* Loop handles case when incremental normalize switches */
1601 { /* to or from the side buffer / original string, and we */
1602 /* need to start again to get the next character. */
1604 if ((collationSource
->flags
& (UCOL_ITER_HASLEN
| UCOL_ITER_INNORMBUF
| UCOL_ITER_NORM
| UCOL_HIRAGANA_Q
| UCOL_USE_ITERATOR
)) == 0)
1606 // The source string is null terminated and we're not working from the side buffer,
1607 // and we're not normalizing. This is the fast path.
1608 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1609 ch
= *collationSource
->pos
++;
1614 return UCOL_NO_MORE_CES
;
1618 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1619 // Normal path for strings when length is specified.
1620 // (We can't be in side buffer because it is always null terminated.)
1621 if (collationSource
->pos
>= collationSource
->endp
) {
1622 // Ran off of the end of the main source string. We're done.
1623 return UCOL_NO_MORE_CES
;
1625 ch
= *collationSource
->pos
++;
1627 else if(collationSource
->flags
& UCOL_USE_ITERATOR
) {
1628 UChar32 iterCh
= collationSource
->iterator
->next(collationSource
->iterator
);
1629 if(iterCh
== U_SENTINEL
) {
1630 return UCOL_NO_MORE_CES
;
1636 // Null terminated string.
1637 ch
= *collationSource
->pos
++;
1639 // Ran off end of buffer.
1640 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1641 // Ran off end of main string. backing up one character.
1642 collationSource
->pos
--;
1643 return UCOL_NO_MORE_CES
;
1647 // Hit null in the normalize side buffer.
1648 // Usually this means the end of the normalized data,
1649 // except for one odd case: a null followed by combining chars,
1650 // which is the case if we are at the start of the buffer.
1651 if (collationSource
->pos
== collationSource
->writableBuffer
+1) {
1655 // Null marked end of side buffer.
1656 // Revert to the main string and
1657 // loop back to top to try again to get a character.
1658 collationSource
->pos
= collationSource
->fcdPosition
;
1659 collationSource
->flags
= collationSource
->origFlags
;
1665 if(collationSource
->flags
&UCOL_HIRAGANA_Q
) {
1666 if((ch
>=0x3040 && ch
<=0x3094) || ch
== 0x309d || ch
== 0x309e) {
1667 collationSource
->flags
|= UCOL_WAS_HIRAGANA
;
1669 collationSource
->flags
&= ~UCOL_WAS_HIRAGANA
;
1673 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1674 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1675 if ((collationSource
->flags
& UCOL_ITER_NORM
) == 0) {
1679 if (collationSource
->fcdPosition
>= collationSource
->pos
) {
1680 // An earlier FCD check has already covered the current character.
1681 // We can go ahead and process this char.
1685 if (ch
< ZERO_CC_LIMIT_
) {
1686 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1690 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1691 // We need to peek at the next character in order to tell if we are FCD
1692 if ((collationSource
->flags
& UCOL_ITER_HASLEN
) && collationSource
->pos
>= collationSource
->endp
) {
1693 // We are at the last char of source string.
1694 // It is always OK for FCD check.
1698 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1699 if (*collationSource
->pos
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1705 // Need a more complete FCD check and possible normalization.
1706 if (collIterFCD(collationSource
)) {
1707 collIterNormalize(collationSource
);
1709 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1710 // No normalization was needed. Go ahead and process the char we already had.
1714 // Some normalization happened. Next loop iteration will pick up a char
1715 // from the normalization buffer.
1721 /* For latin-1 characters we never need to fall back to the UCA table */
1722 /* because all of the UCA data is replicated in the latinOneMapping array */
1723 order
= coll
->latinOneMapping
[ch
];
1724 if (order
> UCOL_NOT_FOUND
) {
1725 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
);
1730 order
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
1731 if(order
> UCOL_NOT_FOUND
) { /* if a CE is special */
1732 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
); /* and try to get the special CE */
1734 if(order
== UCOL_NOT_FOUND
&& coll
->UCA
) { /* We couldn't find a good CE in the tailoring */
1735 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1736 order
= UTRIE_GET32_FROM_LEAD(coll
->UCA
->mapping
, ch
);
1738 if(order
> UCOL_NOT_FOUND
) { /* UCA also gives us a special CE */
1739 order
= ucol_prv_getSpecialCE(coll
->UCA
, ch
, order
, collationSource
, status
);
1743 return order
; /* return the CE */
1746 /* ucol_getNextCE, out-of-line version for use from other files. */
1747 U_CAPI
uint32_t U_EXPORT2
1748 ucol_getNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1749 return ucol_IGetNextCE(coll
, collationSource
, status
);
1754 * Incremental previous normalization happens here. Pick up the range of chars
1755 * identifed by FCD, normalize it into the collIterate's writable buffer,
1756 * switch the collIterate's state to use the writable buffer.
1757 * @param data collation iterator data
1760 void collPrevIterNormalize(collIterate
*data
)
1762 UErrorCode status
= U_ZERO_ERROR
;
1763 UChar
*pEnd
= data
->pos
; /* End normalize + 1 */
1768 /* Start normalize */
1769 if (data
->fcdPosition
== NULL
) {
1770 pStart
= data
->string
;
1773 pStart
= data
->fcdPosition
+ 1;
1776 normLen
= unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0,
1777 data
->writableBuffer
, 0, &status
);
1779 if (data
->writableBufSize
<= normLen
) {
1780 freeHeapWritableBuffer(data
);
1781 data
->writableBuffer
= (UChar
*)uprv_malloc((normLen
+ 1) *
1783 if(data
->writableBuffer
== NULL
) { // something is wrong here, return
1786 data
->flags
|= UCOL_ITER_ALLOCATED
;
1787 /* to handle the zero termination */
1788 data
->writableBufSize
= normLen
+ 1;
1790 status
= U_ZERO_ERROR
;
1792 this puts the null termination infront of the normalized string instead
1795 pStartNorm
= data
->writableBuffer
+ (data
->writableBufSize
- normLen
);
1796 *(pStartNorm
- 1) = 0;
1797 unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0, pStartNorm
,
1800 data
->pos
= data
->writableBuffer
+ data
->writableBufSize
;
1801 data
->origFlags
= data
->flags
;
1802 data
->flags
|= UCOL_ITER_INNORMBUF
;
1803 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
1808 * Incremental FCD check for previous iteration and normalize. Called from
1809 * getPrevCE when normalization state is suspect.
1810 * When entering, the state is known to be this:
1811 * o We are working in the main buffer of the collIterate, not the side
1812 * writable buffer. When in the side buffer, normalization mode is always
1813 * off, so we won't get here.
1814 * o The leading combining class from the current character is 0 or the
1815 * trailing combining class of the previous char was zero.
1816 * True because the previous call to this function will have always exited
1817 * that way, and we get called for every char where cc might be non-zero.
1818 * @param data collation iterate struct
1819 * @return normalization status, TRUE for normalization to be done, FALSE
1823 inline UBool
collPrevIterFCD(collIterate
*data
)
1825 const UChar
*src
, *start
;
1828 uint8_t trailingCC
= 0;
1830 UBool result
= FALSE
;
1832 start
= data
->string
;
1833 src
= data
->pos
+ 1;
1835 /* Get the trailing combining class of the current character. */
1837 if (!UTF_IS_SURROGATE(c
)) {
1838 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1839 } else if (UTF_IS_SECOND_SURROGATE(c
) && start
< src
&& UTF_IS_FIRST_SURROGATE(c2
= *(src
- 1))) {
1841 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1843 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1845 } else /* unpaired surrogate */ {
1849 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1851 if (leadingCC
!= 0) {
1853 The current char has a non-zero leading combining class.
1854 Scan backward until we find a char with a trailing cc of zero.
1859 data
->fcdPosition
= NULL
;
1864 if (!UTF_IS_SURROGATE(c
)) {
1865 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1866 } else if (UTF_IS_SECOND_SURROGATE(c
) && start
< src
&& UTF_IS_FIRST_SURROGATE(c2
= *(src
- 1))) {
1868 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1870 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1872 } else /* unpaired surrogate */ {
1876 trailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1878 if (trailingCC
== 0) {
1882 if (leadingCC
< trailingCC
) {
1886 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1890 data
->fcdPosition
= (UChar
*)src
;
1895 /** gets a character from the string at a given offset
1896 * Handles both normal and iterative cases.
1897 * No error checking - caller beware!
1900 UChar
peekCharacter(collIterate
*source
, int32_t offset
) {
1901 if(source
->pos
!= NULL
) {
1902 return *(source
->pos
+ offset
);
1903 } else if(source
->iterator
!= NULL
) {
1905 source
->iterator
->move(source
->iterator
, offset
, UITER_CURRENT
);
1906 UChar toReturn
= (UChar
)source
->iterator
->next(source
->iterator
);
1907 source
->iterator
->move(source
->iterator
, -offset
-1, UITER_CURRENT
);
1910 return (UChar
)source
->iterator
->current(source
->iterator
);
1913 return (UChar
)U_SENTINEL
;
1918 * Determines if we are at the start of the data string in the backwards
1919 * collation iterator
1920 * @param data collation iterator
1921 * @return TRUE if we are at the start
1924 inline UBool
isAtStartPrevIterate(collIterate
*data
) {
1925 if(data
->pos
== NULL
&& data
->iterator
!= NULL
) {
1926 return !data
->iterator
->hasPrevious(data
->iterator
);
1928 //return (collIter_bos(data)) ||
1929 return (data
->pos
== data
->string
) ||
1930 ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
1931 *(data
->pos
- 1) == 0 && data
->fcdPosition
== NULL
);
1935 inline void goBackOne(collIterate
*data
) {
1937 // somehow, it looks like we need to keep iterator synced up
1938 // at all times, as above.
1942 if(data
->iterator
) {
1943 data
->iterator
->previous(data
->iterator
);
1946 if(data
->iterator
&& (data
->flags
& UCOL_USE_ITERATOR
)) {
1947 data
->iterator
->previous(data
->iterator
);
1955 * Inline function that gets a simple CE.
1956 * So what it does is that it will first check the expansion buffer. If the
1957 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1958 * is different from the string pointer, we return the collation element at the
1959 * return pointer and decrement it.
1960 * For more complicated CEs it resorts to getComplicatedCE.
1961 * @param coll collator data
1962 * @param data collation iterator struct
1963 * @param status error status
1966 inline uint32_t ucol_IGetPrevCE(const UCollator
*coll
, collIterate
*data
,
1969 uint32_t result
= (uint32_t)UCOL_NULLORDER
;
1970 if (data
->toReturn
> data
->CEs
) {
1972 result
= *(data
->toReturn
);
1973 if (data
->CEs
== data
->toReturn
) {
1974 data
->CEpos
= data
->toReturn
;
1980 Loop handles case when incremental normalize switches to or from the
1981 side buffer / original string, and we need to start again to get the
1985 if (data
->flags
& UCOL_ITER_HASLEN
) {
1987 Normal path for strings when length is specified.
1988 Not in side buffer because it is always null terminated.
1990 if (data
->pos
<= data
->string
) {
1991 /* End of the main source string */
1992 return UCOL_NO_MORE_CES
;
1997 // we are using an iterator to go back. Pray for us!
1998 else if (data
->flags
& UCOL_USE_ITERATOR
) {
1999 UChar32 iterCh
= data
->iterator
->previous(data
->iterator
);
2000 if(iterCh
== U_SENTINEL
) {
2001 return UCOL_NO_MORE_CES
;
2009 /* we are in the side buffer. */
2012 At the start of the normalize side buffer.
2014 Because pointer points to the last accessed character,
2015 hence we have to increment it by one here.
2017 if (data
->fcdPosition
== NULL
) {
2018 data
->pos
= data
->string
;
2019 return UCOL_NO_MORE_CES
;
2022 data
->pos
= data
->fcdPosition
+ 1;
2024 data
->flags
= data
->origFlags
;
2029 if(data
->flags
&UCOL_HIRAGANA_Q
) {
2030 if(ch
>=0x3040 && ch
<=0x309f) {
2031 data
->flags
|= UCOL_WAS_HIRAGANA
;
2033 data
->flags
&= ~UCOL_WAS_HIRAGANA
;
2038 * got a character to determine if there's fcd and/or normalization
2040 * if the current character is not fcd.
2041 * if current character is at the start of the string
2042 * Trailing combining class == 0.
2043 * Note if pos is in the writablebuffer, norm is always 0
2045 if (ch
< ZERO_CC_LIMIT_
||
2046 // this should propel us out of the loop in the iterator case
2047 (data
->flags
& UCOL_ITER_NORM
) == 0 ||
2048 (data
->fcdPosition
!= NULL
&& data
->fcdPosition
<= data
->pos
)
2049 || data
->string
== data
->pos
) {
2053 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
2054 /* if next character is FCD */
2055 if (data
->pos
== data
->string
) {
2056 /* First char of string is always OK for FCD check */
2060 /* Not first char of string, do the FCD fast test */
2061 if (*(data
->pos
- 1) < NFC_ZERO_CC_BLOCK_LIMIT_
) {
2066 /* Need a more complete FCD check and possible normalization. */
2067 if (collPrevIterFCD(data
)) {
2068 collPrevIterNormalize(data
);
2071 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2072 /* No normalization. Go ahead and process the char. */
2077 Some normalization happened.
2078 Next loop picks up a char from the normalization buffer.
2082 /* attempt to handle contractions, after removal of the backwards
2085 if (ucol_contractionEndCP(ch
, coll
) && !isAtStartPrevIterate(data
)) {
2086 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, UCOL_CONTRACTION
, data
, status
);
2089 // TODO: fix me for THAI - I reference *(data->pos-1)
2090 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0 &&
2091 /*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally
2092 // makes sure that we're not at the beggining of the string
2093 //data->pos > data->string &&
2094 !collIter_bos(data
) &&
2095 UCOL_ISTHAIPREVOWEL(peekCharacter(data
, -1)))
2096 //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
2098 collIterateState entryState
;
2099 backupState(data
, &entryState
);
2100 // we have to check if the previous character is also Thai
2101 // if not, we can just set the result
2103 if(collIter_bos(data
) || !UCOL_ISTHAIPREVOWEL(peekCharacter(data
, -1))) {
2104 loadState(data
, &entryState
, FALSE
);
2106 } else { // previous is also reordered
2107 // we need to go back as long as they are being reordered
2108 // count over the range of reorderable characters and see
2109 // if there is an even or odd number of them
2110 // if even, we should not reorder. If odd we should reorder.
2111 int32_t noReordered
= 1; // the one we already detected
2112 while(!collIter_bos(data
) && UCOL_ISTHAIPREVOWEL(peekCharacter(data
, -1))) {
2116 if(noReordered
& 1) { // odd number of reorderables
2119 result
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
2121 loadState(data
, &entryState
, FALSE
);
2124 else if (ch
<= 0xFF) {
2125 result
= coll
->latinOneMapping
[ch
];
2126 //if (result > UCOL_NOT_FOUND) {
2127 //result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2131 /*result = ucmpe32_get(coll->mapping, ch);*/
2132 result
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
2134 if (result
> UCOL_NOT_FOUND
) {
2135 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, result
, data
, status
);
2137 if (result
== UCOL_NOT_FOUND
) {
2138 if (!isAtStartPrevIterate(data
) &&
2139 ucol_contractionEndCP(ch
, data
->coll
)) {
2140 result
= UCOL_CONTRACTION
;
2143 /*result = ucmpe32_get(UCA->mapping, ch);*/
2145 result
= UTRIE_GET32_FROM_LEAD(coll
->UCA
->mapping
, ch
);
2149 if (result
> UCOL_NOT_FOUND
&& coll
->UCA
) {
2150 result
= ucol_prv_getSpecialPrevCE(coll
->UCA
, ch
, result
, data
, status
);
2159 /* ucol_getPrevCE, out-of-line version for use from other files. */
2160 U_CAPI
uint32_t U_EXPORT2
2161 ucol_getPrevCE(const UCollator
*coll
, collIterate
*data
,
2162 UErrorCode
*status
) {
2163 return ucol_IGetPrevCE(coll
, data
, status
);
2167 /* this should be connected to special Jamo handling */
2168 U_CAPI
uint32_t U_EXPORT2
2169 ucol_getFirstCE(const UCollator
*coll
, UChar u
, UErrorCode
*status
) {
2172 IInit_collIterate(coll
, &u
, 1, &colIt
);
2173 order
= ucol_IGetNextCE(coll
, &colIt
, status
);
2174 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2179 * Inserts the argument character into the end of the buffer pushing back the
2181 * @param data collIterate struct data
2182 * @param pNull pointer to the null termination
2183 * @param ch character to be appended
2184 * @return the position of the new addition
2187 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar ch
)
2189 uint32_t size
= data
->writableBufSize
;
2191 const uint32_t incsize
= 5;
2193 if ((data
->writableBuffer
+ size
) > (pNull
+ 1)) {
2200 buffer will always be null terminated at the end.
2201 giving extra space since it is likely that more characters will be added.
2204 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
2205 if(newbuffer
!= NULL
) { // something wrong, but no status
2206 uprv_memcpy(newbuffer
, data
->writableBuffer
,
2207 data
->writableBufSize
* sizeof(UChar
));
2209 freeHeapWritableBuffer(data
);
2210 data
->writableBufSize
= size
;
2211 data
->writableBuffer
= newbuffer
;
2213 newbuffer
= newbuffer
+ data
->writableBufSize
;
2215 *(newbuffer
+ 1) = 0;
2221 * Inserts the argument string into the end of the buffer pushing back the
2223 * @param data collIterate struct data
2224 * @param pNull pointer to the null termination
2225 * @param string to be appended
2226 * @param length of the string to be appended
2227 * @return the position of the new addition
2230 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar
*str
,
2233 uint32_t size
= pNull
- data
->writableBuffer
;
2236 if (data
->writableBuffer
+ data
->writableBufSize
> pNull
+ length
+ 1) {
2237 uprv_memcpy(pNull
, str
, length
* sizeof(UChar
));
2238 *(pNull
+ length
) = 0;
2243 buffer will always be null terminated at the end.
2244 giving extra space since it is likely that more characters will be added.
2246 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * (size
+ length
+ 1));
2247 if(newbuffer
!= NULL
) {
2248 uprv_memcpy(newbuffer
, data
->writableBuffer
, size
* sizeof(UChar
));
2249 uprv_memcpy(newbuffer
+ size
, str
, length
* sizeof(UChar
));
2251 freeHeapWritableBuffer(data
);
2252 data
->writableBufSize
= size
+ length
+ 1;
2253 data
->writableBuffer
= newbuffer
;
2260 * Special normalization function for contraction in the forwards iterator.
2261 * This normalization sequence will place the current character at source->pos
2262 * and its following normalized sequence into the buffer.
2263 * The fcd position, pos will be changed.
2264 * pos will now point to positions in the buffer.
2265 * Flags will be changed accordingly.
2266 * @param data collation iterator data
2269 inline void normalizeNextContraction(collIterate
*data
)
2271 UChar
*buffer
= data
->writableBuffer
;
2272 uint32_t buffersize
= data
->writableBufSize
;
2274 UErrorCode status
= U_ZERO_ERROR
;
2275 /* because the pointer points to the next character */
2276 UChar
*pStart
= data
->pos
- 1;
2281 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2282 *data
->writableBuffer
= *(pStart
- 1);
2286 strsize
= u_strlen(data
->writableBuffer
);
2289 pEnd
= data
->fcdPosition
;
2291 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
2294 if (buffersize
<= normLen
+ strsize
) {
2295 uint32_t size
= strsize
+ normLen
+ 1;
2296 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2298 uprv_memcpy(temp
, buffer
, sizeof(UChar
) * strsize
);
2299 freeHeapWritableBuffer(data
);
2300 data
->writableBuffer
= temp
;
2301 data
->writableBufSize
= size
;
2302 data
->flags
|= UCOL_ITER_ALLOCATED
;
2306 status
= U_ZERO_ERROR
;
2307 pStartNorm
= buffer
+ strsize
;
2308 /* null-termination will be added here */
2309 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
,
2310 normLen
+ 1, &status
);
2312 data
->pos
= data
->writableBuffer
+ strsize
;
2313 data
->origFlags
= data
->flags
;
2314 data
->flags
|= UCOL_ITER_INNORMBUF
;
2315 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2319 * Contraction character management function that returns the next character
2320 * for the forwards iterator.
2321 * Does nothing if the next character is in buffer and not the first character
2323 * Else it checks next character in data string to see if it is normalizable.
2324 * If it is not, the character is simply copied into the buffer, else
2325 * the whole normalized substring is copied into the buffer, including the
2326 * current character.
2327 * @param data collation element iterator data
2328 * @return next character
2331 inline UChar
getNextNormalizedChar(collIterate
*data
)
2335 // Here we need to add the iterator code. One problem is the way
2336 // end of string is handled. If we just return next char, it could
2337 // be the sentinel. Most of the cases already check for this, but we
2339 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ) {
2340 /* if no normalization and not in buffer. */
2341 if(data
->flags
& UCOL_USE_ITERATOR
) {
2342 return (UChar
)data
->iterator
->next(data
->iterator
);
2344 return *(data
->pos
++);
2348 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2349 //normalizeIterator(data);
2352 UChar
*pEndWritableBuffer
= NULL
;
2353 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2354 if ((innormbuf
&& *data
->pos
!= 0) ||
2355 (data
->fcdPosition
!= NULL
&& !innormbuf
&&
2356 data
->pos
< data
->fcdPosition
)) {
2358 if next character is in normalized buffer, no further normalization
2361 return *(data
->pos
++);
2364 if (data
->flags
& UCOL_ITER_HASLEN
) {
2365 /* in data string */
2366 if (data
->pos
+ 1 == data
->endp
) {
2367 return *(data
->pos
++);
2372 // inside the normalization buffer, but at the end
2373 // (since we encountered zero). This means, in the
2374 // case we're using char iterator, that we need to
2375 // do another round of normalization.
2376 //if(data->origFlags & UCOL_USE_ITERATOR) {
2377 // we need to restore original flags,
2378 // otherwise, we'll lose them
2379 //data->flags = data->origFlags;
2380 //normalizeIterator(data);
2381 //return *(data->pos++);
2384 in writable buffer, at this point fcdPosition can not be
2385 pointing to the end of the data string. see contracting tag.
2387 if(data
->fcdPosition
) {
2388 if (*(data
->fcdPosition
+ 1) == 0 ||
2389 data
->fcdPosition
+ 1 == data
->endp
) {
2390 /* at the end of the string, dump it into the normalizer */
2391 data
->pos
= insertBufferEnd(data
, data
->pos
,
2392 *(data
->fcdPosition
)) + 1;
2393 return *(data
->fcdPosition
++);
2395 pEndWritableBuffer
= data
->pos
;
2396 data
->pos
= data
->fcdPosition
;
2397 } else if(data
->origFlags
& UCOL_USE_ITERATOR
) {
2398 // if we are here, we're using a normalizing iterator.
2399 // we should just continue further.
2400 data
->flags
= data
->origFlags
;
2402 return (UChar
)data
->iterator
->next(data
->iterator
);
2407 if (*(data
->pos
+ 1) == 0) {
2408 return *(data
->pos
++);
2414 nextch
= *data
->pos
;
2417 * if the current character is not fcd.
2418 * Trailing combining class == 0.
2420 if ((data
->fcdPosition
== NULL
|| data
->fcdPosition
< data
->pos
) &&
2421 (nextch
>= NFC_ZERO_CC_BLOCK_LIMIT_
||
2422 ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
)) {
2424 Need a more complete FCD check and possible normalization.
2425 normalize substring will be appended to buffer
2427 if (collIterFCD(data
)) {
2428 normalizeNextContraction(data
);
2429 return *(data
->pos
++);
2431 else if (innormbuf
) {
2432 /* fcdposition shifted even when there's no normalization, if we
2433 don't input the rest into this, we'll get the wrong position when
2434 we reach the end of the writableBuffer */
2435 int32_t length
= data
->fcdPosition
- data
->pos
+ 1;
2436 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
,
2437 data
->pos
- 1, length
);
2438 return *(data
->pos
++);
2444 no normalization is to be done hence only one character will be
2445 appended to the buffer.
2447 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
, ch
) + 1;
2450 /* points back to the pos in string */
2457 * Function to copy the buffer into writableBuffer and sets the fcd position to
2458 * the correct position
2459 * @param source data string source
2460 * @param buffer character buffer
2461 * @param tempdb current position in buffer that has been used up
2464 inline void setDiscontiguosAttribute(collIterate
*source
, UChar
*buffer
,
2467 /* okay confusing part here. to ensure that the skipped characters are
2468 considered later, we need to place it in the appropriate position in the
2469 normalization buffer and reassign the pos pointer. simple case if pos
2470 reside in string, simply copy to normalization buffer and
2471 fcdposition = pos, pos = start of normalization buffer. if pos in
2472 normalization buffer, we'll insert the copy infront of pos and point pos
2473 to the start of the normalization buffer. why am i doing these copies?
2474 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2475 not require any changes, which be really painful. */
2476 uint32_t length
= u_strlen(buffer
);;
2477 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
2478 u_strcpy(tempdb
, source
->pos
);
2481 source
->fcdPosition
= source
->pos
;
2482 source
->origFlags
= source
->flags
;
2483 source
->flags
|= UCOL_ITER_INNORMBUF
;
2484 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2487 if (length
>= source
->writableBufSize
) {
2488 freeHeapWritableBuffer(source
);
2489 source
->writableBuffer
=
2490 (UChar
*)uprv_malloc((length
+ 1) * sizeof(UChar
));
2491 if(source
->writableBuffer
== NULL
) {
2494 source
->writableBufSize
= length
;
2497 u_strcpy(source
->writableBuffer
, buffer
);
2498 source
->pos
= source
->writableBuffer
;
2502 * Function to get the discontiguos collation element within the source.
2503 * Note this function will set the position to the appropriate places.
2504 * @param coll current collator used
2505 * @param source data string source
2506 * @param constart index to the start character in the contraction table
2507 * @return discontiguos collation element offset
2510 uint32_t getDiscontiguous(const UCollator
*coll
, collIterate
*source
,
2511 const UChar
*constart
)
2513 /* source->pos currently points to the second combining character after
2514 the start character */
2515 UChar
*temppos
= source
->pos
;
2516 UChar buffer
[4*UCOL_MAX_BUFFER
];
2517 UChar
*tempdb
= buffer
;
2518 const UChar
*tempconstart
= constart
;
2519 uint8_t tempflags
= source
->flags
;
2520 UBool multicontraction
= FALSE
;
2521 UChar
*tempbufferpos
= 0;
2522 collIterateState discState
;
2524 backupState(source
, &discState
);
2526 //*tempdb = *(source->pos - 1);
2527 *tempdb
= peekCharacter(source
, -1);
2535 if (((source
->flags
& UCOL_ITER_HASLEN
) && source
->pos
>= source
->endp
)
2536 || (peekCharacter(source
, 0) == 0 &&
2537 //|| (*source->pos == 0 &&
2538 ((source
->flags
& UCOL_ITER_INNORMBUF
) == 0 ||
2539 source
->fcdPosition
== NULL
||
2540 source
->fcdPosition
== source
->endp
||
2541 *(source
->fcdPosition
) == 0 ||
2542 u_getCombiningClass(*(source
->fcdPosition
)) == 0)) ||
2543 /* end of string in null terminated string or stopped by a
2544 null character, note fcd does not always point to a base
2545 character after the discontiguos change */
2546 u_getCombiningClass(peekCharacter(source
, 0)) == 0) {
2547 //u_getCombiningClass(*(source->pos)) == 0) {
2548 //constart = (UChar *)coll->image + getContractOffset(CE);
2549 if (multicontraction
) {
2551 source
->pos
= temppos
- 1;
2552 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2553 return *(coll
->contractionCEs
+
2554 (tempconstart
- coll
->contractionIndex
));
2556 constart
= tempconstart
;
2560 UCharOffset
= (UChar
*)(tempconstart
+ 1); /* skip the backward offset*/
2561 schar
= getNextNormalizedChar(source
);
2563 while (schar
> (tchar
= *UCharOffset
)) {
2567 if (schar
!= tchar
) {
2568 /* not the correct codepoint. we stuff the current codepoint into
2569 the discontiguos buffer and try the next character */
2575 if (u_getCombiningClass(schar
) ==
2576 u_getCombiningClass(peekCharacter(source
, -2))) {
2577 //u_getCombiningClass(*(source->pos - 2))) {
2582 result
= *(coll
->contractionCEs
+
2583 (UCharOffset
- coll
->contractionIndex
));
2587 if (result
== UCOL_NOT_FOUND
) {
2589 } else if (isContraction(result
)) {
2590 /* this is a multi-contraction*/
2591 tempconstart
= (UChar
*)coll
->image
+ getContractOffset(result
);
2592 if (*(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
))
2593 != UCOL_NOT_FOUND
) {
2594 multicontraction
= TRUE
;
2595 temppos
= source
->pos
+ 1;
2596 tempbufferpos
= buffer
+ u_strlen(buffer
);
2599 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2604 /* no problems simply reverting just like that,
2605 if we are in string before getting into this function, points back to
2606 string hence no problem.
2607 if we are in normalization buffer before getting into this function,
2608 since we'll never use another normalization within this function, we
2609 know that fcdposition points to a base character. the normalization buffer
2610 never change, hence this revert works. */
2611 loadState(source
, &discState
, TRUE
);
2614 //source->pos = temppos - 1;
2615 source
->flags
= tempflags
;
2616 return *(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
));
2620 inline UBool
isNonChar(UChar32 cp
) {
2621 if ((cp
& 0xFFFE) == 0xFFFE || (0xFDD0 <= cp
&& cp
<= 0xFDEF) || (0xD800 <= cp
&& cp
<= 0xDFFF)) {
2627 /* now uses Mark's getImplicitPrimary code */
2629 inline uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
) {
2633 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
2634 *(collationSource
->CEpos
++) = ((r
& 0x0000FFFF)<<16) | 0x000000C0;
2635 return (r
& UCOL_PRIMARYMASK
) | 0x00000505; // This was 'order'
2639 * Inserts the argument character into the front of the buffer replacing the
2640 * front null terminator.
2641 * @param data collation element iterator data
2642 * @param pNull pointer to the null terminator
2643 * @param ch character to be appended
2644 * @return positon of added character
2647 inline UChar
* insertBufferFront(collIterate
*data
, UChar
*pNull
, UChar ch
)
2649 uint32_t size
= data
->writableBufSize
;
2652 const uint32_t incsize
= 5;
2654 if (pNull
> data
->writableBuffer
+ 1) {
2661 buffer will always be null terminated infront.
2662 giving extra space since it is likely that more characters will be added.
2665 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
2666 if(newbuffer
== NULL
) {
2669 end
= newbuffer
+ incsize
;
2670 uprv_memcpy(end
, data
->writableBuffer
,
2671 data
->writableBufSize
* sizeof(UChar
));
2675 freeHeapWritableBuffer(data
);
2677 data
->writableBufSize
= size
;
2678 data
->writableBuffer
= newbuffer
;
2683 * Special normalization function for contraction in the previous iterator.
2684 * This normalization sequence will place the current character at source->pos
2685 * and its following normalized sequence into the buffer.
2686 * The fcd position, pos will be changed.
2687 * pos will now point to positions in the buffer.
2688 * Flags will be changed accordingly.
2689 * @param data collation iterator data
2692 inline void normalizePrevContraction(collIterate
*data
)
2694 UChar
*buffer
= data
->writableBuffer
;
2695 uint32_t buffersize
= data
->writableBufSize
;
2696 uint32_t nulltermsize
;
2697 UErrorCode status
= U_ZERO_ERROR
;
2698 UChar
*pEnd
= data
->pos
+ 1; /* End normalize + 1 */
2703 if (data
->flags
& UCOL_ITER_HASLEN
) {
2705 normalization buffer not used yet, we'll pull down the next
2706 character into the end of the buffer
2708 *(buffer
+ (buffersize
- 1)) = *(data
->pos
+ 1);
2709 nulltermsize
= buffersize
- 1;
2712 nulltermsize
= buffersize
;
2713 UChar
*temp
= buffer
+ (nulltermsize
- 1);
2714 while (*(temp
--) != 0) {
2719 /* Start normalize */
2720 if (data
->fcdPosition
== NULL
) {
2721 pStart
= data
->string
;
2724 pStart
= data
->fcdPosition
+ 1;
2727 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
2730 if (nulltermsize
<= normLen
) {
2731 uint32_t size
= buffersize
- nulltermsize
+ normLen
+ 1;
2732 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2734 nulltermsize
= normLen
+ 1;
2735 uprv_memcpy(temp
+ normLen
, buffer
,
2736 sizeof(UChar
) * (buffersize
- nulltermsize
));
2737 freeHeapWritableBuffer(data
);
2738 data
->writableBuffer
= temp
;
2739 data
->writableBufSize
= size
;
2743 status
= U_ZERO_ERROR
;
2745 this puts the null termination infront of the normalized string instead
2748 pStartNorm
= buffer
+ (nulltermsize
- normLen
);
2749 *(pStartNorm
- 1) = 0;
2750 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
, normLen
,
2753 data
->pos
= data
->writableBuffer
+ nulltermsize
;
2754 data
->origFlags
= data
->flags
;
2755 data
->flags
|= UCOL_ITER_INNORMBUF
;
2756 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2760 * Contraction character management function that returns the previous character
2761 * for the backwards iterator.
2762 * Does nothing if the previous character is in buffer and not the first
2764 * Else it checks previous character in data string to see if it is
2766 * If it is not, the character is simply copied into the buffer, else
2767 * the whole normalized substring is copied into the buffer, including the
2768 * current character.
2769 * @param data collation element iterator data
2770 * @return previous character
2773 inline UChar
getPrevNormalizedChar(collIterate
*data
)
2778 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2779 UChar
*pNull
= NULL
;
2780 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ||
2781 (innormbuf
&& *(data
->pos
- 1) != 0)) {
2783 if no normalization.
2784 if previous character is in normalized buffer, no further normalization
2787 if(data
->flags
& UCOL_USE_ITERATOR
) {
2788 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
2789 return (UChar
)data
->iterator
->next(data
->iterator
);
2791 return *(data
->pos
- 1);
2796 if (data
->flags
& UCOL_ITER_HASLEN
) {
2797 /* in data string */
2798 if ((start
- 1) == data
->string
) {
2799 return *(start
- 1);
2803 prevch
= *(start
- 1);
2807 in writable buffer, at this point fcdPosition can not be NULL.
2808 see contracting tag.
2810 if (data
->fcdPosition
== data
->string
) {
2811 /* at the start of the string, just dump it into the normalizer */
2812 insertBufferFront(data
, data
->pos
- 1, *(data
->fcdPosition
));
2813 data
->fcdPosition
= NULL
;
2814 return *(data
->pos
- 1);
2816 pNull
= data
->pos
- 1;
2817 start
= data
->fcdPosition
;
2819 prevch
= *(start
- 1);
2822 * if the current character is not fcd.
2823 * Trailing combining class == 0.
2825 if (data
->fcdPosition
> start
&&
2826 (ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
|| prevch
>= NFC_ZERO_CC_BLOCK_LIMIT_
))
2829 Need a more complete FCD check and possible normalization.
2830 normalize substring will be appended to buffer
2832 UChar
*backuppos
= data
->pos
;
2834 if (collPrevIterFCD(data
)) {
2835 normalizePrevContraction(data
);
2836 return *(data
->pos
- 1);
2838 data
->pos
= backuppos
;
2839 data
->fcdPosition
++;
2844 no normalization is to be done hence only one character will be
2845 appended to the buffer.
2847 insertBufferFront(data
, pNull
, ch
);
2848 data
->fcdPosition
--;
2854 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2855 /* It is called by getNextCE */
2857 uint32_t ucol_prv_getSpecialCE(const UCollator
*coll
, UChar ch
, uint32_t CE
, collIterate
*source
, UErrorCode
*status
) {
2858 collIterateState entryState
;
2859 backupState(source
, &entryState
);
2863 // This loop will repeat only in the case of contractions, and only when a contraction
2864 // is found and the first CE resulting from that contraction is itself a special
2865 // (an expansion, for example.) All other special CE types are fully handled the
2866 // first time through, and the loop exits.
2868 const uint32_t *CEOffset
= NULL
;
2869 switch(getCETag(CE
)) {
2871 /* This one is not found, and we'll let somebody else bother about it... no more games */
2874 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2875 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2876 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2877 /* we return 0 (completely ignorable - per UCA specification */
2880 collIterateState state
;
2881 backupState(source
, &state
);
2882 if (collIter_eos(source
) || !(UTF16_IS_TRAIL((trail
= getNextNormalizedChar(source
))))) {
2883 // we chould have stepped one char forward and it might have turned that it
2884 // was not a trail surrogate. In that case, we have to backup.
2885 loadState(source
, &state
, TRUE
);
2888 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2889 CE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, CE
&0xFFFFFF, trail
);
2890 if(CE
== UCOL_NOT_FOUND
) { // there are tailored surrogates in this block, but not this one.
2891 // We need to backup
2892 loadState(source
, &state
, TRUE
);
2895 // calculate the supplementary code point value, if surrogate was not tailored
2896 cp
= ((((uint32_t)ch
)<<10UL)+(trail
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2901 /* Thai/Lao reordering */
2902 if (((source
->flags
) & UCOL_ITER_INNORMBUF
) /* Already Swapped || */
2903 || collIter_eos(source
)) /* At end of string. No swap possible */
2905 // Treat Thai as a length one expansion */
2906 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
2911 // Move the prevowel and the following base Consonant into the normalization buffer
2912 // with their order swapped
2913 // Note: this operation might activate the normalization buffer. We have to check for
2914 // that and act accordingly.
2915 UChar thCh
= getNextNormalizedChar(source
);
2917 if(U16_IS_LEAD(thCh
)) {
2918 if(!collIter_eos(source
)) {
2919 collIterateState thaiState
;
2920 backupState(source
, &thaiState
);
2921 UChar trailCh
= getNextNormalizedChar(source
);
2922 if(U16_IS_TRAIL(trailCh
)) {
2923 cp
= U16_GET_SUPPLEMENTARY(thCh
, trailCh
);
2925 loadState(source
, &thaiState
, TRUE
);
2934 // Now we have the character that needs to be decomposed
2935 // if the normalizing buffer was not used, we can just use our structure and be happy.
2936 if((source
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2937 // decompose into writable buffer
2938 int32_t decompLen
= unorm_getDecomposition(cp
, FALSE
, &(source
->writableBuffer
[1]), UCOL_WRITABLE_BUFFER_SIZE
-1);
2940 decompLen
= -decompLen
;
2942 // reorder Thai and the character after it
2943 if(decompLen
>= 2 && U16_IS_LEAD(source
->writableBuffer
[1]) && U16_IS_TRAIL(source
->writableBuffer
[2])) {
2944 source
->writableBuffer
[0] = source
->writableBuffer
[1];
2945 source
->writableBuffer
[1] = source
->writableBuffer
[2];
2946 source
->writableBuffer
[2] = ch
;
2948 source
->writableBuffer
[0] = source
->writableBuffer
[1];
2949 source
->writableBuffer
[1] = ch
;
2951 // zero terminate, since normalization buffer is always zero terminated
2952 source
->writableBuffer
[decompLen
+1] = 0; // we added the prevowel
2954 source
->fcdPosition
= source
->pos
; // Indicate where to continue in main input string
2955 // after exhausting the writableBuffer
2957 source
->pos
= source
->writableBuffer
;
2958 source
->origFlags
= source
->flags
;
2959 source
->flags
|= UCOL_ITER_INNORMBUF
;
2960 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2963 // stuff is already normalized... what to do here???
2965 // if we are in the normalization buffer, thCh must be in it
2966 // prove by contradiction
2967 // if thCh is not in the normalization buffer,
2968 // that means that trailCh is the normalization buffer
2969 // that means that trailCh is a trail surrogate by the above
2970 // bounding if block, this is a contradiction because there
2971 // are no characters at the moment that decomposes to an
2972 // unmatched surrogate. qed.
2973 if (cp
>= 0x10000) {
2974 source
->writableBuffer
[0] = source
->writableBuffer
[1];
2975 source
->writableBuffer
[1] = source
->writableBuffer
[2];
2976 source
->writableBuffer
[2] = ch
;
2979 source
->writableBuffer
[0] = source
->writableBuffer
[1];
2980 source
->writableBuffer
[1] = ch
;
2982 source
->pos
= source
->writableBuffer
;
2984 CE
= UCOL_IGNORABLE
;
2989 // Special processing is getting a CE that is preceded by a certain prefix
2990 // Currently this is only needed for optimizing Japanese length and iteration marks.
2991 // When we encouter a special processing tag, we go backwards and try to see if
2993 // Contraction tables are used - so the whole process is not unlike contraction.
2994 // prefix data is stored backwards in the table.
2995 const UChar
*UCharOffset
;
2997 collIterateState prefixState
;
2998 backupState(source
, &prefixState
);
2999 loadState(source
, &entryState
, TRUE
);
3000 goBackOne(source
); // We want to look at the point where we entered - actually one
3004 // This loop will run once per source string character, for as long as we
3005 // are matching a potential contraction sequence
3007 // First we position ourselves at the begining of contraction sequence
3008 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3009 if (collIter_bos(source
)) {
3010 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3013 schar
= getPrevNormalizedChar(source
);
3016 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3020 if (schar
== tchar
) {
3021 // Found the source string char in the table.
3022 // Pick up the corresponding CE from the table.
3023 CE
= *(coll
->contractionCEs
+
3024 (UCharOffset
- coll
->contractionIndex
));
3028 // if there is a completely ignorable code point in the middle of
3029 // a prefix, we need to act as if it's not there
3030 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3031 // lone surrogates cannot be set to zero as it would break other processing
3032 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
3033 // it's easy for BMP code points
3036 } else if(UTF_IS_TRAIL(schar
) || UTF_IS_LEAD(schar
)) {
3037 // for supplementary code points, we have to check the next one
3038 // situations where we are going to ignore
3039 // 1. beginning of the string: schar is a lone surrogate
3040 // 2. schar is a lone surrogate
3041 // 3. schar is a trail surrogate in a valid surrogate sequence
3042 // that is explicitly set to zero.
3043 if (!collIter_bos(source
)) {
3045 if(UTF_IS_LEAD(lead
= getPrevNormalizedChar(source
))) {
3046 isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, lead
);
3047 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
3048 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
3050 // this is a real, assigned completely ignorable code point
3056 // lone surrogate, completely ignorable
3060 // lone surrogate at the beggining, completely ignorable
3064 // Source string char was not in the table.
3065 // We have not found the prefix.
3066 CE
= *(coll
->contractionCEs
+
3067 (ContractionStart
- coll
->contractionIndex
));
3071 // The source string char was in the contraction table, and the corresponding
3072 // CE is not a prefix CE. We found the prefix, break
3073 // out of loop, this CE will end up being returned. This is the normal
3074 // way out of prefix handling when the source actually contained
3079 if(CE
!= UCOL_NOT_FOUND
) { // we found something and we can merilly continue
3080 loadState(source
, &prefixState
, TRUE
);
3081 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
3082 source
->flags
= source
->origFlags
;
3084 } else { // prefix search was a failure, we have to backup all the way to the start
3085 loadState(source
, &entryState
, TRUE
);
3089 case CONTRACTION_TAG
:
3091 /* This should handle contractions */
3092 collIterateState state
;
3093 backupState(source
, &state
);
3094 uint32_t firstCE
= UCOL_NOT_FOUND
;
3095 const UChar
*UCharOffset
;
3099 /* This loop will run once per source string character, for as long as we */
3100 /* are matching a potential contraction sequence */
3102 /* First we position ourselves at the begining of contraction sequence */
3103 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3105 if (collIter_eos(source
)) {
3106 // Ran off the end of the source string.
3107 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3108 // So we'll pick whatever we have at the point...
3109 if (CE
== UCOL_NOT_FOUND
) {
3110 // back up the source over all the chars we scanned going into this contraction.
3112 loadState(source
, &state
, TRUE
);
3113 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
3114 source
->flags
= source
->origFlags
;
3120 uint8_t maxCC
= (uint8_t)(*(UCharOffset
)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
3121 uint8_t allSame
= (uint8_t)(*(UCharOffset
++)>>8);
3123 schar
= getNextNormalizedChar(source
);
3124 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3128 if (schar
== tchar
) {
3129 // Found the source string char in the contraction table.
3130 // Pick up the corresponding CE from the table.
3131 CE
= *(coll
->contractionCEs
+
3132 (UCharOffset
- coll
->contractionIndex
));
3136 // if there is a completely ignorable code point in the middle of
3137 // contraction, we need to act as if it's not there
3138 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
3139 // it's easy for BMP code points
3142 } else if(UTF_IS_LEAD(schar
)) {
3143 if(!collIter_eos(source
)) {
3144 backupState(source
, &state
);
3145 UChar trail
= getNextNormalizedChar(source
);
3146 if(UTF_IS_TRAIL(trail
)) { // do stuff with trail
3147 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
3148 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, isZeroCE
&0xFFFFFF, trail
);
3154 // broken surrogate sequence, thus completely ignorable
3155 loadState(source
, &state
, TRUE
);
3158 loadState(source
, &state
, TRUE
);
3159 } else { // no more characters, so broken surrogate pair...
3160 // this contraction will ultimately fail, but not because of us
3163 } // else if(UTF_IS_LEAD(schar))
3165 // Source string char was not in contraction table.
3166 // Unless we have a discontiguous contraction, we have finished
3167 // with this contraction.
3169 if (schar
< 0x300 ||
3171 (sCC
= i_getCombiningClass(schar
, coll
)) == 0 ||
3173 (allSame
!= 0 && sCC
== maxCC
) ||
3174 collIter_eos(source
)) {
3175 // Contraction can not be discontiguous.
3176 goBackOne(source
); // back up the source string by one,
3177 // because the character we just looked at was
3178 // not part of the contraction. */
3179 CE
= *(coll
->contractionCEs
+
3180 (ContractionStart
- coll
->contractionIndex
));
3183 // Contraction is possibly discontiguous.
3184 // Scan more of source string looking for a match
3187 /* find the next character if schar is not a base character
3188 and we are not yet at the end of the string */
3189 tempchar
= getNextNormalizedChar(source
);
3191 if (i_getCombiningClass(tempchar
, coll
) == 0) {
3193 /* Spit out the last char of the string, wasn't tasty enough */
3194 CE
= *(coll
->contractionCEs
+
3195 (ContractionStart
- coll
->contractionIndex
));
3197 CE
= getDiscontiguous(coll
, source
, ContractionStart
);
3200 } // else after if(schar == tchar)
3202 if(CE
== UCOL_NOT_FOUND
) {
3203 /* The Source string did not match the contraction that we were checking. */
3204 /* Back up the source position to undo the effects of having partially */
3205 /* scanned through what ultimately proved to not be a contraction. */
3206 loadState(source
, &state
, TRUE
);
3211 if(!isContraction(CE
)) {
3212 // The source string char was in the contraction table, and the corresponding
3213 // CE is not a contraction CE. We completed the contraction, break
3214 // out of loop, this CE will end up being returned. This is the normal
3215 // way out of contraction handling when the source actually contained
3221 // The source string char was in the contraction table, and the corresponding
3222 // CE is IS a contraction CE. We will continue looping to check the source
3223 // string for the remaining chars in the contraction.
3224 uint32_t tempCE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
3225 if(tempCE
!= UCOL_NOT_FOUND
) {
3226 // We have scanned a a section of source string for which there is a
3227 // CE from the contraction table. Remember the CE and scan position, so
3228 // that we can return to this point if further scanning fails to
3229 // match a longer contraction sequence.
3233 backupState(source
, &state
);
3234 getNextNormalizedChar(source
);
3236 // Another way to do this is:
3237 //collIterateState tempState;
3238 //backupState(source, &tempState);
3239 //goBackOne(source);
3240 //backupState(source, &state);
3241 //loadState(source, &tempState, TRUE);
3243 // The problem is that for incomplete contractions we have to remember the previous
3244 // position. Before, the only thing I needed to do was state.pos--;
3245 // After iterator introduction and especially after introduction of normalizing
3246 // iterators, it became much more difficult to decrease the saved state.
3247 // I'm not yet sure which of the two methods above is faster.
3251 } // case CONTRACTION_TAG:
3252 case LONG_PRIMARY_TAG
:
3254 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
3255 CE
= ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
3260 /* This should handle expansion. */
3261 /* NOTE: we can encounter both continuations and expansions in an expansion! */
3262 /* I have to decide where continuations are going to be dealt with */
3264 uint32_t i
; /* general counter */
3265 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3266 size
= getExpansionCount(CE
);
3268 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3269 for(i
= 1; i
<size
; i
++) {
3270 *(source
->CEpos
++) = *CEOffset
++;
3272 } else { /* else, we do */
3273 while(*CEOffset
!= 0) {
3274 *(source
->CEpos
++) = *CEOffset
++;
3282 We do a check to see if we want to collate digits as numbers; if so we generate
3283 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3286 uint32_t i
; /* general counter */
3287 collIterateState digitState
;
3289 if (source
->coll
->numericCollation
== UCOL_ON
){
3292 uint32_t digIndx
= 0;
3293 uint32_t endIndex
= 0;
3294 uint32_t trailingZeroIndex
= 0;
3296 uint32_t primWeight
= 0;
3299 uint8_t collateVal
= 0;
3301 UBool nonZeroValReached
= FALSE
;
3303 uint8_t *numTempBuf
;
3304 uint8_t stackNumTempBuf
[UCOL_MAX_BUFFER
]; // I just need a temporary place to store my generated CEs.
3305 uint32_t numTempBufSize
= UCOL_MAX_BUFFER
;
3307 numTempBuf
= stackNumTempBuf
;
3309 We parse the source string until we hit a char that's NOT a digit.
3310 Use this u_charDigitValue. This might be slow because we have to
3311 handle surrogates...
3314 if (U16_IS_LEAD(ch)){
3315 if (!collIter_eos(source)) {
3316 backupState(source, &digitState);
3317 UChar trail = getNextNormalizedChar(source);
3318 if(U16_IS_TRAIL(trail)) {
3319 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3321 loadState(source, &digitState, TRUE);
3330 digVal = u_charDigitValue(char32);
3332 digVal
= u_charDigitValue(cp
); // if we have arrived here, we have
3333 // already processed possible supplementaries that trigered the digit tag -
3334 // all supplementaries are marked in the UCA.
3336 We pad a zero in front of the first element anyways. This takes
3337 care of the (probably) most common case where people are sorting things followed
3342 // Make sure we have enough space.
3343 if (digIndx
>= ((numTempBufSize
- 2) * 2) + 1)
3345 numTempBufSize
*= 2;
3346 if (numTempBuf
== stackNumTempBuf
){
3347 numTempBuf
= (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize
);
3348 uprv_memcpy(numTempBuf
, stackNumTempBuf
, UCOL_MAX_BUFFER
);
3350 uprv_realloc(numTempBuf
, numTempBufSize
);
3353 // Skipping over leading zeroes.
3354 if (digVal
!= 0 || nonZeroValReached
){
3355 if (digVal
!= 0 && !nonZeroValReached
)
3356 nonZeroValReached
= TRUE
;
3359 We parse the digit string into base 100 numbers (this fits into a byte).
3360 We only add to the buffer in twos, thus if we are parsing an odd character,
3361 that serves as the 'tens' digit while the if we are parsing an even one, that
3362 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3363 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3364 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3365 than all the other bytes.
3368 if (digIndx
% 2 == 1){
3369 collateVal
+= (uint8_t)digVal
;
3371 // We don't enter the low-order-digit case unless we've already seen
3372 // the high order, or for the first digit, which is always non-zero.
3373 if (collateVal
!= 0)
3374 trailingZeroIndex
= 0;
3376 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3380 // We drop the collation value into the buffer so if we need to do
3381 // a "front patch" we don't have to check to see if we're hitting the
3383 collateVal
= (uint8_t)(digVal
* 10);
3385 // Check for trailing zeroes.
3386 if (collateVal
== 0)
3388 if (!trailingZeroIndex
)
3389 trailingZeroIndex
= (digIndx
/2) + 2;
3392 trailingZeroIndex
= 0;
3394 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3399 // Get next character.
3400 if (!collIter_eos(source
)){
3401 ch
= getNextNormalizedChar(source
);
3402 if (U16_IS_LEAD(ch
)){
3403 if (!collIter_eos(source
)) {
3404 backupState(source
, &digitState
);
3405 UChar trail
= getNextNormalizedChar(source
);
3406 if(U16_IS_TRAIL(trail
)) {
3407 char32
= U16_GET_SUPPLEMENTARY(ch
, trail
);
3409 loadState(source
, &digitState
, TRUE
);
3417 if ((digVal
= u_charDigitValue(char32
)) == -1){
3418 // Resetting position to point to the next unprocessed char. We
3419 // overshot it when doing our test/set for numbers.
3420 if (char32
> 0xFFFF) { // For surrogates.
3421 loadState(source
, &digitState
, TRUE
);
3422 //goBackOne(source);
3432 if (nonZeroValReached
== FALSE
){
3437 endIndex
= trailingZeroIndex
? trailingZeroIndex
: ((digIndx
/2) + 2) ;
3438 if (digIndx
% 2 != 0){
3440 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3441 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3442 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3443 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3446 for(i
= 2; i
< endIndex
; i
++){
3447 numTempBuf
[i
] = (((((numTempBuf
[i
] - 6)/2) % 10) * 10) +
3448 (((numTempBuf
[i
+1])-6)/2) / 10) * 2 + 6;
3453 // Subtract one off of the last byte.
3454 numTempBuf
[endIndex
-1] -= 1;
3457 We want to skip over the first two slots in the buffer. The first slot
3458 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3459 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3461 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
3462 numTempBuf
[1] = (uint8_t)(0x80 + ((digIndx
/2) & 0x7F));
3464 // Now transfer the collation key to our collIterate struct.
3465 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3466 size
= ((endIndex
+1) & ~1)/2;
3467 CE
= (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3468 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3469 UCOL_BYTE_COMMON
; // Tertiary weight.
3470 i
= 2; // Reset the index into the buffer.
3473 primWeight
= numTempBuf
[i
++] << 8;
3475 primWeight
|= numTempBuf
[i
++];
3476 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3479 if (numTempBuf
!= stackNumTempBuf
)
3480 uprv_free(numTempBuf
);
3482 // no numeric mode, we'll just switch to whatever we stashed and continue
3483 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3487 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3488 size
= getExpansionCount(CE
);
3490 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3491 for(i
= 1; i
<size
; i
++) {
3492 *(source
->CEpos
++) = *CEOffset
++;
3494 } else { /* else, we do */
3495 while(*CEOffset
!= 0) {
3496 *(source
->CEpos
++) = *CEOffset
++;
3503 /* various implicits optimization */
3504 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3505 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3506 //return getImplicit(cp, source, 0x04000000);
3507 return getImplicit(cp
, source
);
3508 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3509 /* UCA is filled with these. Tailorings are NOT_FOUND */
3510 //return getImplicit(cp, source, 0);
3511 return getImplicit(cp
, source
);
3512 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3513 return 0; /* broken surrogate sequence */
3514 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3516 if( source
->flags
& UCOL_USE_ITERATOR
) {
3517 if(U_IS_TRAIL(nextChar
= (UChar
)source
->iterator
->current(source
->iterator
))) {
3518 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3519 source
->iterator
->next(source
->iterator
);
3520 return getImplicit(cp
, source
);
3524 } else if((((source
->flags
& UCOL_ITER_HASLEN
) == 0 ) || (source
->pos
<source
->endp
)) &&
3525 U_IS_TRAIL((nextChar
=*source
->pos
))) {
3526 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3528 return getImplicit(cp
, source
);
3530 return 0; /* completely ignorable */
3532 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3535 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3536 //const uint32_t LCount = 19;
3537 const uint32_t VCount
= 21;
3538 const uint32_t TCount
= 28;
3539 //const uint32_t NCount = VCount * TCount; // 588
3540 //const uint32_t SCount = LCount * NCount; // 11172
3541 uint32_t L
= ch
- SBase
;
3543 // divide into pieces
3545 uint32_t T
= L
% TCount
; // we do it in this order since some compilers can do % and / in one operation
3547 uint32_t V
= L
% VCount
;
3556 // return the first CE, but first put the rest into the expansion buffer
3557 if (!source
->coll
->image
->jamoSpecial
) { // FAST PATH
3559 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
3560 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3561 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, V
);
3563 /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
3564 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3565 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, T
);
3568 /*return ucmpe32_get(UCA->mapping, L);*/ // return first one
3569 /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3570 return UTRIE_GET32_FROM_LEAD(coll
->mapping
, L
);
3572 } else { // Jamo is Special
3573 // Since Hanguls pass the FCD check, it is
3574 // guaranteed that we won't be in
3575 // the normalization buffer if something like this happens
3576 // However, if we are using a uchar iterator and normalization
3577 // is ON, the Hangul that lead us here is going to be in that
3578 // normalization buffer. Here we want to restore the uchar
3579 // iterator state and pull out of the normalization buffer
3580 if(source
->iterator
!= NULL
&& source
->flags
& UCOL_ITER_INNORMBUF
) {
3581 source
->flags
= source
->origFlags
; // restore the iterator
3584 // Move Jamos into normalization buffer
3585 source
->writableBuffer
[0] = (UChar
)L
;
3586 source
->writableBuffer
[1] = (UChar
)V
;
3588 source
->writableBuffer
[2] = (UChar
)T
;
3589 source
->writableBuffer
[3] = 0;
3591 source
->writableBuffer
[2] = 0;
3594 source
->fcdPosition
= source
->pos
; // Indicate where to continue in main input string
3595 // after exhausting the writableBuffer
3596 source
->pos
= source
->writableBuffer
;
3597 source
->origFlags
= source
->flags
;
3598 source
->flags
|= UCOL_ITER_INNORMBUF
;
3599 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3601 return(UCOL_IGNORABLE
);
3605 /* not yet implemented */
3606 /* probably after 1.8 */
3607 return UCOL_NOT_FOUND
;
3609 *status
= U_INTERNAL_PROGRAM_ERROR
;
3613 if (CE
<= UCOL_NOT_FOUND
) break;
3619 /* now uses Mark's getImplicitPrimary code */
3621 inline uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
) {
3626 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
3628 *(collationSource
->CEpos
++) = (r
& UCOL_PRIMARYMASK
) | 0x00000505;
3629 collationSource
->toReturn
= collationSource
->CEpos
;
3630 return ((r
& 0x0000FFFF)<<16) | 0x000000C0;
3634 * This function handles the special CEs like contractions, expansions,
3636 * It is called by both getPrevCE
3638 uint32_t ucol_prv_getSpecialPrevCE(const UCollator
*coll
, UChar ch
, uint32_t CE
,
3639 collIterate
*source
,
3642 const uint32_t *CEOffset
= NULL
;
3643 UChar
*UCharOffset
= NULL
;
3645 const UChar
*constart
= NULL
;
3647 UChar buffer
[UCOL_MAX_BUFFER
];
3648 uint32_t *endCEBuffer
;
3650 int32_t noChars
= 0;
3654 /* the only ces that loops are thai and contractions */
3655 switch (getCETag(CE
))
3657 case NOT_FOUND_TAG
: /* this tag always returns */
3659 case SURROGATE_TAG
: /* This is a surrogate pair */
3660 /* essentialy an engaged lead surrogate. */
3661 /* if you have encountered it here, it means that a */
3662 /* broken sequence was encountered and this is an error */
3665 if ((source
->flags
& UCOL_ITER_INNORMBUF
) || /* Already Swapped || */
3666 source
->string
== source
->pos
|| /* At start of string.|| */
3667 /* previous char not Thai prevowel */
3668 /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
3669 UCOL_ISTHAIPREVOWEL(peekCharacter(source
, -1)) == FALSE
)
3670 //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
3672 /* Treat Thai as a length one expansion */
3673 /* find the offset to expansion table */
3674 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
);
3680 Move the prevowel and the following base Consonant into the
3681 normalization buffer with their order swapped
3683 UChar32 cp
= (UChar32
)peekCharacter(source
, 0);
3684 UBool reorder
= TRUE
;
3686 int32_t decompLen
= unorm_getDecomposition(cp
, FALSE
, source
->writableBuffer
, UCOL_WRITABLE_BUFFER_SIZE
-1);
3688 decompLen
= -decompLen
; // there was no decomposition
3689 } else { // we need to check if we will hit a contraction trigger because of decomposition
3690 int32_t i
= decompLen
;
3691 for(i
= 0; i
< decompLen
; i
++) {
3692 if(ucol_contractionEndCP(source
->writableBuffer
[i
], coll
)) {
3698 UChar
*tempbuffer
= source
->writableBuffer
+
3699 (source
->writableBufSize
- 1);
3700 uprv_memcpy(tempbuffer
-decompLen
+ 1, source
->writableBuffer
, sizeof(UChar
)*decompLen
);
3702 *(tempbuffer
- decompLen
) = *(tempbuffer
- decompLen
+ 1);
3703 *(tempbuffer
- decompLen
+ 1) = peekCharacter(source
, -1);
3705 *(tempbuffer
- decompLen
) = peekCharacter(source
, -1);
3707 *(tempbuffer
- decompLen
- 1) = 0;
3711 UChar *tempbuffer = source->writableBuffer +
3712 (source->writableBufSize - 1);
3713 *(tempbuffer - 2) = 0;
3714 *(tempbuffer - 1) = peekCharacter(source, 0);
3715 *(tempbuffer) = peekCharacter(source, -1);
3718 Indicate where to continue in main input string after exhausting
3721 if (source
->pos
- 1 == source
->string
) {
3722 source
->fcdPosition
= NULL
;
3724 source
->fcdPosition
= source
->pos
-2;
3727 source
->pos
= tempbuffer
+1; // we're doing predecrement, right?
3728 source
->origFlags
= source
->flags
;
3729 source
->flags
|= UCOL_ITER_INNORMBUF
;
3730 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3732 //CE = UCOL_IGNORABLE;
3733 return(UCOL_IGNORABLE
);
3738 // Special processing is getting a CE that is preceded by a certain prefix
3739 // Currently this is only needed for optimizing Japanese length and iteration marks.
3740 // When we encouter a special processing tag, we go backwards and try to see if
3742 // Contraction tables are used - so the whole process is not unlike contraction.
3743 // prefix data is stored backwards in the table.
3744 const UChar
*UCharOffset
;
3746 collIterateState prefixState
;
3747 backupState(source
, &prefixState
);
3749 // This loop will run once per source string character, for as long as we
3750 // are matching a potential contraction sequence
3752 // First we position ourselves at the begining of contraction sequence
3753 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3755 if (collIter_bos(source
)) {
3756 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3759 schar
= getPrevNormalizedChar(source
);
3762 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3766 if (schar
== tchar
) {
3767 // Found the source string char in the table.
3768 // Pick up the corresponding CE from the table.
3769 CE
= *(coll
->contractionCEs
+
3770 (UCharOffset
- coll
->contractionIndex
));
3774 // if there is a completely ignorable code point in the middle of
3775 // a prefix, we need to act as if it's not there
3776 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3777 // lone surrogates cannot be set to zero as it would break other processing
3778 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
3779 // it's easy for BMP code points
3782 } else if(UTF_IS_TRAIL(schar
) || UTF_IS_LEAD(schar
)) {
3783 // for supplementary code points, we have to check the next one
3784 // situations where we are going to ignore
3785 // 1. beginning of the string: schar is a lone surrogate
3786 // 2. schar is a lone surrogate
3787 // 3. schar is a trail surrogate in a valid surrogate sequence
3788 // that is explicitly set to zero.
3789 if (!collIter_bos(source
)) {
3791 if(UTF_IS_LEAD(lead
= getPrevNormalizedChar(source
))) {
3792 isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, lead
);
3793 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
3794 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
3796 // this is a real, assigned completely ignorable code point
3802 // lone surrogate, completely ignorable
3806 // lone surrogate at the beggining, completely ignorable
3810 // Source string char was not in the table.
3811 // We have not found the prefix.
3812 CE
= *(coll
->contractionCEs
+
3813 (ContractionStart
- coll
->contractionIndex
));
3817 // The source string char was in the contraction table, and the corresponding
3818 // CE is not a prefix CE. We found the prefix, break
3819 // out of loop, this CE will end up being returned. This is the normal
3820 // way out of prefix handling when the source actually contained
3825 loadState(source
, &prefixState
, TRUE
);
3829 case CONTRACTION_TAG
:
3830 /* to ensure that the backwards and forwards iteration matches, we
3831 take the current region of most possible match and pass it through
3832 the forward iteration. this will ensure that the obstinate problem of
3833 overlapping contractions will not occur.
3835 schar
= peekCharacter(source
, 0);
3836 constart
= (UChar
*)coll
->image
+ getContractOffset(CE
);
3837 if (isAtStartPrevIterate(source
)
3838 /* commented away contraction end checks after adding the checks
3840 /* start of string or this is not the end of any contraction */
3841 CE
= *(coll
->contractionCEs
+
3842 (constart
- coll
->contractionIndex
));
3846 UCharOffset
= strbuffer
+ (UCOL_MAX_BUFFER
- 1);
3847 *(UCharOffset
--) = 0;
3849 // have to swap thai characters
3850 while (ucol_unsafeCP(schar
, coll
) || UCOL_ISTHAIPREVOWEL(peekCharacter(source
, -1))) {
3851 // we might have ended here after trying to reorder Thai, but seeing that there are unsafe points
3852 // in the backward processing
3853 *(UCharOffset
) = schar
;
3856 schar
= getPrevNormalizedChar(source
);
3858 // TODO: when we exhaust the contraction buffer,
3859 // it needs to get reallocated. The problem is
3860 // that the size depends on the string which is
3861 // not iterated over. However, since we're travelling
3862 // backwards, we already had to set the iterator at
3863 // the end - so we might as well know where we are?
3864 if (UCharOffset
+ 1 == buffer
) {
3865 /* we have exhausted the buffer */
3866 int32_t newsize
= 0;
3867 if(source
->pos
) { // actually dealing with a position
3868 newsize
= source
->pos
- source
->string
+ 1;
3869 } else { // iterator
3870 newsize
= 4 * UCOL_MAX_BUFFER
;
3872 strbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) *
3873 (newsize
+ UCOL_MAX_BUFFER
));
3875 if (strbuffer
== NULL
) {
3876 *status
= U_MEMORY_ALLOCATION_ERROR
;
3877 return UCOL_NO_MORE_CES
;
3879 UCharOffset
= strbuffer
+ newsize
;
3880 uprv_memcpy(UCharOffset
, buffer
,
3881 UCOL_MAX_BUFFER
* sizeof(UChar
));
3884 if ((source
->pos
&& (source
->pos
== source
->string
||
3885 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
3886 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)))
3887 || (source
->iterator
&& !source
->iterator
->hasPrevious(source
->iterator
))) {
3891 /* adds the initial base character to the string */
3892 *(UCharOffset
) = schar
;
3895 /* a new collIterate is used to simplify things, since using the current
3896 collIterate will mean that the forward and backwards iteration will
3897 share and change the same buffers. we don't want to get into that. */
3899 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3900 IInit_collIterate(coll
, UCharOffset
, noChars
, &temp
);
3901 temp
.flags
&= ~UCOL_ITER_NORM
;
3903 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3904 endCEBuffer
= source
->CEs
+ UCOL_EXPAND_CE_BUFFER_SIZE
;
3905 while (CE
!= UCOL_NO_MORE_CES
) {
3906 *(source
->CEpos
++) = CE
;
3907 if (source
->CEpos
== endCEBuffer
) {
3908 /* ran out of CE space, bail.
3909 there's no guarantee of the right character position after
3911 *status
= U_BUFFER_OVERFLOW_ERROR
;
3912 source
->CEpos
= source
->CEs
;
3913 freeHeapWritableBuffer(&temp
);
3914 if (strbuffer
!= buffer
) {
3915 uprv_free(strbuffer
);
3917 return (uint32_t)UCOL_NULLORDER
;
3919 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3921 freeHeapWritableBuffer(&temp
);
3922 if (strbuffer
!= buffer
) {
3923 uprv_free(strbuffer
);
3925 source
->toReturn
= source
->CEpos
- 1;
3926 if (source
->toReturn
== source
->CEs
) {
3927 source
->CEpos
= source
->CEs
;
3929 return *(source
->toReturn
);
3930 case LONG_PRIMARY_TAG
:
3932 *(source
->CEpos
++) = ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
3933 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
3934 source
->toReturn
= source
->CEpos
- 1;
3935 return *(source
->toReturn
);
3937 case EXPANSION_TAG
: /* this tag always returns */
3939 This should handle expansion.
3940 NOTE: we can encounter both continuations and expansions in an expansion!
3941 I have to decide where continuations are going to be dealt with
3943 /* find the offset to expansion table */
3944 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3945 size
= getExpansionCount(CE
);
3948 if there are less than 16 elements in expansion, we don't terminate
3951 for (count
= 0; count
< size
; count
++) {
3952 *(source
->CEpos
++) = *CEOffset
++;
3957 while (*CEOffset
!= 0) {
3958 *(source
->CEpos
++) = *CEOffset
++;
3961 source
->toReturn
= source
->CEpos
- 1;
3962 // in case of one element expansion, we
3963 // want to immediately return CEpos
3964 if(source
->toReturn
== source
->CEs
) {
3965 source
->CEpos
= source
->CEs
;
3967 return *(source
->toReturn
);
3971 We do a check to see if we want to collate digits as numbers; if so we generate
3972 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3975 uint32_t i
; /* general counter */
3976 collIterateState state
;
3978 if (source
->coll
->numericCollation
== UCOL_ON
){
3981 uint32_t digIndx
= 0;
3982 uint32_t endIndex
= 0;
3983 uint32_t leadingZeroIndex
= 0;
3984 uint32_t trailingZeroCount
= 0;
3986 uint32_t primWeight
= 0;
3989 uint8_t collateVal
= 0;
3991 UBool nonZeroValReached
= FALSE
;
3993 uint8_t *numTempBuf
;
3994 uint8_t stackNumTempBuf
[UCOL_MAX_BUFFER
]; // I just need a temporary place to store my generated CEs.
3995 uint32_t numTempBufSize
= UCOL_MAX_BUFFER
;
3997 numTempBuf
= stackNumTempBuf
;
3999 We parse the source string until we hit a char that's NOT a digit.
4000 Use this u_charDigitValue. This might be slow because we have to
4001 handle surrogates...
4004 if (U16_IS_TRAIL (ch
)){
4005 if (!collIter_bos(source
)){
4006 UChar lead
= getPrevNormalizedChar(source
);
4007 if(U16_IS_LEAD(lead
)) {
4008 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
4019 digVal
= u_charDigitValue(char32
);
4022 // Make sure we have enough space.
4023 if (digIndx
>= ((numTempBufSize
- 2) * 2) + 1)
4025 numTempBufSize
*= 2;
4026 if (numTempBuf
== stackNumTempBuf
){
4027 numTempBuf
= (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize
);
4028 uprv_memcpy(numTempBuf
, stackNumTempBuf
, UCOL_MAX_BUFFER
);
4030 uprv_realloc(numTempBuf
, numTempBufSize
);
4033 // Skip over trailing zeroes, and keep a count of them.
4035 nonZeroValReached
= TRUE
;
4036 if (nonZeroValReached
){
4038 We parse the digit string into base 100 numbers (this fits into a byte).
4039 We only add to the buffer in twos, thus if we are parsing an odd character,
4040 that serves as the 'tens' digit while the if we are parsing an even one, that
4041 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
4042 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
4043 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
4044 than all the other bytes.
4046 Since we're doing in this reverse we want to put the first digit encountered into the
4047 ones place and the second digit encountered into the tens place.
4050 if ((digIndx
+ trailingZeroCount
) % 2 == 1){
4051 // High-order digit case (tens place)
4052 collateVal
+= (uint8_t)(digVal
* 10);
4054 // We cannot set leadingZeroIndex unless it has been set for the
4055 // low-order digit. Therefore, all we can do for the high-order
4056 // digit is turn it off, never on.
4057 // The only time we will have a high digit without a low is for
4058 // the very first non-zero digit, so no zero check is necessary.
4059 if (collateVal
!= 0)
4060 leadingZeroIndex
= 0;
4062 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
4066 // Low-order digit case (ones place)
4067 collateVal
= (uint8_t)digVal
;
4069 // Check for leading zeroes.
4070 if (collateVal
== 0)
4072 if (!leadingZeroIndex
)
4073 leadingZeroIndex
= (digIndx
/2) + 2;
4076 leadingZeroIndex
= 0;
4078 // No need to write to buffer; the case of a last odd digit
4079 // is handled below.
4084 ++trailingZeroCount
;
4086 if (!collIter_bos(source
)){
4087 ch
= getPrevNormalizedChar(source
);
4088 //goBackOne(source);
4089 if (U16_IS_TRAIL(ch
)){
4090 backupState(source
, &state
);
4091 if (!collIter_bos(source
))
4094 UChar lead
= getPrevNormalizedChar(source
);
4095 if(U16_IS_LEAD(lead
)) {
4096 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
4098 loadState(source
, &state
, FALSE
);
4106 if ((digVal
= u_charDigitValue(char32
)) == -1){
4107 if (char32
> 0xFFFF) {// For surrogates.
4108 loadState(source
, &state
, FALSE
);
4110 // Don't need to "reverse" the goBackOne call,
4111 // as this points to the next position to process..
4112 //if (char32 > 0xFFFF) // For surrogates.
4113 //getNextNormalizedChar(source);
4121 if (nonZeroValReached
== FALSE
){
4123 trailingZeroCount
= 0;
4127 if ((digIndx
+ trailingZeroCount
) % 2 != 0){
4128 numTempBuf
[((digIndx
)/2) + 2] = collateVal
*2 + 6;
4129 digIndx
+= 1; // The implicit leading zero
4131 if (trailingZeroCount
% 2 != 0){
4132 // We had to consume one trailing zero for the low digit
4133 // of the least significant byte
4134 digIndx
+= 1; // The trailing zero not in the exponent
4135 trailingZeroCount
-= 1;
4138 endIndex
= leadingZeroIndex
? leadingZeroIndex
: ((digIndx
/2) + 2) ;
4140 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4144 We want to skip over the first two slots in the buffer. The first slot
4145 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4146 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4147 The exponent must be adjusted by the number of leading zeroes, and the number of
4150 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
4151 uint32_t exponent
= (digIndx
+trailingZeroCount
)/2;
4152 if (leadingZeroIndex
)
4153 exponent
-= ((digIndx
/2) + 2 - leadingZeroIndex
);
4154 numTempBuf
[1] = (uint8_t)(0x80 + (exponent
& 0x7F));
4156 // Now transfer the collation key to our collIterate struct.
4157 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4158 //size = ((endIndex+1) & ~1)/2;
4159 *(source
->CEpos
++) = (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
4160 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
4161 UCOL_BYTE_COMMON
; // Tertiary weight.
4162 i
= endIndex
- 1; // Reset the index into the buffer.
4165 primWeight
= numTempBuf
[i
--] << 8;
4167 primWeight
|= numTempBuf
[i
--];
4168 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
4170 if (numTempBuf
!= stackNumTempBuf
)
4171 uprv_free(numTempBuf
);
4173 source
->toReturn
= source
->CEpos
-1;
4174 return *(source
->toReturn
);
4177 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
4181 /* find the offset to expansion table */
4182 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
4183 size
= getExpansionCount(CE
);
4186 if there are less than 16 elements in expansion, we don't terminate
4189 for (count
= 0; count
< size
; count
++) {
4190 *(source
->CEpos
++) = *CEOffset
++;
4195 while (*CEOffset
!= 0) {
4196 *(source
->CEpos
++) = *CEOffset
++;
4199 source
->toReturn
= source
->CEpos
- 1;
4200 // in case of one element expansion, we
4201 // want to immediately return CEpos
4202 if(source
->toReturn
== source
->CEs
) {
4203 source
->CEpos
= source
->CEs
;
4205 return *(source
->toReturn
);
4209 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
4212 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
4213 //const uint32_t LCount = 19;
4214 const uint32_t VCount
= 21;
4215 const uint32_t TCount
= 28;
4216 //const uint32_t NCount = VCount * TCount; /* 588 */
4217 //const uint32_t SCount = LCount * NCount; /* 11172 */
4219 uint32_t L
= ch
- SBase
;
4222 we do it in this order since some compilers can do % and / in one
4225 uint32_t T
= L
% TCount
;
4227 uint32_t V
= L
% VCount
;
4236 return the first CE, but first put the rest into the expansion buffer
4238 if (!source
->coll
->image
->jamoSpecial
)
4240 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
4241 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
4242 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, L
);
4243 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
4244 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
4245 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, V
);
4247 /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
4248 /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
4249 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(coll
->mapping
, T
);
4251 source
->toReturn
= source
->CEpos
- 1;
4252 return *(source
->toReturn
);
4254 // Since Hanguls pass the FCD check, it is
4255 // guaranteed that we won't be in
4256 // the normalization buffer if something like this happens
4257 // Move Jamos into normalization buffer
4259 Move the Jamos into the
4260 normalization buffer
4262 UChar
*tempbuffer
= source
->writableBuffer
+
4263 (source
->writableBufSize
- 1);
4266 *(tempbuffer
- 1) = (UChar
)T
;
4267 *(tempbuffer
- 2) = (UChar
)V
;
4268 *(tempbuffer
- 3) = (UChar
)L
;
4269 *(tempbuffer
- 4) = 0;
4271 *(tempbuffer
- 1) = (UChar
)V
;
4272 *(tempbuffer
- 2) = (UChar
)L
;
4273 *(tempbuffer
- 3) = 0;
4277 Indicate where to continue in main input string after exhausting
4280 if (source
->pos
== source
->string
) {
4281 source
->fcdPosition
= NULL
;
4283 source
->fcdPosition
= source
->pos
-1;
4286 source
->pos
= tempbuffer
;
4287 source
->origFlags
= source
->flags
;
4288 source
->flags
|= UCOL_ITER_INNORMBUF
;
4289 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
4291 return(UCOL_IGNORABLE
);
4294 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
4295 return 0; /* broken surrogate sequence */
4296 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
4301 if (isAtStartPrevIterate(source
)) {
4302 /* we are at the start of the string, wrong place to be at */
4305 if (source
->pos
!= source
->writableBuffer
) {
4306 prev
= source
->pos
- 1;
4308 prev
= source
->fcdPosition
;
4312 /* Handles Han and Supplementary characters here.*/
4313 if (UTF_IS_FIRST_SURROGATE(prevChar
)) {
4314 cp
= ((((uint32_t)prevChar
)<<10UL)+(ch
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4317 return 0; /* completely ignorable */
4319 return getPrevImplicit(cp
, source
);
4321 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4322 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4323 return getPrevImplicit(ch
, source
);
4324 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
4325 return getPrevImplicit(ch
, source
);
4326 /* UCA is filled with these. Tailorings are NOT_FOUND */
4327 /* not yet implemented */
4328 case CHARSET_TAG
: /* this tag always returns */
4329 /* probably after 1.8 */
4330 return UCOL_NOT_FOUND
;
4331 default: /* this tag always returns */
4332 *status
= U_INTERNAL_PROGRAM_ERROR
;
4336 if (CE
<= UCOL_NOT_FOUND
) {
4343 /* This should really be a macro */
4344 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4347 uint8_t *reallocateBuffer(uint8_t **secondaries
, uint8_t *secStart
, uint8_t *second
, uint32_t *secSize
, uint32_t newSize
, UErrorCode
*status
) {
4349 fprintf(stderr
, ".");
4351 uint8_t *newStart
= NULL
;
4352 uint32_t offset
= *secondaries
-secStart
;
4354 if(secStart
==second
) {
4355 newStart
=(uint8_t*)uprv_malloc(newSize
);
4356 if(newStart
==NULL
) {
4357 *status
= U_MEMORY_ALLOCATION_ERROR
;
4360 uprv_memcpy(newStart
, secStart
, *secondaries
-secStart
);
4362 newStart
=(uint8_t*)uprv_realloc(secStart
, newSize
);
4363 if(newStart
==NULL
) {
4364 *status
= U_MEMORY_ALLOCATION_ERROR
;
4368 *secondaries
=newStart
+offset
;
4374 /* This should really be a macro */
4375 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4376 /* secondaries in French */
4378 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4388 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4390 while((start)<(end)) { \
4392 *(start)++ = *(end); \
4397 /****************************************************************************/
4398 /* Following are the sortkey generation functions */
4400 /****************************************************************************/
4403 * Merge two sort keys.
4404 * This is useful, for example, to combine sort keys from first and last names
4405 * to sort such pairs.
4406 * Merged sort keys consider on each collation level the first part first entirely,
4407 * then the second one.
4408 * It is possible to merge multiple sort keys by consecutively merging
4409 * another one with the intermediate result.
4411 * The length of the merge result is the sum of the lengths of the input sort keys
4414 * @param src1 the first sort key
4415 * @param src1Length the length of the first sort key, including the zero byte at the end;
4416 * can be -1 if the function is to find the length
4417 * @param src2 the second sort key
4418 * @param src2Length the length of the second sort key, including the zero byte at the end;
4419 * can be -1 if the function is to find the length
4420 * @param dest the buffer where the merged sort key is written,
4421 * can be NULL if destCapacity==0
4422 * @param destCapacity the number of bytes in the dest buffer
4423 * @return the length of the merged sort key, src1Length+src2Length-1;
4424 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4425 * in which cases the contents of dest is undefined
4429 U_CAPI
int32_t U_EXPORT2
4430 ucol_mergeSortkeys(const uint8_t *src1
, int32_t src1Length
,
4431 const uint8_t *src2
, int32_t src2Length
,
4432 uint8_t *dest
, int32_t destCapacity
) {
4436 /* check arguments */
4437 if( src1
==NULL
|| src1Length
<-2 || src1Length
==0 || (src1Length
>0 && src1
[src1Length
-1]!=0) ||
4438 src2
==NULL
|| src2Length
<-2 || src2Length
==0 || (src2Length
>0 && src2
[src2Length
-1]!=0) ||
4439 destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)
4441 /* error, attempt to write a zero byte and return 0 */
4442 if(dest
!=NULL
&& destCapacity
>0) {
4448 /* check lengths and capacity */
4450 src1Length
=(int32_t)uprv_strlen((const char *)src1
)+1;
4453 src2Length
=(int32_t)uprv_strlen((const char *)src2
)+1;
4456 destLength
=src1Length
+src2Length
-1;
4457 if(destLength
>destCapacity
) {
4458 /* the merged sort key does not fit into the destination */
4462 /* merge the sort keys with the same number of levels */
4463 while(*src1
!=0 && *src2
!=0) { /* while both have another level */
4464 /* copy level from src1 not including 00 or 01 */
4465 while((b
=*src1
)>=2) {
4470 /* add a 02 merge separator */
4473 /* copy level from src2 not including 00 or 01 */
4474 while((b
=*src2
)>=2) {
4479 /* if both sort keys have another level, then add a 01 level separator and continue */
4480 if(*src1
==1 && *src2
==1) {
4488 * here, at least one sort key is finished now, but the other one
4489 * might have some contents left from containing more levels;
4490 * that contents is just appended to the result
4493 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4496 /* append src2, "the other, unfinished sort key" */
4497 uprv_strcpy((char *)dest
, (const char *)src2
);
4499 /* trust that neither sort key contained illegally embedded zero bytes */
4504 U_CAPI
int32_t U_EXPORT2
4505 ucol_getSortKey(const UCollator
*coll
,
4506 const UChar
*source
,
4507 int32_t sourceLength
,
4509 int32_t resultLength
)
4511 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY
);
4512 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
4513 int32_t actualSrcLen
= sourceLength
;
4514 if (actualSrcLen
==-1 && source
!=NULL
) {
4515 actualSrcLen
= u_strlen(source
);
4517 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source string = %vh ", coll
, source
, actualSrcLen
);
4520 UErrorCode status
= U_ZERO_ERROR
;
4521 int32_t keySize
= 0;
4523 if(source
!= NULL
) {
4524 // source == NULL is actually an error situation, but we would need to
4525 // have an error code to return it. Until we introduce a new
4526 // API, it stays like this
4528 /* this uses the function pointer that is set in updateinternalstate */
4529 /* currently, there are two funcs: */
4530 /*ucol_calcSortKey(...);*/
4531 /*ucol_calcSortKeySimpleTertiary(...);*/
4533 keySize
= coll
->sortKeyGen(coll
, source
, sourceLength
, &result
, resultLength
, FALSE
, &status
);
4534 //((UCollator *)coll)->errorCode = status; /*semantically const */
4536 UTRACE_DATA2(UTRACE_VERBOSE
, "Sort Key = %vb", result
, keySize
);
4537 UTRACE_EXIT_STATUS(status
);
4541 /* this function is called by the C++ API for sortkey generation */
4543 ucol_getSortKeyWithAllocation(const UCollator
*coll
,
4544 const UChar
*source
, int32_t sourceLength
,
4546 UErrorCode
*pErrorCode
) {
4548 return coll
->sortKeyGen(coll
, source
, sourceLength
, pResult
, 0, TRUE
, pErrorCode
);
4551 #define UCOL_FSEC_BUF_SIZE 256
4553 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4554 /* or if we run out of space while making a sortkey and want to return ASAP */
4555 int32_t ucol_getSortKeySize(const UCollator
*coll
, collIterate
*s
, int32_t currentSize
, UColAttributeValue strength
, int32_t len
) {
4556 UErrorCode status
= U_ZERO_ERROR
;
4557 const UCAConstants
*UCAconsts
= (UCAConstants
*)((uint8_t *)coll
->UCA
->image
+ coll
->image
->UCAConsts
);
4558 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4559 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4560 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4561 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4562 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4563 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4564 //UBool qShifted = shifted && (compareQuad == 0);
4565 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4566 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4567 uint8_t fSecsBuff
[UCOL_FSEC_BUF_SIZE
];
4568 uint8_t *fSecs
= fSecsBuff
;
4569 uint32_t fSecsLen
= 0, fSecsMaxLen
= UCOL_FSEC_BUF_SIZE
;
4570 uint8_t *frenchStartPtr
= NULL
, *frenchEndPtr
= NULL
;
4572 uint32_t variableTopValue
= coll
->variableTopValue
;
4573 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4576 /* allocate one more space for hiragana */
4578 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4580 uint32_t order
= UCOL_NO_MORE_CES
;
4581 uint8_t primary1
= 0;
4582 uint8_t primary2
= 0;
4583 uint8_t secondary
= 0;
4584 uint8_t tertiary
= 0;
4585 int32_t caseShift
= 0;
4586 uint32_t c2
= 0, c3
= 0, c4
= 0; /* variables for compression */
4588 uint8_t caseSwitch
= coll
->caseSwitch
;
4589 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4590 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4592 UBool wasShifted
= FALSE
;
4593 UBool notIsContinuation
= FALSE
;
4594 uint8_t leadPrimary
= 0;
4598 order
= ucol_IGetNextCE(coll
, s
, &status
);
4599 if(order
== UCOL_NO_MORE_CES
) {
4607 notIsContinuation
= !isContinuation(order
);
4610 if(notIsContinuation
) {
4611 tertiary
= (uint8_t)((order
& UCOL_BYTE_SIZE_MASK
));
4613 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4615 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4616 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4617 primary1
= (uint8_t)(order
>> 8);
4620 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4621 || (!notIsContinuation
&& wasShifted
))
4622 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4623 /* and other ignorables should be removed if following a shifted code point */
4624 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4625 /* we should just completely ignore it */
4628 if(compareQuad
== 0) {
4630 currentSize
+= (c2
/UCOL_BOT_COUNT4
)+1;
4641 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4642 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4643 /* calculate sortkey size */
4644 if(primary1
!= UCOL_IGNORABLE
) {
4645 if(notIsContinuation
) {
4646 if(leadPrimary
== primary1
) {
4649 if(leadPrimary
!= 0) {
4652 if(primary2
== UCOL_IGNORABLE
) {
4653 /* one byter, not compressed */
4656 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
4657 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4658 (primary1
> (*UCAconsts
->UCA_LAST_NON_VARIABLE
>>24) && primary1
< (*UCAconsts
->UCA_FIRST_IMPLICIT
>>24))) {
4659 /* not compressible */
4662 } else { /* compress */
4663 leadPrimary
= primary1
;
4667 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4669 if(primary2
!= UCOL_IGNORABLE
) {
4675 if(secondary
> compareSec
) { /* I think that != 0 test should be != IGNORABLE */
4677 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4681 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4682 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+1;
4684 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+1;
4691 fSecs
[fSecsLen
++] = secondary
;
4692 if(fSecsLen
== fSecsMaxLen
) {
4693 if(fSecs
== fSecsBuff
) {
4694 fSecs
= (uint8_t *)uprv_malloc(2*fSecsLen
);
4696 fSecs
= (uint8_t *)uprv_realloc(fSecs
, 2*fSecsLen
);
4699 status
= U_MEMORY_ALLOCATION_ERROR
;
4704 if(notIsContinuation
) {
4705 if (frenchStartPtr
!= NULL
) {
4706 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4707 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4708 frenchStartPtr
= NULL
;
4711 if (frenchStartPtr
== NULL
) {
4712 frenchStartPtr
= fSecs
+fSecsLen
-2;
4714 frenchEndPtr
= fSecs
+fSecsLen
-1;
4720 if (caseShift
== 0) {
4722 caseShift
= UCOL_CASE_SHIFT_START
;
4724 if((tertiary
&0x3F) > 0 && notIsContinuation
) {
4726 if((tertiary
&0xC0) != 0) {
4727 if (caseShift
== 0) {
4729 caseShift
= UCOL_CASE_SHIFT_START
;
4735 if(notIsContinuation
) {
4736 tertiary
^= caseSwitch
;
4740 tertiary
&= tertiaryMask
;
4741 if(tertiary
> compareTer
) { /* I think that != 0 test should be != IGNORABLE */
4742 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4746 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
4747 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
4748 currentSize
+= (c3
/(uint32_t)coll
->tertiaryTopCount
)+1;
4750 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
)+1;
4758 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4759 if(s
->flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4760 if(c4
>0) { // Close this part
4761 currentSize
+= (c4
/UCOL_BOT_COUNT4
)+1;
4764 currentSize
++; // Add the Hiragana
4765 } else { // This wasn't Hiragana, so we can continue adding stuff
4775 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4779 if(frenchStartPtr
!= NULL
) {
4780 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4782 for(i
= 0; i
<fSecsLen
; i
++) {
4783 secondary
= *(fSecs
+fSecsLen
-i
-1);
4784 /* This is compression code. */
4785 if (secondary
== UCOL_COMMON2
) {
4789 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4790 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+((c2
%(uint32_t)UCOL_TOP_COUNT2
!= 0)?1:0);
4792 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4800 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4802 if(fSecs
!= fSecsBuff
) {
4808 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
) + ((c3
%(uint32_t)coll
->tertiaryBottomCount
!= 0)?1:0);
4811 if(c4
> 0 && compareQuad
== 0) {
4812 currentSize
+= (c4
/(uint32_t)UCOL_BOT_COUNT4
)+((c4
%(uint32_t)UCOL_BOT_COUNT4
!= 0)?1:0);
4816 currentSize
+= u_lengthOfIdenticalLevelRun(s
->string
, len
);
4823 inline void doCaseShift(uint8_t **cases
, uint32_t &caseShift
) {
4824 if (caseShift
== 0) {
4825 *(*cases
)++ = UCOL_CASE_BYTE_START
;
4826 caseShift
= UCOL_CASE_SHIFT_START
;
4830 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4831 // know how many values we wanted to add, even if we didn't add them all
4833 inline void addWithIncrement(uint8_t *&primaries
, uint8_t *limit
, uint32_t &size
, const uint8_t value
) {
4835 if(primaries
< limit
) {
4836 *(primaries
)++ = value
;
4840 // Packs the secondary buffer when processing French locale. Adds the terminator.
4842 inline uint8_t *packFrench(uint8_t *primaries
, uint8_t *primEnd
, uint8_t *secondaries
, uint32_t *secsize
, uint8_t *frenchStartPtr
, uint8_t *frenchEndPtr
) {
4845 uint32_t i
= 0, size
= 0;
4846 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4847 addWithIncrement(primaries
, primEnd
, i
, UCOL_LEVELTERMINATOR
);
4848 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4849 if(frenchStartPtr
!= NULL
) {
4850 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4852 for(i
= 0; i
<*secsize
; i
++) {
4853 secondary
= *(secondaries
-i
-1);
4854 /* This is compression code. */
4855 if (secondary
== UCOL_COMMON2
) {
4859 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4860 while (count2
> UCOL_TOP_COUNT2
) {
4861 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
));
4862 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4864 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1)));
4866 while (count2
> UCOL_BOT_COUNT2
) {
4867 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4868 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4870 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4874 addWithIncrement(primaries
, primEnd
, size
, secondary
);
4878 while (count2
> UCOL_BOT_COUNT2
) {
4879 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4880 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4882 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4888 /* This is the sortkey work horse function */
4889 U_CFUNC
int32_t U_CALLCONV
4890 ucol_calcSortKey(const UCollator
*coll
,
4891 const UChar
*source
,
4892 int32_t sourceLength
,
4894 uint32_t resultLength
,
4895 UBool allocateSKBuffer
,
4898 const UCAConstants
*UCAconsts
= (UCAConstants
*)((uint8_t *)coll
->UCA
->image
+ coll
->image
->UCAConsts
);
4900 uint32_t i
= 0; /* general purpose counter */
4902 /* Stack allocated buffers for buffers we use */
4903 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
], caseB
[UCOL_CASE_MAX_BUFFER
], quad
[UCOL_QUAD_MAX_BUFFER
];
4905 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
, *cases
= caseB
, *quads
= quad
;
4907 if(U_FAILURE(*status
)) {
4911 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
4912 primaries
= *result
= prim
;
4913 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
4916 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
,
4917 caseSize
= UCOL_CASE_MAX_BUFFER
, quadSize
= UCOL_QUAD_MAX_BUFFER
;
4919 uint32_t sortKeySize
= 1; /* it is always \0 terminated */
4921 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
4922 UChar
*normSource
= normBuffer
;
4923 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
4925 int32_t len
= (sourceLength
== -1 ? u_strlen(source
) : sourceLength
);
4927 UColAttributeValue strength
= coll
->strength
;
4929 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4930 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4931 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4932 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4933 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4934 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4935 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4936 //UBool qShifted = shifted && (compareQuad == 0);
4937 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4938 const uint8_t *scriptOrder
= coll
->scriptOrder
;
4940 uint32_t variableTopValue
= coll
->variableTopValue
;
4941 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4942 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4943 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4944 uint8_t UCOL_HIRAGANA_QUAD
= 0;
4946 UCOL_HIRAGANA_QUAD
=UCOL_COMMON_BOT4
++;
4947 /* allocate one more space for hiragana, value for hiragana */
4949 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4951 /* support for special features like caselevel and funky secondaries */
4952 uint8_t *frenchStartPtr
= NULL
;
4953 uint8_t *frenchEndPtr
= NULL
;
4954 uint32_t caseShift
= 0;
4956 sortKeySize
+= ((compareSec
?0:1) + (compareTer
?0:1) + (doCase
?1:0) + /*(qShifted?1:0)*/(compareQuad
?0:1) + (compareIdent
?1:0));
4958 /* If we need to normalize, we'll do it all at once at the beginning! */
4959 UNormalizationMode normMode
;
4961 normMode
= UNORM_NFD
;
4962 } else if(coll
->normalizationMode
!= UCOL_OFF
) {
4963 normMode
= UNORM_FCD
;
4965 normMode
= UNORM_NONE
;
4968 if(normMode
!= UNORM_NONE
&& UNORM_YES
!= unorm_quickCheck(source
, len
, normMode
, status
)) {
4969 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4973 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
4974 normSourceLen
= len
;
4975 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
4976 if(normSource
== NULL
) {
4977 *status
= U_MEMORY_ALLOCATION_ERROR
;
4980 *status
= U_ZERO_ERROR
;
4981 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4987 if(U_FAILURE(*status
)) {
4990 source
= normSource
;
4994 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4995 if(source
== normSource
) {
4996 s
.flags
&= ~UCOL_ITER_NORM
;
4999 if(resultLength
== 0 || primaries
== NULL
) {
5000 int32_t keyLen
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
5001 if(normSource
!= normBuffer
) {
5002 uprv_free(normSource
);
5006 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
5008 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
5010 uint8_t *primStart
= primaries
;
5011 uint8_t *secStart
= secondaries
;
5012 uint8_t *terStart
= tertiaries
;
5013 uint8_t *caseStart
= cases
;
5014 uint8_t *quadStart
= quads
;
5018 uint8_t primary1
= 0;
5019 uint8_t primary2
= 0;
5020 uint8_t secondary
= 0;
5021 uint8_t tertiary
= 0;
5022 uint8_t caseSwitch
= coll
->caseSwitch
;
5023 uint8_t tertiaryMask
= coll
->tertiaryMask
;
5024 int8_t tertiaryAddition
= (int8_t)coll
->tertiaryAddition
;
5025 uint8_t tertiaryTop
= coll
->tertiaryTop
;
5026 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
5027 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
5028 uint8_t caseBits
= 0;
5030 UBool finished
= FALSE
;
5031 UBool wasShifted
= FALSE
;
5032 UBool notIsContinuation
= FALSE
;
5034 uint32_t prevBuffSize
= 0;
5036 uint32_t count2
= 0, count3
= 0, count4
= 0;
5037 uint8_t leadPrimary
= 0;
5040 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
5042 order
= ucol_IGetNextCE(coll
, &s
, status
);
5043 if(order
== UCOL_NO_MORE_CES
) {
5052 notIsContinuation
= !isContinuation(order
);
5054 if(notIsContinuation
) {
5055 tertiary
= (uint8_t)(order
& UCOL_BYTE_SIZE_MASK
);
5057 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
5060 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5061 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5062 primary1
= (uint8_t)(order
>> 8);
5064 if(notIsContinuation
) {
5065 if(scriptOrder
!= NULL
) {
5066 primary1
= scriptOrder
[primary1
];
5070 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
5071 || (!notIsContinuation
&& wasShifted
))
5072 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
5073 /* and other ignorables should be removed if following a shifted code point */
5074 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
5075 /* we should just completely ignore it */
5078 if(compareQuad
== 0) {
5080 while (count4
> UCOL_BOT_COUNT4
) {
5081 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5082 count4
-= UCOL_BOT_COUNT4
;
5084 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5087 /* We are dealing with a variable and we're treating them as shifted */
5088 /* This is a shifted ignorable */
5089 if(primary1
!= 0) { /* we need to check this since we could be in continuation */
5090 *quads
++ = primary1
;
5093 *quads
++ = primary2
;
5099 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5100 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5101 /* regular and simple sortkey calc */
5102 if(primary1
!= UCOL_IGNORABLE
) {
5103 if(notIsContinuation
) {
5104 if(leadPrimary
== primary1
) {
5105 *primaries
++ = primary2
;
5107 if(leadPrimary
!= 0) {
5108 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5110 if(primary2
== UCOL_IGNORABLE
) {
5111 /* one byter, not compressed */
5112 *primaries
++ = primary1
;
5114 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
5115 (primary1
> (*UCAconsts
->UCA_LAST_NON_VARIABLE
>>24) && primary1
< (*UCAconsts
->UCA_FIRST_IMPLICIT
>>24))) {
5116 /* not compressible */
5118 *primaries
++ = primary1
;
5119 *primaries
++ = primary2
;
5120 } else { /* compress */
5121 *primaries
++ = leadPrimary
= primary1
;
5122 *primaries
++ = primary2
;
5125 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5126 *primaries
++ = primary1
;
5127 if(primary2
!= UCOL_IGNORABLE
) {
5128 *primaries
++ = primary2
; /* second part */
5133 if(secondary
> compareSec
) {
5135 /* This is compression code. */
5136 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5140 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5141 while (count2
> UCOL_TOP_COUNT2
) {
5142 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5143 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5145 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5147 while (count2
> UCOL_BOT_COUNT2
) {
5148 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5149 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5151 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5155 *secondaries
++ = secondary
;
5158 *secondaries
++ = secondary
;
5159 /* Do the special handling for French secondaries */
5160 /* We need to get continuation elements and do intermediate restore */
5161 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5162 if(notIsContinuation
) {
5163 if (frenchStartPtr
!= NULL
) {
5164 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5165 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
5166 frenchStartPtr
= NULL
;
5169 if (frenchStartPtr
== NULL
) {
5170 frenchStartPtr
= secondaries
- 2;
5172 frenchEndPtr
= secondaries
-1;
5178 doCaseShift(&cases
, caseShift
);
5179 if(notIsContinuation
) {
5180 caseBits
= (uint8_t)(tertiary
& 0xC0);
5183 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
5184 if((caseBits
& 0xC0) == 0) {
5185 *(cases
-1) |= 1 << (--caseShift
);
5187 *(cases
-1) |= 0 << (--caseShift
);
5189 doCaseShift(&cases
, caseShift
);
5190 *(cases
-1) |= ((caseBits
>>6)&1) << (--caseShift
);
5193 if((caseBits
& 0xC0) == 0) {
5194 *(cases
-1) |= 0 << (--caseShift
);
5196 *(cases
-1) |= 1 << (--caseShift
);
5198 doCaseShift(&cases
, caseShift
);
5199 *(cases
-1) |= ((caseBits
>>7)&1) << (--caseShift
);
5206 if(notIsContinuation
) {
5207 tertiary
^= caseSwitch
;
5211 tertiary
&= tertiaryMask
;
5212 if(tertiary
> compareTer
) {
5213 /* This is compression code. */
5214 /* sequence size check is included in the if clause */
5215 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5218 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
5219 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
5220 tertiary
+= tertiaryAddition
;
5223 if ((tertiary
> tertiaryCommon
)) {
5224 while (count3
> coll
->tertiaryTopCount
) {
5225 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5226 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5228 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5230 while (count3
> coll
->tertiaryBottomCount
) {
5231 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5232 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5234 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5238 *tertiaries
++ = tertiary
;
5242 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
5243 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
5244 if(count4
>0) { // Close this part
5245 while (count4
> UCOL_BOT_COUNT4
) {
5246 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5247 count4
-= UCOL_BOT_COUNT4
;
5249 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5252 *quads
++ = UCOL_HIRAGANA_QUAD
; // Add the Hiragana
5253 } else { // This wasn't Hiragana, so we can continue adding stuff
5259 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5260 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5261 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5262 if(source
== normSource
) {
5263 s
.flags
&= ~UCOL_ITER_NORM
;
5265 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
5266 *status
= U_BUFFER_OVERFLOW_ERROR
;
5269 } else { /* It's much nicer if we can actually reallocate */
5270 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
)+(cases
-caseStart
)+(quads
-quadStart
);
5271 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5272 if(U_SUCCESS(*status
)) {
5273 *result
= primStart
;
5274 primarySafeEnd
= primStart
+ resultLength
- 2;
5276 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5277 if(source
== normSource
) {
5278 s
.flags
&= ~UCOL_ITER_NORM
;
5280 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
5290 prevBuffSize
= minBufferSize
;
5291 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5292 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5293 caseStart
= reallocateBuffer(&cases
, caseStart
, caseB
, &caseSize
, 2*caseSize
, status
);
5294 quadStart
= reallocateBuffer(&quads
, quadStart
, quad
, &quadSize
, 2*quadSize
, status
);
5296 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5297 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5298 if(source
== normSource
) {
5299 s
.flags
&= ~UCOL_ITER_NORM
;
5301 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
5307 /* Here, we are generally done with processing */
5308 /* bailing out would not be too productive */
5310 if(U_SUCCESS(*status
)) {
5311 sortKeySize
+= (primaries
- primStart
);
5312 /* we have done all the CE's, now let's put them together to form a key */
5313 if(compareSec
== 0) {
5315 while (count2
> UCOL_BOT_COUNT2
) {
5316 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5317 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5319 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5321 uint32_t secsize
= secondaries
-secStart
;
5322 if(!isFrenchSec
) { // Regular situation, we know the length of secondaries
5323 sortKeySize
+= secsize
;
5324 if(sortKeySize
<= resultLength
) {
5325 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5326 uprv_memcpy(primaries
, secStart
, secsize
);
5327 primaries
+= secsize
;
5329 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
5330 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5331 if(U_SUCCESS(*status
)) {
5332 *result
= primStart
;
5333 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5334 uprv_memcpy(primaries
, secStart
, secsize
);
5335 primaries
+= secsize
;
5338 *status
= U_BUFFER_OVERFLOW_ERROR
;
5341 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5342 uint8_t *newPrim
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
5343 sortKeySize
+= secsize
;
5344 if(sortKeySize
<= resultLength
) { // if we managed to pack fine
5345 primaries
= newPrim
; // update the primary pointer
5346 } else { // overflow, need to reallocate and redo
5347 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
5348 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5349 if(U_SUCCESS(*status
)) {
5350 primaries
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
5353 *status
= U_BUFFER_OVERFLOW_ERROR
;
5360 uint32_t casesize
= cases
- caseStart
;
5361 sortKeySize
+= casesize
;
5362 if(sortKeySize
<= resultLength
) {
5363 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5364 uprv_memcpy(primaries
, caseStart
, casesize
);
5365 primaries
+= casesize
;
5367 if(allocateSKBuffer
== TRUE
) {
5368 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5369 if(U_SUCCESS(*status
)) {
5370 *result
= primStart
;
5371 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5372 uprv_memcpy(primaries
, caseStart
, casesize
);
5375 *status
= U_BUFFER_OVERFLOW_ERROR
;
5380 if(compareTer
== 0) {
5382 if (coll
->tertiaryCommon
!= UCOL_COMMON_BOT3
) {
5383 while (count3
>= coll
->tertiaryTopCount
) {
5384 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5385 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5387 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5389 while (count3
> coll
->tertiaryBottomCount
) {
5390 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5391 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5393 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5396 uint32_t tersize
= tertiaries
- terStart
;
5397 sortKeySize
+= tersize
;
5398 if(sortKeySize
<= resultLength
) {
5399 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5400 uprv_memcpy(primaries
, terStart
, tersize
);
5401 primaries
+= tersize
;
5403 if(allocateSKBuffer
== TRUE
) {
5404 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5405 if(U_SUCCESS(*status
)) {
5406 *result
= primStart
;
5407 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5408 uprv_memcpy(primaries
, terStart
, tersize
);
5411 *status
= U_BUFFER_OVERFLOW_ERROR
;
5415 if(compareQuad
== 0/*qShifted == TRUE*/) {
5417 while (count4
> UCOL_BOT_COUNT4
) {
5418 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5419 count4
-= UCOL_BOT_COUNT4
;
5421 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5423 uint32_t quadsize
= quads
- quadStart
;
5424 sortKeySize
+= quadsize
;
5425 if(sortKeySize
<= resultLength
) {
5426 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5427 uprv_memcpy(primaries
, quadStart
, quadsize
);
5428 primaries
+= quadsize
;
5430 if(allocateSKBuffer
== TRUE
) {
5431 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5432 if(U_SUCCESS(*status
)) {
5433 *result
= primStart
;
5434 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5435 uprv_memcpy(primaries
, quadStart
, quadsize
);
5438 *status
= U_BUFFER_OVERFLOW_ERROR
;
5444 sortKeySize
+= u_lengthOfIdenticalLevelRun(s
.string
, len
);
5445 if(sortKeySize
<= resultLength
) {
5446 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5447 primaries
+= u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5449 if(allocateSKBuffer
== TRUE
) {
5450 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, sortKeySize
, status
);
5451 if(U_SUCCESS(*status
)) {
5452 *result
= primStart
;
5453 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5454 u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5457 *status
= U_BUFFER_OVERFLOW_ERROR
;
5462 *(primaries
++) = '\0';
5465 if(terStart
!= tert
) {
5466 uprv_free(terStart
);
5467 uprv_free(secStart
);
5468 uprv_free(caseStart
);
5469 uprv_free(quadStart
);
5472 if(normSource
!= normBuffer
) {
5473 uprv_free(normSource
);
5476 if(allocateSKBuffer
== TRUE
) {
5477 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5479 if (*result
== NULL
) {
5480 *status
= U_MEMORY_ALLOCATION_ERROR
;
5483 uprv_memcpy(*result
, primStart
, sortKeySize
);
5484 if(primStart
!= prim
) {
5485 uprv_free(primStart
);
5493 U_CFUNC
int32_t U_CALLCONV
5494 ucol_calcSortKeySimpleTertiary(const UCollator
*coll
,
5495 const UChar
*source
,
5496 int32_t sourceLength
,
5498 uint32_t resultLength
,
5499 UBool allocateSKBuffer
,
5504 const UCAConstants
*UCAconsts
= (UCAConstants
*)((uint8_t *)coll
->UCA
->image
+ coll
->image
->UCAConsts
);
5505 uint32_t i
= 0; /* general purpose counter */
5507 /* Stack allocated buffers for buffers we use */
5508 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
];
5510 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
;
5512 if(U_FAILURE(*status
)) {
5516 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
5517 primaries
= *result
= prim
;
5518 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
5521 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
;
5523 uint32_t sortKeySize
= 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5525 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
5526 UChar
*normSource
= normBuffer
;
5527 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
5529 int32_t len
= sourceLength
;
5531 /* If we need to normalize, we'll do it all at once at the beginning! */
5532 if(coll
->normalizationMode
!= UCOL_OFF
&& UNORM_YES
!= unorm_quickCheck(source
, len
, UNORM_FCD
, status
)) {
5533 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5537 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
5538 normSourceLen
= len
;
5539 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
5540 if(normSource
== NULL
) {
5541 *status
= U_MEMORY_ALLOCATION_ERROR
;
5544 *status
= U_ZERO_ERROR
;
5545 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5551 if(U_FAILURE(*status
)) {
5554 source
= normSource
;
5558 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5559 if(source
== normSource
) {
5560 s
.flags
&= ~UCOL_ITER_NORM
;
5563 if(resultLength
== 0 || primaries
== NULL
) {
5564 int32_t t
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5565 if(normSource
!= normBuffer
) {
5566 uprv_free(normSource
);
5571 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
5573 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
5575 uint8_t *primStart
= primaries
;
5576 uint8_t *secStart
= secondaries
;
5577 uint8_t *terStart
= tertiaries
;
5581 uint8_t primary1
= 0;
5582 uint8_t primary2
= 0;
5583 uint8_t secondary
= 0;
5584 uint8_t tertiary
= 0;
5585 uint8_t caseSwitch
= coll
->caseSwitch
;
5586 uint8_t tertiaryMask
= coll
->tertiaryMask
;
5587 int8_t tertiaryAddition
= (int8_t)coll
->tertiaryAddition
;
5588 uint8_t tertiaryTop
= coll
->tertiaryTop
;
5589 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
5590 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
5592 uint32_t prevBuffSize
= 0;
5594 UBool finished
= FALSE
;
5595 UBool notIsContinuation
= FALSE
;
5597 uint32_t count2
= 0, count3
= 0;
5598 uint8_t leadPrimary
= 0;
5601 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
5603 order
= ucol_IGetNextCE(coll
, &s
, status
);
5609 if(order
== UCOL_NO_MORE_CES
) {
5614 notIsContinuation
= !isContinuation(order
);
5616 if(notIsContinuation
) {
5617 tertiary
= (uint8_t)((order
& tertiaryMask
));
5619 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
5621 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5622 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5623 primary1
= (uint8_t)(order
>> 8);
5625 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5626 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5627 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5628 /* regular and simple sortkey calc */
5629 if(primary1
!= UCOL_IGNORABLE
) {
5630 if(notIsContinuation
) {
5631 if(leadPrimary
== primary1
) {
5632 *primaries
++ = primary2
;
5634 if(leadPrimary
!= 0) {
5635 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5637 if(primary2
== UCOL_IGNORABLE
) {
5638 /* one byter, not compressed */
5639 *primaries
++ = primary1
;
5641 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
5642 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5643 (primary1
> (*UCAconsts
->UCA_LAST_NON_VARIABLE
>>24) && primary1
< (*UCAconsts
->UCA_FIRST_IMPLICIT
>>24))) {
5644 /* not compressible */
5646 *primaries
++ = primary1
;
5647 *primaries
++ = primary2
;
5648 } else { /* compress */
5649 *primaries
++ = leadPrimary
= primary1
;
5650 *primaries
++ = primary2
;
5653 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5654 *primaries
++ = primary1
;
5655 if(primary2
!= UCOL_IGNORABLE
) {
5656 *primaries
++ = primary2
; /* second part */
5661 if(secondary
> 0) { /* I think that != 0 test should be != IGNORABLE */
5662 /* This is compression code. */
5663 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5667 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5668 while (count2
> UCOL_TOP_COUNT2
) {
5669 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5670 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5672 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5674 while (count2
> UCOL_BOT_COUNT2
) {
5675 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5676 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5678 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5682 *secondaries
++ = secondary
;
5686 if(notIsContinuation
) {
5687 tertiary
^= caseSwitch
;
5691 /* This is compression code. */
5692 /* sequence size check is included in the if clause */
5693 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5696 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
5697 tertiary
+= tertiaryAddition
;
5698 } else if (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
5699 tertiary
-= tertiaryAddition
;
5702 if ((tertiary
> tertiaryCommon
)) {
5703 while (count3
> coll
->tertiaryTopCount
) {
5704 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5705 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5707 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5709 while (count3
> coll
->tertiaryBottomCount
) {
5710 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5711 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5713 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5717 *tertiaries
++ = tertiary
;
5721 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5722 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5723 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5724 if(source
== normSource
) {
5725 s
.flags
&= ~UCOL_ITER_NORM
;
5727 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5728 *status
= U_BUFFER_OVERFLOW_ERROR
;
5731 } else { /* It's much nicer if we can actually reallocate */
5732 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
);
5733 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5734 if(U_SUCCESS(*status
)) {
5735 *result
= primStart
;
5736 primarySafeEnd
= primStart
+ resultLength
- 2;
5738 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5739 if(source
== normSource
) {
5740 s
.flags
&= ~UCOL_ITER_NORM
;
5742 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5752 prevBuffSize
= minBufferSize
;
5753 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5754 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5756 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5757 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5758 if(source
== normSource
) {
5759 s
.flags
&= ~UCOL_ITER_NORM
;
5761 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5767 if(U_SUCCESS(*status
)) {
5768 sortKeySize
+= (primaries
- primStart
);
5769 /* we have done all the CE's, now let's put them together to form a key */
5771 while (count2
> UCOL_BOT_COUNT2
) {
5772 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5773 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5775 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5777 uint32_t secsize
= secondaries
-secStart
;
5778 sortKeySize
+= secsize
;
5779 if(sortKeySize
<= resultLength
) {
5780 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5781 uprv_memcpy(primaries
, secStart
, secsize
);
5782 primaries
+= secsize
;
5784 if(allocateSKBuffer
== TRUE
) {
5785 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5786 if(U_SUCCESS(*status
)) {
5787 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5788 *result
= primStart
;
5789 uprv_memcpy(primaries
, secStart
, secsize
);
5792 *status
= U_BUFFER_OVERFLOW_ERROR
;
5797 if (coll
->tertiaryCommon
!= UCOL_COMMON3_NORMAL
) {
5798 while (count3
>= coll
->tertiaryTopCount
) {
5799 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5800 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5802 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5804 while (count3
> coll
->tertiaryBottomCount
) {
5805 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5806 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5808 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5811 uint32_t tersize
= tertiaries
- terStart
;
5812 sortKeySize
+= tersize
;
5813 if(sortKeySize
<= resultLength
) {
5814 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5815 uprv_memcpy(primaries
, terStart
, tersize
);
5816 primaries
+= tersize
;
5818 if(allocateSKBuffer
== TRUE
) {
5819 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5820 if(U_SUCCESS(*status
)) {
5821 *result
= primStart
;
5822 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5823 uprv_memcpy(primaries
, terStart
, tersize
);
5826 *status
= U_MEMORY_ALLOCATION_ERROR
;
5830 *(primaries
++) = '\0';
5833 if(terStart
!= tert
) {
5834 uprv_free(terStart
);
5835 uprv_free(secStart
);
5838 if(normSource
!= normBuffer
) {
5839 uprv_free(normSource
);
5842 if(allocateSKBuffer
== TRUE
) {
5843 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5845 if (*result
== NULL
) {
5846 *status
= U_MEMORY_ALLOCATION_ERROR
;
5849 uprv_memcpy(*result
, primStart
, sortKeySize
);
5850 if(primStart
!= prim
) {
5851 uprv_free(primStart
);
5859 UBool
isShiftedCE(uint32_t CE
, uint32_t LVT
, UBool
*wasShifted
) {
5860 UBool notIsContinuation
= !isContinuation(CE
);
5861 uint8_t primary1
= (uint8_t)((CE
>> 24) & 0xFF);
5862 if(LVT
&& ((notIsContinuation
&& (CE
& 0xFFFF0000)<= LVT
&& primary1
> 0)
5863 || (!notIsContinuation
&& *wasShifted
))
5864 || (*wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
5865 // The stuff below should probably be in the sortkey code... maybe not...
5866 if(primary1
!= 0) { /* if we were shifted and we got an ignorable code point */
5867 /* we should just completely ignore it */
5871 //*wasShifted = TRUE;
5874 *wasShifted
= FALSE
;
5879 void terminatePSKLevel(int32_t level
, int32_t maxLevel
, int32_t &i
, uint8_t *dest
) {
5880 if(level
< maxLevel
) {
5881 dest
[i
++] = UCOL_LEVELTERMINATOR
;
5887 /** enumeration of level identifiers for partial sort key generation */
5889 UCOL_PSK_PRIMARY
= 0,
5890 UCOL_PSK_SECONDARY
= 1,
5892 UCOL_PSK_TERTIARY
= 3,
5893 UCOL_PSK_QUATERNARY
= 4,
5894 UCOL_PSK_QUIN
= 5, /** This is an extra level, not used - but we have three bits to blow */
5895 UCOL_PSK_IDENTICAL
= 6,
5896 UCOL_PSK_NULL
= 7, /** level for the end of sort key. Will just produce zeros */
5900 /** collation state enum. *_SHIFT value is how much to shift right
5901 * to get the state piece to the right. *_MASK value should be
5902 * ANDed with the shifted state. This data is stored in state[1]
5906 UCOL_PSK_LEVEL_SHIFT
= 0, /** level identificator. stores an enum value from above */
5907 UCOL_PSK_LEVEL_MASK
= 7, /** three bits */
5908 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
= 3, /** number of bytes of primary or quaternary already written */
5909 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
= 1,
5910 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5911 * This field is also used to denote that the French secondary level is finished
5913 UCOL_PSK_WAS_SHIFTED_SHIFT
= 4,/** was the last value shifted */
5914 UCOL_PSK_WAS_SHIFTED_MASK
= 1, /** can be 0 or 1 (Boolean) */
5915 UCOL_PSK_USED_FRENCH_SHIFT
= 5,/** how many French bytes have we already written */
5916 UCOL_PSK_USED_FRENCH_MASK
= 3, /** up to 4 bytes. See comment just below */
5917 /** When we do French we need to reverse secondary values. However, continuations
5918 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5920 UCOL_PSK_USED_ELEMENTS_SHIFT
= 7,
5921 UCOL_PSK_USED_ELEMENTS_MASK
= 0x3FF,
5922 UCOL_PSK_ITER_SKIP_SHIFT
= 17,
5923 UCOL_PSK_ITER_SKIP_MASK
= 0x7FFF
5927 /** main sortkey part procedure. On the first call,
5928 * you should pass in a collator, an iterator, empty state
5929 * state[0] == state[1] == 0, a buffer to hold results
5930 * number of bytes you need and an error code pointer.
5931 * Make sure your buffer is big enough to hold the wanted
5932 * number of sortkey bytes. I don't check.
5933 * The only meaningful status you can get back is
5934 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5935 * have been dealt a raw deal and that you probably won't
5936 * be able to use partial sortkey generation for this
5937 * particular combination of string and collator. This
5938 * is highly unlikely, but you should still check the error code.
5939 * Any other status means that you're not in a sane situation
5940 * anymore. After the first call, preserve state values and
5941 * use them on subsequent calls to obtain more bytes of a sortkey.
5942 * Use until the number of bytes written is smaller than the requested
5943 * number of bytes. Generated sortkey is not compatible with the
5944 * one generated by ucol_getSortKey, as we don't do any compression.
5945 * However, levels are still terminated by a 1 (one) and the sortkey
5946 * is terminated by a 0 (zero). Identical level is the same as in the
5947 * regular sortkey - internal bocu-1 implementation is used.
5948 * For curious, although you cannot do much about this, here is
5949 * the structure of state words.
5950 * state[0] - iterator state. Depends on the iterator implementation,
5951 * but allows the iterator to continue where it stopped in
5952 * the last iteration.
5953 * state[1] - collation processing state. Here is the distribution
5955 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5956 * quaternary, quin (we don't use this one), identical and
5957 * null (producing only zeroes - first one to terminate the
5958 * sortkey and subsequent to fill the buffer).
5959 * 3 - byte count. Number of bytes written on the primary level.
5960 * 4 - was shifted. Whether the previous iteration finished in the
5962 * 5, 6 - French continuation bytes written. See the comment in the enum
5963 * 7..16 - Used elements. Number of CEs that were already used from the
5964 * expansion buffer or number of bytes from a bocu sequence on
5965 * the identical level.
5966 * 17..31 - iterator skip. Number of move operations iterator needs to
5967 * skip from the current state in order to continue. This is used
5968 * only if normalization is turned on, since the normalizing iterator
5969 * can return undefined state, which means that it's in the middle
5970 * of normalizing sequence.
5972 U_CAPI
int32_t U_EXPORT2
5973 ucol_nextSortKeyPart(const UCollator
*coll
,
5974 UCharIterator
*iter
,
5976 uint8_t *dest
, int32_t count
,
5977 UErrorCode
*status
) {
5978 /* error checking */
5979 if(status
==NULL
|| U_FAILURE(*status
)) {
5982 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART
);
5983 if( coll
==NULL
|| iter
==NULL
||
5985 count
<0 || (count
>0 && dest
==NULL
)
5987 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
5990 UTRACE_DATA6(UTRACE_VERBOSE
, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5991 coll
, iter
, state
[0], state
[1], dest
, count
);
5995 UTRACE_EXIT_VALUE(0);
5999 /** Setting up situation according to the state we got from the previous iteration */
6000 // The state of the iterator from the previous invocation
6001 uint32_t iterState
= state
[0];
6002 // Has the last iteration ended in the shifted state
6003 UBool wasShifted
= ((state
[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT
) & UCOL_PSK_WAS_SHIFTED_MASK
)?TRUE
:FALSE
;
6004 // What is the current level of the sortkey?
6005 int32_t level
= (state
[1] >> UCOL_PSK_LEVEL_SHIFT
) & UCOL_PSK_LEVEL_MASK
;
6006 // Have we written only one byte from a two byte primary in the previous iteration?
6007 // Also on secondary level - have we finished with the French secondary?
6008 int32_t byteCountOrFrenchDone
= (state
[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
;
6009 // number of bytes in the continuation buffer for French
6010 int32_t usedFrench
= (state
[1] >> UCOL_PSK_USED_FRENCH_SHIFT
) & UCOL_PSK_USED_FRENCH_MASK
;
6011 // Skip the CEs that we got from an extraction
6012 // and delivered in the previous call
6013 int32_t usedElements
= (state
[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT
) & UCOL_PSK_USED_ELEMENTS_MASK
;
6014 // Number of times to skip because the iterator returned
6015 // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
6016 // last valid state.
6017 int32_t iterSkips
= (state
[1] >> UCOL_PSK_ITER_SKIP_SHIFT
) & UCOL_PSK_ITER_SKIP_MASK
;
6019 /** values that depend on the collator attributes */
6020 // strength of the collator.
6021 int32_t strength
= ucol_getAttribute(coll
, UCOL_STRENGTH
, status
);
6022 // maximal level of the partial sortkey. Need to take whether case level is done
6023 int32_t maxLevel
= 0;
6024 if(strength
< UCOL_TERTIARY
) {
6025 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
6026 maxLevel
= UCOL_PSK_CASE
;
6028 maxLevel
= strength
;
6031 if(strength
== UCOL_TERTIARY
) {
6032 maxLevel
= UCOL_PSK_TERTIARY
;
6033 } else if(strength
== UCOL_QUATERNARY
) {
6034 maxLevel
= UCOL_PSK_QUATERNARY
;
6035 } else { // identical
6036 maxLevel
= UCOL_IDENTICAL
;
6039 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
6040 uint8_t UCOL_HIRAGANA_QUAD
=
6041 (ucol_getAttribute(coll
, UCOL_HIRAGANA_QUATERNARY_MODE
, status
) == UCOL_ON
)?0xFE:0xFF;
6042 // Boundary value that decides whether a CE is shifted or not
6043 uint32_t LVT
= (coll
->alternateHandling
== UCOL_SHIFTED
)?(coll
->variableTopValue
<<16):0;
6044 // Are we doing French collation?
6045 UBool doingFrench
= (ucol_getAttribute(coll
, UCOL_FRENCH_COLLATION
, status
) == UCOL_ON
);
6047 /** initializing the collation state */
6048 UBool notIsContinuation
= FALSE
;
6049 uint32_t CE
= UCOL_NO_MORE_CES
;
6052 IInit_collIterate(coll
, NULL
, -1, &s
);
6054 s
.flags
|= UCOL_USE_ITERATOR
;
6055 // This variable tells us whether we have produced some other levels in this iteration
6056 // before we moved to the identical level. In that case, we need to switch the
6057 // type of the iterator.
6058 UBool doingIdenticalFromStart
= FALSE
;
6059 // Normalizing iterator
6060 // The division for the array length may truncate the array size to
6061 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6062 // for all platforms anyway.
6063 UAlignedMemory stackNormIter
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
6064 UNormIterator
*normIter
= NULL
;
6065 // If the normalization is turned on for the collator and we are below identical level
6066 // we will use a FCD normalizing iterator
6067 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
&& level
< UCOL_PSK_IDENTICAL
) {
6068 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6069 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_FCD
, status
);
6070 s
.flags
&= ~UCOL_ITER_NORM
;
6071 if(U_FAILURE(*status
)) {
6072 UTRACE_EXIT_STATUS(*status
);
6075 } else if(level
== UCOL_PSK_IDENTICAL
) {
6076 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
6077 // will be updating the state - and this cannot be done on an ordinary iterator.
6078 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6079 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6080 s
.flags
&= ~UCOL_ITER_NORM
;
6081 if(U_FAILURE(*status
)) {
6082 UTRACE_EXIT_STATUS(*status
);
6085 doingIdenticalFromStart
= TRUE
;
6088 // This is the tentative new state of the iterator. The problem
6089 // is that the iterator might return an undefined state, in
6090 // which case we should save the last valid state and increase
6091 // the iterator skip value.
6092 uint32_t newState
= 0;
6094 // First, we set the iterator to the last valid position
6095 // from the last iteration. This was saved in state[0].
6096 if(iterState
== 0) {
6098 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
&& !byteCountOrFrenchDone
) {
6099 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
6101 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6104 /* reset to previous state */
6105 s
.iterator
->setState(s
.iterator
, iterState
, status
);
6106 if(U_FAILURE(*status
)) {
6107 UTRACE_EXIT_STATUS(*status
);
6112 // Then, we may have to move more, if the normalizing iterator
6113 // was going through a normalizing sequence.
6115 // if we are on secondary level AND we do French, we need to go backward instead of forward
6116 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6117 s
.iterator
->move(s
.iterator
, -iterSkips
, UITER_CURRENT
);
6119 s
.iterator
->move(s
.iterator
, iterSkips
, UITER_CURRENT
);
6124 // Number of expansion CEs that were already consumed in the
6125 // previous iteration for the last code point processed. We
6126 // want to clean out the expansion buffer, so that we can
6127 // get correct CEs. This value is persistent over iterations,
6128 // since we can have several iterations on the one expansion
6130 int32_t consumedExpansionCEs
= usedElements
;
6131 // Number of bytes already writted from a bocsu sequence. Since
6132 // the longes bocsu sequence is 4 long, this can be up to 3. It
6133 // shares the state field with consumedExpansionCEs value, since
6134 // they cannot simultanously appear on the same level
6135 int32_t bocsuBytesUsed
= 0;
6136 // Clean out the expansion buffer unless we are on
6137 // identical level. In that case we use this field
6138 // to store the number of bytes already written
6139 // from the previous bocsu sequence.
6140 if(level
< UCOL_PSK_IDENTICAL
&& usedElements
!= 0) {
6141 while(usedElements
-->0) {
6142 // If we're doing French and we are on the secondary level,
6144 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6145 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
6147 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6149 if(CE
==UCOL_NO_MORE_CES
) {
6150 /* should not happen */
6151 *status
=U_INTERNAL_PROGRAM_ERROR
;
6152 UTRACE_EXIT_STATUS(*status
);
6157 bocsuBytesUsed
= usedElements
;
6160 // This variable prevents the adjusting of iterator
6161 // skip variable when we are the first time on a
6162 // level. I hope there is a better way to do it, but
6163 // I could not think of it.
6164 UBool firstTimeOnLevel
= TRUE
;
6165 // French secondary needs to know whether the iterator state of zero came from previous level OR
6166 // from a new invocation...
6167 UBool wasDoingPrimary
= FALSE
;
6168 // Case level is kind of goofy. This variable tells us that
6169 // we are still not done with the case level.
6170 UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator
= FALSE
;
6171 // destination buffer byte counter. When this guy
6172 // gets to count, we're done with the iteration
6174 // used to count the zero bytes written after we
6175 // have finished with the sort key
6179 // Hm.... I think we're ready to plunge in. Basic story is as following:
6180 // we have a fall through case based on level. This is used for initial
6181 // positioning on iteration start. Every level processor contains a
6182 // for(;;) which will be broken when we exhaust all the CEs. Other
6183 // way to exit is a goto saveState, which happens when we have filled
6186 case UCOL_PSK_PRIMARY
:
6187 wasDoingPrimary
= TRUE
;
6192 // We should save the state only if we
6193 // are sure that we are done with the
6194 // previous iterator state
6195 if(consumedExpansionCEs
== 0 && byteCountOrFrenchDone
== 0) {
6196 newState
= s
.iterator
->getState(s
.iterator
);
6197 if(newState
!= UITER_NO_STATE
) {
6198 iterState
= newState
;
6201 if(!firstTimeOnLevel
&& !byteCountOrFrenchDone
) {
6206 firstTimeOnLevel
= FALSE
;
6207 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6208 if(CE
==UCOL_NO_MORE_CES
) {
6209 // Add the level separator
6210 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6211 byteCountOrFrenchDone
=0;
6212 // Restart the iteration an move to the
6214 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6215 level
= UCOL_PSK_SECONDARY
;
6218 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6219 CE
>>= UCOL_PRIMARYORDERSHIFT
; /* get primary */
6221 if(byteCountOrFrenchDone
== 0) {
6222 // get the second byte of primary
6223 dest
[i
++]=(uint8_t)(CE
>> 8);
6225 byteCountOrFrenchDone
= 0;
6227 if((CE
&=0xff)!=0) {
6230 byteCountOrFrenchDone
=1;
6233 dest
[i
++]=(uint8_t)CE
;
6237 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6238 // s.pos != NULL means there is a normalization buffer in effect
6239 // in iterative case, this means that we are doing Thai (maybe discontiguos)
6240 consumedExpansionCEs
++;
6242 consumedExpansionCEs
= 0;
6244 if(s
.pos
&& *s
.pos
== 0) {
6245 // maybe it is the end of Thai - we have to have
6250 /* fall through to next level */
6251 case UCOL_PSK_SECONDARY
:
6252 if(strength
>= UCOL_SECONDARY
) {
6258 // We should save the state only if we
6259 // are sure that we are done with the
6260 // previous iterator state
6261 if(consumedExpansionCEs
== 0) {
6262 newState
= s
.iterator
->getState(s
.iterator
);
6263 if(newState
!= UITER_NO_STATE
) {
6264 iterState
= newState
;
6267 if(!firstTimeOnLevel
) {
6272 firstTimeOnLevel
= FALSE
;
6273 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6274 if(CE
==UCOL_NO_MORE_CES
) {
6275 // Add the level separator
6276 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6277 byteCountOrFrenchDone
=0;
6278 // Restart the iteration an move to the
6280 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6281 level
= UCOL_PSK_CASE
;
6284 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6285 CE
>>= 8; /* get secondary */
6287 dest
[i
++]=(uint8_t)CE
;
6290 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6291 consumedExpansionCEs
++;
6293 consumedExpansionCEs
= 0;
6295 if(s
.pos
&& *s
.pos
== 0) {
6299 } else { // French secondary processing
6300 uint8_t frenchBuff
[UCOL_MAX_BUFFER
];
6301 int32_t frenchIndex
= 0;
6302 // Here we are going backwards.
6303 // If the iterator is at the beggining, it should be
6305 if(wasDoingPrimary
) {
6306 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
6312 if(consumedExpansionCEs
== 0) {
6313 newState
= s
.iterator
->getState(s
.iterator
);
6314 if(newState
!= UITER_NO_STATE
) {
6315 iterState
= newState
;
6318 if(!firstTimeOnLevel
) {
6323 firstTimeOnLevel
= FALSE
;
6324 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
6325 if(CE
==UCOL_NO_MORE_CES
) {
6326 // Add the level separator
6327 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6328 byteCountOrFrenchDone
=0;
6329 // Restart the iteration an move to the next level
6330 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6331 level
= UCOL_PSK_CASE
;
6334 if(isContinuation(CE
)) { // if it's a continuation, we want to save it and
6335 // reverse when we get a first non-continuation CE.
6337 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
6338 } else if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6339 CE
>>= 8; /* get secondary */
6342 dest
[i
++]=(uint8_t)CE
;
6345 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
6346 frenchIndex
-= usedFrench
;
6348 while(i
< count
&& frenchIndex
) {
6349 dest
[i
++] = frenchBuff
[--frenchIndex
];
6354 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6355 consumedExpansionCEs
++;
6357 consumedExpansionCEs
= 0;
6359 if(s
.pos
&& *s
.pos
== 0) {
6365 level
= UCOL_PSK_CASE
;
6367 /* fall through to next level */
6369 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
6370 uint32_t caseShift
= UCOL_CASE_SHIFT_START
;
6371 uint8_t caseByte
= UCOL_CASE_BYTE_START
;
6372 uint8_t caseBits
= 0;
6378 // We should save the state only if we
6379 // are sure that we are done with the
6380 // previous iterator state
6381 if(consumedExpansionCEs
== 0) {
6382 newState
= s
.iterator
->getState(s
.iterator
);
6383 if(newState
!= UITER_NO_STATE
) {
6384 iterState
= newState
;
6387 if(!firstTimeOnLevel
) {
6392 firstTimeOnLevel
= FALSE
;
6393 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6394 if(CE
==UCOL_NO_MORE_CES
) {
6395 // On the case level we might have an unfinished
6396 // case byte. Add one if it's started.
6397 if(caseShift
!= UCOL_CASE_SHIFT_START
) {
6398 dest
[i
++] = caseByte
;
6400 // This is kind of tricky - situation where
6401 // we need to keep the iterator in the old
6402 // state, but don't need to bring anything
6403 // to the next invocation
6405 // Add the level separator
6406 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6407 // Restart the iteration and move to the
6409 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6410 level
= UCOL_PSK_TERTIARY
;
6412 dontAdvanceIteratorBecauseWeNeedALevelTerminator
= TRUE
;
6417 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6418 if(!isContinuation(CE
)) {
6419 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6420 caseBits
= (uint8_t)(CE
& 0xC0);
6421 // this copies the case level logic from the
6422 // sort key generation code
6424 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6425 if((caseBits
& 0xC0) == 0) {
6426 caseByte
|= 1 << (--caseShift
);
6428 caseByte
|= 0 << (--caseShift
);
6430 if(caseShift
== 0) {
6431 dest
[i
++] = caseByte
;
6432 caseShift
= UCOL_CASE_SHIFT_START
;
6433 caseByte
= UCOL_CASE_BYTE_START
;
6435 caseByte
|= ((caseBits
>>6)&1) << (--caseShift
);
6438 if((caseBits
& 0xC0) == 0) {
6439 caseByte
|= 0 << (--caseShift
);
6441 caseByte
|= 1 << (--caseShift
);
6443 if(caseShift
== 0) {
6444 dest
[i
++] = caseByte
;
6445 caseShift
= UCOL_CASE_SHIFT_START
;
6446 caseByte
= UCOL_CASE_BYTE_START
;
6448 caseByte
|= ((caseBits
>>7)&1) << (--caseShift
);
6455 // Not sure this is correct for the case level - revisit
6456 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6457 consumedExpansionCEs
++;
6459 consumedExpansionCEs
= 0;
6461 if(s
.pos
&& *s
.pos
== 0) {
6466 level
= UCOL_PSK_TERTIARY
;
6468 /* fall through to next level */
6469 case UCOL_PSK_TERTIARY
:
6470 if(strength
>= UCOL_TERTIARY
) {
6475 // We should save the state only if we
6476 // are sure that we are done with the
6477 // previous iterator state
6478 if(consumedExpansionCEs
== 0) {
6479 newState
= s
.iterator
->getState(s
.iterator
);
6480 if(newState
!= UITER_NO_STATE
) {
6481 iterState
= newState
;
6484 if(!firstTimeOnLevel
) {
6489 firstTimeOnLevel
= FALSE
;
6490 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6491 if(CE
==UCOL_NO_MORE_CES
) {
6492 // Add the level separator
6493 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6494 byteCountOrFrenchDone
=0;
6495 // Restart the iteration an move to the
6497 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6498 level
= UCOL_PSK_QUATERNARY
;
6501 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6502 notIsContinuation
= !isContinuation(CE
);
6504 if(notIsContinuation
) {
6505 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6506 CE
^= coll
->caseSwitch
;
6507 CE
&= coll
->tertiaryMask
;
6509 CE
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6513 dest
[i
++]=(uint8_t)CE
;
6516 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6517 consumedExpansionCEs
++;
6519 consumedExpansionCEs
= 0;
6521 if(s
.pos
&& *s
.pos
== 0) {
6526 // if we're not doing tertiary
6528 level
= UCOL_PSK_NULL
;
6530 /* fall through to next level */
6531 case UCOL_PSK_QUATERNARY
:
6532 if(strength
>= UCOL_QUATERNARY
) {
6537 // We should save the state only if we
6538 // are sure that we are done with the
6539 // previous iterator state
6540 if(consumedExpansionCEs
== 0) {
6541 newState
= s
.iterator
->getState(s
.iterator
);
6542 if(newState
!= UITER_NO_STATE
) {
6543 iterState
= newState
;
6546 if(!firstTimeOnLevel
) {
6551 firstTimeOnLevel
= FALSE
;
6552 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6553 if(CE
==UCOL_NO_MORE_CES
) {
6554 // Add the level separator
6555 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6556 //dest[i++] = UCOL_LEVELTERMINATOR;
6557 byteCountOrFrenchDone
=0;
6558 // Restart the iteration an move to the
6560 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6561 level
= UCOL_PSK_QUIN
;
6564 if(isShiftedCE(CE
, LVT
, &wasShifted
)) {
6565 CE
>>= 16; /* get primary */
6567 if(byteCountOrFrenchDone
== 0) {
6568 dest
[i
++]=(uint8_t)(CE
>> 8);
6570 byteCountOrFrenchDone
= 0;
6572 if((CE
&=0xff)!=0) {
6575 byteCountOrFrenchDone
=1;
6578 dest
[i
++]=(uint8_t)CE
;
6582 notIsContinuation
= !isContinuation(CE
);
6583 if(notIsContinuation
) {
6584 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
6585 dest
[i
++] = UCOL_HIRAGANA_QUAD
;
6591 if(s
.CEpos
- s
.toReturn
|| (s
.pos
&& *s
.pos
!= 0)) {
6592 consumedExpansionCEs
++;
6594 consumedExpansionCEs
= 0;
6596 if(s
.pos
&& *s
.pos
== 0) {
6601 // if we're not doing quaternary
6603 level
= UCOL_PSK_NULL
;
6605 /* fall through to next level */
6607 level
= UCOL_PSK_IDENTICAL
;
6608 /* fall through to next level */
6609 case UCOL_PSK_IDENTICAL
:
6610 if(strength
>= UCOL_IDENTICAL
) {
6611 UChar32 first
, second
;
6612 int32_t bocsuBytesWritten
= 0;
6613 // We always need to do identical on
6614 // the NFD form of the string.
6615 if(normIter
== NULL
) {
6616 // we arrived from the level below and
6617 // normalization was not turned on.
6618 // therefore, we need to make a fresh NFD iterator
6619 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6620 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6621 } else if(!doingIdenticalFromStart
) {
6622 // there is an iterator, but we did some other levels.
6623 // therefore, we have a FCD iterator - need to make
6625 // normIter being at the beginning does not guarantee
6626 // that the underlying iterator is at the beginning
6627 iter
->move(iter
, 0, UITER_START
);
6628 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6630 // At this point we have a NFD iterator that is positioned
6631 // in the right place
6632 if(U_FAILURE(*status
)) {
6633 UTRACE_EXIT_STATUS(*status
);
6636 first
= uiter_previous32(s
.iterator
);
6637 // maybe we're at the start of the string
6638 if(first
== U_SENTINEL
) {
6641 uiter_next32(s
.iterator
);
6647 if(j
+1 < bocsuBytesWritten
) {
6648 bocsuBytesUsed
= j
+1;
6653 // On identical level, we will always save
6654 // the state if we reach this point, since
6655 // we don't depend on getNextCE for content
6656 // all the content is in our buffer and we
6657 // already either stored the full buffer OR
6658 // otherwise we won't arrive here.
6659 newState
= s
.iterator
->getState(s
.iterator
);
6660 if(newState
!= UITER_NO_STATE
) {
6661 iterState
= newState
;
6668 second
= uiter_next32(s
.iterator
);
6670 // end condition for identical level
6671 if(second
== U_SENTINEL
) {
6672 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6673 level
= UCOL_PSK_NULL
;
6676 bocsuBytesWritten
= u_writeIdenticalLevelRunTwoChars(first
, second
, buff
);
6680 if(bocsuBytesUsed
!= 0) {
6681 while(bocsuBytesUsed
-->0) {
6686 while(i
< count
&& j
< bocsuBytesWritten
) {
6687 dest
[i
++] = buff
[j
++];
6692 level
= UCOL_PSK_NULL
;
6694 /* fall through to next level */
6702 *status
= U_INTERNAL_PROGRAM_ERROR
;
6703 UTRACE_EXIT_STATUS(*status
);
6708 // Now we need to return stuff. First we want to see whether we have
6709 // done everything for the current state of iterator.
6710 if(consumedExpansionCEs
|| byteCountOrFrenchDone
6711 || dontAdvanceIteratorBecauseWeNeedALevelTerminator
) {
6712 // Any of above mean that the previous transaction
6713 // wasn't finished and that we should store the
6714 // previous iterator state.
6715 state
[0] = iterState
;
6717 // The transaction is complete. We will continue in
6719 if((newState
= s
.iterator
->getState(s
.iterator
))!= UITER_NO_STATE
) {
6720 state
[0] = s
.iterator
->getState(s
.iterator
);
6723 state
[0] = iterState
;
6727 // Store the number of elements processed. On CE levels, this is
6728 // the number of expansion CEs processed. On identical level, this
6729 // is the number of bocsu bytes written.
6730 if(level
< UCOL_PSK_IDENTICAL
) {
6731 if((consumedExpansionCEs
& UCOL_PSK_USED_ELEMENTS_MASK
) != consumedExpansionCEs
) {
6732 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6734 state
[1] = (consumedExpansionCEs
& UCOL_PSK_USED_ELEMENTS_MASK
) << UCOL_PSK_USED_ELEMENTS_SHIFT
;
6736 if((bocsuBytesUsed
& UCOL_PSK_USED_ELEMENTS_MASK
) != bocsuBytesUsed
) {
6737 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6739 state
[1] = (bocsuBytesUsed
& UCOL_PSK_USED_ELEMENTS_MASK
) << UCOL_PSK_USED_ELEMENTS_SHIFT
;
6742 // Next we put in the level of comparison
6743 state
[1] |= ((level
& UCOL_PSK_LEVEL_MASK
) << UCOL_PSK_LEVEL_SHIFT
);
6745 // If we are doing French, we need to store whether we have just finished the French level
6746 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6747 state
[1] |= (((state
[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6749 state
[1] |= ((byteCountOrFrenchDone
& UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6752 // Was the latest CE shifted
6754 state
[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT
;
6756 // Check for iterSkips overflow
6757 if((iterSkips
& UCOL_PSK_ITER_SKIP_MASK
) != iterSkips
) {
6758 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6761 state
[1] |= ((iterSkips
& UCOL_PSK_ITER_SKIP_MASK
) << UCOL_PSK_ITER_SKIP_SHIFT
);
6763 // Check for French overflow
6764 if((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) != usedFrench
) {
6765 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6767 // Store number of bytes written in the French secondary continuation sequence
6768 state
[1] |= ((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) << UCOL_PSK_USED_FRENCH_SHIFT
);
6771 // If we have used normalizing iterator, get rid of it
6772 if(normIter
!= NULL
) {
6773 unorm_closeIter(normIter
);
6776 // Return number of meaningful sortkey bytes.
6777 UTRACE_DATA4(UTRACE_VERBOSE
, "dest = %vb, state=%d %d",
6778 dest
,i
, state
[0], state
[1]);
6779 UTRACE_EXIT_VALUE(i
);
6784 * Produce a bound for a given sortkey and a number of levels.
6786 U_CAPI
int32_t U_EXPORT2
6787 ucol_getBound(const uint8_t *source
,
6788 int32_t sourceLength
,
6789 UColBoundMode boundType
,
6790 uint32_t noOfLevels
,
6792 int32_t resultLength
,
6793 UErrorCode
*status
) {
6794 // consistency checks
6795 if(status
== NULL
|| U_FAILURE(*status
)) {
6798 if(source
== NULL
) {
6799 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6803 int32_t sourceIndex
= 0;
6804 // Scan the string until we skip enough of the key OR reach the end of the key
6807 if(source
[sourceIndex
] == UCOL_LEVELTERMINATOR
) {
6810 } while (noOfLevels
> 0
6811 && (source
[sourceIndex
] != 0 || sourceIndex
< sourceLength
));
6813 if((source
[sourceIndex
] == 0 || sourceIndex
== sourceLength
)
6814 && noOfLevels
> 0) {
6815 *status
= U_SORT_KEY_TOO_SHORT_WARNING
;
6819 // READ ME: this code assumes that the values for boundType
6820 // enum will not changes. They are set so that the enum value
6821 // corresponds to the number of extra bytes each bound type
6823 if(result
!= NULL
&& resultLength
>= sourceIndex
+boundType
) {
6824 uprv_memcpy(result
, source
, sourceIndex
);
6826 // Lower bound just gets terminated. No extra bytes
6827 case UCOL_BOUND_LOWER
: // = 0
6829 // Upper bound needs one extra byte
6830 case UCOL_BOUND_UPPER
: // = 1
6831 result
[sourceIndex
++] = 2;
6833 // Upper long bound needs two extra bytes
6834 case UCOL_BOUND_UPPER_LONG
: // = 2
6835 result
[sourceIndex
++] = 0xFF;
6836 result
[sourceIndex
++] = 0xFF;
6839 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6842 result
[sourceIndex
++] = 0;
6846 return sourceIndex
+boundType
+1;
6851 inline void uprv_appendByteToHexString(char *dst
, uint8_t val
) {
6852 uint32_t len
= (uint32_t)uprv_strlen(dst
);
6853 *(dst
+len
) = T_CString_itosOffset((val
>> 4));
6854 *(dst
+len
+1) = T_CString_itosOffset((val
& 0xF));
6858 /* this function makes a string with representation of a sortkey */
6859 U_CAPI
char* U_EXPORT2
ucol_sortKeyToString(const UCollator
*coll
, const uint8_t *sortkey
, char *buffer
, uint32_t *len
) {
6860 int32_t strength
= UCOL_PRIMARY
;
6861 uint32_t res_size
= 0;
6862 UBool doneCase
= FALSE
;
6864 char *current
= buffer
;
6865 const uint8_t *currentSk
= sortkey
;
6867 uprv_strcpy(current
, "[");
6869 while(strength
<= UCOL_QUATERNARY
&& strength
<= coll
->strength
) {
6870 if(strength
> UCOL_PRIMARY
) {
6871 uprv_strcat(current
, " . ");
6873 while(*currentSk
!= 0x01 && *currentSk
!= 0x00) { /* print a level */
6874 uprv_appendByteToHexString(current
, *currentSk
++);
6875 uprv_strcat(current
, " ");
6877 if(coll
->caseLevel
== UCOL_ON
&& strength
== UCOL_SECONDARY
&& doneCase
== FALSE
) {
6879 } else if(coll
->caseLevel
== UCOL_OFF
|| doneCase
== TRUE
|| strength
!= UCOL_SECONDARY
) {
6882 uprv_appendByteToHexString(current
, *currentSk
++); /* This should print '01' */
6883 if(strength
== UCOL_QUATERNARY
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
) {
6888 if(coll
->strength
== UCOL_IDENTICAL
) {
6889 uprv_strcat(current
, " . ");
6890 while(*currentSk
!= 0) {
6891 uprv_appendByteToHexString(current
, *currentSk
++);
6892 uprv_strcat(current
, " ");
6895 uprv_appendByteToHexString(current
, *currentSk
++);
6897 uprv_strcat(current
, "]");
6899 if(res_size
> *len
) {
6907 /****************************************************************************/
6908 /* Following are the functions that deal with the properties of a collator */
6909 /* there are new APIs and some compatibility APIs */
6910 /****************************************************************************/
6913 ucol_addLatinOneEntry(UCollator
*coll
, UChar ch
, uint32_t CE
,
6914 int32_t *primShift
, int32_t *secShift
, int32_t *terShift
) {
6915 uint8_t primary1
= 0, primary2
= 0, secondary
= 0, tertiary
= 0;
6916 UBool reverseSecondary
= FALSE
;
6917 if(!isContinuation(CE
)) {
6918 tertiary
= (uint8_t)((CE
& coll
->tertiaryMask
));
6919 tertiary
^= coll
->caseSwitch
;
6920 reverseSecondary
= TRUE
;
6922 tertiary
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6923 tertiary
&= UCOL_REMOVE_CASE
;
6924 reverseSecondary
= FALSE
;
6927 secondary
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6928 primary2
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6929 primary1
= (uint8_t)(CE
>> 8);
6932 coll
->latinOneCEs
[ch
] |= (primary1
<< *primShift
);
6936 if(*primShift
< 0) {
6937 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6938 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6939 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6942 coll
->latinOneCEs
[ch
] |= (primary2
<< *primShift
);
6945 if(secondary
!= 0) {
6946 if(reverseSecondary
&& coll
->frenchCollation
== UCOL_ON
) { // reverse secondary
6947 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] >>= 8; // make space for secondary
6948 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< 24);
6949 } else { // normal case
6950 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< *secShift
);
6955 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] |= (tertiary
<< *terShift
);
6961 ucol_resizeLatinOneTable(UCollator
*coll
, int32_t size
, UErrorCode
*status
) {
6962 uint32_t *newTable
= (uint32_t *)uprv_malloc(size
*sizeof(uint32_t)*3);
6963 if(newTable
== NULL
) {
6964 *status
= U_MEMORY_ALLOCATION_ERROR
;
6965 coll
->latinOneFailed
= TRUE
;
6968 int32_t sizeToCopy
= ((size
<coll
->latinOneTableLen
)?size
:coll
->latinOneTableLen
)*sizeof(uint32_t);
6969 uprv_memset(newTable
, 0, size
*sizeof(uint32_t)*3);
6970 uprv_memcpy(newTable
, coll
->latinOneCEs
, sizeToCopy
);
6971 uprv_memcpy(newTable
+size
, coll
->latinOneCEs
+coll
->latinOneTableLen
, sizeToCopy
);
6972 uprv_memcpy(newTable
+2*size
, coll
->latinOneCEs
+2*coll
->latinOneTableLen
, sizeToCopy
);
6973 coll
->latinOneTableLen
= size
;
6974 uprv_free(coll
->latinOneCEs
);
6975 coll
->latinOneCEs
= newTable
;
6980 ucol_setUpLatinOne(UCollator
*coll
, UErrorCode
*status
) {
6981 UBool result
= TRUE
;
6982 if(coll
->latinOneCEs
== NULL
) {
6983 coll
->latinOneCEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN
*3);
6984 if(coll
->latinOneCEs
== NULL
) {
6985 *status
= U_MEMORY_ALLOCATION_ERROR
;
6988 coll
->latinOneTableLen
= UCOL_LATINONETABLELEN
;
6991 UCollationElements
*it
= ucol_openElements(coll
, &ch
, 1, status
);
6992 uprv_memset(coll
->latinOneCEs
, 0, sizeof(uint32_t)*coll
->latinOneTableLen
*3);
6994 int32_t primShift
= 24, secShift
= 24, terShift
= 24;
6996 int32_t contractionOffset
= UCOL_ENDOFLATINONERANGE
+1;
6998 // TODO: make safe if you get more than you wanted...
6999 for(ch
= 0; ch
<= UCOL_ENDOFLATINONERANGE
; ch
++) {
7000 primShift
= 24; secShift
= 24; terShift
= 24;
7002 CE
= coll
->latinOneMapping
[ch
];
7004 CE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, ch
);
7005 if(CE
== UCOL_NOT_FOUND
&& coll
->UCA
) {
7006 CE
= UTRIE_GET32_FROM_LEAD(coll
->UCA
->mapping
, ch
);
7009 if(CE
< UCOL_NOT_FOUND
) {
7010 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
7012 switch (getCETag(CE
)) {
7015 ucol_setText(it
, &ch
, 1, status
);
7016 while((int32_t)(CE
= ucol_next(it
, status
)) != UCOL_NULLORDER
) {
7017 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
7018 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
7019 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
7020 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
7023 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
7026 case CONTRACTION_TAG
:
7027 // here is the trick
7028 // F2 is contraction. We do something very similar to contractions
7029 // but have two indices, one in the real contraction table and the
7030 // other to where we stuffed things. This hopes that we don't have
7031 // many contractions (this should work for latin-1 tables).
7033 if((CE
& 0x00FFF000) != 0) {
7034 *status
= U_UNSUPPORTED_ERROR
;
7035 coll
->latinOneFailed
= TRUE
;
7039 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
7041 CE
|= (contractionOffset
& 0xFFF) << 12; // insert the offset in latin-1 table
7043 coll
->latinOneCEs
[ch
] = CE
;
7044 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = CE
;
7045 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = CE
;
7047 // We're going to jump into contraction table, pick the elements
7050 CE
= *(coll
->contractionCEs
+
7051 (UCharOffset
- coll
->contractionIndex
));
7052 if(CE
> UCOL_NOT_FOUND
&& getCETag(CE
) == EXPANSION_TAG
) {
7054 uint32_t i
; /* general counter */
7055 uint32_t *CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
7056 size
= getExpansionCount(CE
);
7058 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
7059 for(i
= 0; i
<size
; i
++) {
7060 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
7061 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7062 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7063 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7066 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
7068 } else { /* else, we do */
7069 while(*CEOffset
!= 0) {
7070 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
7071 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7072 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7073 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7076 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
7079 contractionOffset
++;
7080 } else if(CE
< UCOL_NOT_FOUND
) {
7081 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
++, CE
, &primShift
, &secShift
, &terShift
);
7083 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7084 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7085 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
7086 contractionOffset
++;
7089 primShift
= 24; secShift
= 24; terShift
= 24;
7090 if(contractionOffset
== coll
->latinOneTableLen
) { // we need to reallocate
7091 if(!ucol_resizeLatinOneTable(coll
, 2*coll
->latinOneTableLen
, status
)) {
7092 coll
->latinOneFailed
= TRUE
;
7096 } while(*UCharOffset
!= 0xFFFF);
7100 coll
->latinOneFailed
= TRUE
;
7106 ucol_closeElements(it
);
7108 if(contractionOffset
< coll
->latinOneTableLen
) {
7109 if(!ucol_resizeLatinOneTable(coll
, contractionOffset
, status
)) {
7110 coll
->latinOneFailed
= TRUE
;
7117 void ucol_updateInternalState(UCollator
*coll
, UErrorCode
*status
) {
7118 if(U_SUCCESS(*status
)) {
7119 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
7120 coll
->caseSwitch
= UCOL_CASE_SWITCH
;
7122 coll
->caseSwitch
= UCOL_NO_CASE_SWITCH
;
7125 if(coll
->caseLevel
== UCOL_ON
|| coll
->caseFirst
== UCOL_OFF
) {
7126 coll
->tertiaryMask
= UCOL_REMOVE_CASE
;
7127 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
7128 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_OFF
;
7129 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_OFF
;
7130 coll
->tertiaryBottom
= UCOL_COMMON_BOT3
;
7132 coll
->tertiaryMask
= UCOL_KEEP_CASE
;
7133 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_ON
;
7134 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
7135 coll
->tertiaryCommon
= UCOL_COMMON3_UPPERFIRST
;
7136 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_UPPER
;
7137 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_UPPER
;
7139 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
7140 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_LOWER
;
7141 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_LOWER
;
7145 /* Set the compression values */
7146 uint8_t tertiaryTotal
= (uint8_t)(coll
->tertiaryTop
- UCOL_COMMON_BOT3
-1);
7147 coll
->tertiaryTopCount
= (uint8_t)(UCOL_PROPORTION3
*tertiaryTotal
); /* we multilply double with int, but need only int */
7148 coll
->tertiaryBottomCount
= (uint8_t)(tertiaryTotal
- coll
->tertiaryTopCount
);
7150 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
== UCOL_TERTIARY
7151 && coll
->frenchCollation
== UCOL_OFF
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
) {
7152 coll
->sortKeyGen
= ucol_calcSortKeySimpleTertiary
;
7154 coll
->sortKeyGen
= ucol_calcSortKey
;
7156 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
<= UCOL_TERTIARY
&& coll
->numericCollation
== UCOL_OFF
7157 && coll
->alternateHandling
== UCOL_NON_IGNORABLE
&& !coll
->latinOneFailed
) {
7158 if(coll
->latinOneCEs
== NULL
|| coll
->latinOneRegenTable
) {
7159 if(ucol_setUpLatinOne(coll
, status
)) { // if we succeed in building latin1 table, we'll use it
7160 //fprintf(stderr, "F");
7161 coll
->latinOneUse
= TRUE
;
7163 coll
->latinOneUse
= FALSE
;
7165 if(*status
== U_UNSUPPORTED_ERROR
) {
7166 *status
= U_ZERO_ERROR
;
7168 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7169 coll
->latinOneUse
= TRUE
;
7172 coll
->latinOneUse
= FALSE
;
7178 U_CAPI
uint32_t U_EXPORT2
7179 ucol_setVariableTop(UCollator
*coll
, const UChar
*varTop
, int32_t len
, UErrorCode
*status
) {
7180 if(U_FAILURE(*status
) || coll
== NULL
) {
7184 len
= u_strlen(varTop
);
7187 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7192 IInit_collIterate(coll
, varTop
, len
, &s
);
7194 uint32_t CE
= ucol_IGetNextCE(coll
, &s
, status
);
7196 /* here we check if we have consumed all characters */
7197 /* you can put in either one character or a contraction */
7198 /* you shouldn't put more... */
7199 if(s
.pos
!= s
.endp
|| CE
== UCOL_NO_MORE_CES
) {
7200 *status
= U_CE_NOT_FOUND_ERROR
;
7204 uint32_t nextCE
= ucol_IGetNextCE(coll
, &s
, status
);
7206 if(isContinuation(nextCE
) && (nextCE
& UCOL_PRIMARYMASK
) != 0) {
7207 *status
= U_PRIMARY_TOO_LONG_ERROR
;
7210 if(coll
->variableTopValue
!= (CE
& UCOL_PRIMARYMASK
)>>16) {
7211 coll
->variableTopValueisDefault
= FALSE
;
7212 coll
->variableTopValue
= (CE
& UCOL_PRIMARYMASK
)>>16;
7215 return CE
& UCOL_PRIMARYMASK
;
7218 U_CAPI
uint32_t U_EXPORT2
ucol_getVariableTop(const UCollator
*coll
, UErrorCode
*status
) {
7219 if(U_FAILURE(*status
) || coll
== NULL
) {
7222 return coll
->variableTopValue
<<16;
7225 U_CAPI
void U_EXPORT2
7226 ucol_restoreVariableTop(UCollator
*coll
, const uint32_t varTop
, UErrorCode
*status
) {
7227 if(U_FAILURE(*status
) || coll
== NULL
) {
7231 if(coll
->variableTopValue
!= (varTop
& UCOL_PRIMARYMASK
)>>16) {
7232 coll
->variableTopValueisDefault
= FALSE
;
7233 coll
->variableTopValue
= (varTop
& UCOL_PRIMARYMASK
)>>16;
7236 /* Attribute setter API */
7237 U_CAPI
void U_EXPORT2
7238 ucol_setAttribute(UCollator
*coll
, UColAttribute attr
, UColAttributeValue value
, UErrorCode
*status
) {
7239 if(U_FAILURE(*status
) || coll
== NULL
) {
7242 UColAttributeValue oldFrench
= coll
->frenchCollation
;
7243 UColAttributeValue oldCaseFirst
= coll
->caseFirst
;
7245 case UCOL_NUMERIC_COLLATION
: /* sort substrings of digits as numbers */
7246 if(value
== UCOL_ON
) {
7247 coll
->numericCollation
= UCOL_ON
;
7248 coll
->numericCollationisDefault
= FALSE
;
7249 } else if (value
== UCOL_OFF
) {
7250 coll
->numericCollation
= UCOL_OFF
;
7251 coll
->numericCollationisDefault
= FALSE
;
7252 } else if (value
== UCOL_DEFAULT
) {
7253 coll
->numericCollationisDefault
= TRUE
;
7254 coll
->numericCollation
= (UColAttributeValue
)coll
->options
->numericCollation
;
7256 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7259 case UCOL_HIRAGANA_QUATERNARY_MODE
: /* special quaternary values for Hiragana */
7260 if(value
== UCOL_ON
) {
7261 coll
->hiraganaQ
= UCOL_ON
;
7262 coll
->hiraganaQisDefault
= FALSE
;
7263 } else if (value
== UCOL_OFF
) {
7264 coll
->hiraganaQ
= UCOL_OFF
;
7265 coll
->hiraganaQisDefault
= FALSE
;
7266 } else if (value
== UCOL_DEFAULT
) {
7267 coll
->hiraganaQisDefault
= TRUE
;
7268 coll
->hiraganaQ
= (UColAttributeValue
)coll
->options
->hiraganaQ
;
7270 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7273 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
7274 if(value
== UCOL_ON
) {
7275 coll
->frenchCollation
= UCOL_ON
;
7276 coll
->frenchCollationisDefault
= FALSE
;
7277 } else if (value
== UCOL_OFF
) {
7278 coll
->frenchCollation
= UCOL_OFF
;
7279 coll
->frenchCollationisDefault
= FALSE
;
7280 } else if (value
== UCOL_DEFAULT
) {
7281 coll
->frenchCollationisDefault
= TRUE
;
7282 coll
->frenchCollation
= (UColAttributeValue
)coll
->options
->frenchCollation
;
7284 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7287 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
7288 if(value
== UCOL_SHIFTED
) {
7289 coll
->alternateHandling
= UCOL_SHIFTED
;
7290 coll
->alternateHandlingisDefault
= FALSE
;
7291 } else if (value
== UCOL_NON_IGNORABLE
) {
7292 coll
->alternateHandling
= UCOL_NON_IGNORABLE
;
7293 coll
->alternateHandlingisDefault
= FALSE
;
7294 } else if (value
== UCOL_DEFAULT
) {
7295 coll
->alternateHandlingisDefault
= TRUE
;
7296 coll
->alternateHandling
= (UColAttributeValue
)coll
->options
->alternateHandling
;
7298 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7301 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
7302 if(value
== UCOL_LOWER_FIRST
) {
7303 coll
->caseFirst
= UCOL_LOWER_FIRST
;
7304 coll
->caseFirstisDefault
= FALSE
;
7305 } else if (value
== UCOL_UPPER_FIRST
) {
7306 coll
->caseFirst
= UCOL_UPPER_FIRST
;
7307 coll
->caseFirstisDefault
= FALSE
;
7308 } else if (value
== UCOL_OFF
) {
7309 coll
->caseFirst
= UCOL_OFF
;
7310 coll
->caseFirstisDefault
= FALSE
;
7311 } else if (value
== UCOL_DEFAULT
) {
7312 coll
->caseFirst
= (UColAttributeValue
)coll
->options
->caseFirst
;
7313 coll
->caseFirstisDefault
= TRUE
;
7315 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7318 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
7319 if(value
== UCOL_ON
) {
7320 coll
->caseLevel
= UCOL_ON
;
7321 coll
->caseLevelisDefault
= FALSE
;
7322 } else if (value
== UCOL_OFF
) {
7323 coll
->caseLevel
= UCOL_OFF
;
7324 coll
->caseLevelisDefault
= FALSE
;
7325 } else if (value
== UCOL_DEFAULT
) {
7326 coll
->caseLevel
= (UColAttributeValue
)coll
->options
->caseLevel
;
7327 coll
->caseLevelisDefault
= TRUE
;
7329 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7332 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
7333 if(value
== UCOL_ON
) {
7334 coll
->normalizationMode
= UCOL_ON
;
7335 coll
->normalizationModeisDefault
= FALSE
;
7336 } else if (value
== UCOL_OFF
) {
7337 coll
->normalizationMode
= UCOL_OFF
;
7338 coll
->normalizationModeisDefault
= FALSE
;
7339 } else if (value
== UCOL_DEFAULT
) {
7340 coll
->normalizationModeisDefault
= TRUE
;
7341 coll
->normalizationMode
= (UColAttributeValue
)coll
->options
->normalizationMode
;
7343 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7346 case UCOL_STRENGTH
: /* attribute for strength */
7347 if (value
== UCOL_DEFAULT
) {
7348 coll
->strengthisDefault
= TRUE
;
7349 coll
->strength
= (UColAttributeValue
)coll
->options
->strength
;
7350 } else if (value
<= UCOL_IDENTICAL
) {
7351 coll
->strengthisDefault
= FALSE
;
7352 coll
->strength
= value
;
7354 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7357 case UCOL_ATTRIBUTE_COUNT
:
7359 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7362 if(oldFrench
!= coll
->frenchCollation
|| oldCaseFirst
!= coll
->caseFirst
) {
7363 coll
->latinOneRegenTable
= TRUE
;
7365 coll
->latinOneRegenTable
= FALSE
;
7367 ucol_updateInternalState(coll
, status
);
7370 U_CAPI UColAttributeValue U_EXPORT2
7371 ucol_getAttribute(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
) {
7372 if(U_FAILURE(*status
) || coll
== NULL
) {
7373 return UCOL_DEFAULT
;
7376 case UCOL_NUMERIC_COLLATION
:
7377 return coll
->numericCollation
;
7378 case UCOL_HIRAGANA_QUATERNARY_MODE
:
7379 return coll
->hiraganaQ
;
7380 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
7381 return coll
->frenchCollation
;
7382 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
7383 return coll
->alternateHandling
;
7384 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
7385 return coll
->caseFirst
;
7386 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
7387 return coll
->caseLevel
;
7388 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
7389 return coll
->normalizationMode
;
7390 case UCOL_STRENGTH
: /* attribute for strength */
7391 return coll
->strength
;
7392 case UCOL_ATTRIBUTE_COUNT
:
7394 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7397 return UCOL_DEFAULT
;
7400 U_CAPI
void U_EXPORT2
7401 ucol_setStrength( UCollator
*coll
,
7402 UCollationStrength strength
)
7404 UErrorCode status
= U_ZERO_ERROR
;
7405 ucol_setAttribute(coll
, UCOL_STRENGTH
, strength
, &status
);
7408 U_CAPI UCollationStrength U_EXPORT2
7409 ucol_getStrength(const UCollator
*coll
)
7411 UErrorCode status
= U_ZERO_ERROR
;
7412 return ucol_getAttribute(coll
, UCOL_STRENGTH
, &status
);
7415 /****************************************************************************/
7416 /* Following are misc functions */
7417 /* there are new APIs and some compatibility APIs */
7418 /****************************************************************************/
7420 U_CAPI UCollator
* U_EXPORT2
7421 ucol_safeClone(const UCollator
*coll
, void *stackBuffer
, int32_t * pBufferSize
, UErrorCode
*status
)
7423 UCollator
* localCollator
;
7424 int32_t bufferSizeNeeded
= (int32_t)sizeof(UCollator
);
7425 char *stackBufferChars
= (char *)stackBuffer
;
7427 if (status
== NULL
|| U_FAILURE(*status
)){
7430 if ((stackBuffer
&& !pBufferSize
) || !coll
){
7431 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7434 /* Pointers on 64-bit platforms need to be aligned
7435 * on a 64-bit boundry in memory.
7437 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
7438 int32_t offsetUp
= (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars
);
7439 *pBufferSize
-= offsetUp
;
7440 stackBufferChars
+= offsetUp
;
7442 stackBuffer
= (void *)stackBufferChars
;
7444 if (stackBuffer
&& *pBufferSize
<= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
7445 *pBufferSize
= bufferSizeNeeded
;
7448 if (!stackBuffer
|| *pBufferSize
< bufferSizeNeeded
) {
7449 /* allocate one here...*/
7451 const UChar
* rules
= ucol_getRules(coll
, &length
);
7453 localCollator
= ucol_openRules(rules
,
7455 ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
),
7456 ucol_getStrength(coll
),
7459 if (U_SUCCESS(*status
))
7461 *status
= U_SAFECLONE_ALLOCATED_WARNING
;
7464 localCollator
= (UCollator
*)stackBuffer
;
7465 uprv_memcpy(localCollator
, coll
, sizeof(UCollator
));
7466 localCollator
->freeOnClose
= FALSE
;
7467 localCollator
->requestedLocale
= NULL
; // zero copies of pointers
7468 localCollator
->validLocale
= NULL
;
7470 return localCollator
;
7473 U_CAPI
int32_t U_EXPORT2
7474 ucol_getRulesEx(const UCollator
*coll
, UColRuleOption delta
, UChar
*buffer
, int32_t bufferLen
) {
7475 UErrorCode status
= U_ZERO_ERROR
;
7478 const UChar
* ucaRules
= 0;
7479 const UChar
*rules
= ucol_getRules(coll
, &len
);
7480 if(delta
== UCOL_FULL_RULES
) {
7481 /* take the UCA rules and append real rules at the end */
7482 /* UCA rules will be probably coming from the root RB */
7483 ucaRules
= ures_getStringByKey(coll
->rb
,"%%UCARULES",&UCAlen
,&status
);
7485 UResourceBundle* cresb = ures_getByKeyWithFallback(coll->rb, "collations", NULL, &status);
7486 UResourceBundle* uca = ures_getByKeyWithFallback(cresb, "UCA", NULL, &status);
7487 ucaRules = ures_getStringByKey(uca,"Sequence",&UCAlen,&status);
7492 if(U_FAILURE(status
)) {
7495 if(buffer
!=0 && bufferLen
>0){
7498 u_memcpy(buffer
, ucaRules
, uprv_min(UCAlen
, bufferLen
));
7500 if(len
> 0 && bufferLen
> UCAlen
) {
7501 u_memcpy(buffer
+UCAlen
, rules
, uprv_min(len
, bufferLen
-UCAlen
));
7504 return u_terminateUChars(buffer
, bufferLen
, len
+UCAlen
, &status
);
7507 static const UChar _NUL
= 0;
7509 U_CAPI
const UChar
* U_EXPORT2
7510 ucol_getRules( const UCollator
*coll
,
7513 if(coll
->rules
!= NULL
) {
7514 *length
= coll
->rulesLength
;
7517 UErrorCode status
= U_ZERO_ERROR
;
7518 if(coll
->elements
!= NULL
) {
7519 if(U_SUCCESS(status
)) {
7521 ((UCollator
*)coll
)->rules
= ures_getStringByKey(coll
->elements
, "Sequence", length
, &status
);
7522 ((UCollator
*)coll
)->rulesLength
= *length
;
7523 ((UCollator
*)coll
)->freeRulesOnClose
= FALSE
;
7532 U_CAPI
int32_t U_EXPORT2
7533 ucol_getDisplayName( const char *objLoc
,
7534 const char *dispLoc
,
7536 int32_t resultLength
,
7540 if(U_FAILURE(*status
)) return -1;
7542 if(!(result
==NULL
&& resultLength
==0)) {
7543 // NULL destination for pure preflighting: empty dummy string
7544 // otherwise, alias the destination buffer
7545 dst
.setTo(result
, 0, resultLength
);
7547 Collator::getDisplayName(Locale(objLoc
), Locale(dispLoc
), dst
);
7548 return dst
.extract(result
, resultLength
, *status
);
7551 U_CAPI
const char* U_EXPORT2
7552 ucol_getAvailable(int32_t index
)
7554 return uloc_getAvailable(index
);
7557 U_CAPI
int32_t U_EXPORT2
7558 ucol_countAvailable()
7560 return uloc_countAvailable();
7563 #if !UCONFIG_NO_SERVICE
7564 U_CAPI UEnumeration
* U_EXPORT2
7565 ucol_openAvailableLocales(UErrorCode
*status
) {
7566 // This is a wrapper over Collator::getAvailableLocales()
7567 if (U_FAILURE(*status
)) {
7570 StringEnumeration
*s
= Collator::getAvailableLocales();
7572 *status
= U_MEMORY_ALLOCATION_ERROR
;
7575 return uenum_openStringEnumeration(s
, status
);
7579 // Note: KEYWORDS[0] != RESOURCE_NAME - alan
7581 static const char* RESOURCE_NAME
= "collations";
7583 static const char* KEYWORDS
[] = { "collation" };
7585 #define KEYWORD_COUNT (sizeof(KEYWORDS)/sizeof(KEYWORDS[0]))
7587 U_CAPI UEnumeration
* U_EXPORT2
7588 ucol_getKeywords(UErrorCode
*status
) {
7589 UEnumeration
*result
= NULL
;
7590 if (U_SUCCESS(*status
)) {
7591 return uenum_openCharStringsEnumeration(KEYWORDS
, KEYWORD_COUNT
, status
);
7596 U_CAPI UEnumeration
* U_EXPORT2
7597 ucol_getKeywordValues(const char *keyword
, UErrorCode
*status
) {
7598 // hard-coded to accept exactly one collation keyword
7599 // modify if additional collation keyword is added later
7600 if (U_SUCCESS(*status
) &&
7601 keyword
==NULL
|| uprv_strcmp(keyword
, KEYWORDS
[0])!=0) {
7602 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7605 return ures_getKeywordValues(U_ICUDATA_COLL
, RESOURCE_NAME
, status
);
7608 U_CAPI
int32_t U_EXPORT2
7609 ucol_getFunctionalEquivalent(char* result
, int32_t resultCapacity
,
7610 const char* keyword
, const char* locale
,
7611 UBool
* isAvailable
, UErrorCode
* status
) {
7612 // N.B.: Resource name is "collations" but keyword is "collation"
7613 return ures_getFunctionalEquivalent(result
, resultCapacity
, U_ICUDATA_COLL
,
7614 "collations", keyword
, locale
,
7615 isAvailable
, TRUE
, status
);
7618 U_CAPI
void U_EXPORT2
7619 ucol_getVersion(const UCollator
* coll
,
7620 UVersionInfo versionInfo
)
7622 /* RunTime version */
7623 uint8_t rtVersion
= UCOL_RUNTIME_VERSION
;
7624 /* Builder version*/
7625 uint8_t bdVersion
= coll
->image
->version
[0];
7627 /* Charset Version. Need to get the version from cnv files
7628 * makeconv should populate cnv files with version and
7629 * an api has to be provided in ucnv.h to obtain this version
7631 uint8_t csVersion
= 0;
7633 /* combine the version info */
7634 uint16_t cmbVersion
= (uint16_t)((rtVersion
<<11) | (bdVersion
<<6) | (csVersion
));
7636 /* Tailoring rules */
7637 versionInfo
[0] = (uint8_t)(cmbVersion
>>8);
7638 versionInfo
[1] = (uint8_t)cmbVersion
;
7639 versionInfo
[2] = coll
->image
->version
[1];
7641 versionInfo
[3] = coll
->UCA
->image
->UCAVersion
[0];
7648 /* This internal API checks whether a character is tailored or not */
7649 U_CAPI UBool U_EXPORT2
7650 ucol_isTailored(const UCollator
*coll
, const UChar u
, UErrorCode
*status
) {
7651 uint32_t CE
= UCOL_NOT_FOUND
;
7652 const UChar
*ContractionStart
= NULL
;
7653 if(U_SUCCESS(*status
) && coll
!= NULL
) {
7654 if(coll
== coll
->UCA
) {
7656 } else if(u
< 0x100) { /* latin-1 */
7657 CE
= coll
->latinOneMapping
[u
];
7658 if(coll
->UCA
&& CE
== coll
->UCA
->latinOneMapping
[u
]) {
7661 } else { /* regular */
7662 /*CE = ucmpe32_get(coll->mapping, u);*/
7663 CE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, u
);
7667 if(isContraction(CE
)) {
7668 ContractionStart
= (UChar
*)coll
->image
+getContractOffset(CE
);
7669 CE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
7672 if(CE
== UCOL_NOT_FOUND
) {
7683 /****************************************************************************/
7684 /* Following are the string compare functions */
7686 /****************************************************************************/
7689 /* ucol_checkIdent internal function. Does byte level string compare. */
7690 /* Used by strcoll if strength == identical and strings */
7691 /* are otherwise equal. Moved out-of-line because this */
7692 /* is a rare case. */
7694 /* Comparison must be done on NFD normalized strings. */
7695 /* FCD is not good enough. */
7697 /* TODO: make an incremental NFD Comparison function, which could */
7698 /* be of general use */
7701 UCollationResult
ucol_checkIdent(collIterate
*sColl
, collIterate
*tColl
, UBool normalize
, UErrorCode
*status
)
7704 // TODO: When we have an UChar iterator, we need to access the whole string. One
7705 // useful modification would be a UChar iterator extract API, since reset next next...
7707 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7709 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7710 // of same type, but that doesn't really mean that it will stay that way.
7712 // The division for the array length may truncate the array size to
7713 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7714 // for all platforms anyway.
7715 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7716 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7717 //UChar sStackBuf[256], tStackBuf[256];
7718 //int32_t sBufSize = 256, tBufSize = 256;
7724 UBool freeSBuf
= FALSE
, freeTBuf
= FALSE
;
7726 if (sColl
->flags
& UCOL_USE_ITERATOR
) {
7727 UNormIterator
*sNIt
= NULL
, *tNIt
= NULL
;
7728 sNIt
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
7729 tNIt
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
7730 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7731 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7732 UCharIterator
*sIt
= unorm_setIter(sNIt
, sColl
->iterator
, UNORM_NFD
, status
);
7733 UCharIterator
*tIt
= unorm_setIter(tNIt
, tColl
->iterator
, UNORM_NFD
, status
);
7734 comparison
= u_strCompareIter(sIt
, tIt
, TRUE
);
7735 unorm_closeIter(sNIt
);
7736 unorm_closeIter(tNIt
);
7738 sLen
= (sColl
->flags
& UCOL_ITER_HASLEN
) ? sColl
->endp
- sColl
->string
: -1;
7739 sBuf
= sColl
->string
;
7740 tLen
= (tColl
->flags
& UCOL_ITER_HASLEN
) ? tColl
->endp
- tColl
->string
: -1;
7741 tBuf
= tColl
->string
;
7744 *status
= U_ZERO_ERROR
;
7745 if (unorm_quickCheck(sBuf
, sLen
, UNORM_NFD
, status
) != UNORM_YES
) {
7746 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
7750 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
7751 if(!u_growBufferFromStatic(sColl
->stackWritableBuffer
,
7752 &sColl
->writableBuffer
,
7753 (int32_t *)&sColl
->writableBufSize
, sLen
,
7756 *status
= U_MEMORY_ALLOCATION_ERROR
;
7757 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7759 *status
= U_ZERO_ERROR
;
7760 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
7769 sBuf
= sColl
->writableBuffer
;
7770 if (sBuf
!= sColl
->stackWritableBuffer
) {
7771 sColl
->flags
|= UCOL_ITER_ALLOCATED
;
7775 *status
= U_ZERO_ERROR
;
7776 if (unorm_quickCheck(tBuf
, tLen
, UNORM_NFD
, status
) != UNORM_YES
) {
7777 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7781 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
7782 if(!u_growBufferFromStatic(tColl
->stackWritableBuffer
,
7783 &tColl
->writableBuffer
,
7784 (int32_t *)&tColl
->writableBufSize
, tLen
,
7787 *status
= U_MEMORY_ALLOCATION_ERROR
;
7788 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7790 *status
= U_ZERO_ERROR
;
7791 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7800 tBuf
= tColl
->writableBuffer
;
7801 if (tBuf
!= tColl
->stackWritableBuffer
) {
7802 tColl
->flags
|= UCOL_ITER_ALLOCATED
;
7807 if (sLen
== -1 && tLen
== -1) {
7808 comparison
= u_strcmpCodePointOrder(sBuf
, tBuf
);
7811 sLen
= u_strlen(sBuf
);
7814 tLen
= u_strlen(tBuf
);
7816 comparison
= u_memcmpCodePointOrder(sBuf
, tBuf
, uprv_min(sLen
, tLen
));
7817 if (comparison
== 0) {
7818 comparison
= sLen
- tLen
;
7823 if (comparison
< 0) {
7825 } else if (comparison
== 0) {
7827 } else /* comparison > 0 */ {
7828 return UCOL_GREATER
;
7832 /* CEBuf - A struct and some inline functions to handle the saving */
7833 /* of CEs in a buffer within ucol_strcoll */
7835 #define UCOL_CEBUF_SIZE 512
7836 typedef struct ucol_CEBuf
{
7840 uint32_t localArray
[UCOL_CEBUF_SIZE
];
7845 inline void UCOL_INIT_CEBUF(ucol_CEBuf
*b
) {
7846 (b
)->buf
= (b
)->pos
= (b
)->localArray
;
7847 (b
)->endp
= (b
)->buf
+ UCOL_CEBUF_SIZE
;
7851 void ucol_CEBuf_Expand(ucol_CEBuf
*b
, collIterate
*ci
) {
7856 ci
->flags
|= UCOL_ITER_ALLOCATED
;
7857 oldSize
= b
->pos
- b
->buf
;
7858 newSize
= oldSize
* 2;
7859 newBuf
= (uint32_t *)uprv_malloc(newSize
* sizeof(uint32_t));
7860 if(newBuf
!= NULL
) {
7861 uprv_memcpy(newBuf
, b
->buf
, oldSize
* sizeof(uint32_t));
7862 if (b
->buf
!= b
->localArray
) {
7866 b
->endp
= b
->buf
+ newSize
;
7867 b
->pos
= b
->buf
+ oldSize
;
7872 inline void UCOL_CEBUF_PUT(ucol_CEBuf
*b
, uint32_t ce
, collIterate
*ci
) {
7873 if (b
->pos
== b
->endp
) {
7874 ucol_CEBuf_Expand(b
, ci
);
7879 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7880 /* It is used when compare gets in trouble and needs to bail out */
7881 static UCollationResult
ucol_compareUsingSortKeys(collIterate
*sColl
,
7884 uint8_t sourceKey
[UCOL_MAX_BUFFER
], targetKey
[UCOL_MAX_BUFFER
];
7885 uint8_t *sourceKeyP
= sourceKey
;
7886 uint8_t *targetKeyP
= targetKey
;
7887 int32_t sourceKeyLen
= UCOL_MAX_BUFFER
, targetKeyLen
= UCOL_MAX_BUFFER
;
7888 const UCollator
*coll
= sColl
->coll
;
7889 UChar
*source
= NULL
;
7890 UChar
*target
= NULL
;
7891 UChar sStackBuf
[256], tStackBuf
[256];
7892 int32_t sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7893 int32_t targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7895 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7896 if(sColl
->flags
& UCOL_USE_ITERATOR
) {
7897 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7898 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7900 UChar
*sBufp
= source
;
7902 UChar
*tBufp
= target
;
7903 while(sColl
->iterator
->hasNext(sColl
->iterator
)) {
7904 *sBufp
++ = (UChar
)sColl
->iterator
->next(sColl
->iterator
);
7906 while(tColl
->iterator
->hasNext(tColl
->iterator
)) {
7907 *tBufp
++ = (UChar
)tColl
->iterator
->next(tColl
->iterator
);
7909 sourceLength
= sBufp
- source
;
7910 targetLength
= tBufp
- target
;
7911 } else { // no iterators
7912 sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7913 targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7914 source
= sColl
->string
;
7915 target
= tColl
->string
;
7920 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7921 if(sourceKeyLen
> UCOL_MAX_BUFFER
) {
7922 sourceKeyP
= (uint8_t*)uprv_malloc(sourceKeyLen
*sizeof(uint8_t));
7923 if(sourceKeyP
!= NULL
) {
7924 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7928 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7929 if(targetKeyLen
> UCOL_MAX_BUFFER
) {
7930 targetKeyP
= (uint8_t*)uprv_malloc(targetKeyLen
*sizeof(uint8_t));
7931 if(targetKeyP
!= NULL
) {
7932 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7936 int32_t result
= uprv_strcmp((const char*)sourceKeyP
, (const char*)targetKeyP
);
7938 if(sourceKeyP
!= sourceKey
) {
7939 uprv_free(sourceKeyP
);
7942 if(targetKeyP
!= targetKey
) {
7943 uprv_free(targetKeyP
);
7948 } else if(result
>0) {
7949 return UCOL_GREATER
;
7956 static inline UCollationResult
7957 ucol_strcollRegular( collIterate
*sColl
, collIterate
*tColl
,
7958 // const UCollator *coll,
7959 // const UChar *source,
7960 // int32_t sourceLength,
7961 // const UChar *target,
7962 // int32_t targetLength,
7967 const UCollator
*coll
= sColl
->coll
;
7970 // setting up the collator parameters
7971 UColAttributeValue strength
= coll
->strength
;
7972 UBool initialCheckSecTer
= (strength
>= UCOL_SECONDARY
);
7974 UBool checkSecTer
= initialCheckSecTer
;
7975 UBool checkTertiary
= (strength
>= UCOL_TERTIARY
);
7976 UBool checkQuad
= (strength
>= UCOL_QUATERNARY
);
7977 UBool checkIdent
= (strength
== UCOL_IDENTICAL
);
7978 UBool checkCase
= (coll
->caseLevel
== UCOL_ON
);
7979 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && checkSecTer
;
7980 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
7981 UBool qShifted
= shifted
&& checkQuad
;
7982 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && checkQuad
;
7984 if(doHiragana
&& shifted
) {
7985 return (ucol_compareUsingSortKeys(sColl
, tColl
));
7987 uint8_t caseSwitch
= coll
->caseSwitch
;
7988 uint8_t tertiaryMask
= coll
->tertiaryMask
;
7990 // This is the lowest primary value that will not be ignored if shifted
7991 uint32_t LVT
= (shifted
)?(coll
->variableTopValue
<<16):0;
7993 UCollationResult result
= UCOL_EQUAL
;
7994 UCollationResult hirResult
= UCOL_EQUAL
;
7996 // Preparing the CE buffers. They will be filled during the primary phase
7999 UCOL_INIT_CEBUF(&sCEs
);
8000 UCOL_INIT_CEBUF(&tCEs
);
8002 uint32_t secS
= 0, secT
= 0;
8003 uint32_t sOrder
=0, tOrder
=0;
8005 // Non shifted primary processing is quite simple
8009 // We fetch CEs until we hit a non ignorable primary or end.
8011 // We get the next CE
8012 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
8013 // Stuff it in the buffer
8014 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8015 // And keep just the primary part.
8016 sOrder
&= UCOL_PRIMARYMASK
;
8017 } while(sOrder
== 0);
8019 // see the comments on the above block
8021 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
8022 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8023 tOrder
&= UCOL_PRIMARYMASK
;
8024 } while(tOrder
== 0);
8026 // if both primaries are the same
8027 if(sOrder
== tOrder
) {
8028 // and there are no more CEs, we advance to the next level
8029 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
8032 if(doHiragana
&& hirResult
== UCOL_EQUAL
) {
8033 if((sColl
->flags
& UCOL_WAS_HIRAGANA
) != (tColl
->flags
& UCOL_WAS_HIRAGANA
)) {
8034 hirResult
= ((sColl
->flags
& UCOL_WAS_HIRAGANA
) > (tColl
->flags
& UCOL_WAS_HIRAGANA
))
8035 ? UCOL_LESS
:UCOL_GREATER
;
8039 // if two primaries are different, we are done
8040 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
8043 } // no primary difference... do the rest from the buffers
8044 } else { // shifted - do a slightly more complicated processing :)
8046 UBool sInShifted
= FALSE
;
8047 UBool tInShifted
= FALSE
;
8048 // This version of code can be refactored. However, it seems easier to understand this way.
8049 // Source loop. Sam as the target loop.
8051 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
8052 if(sOrder
== UCOL_NO_MORE_CES
) {
8053 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8055 } else if(sOrder
== 0
8056 || (sInShifted
&& (sOrder
& UCOL_PRIMARYMASK
) == 0)) {
8057 /* UCA amendment - ignore ignorables that follow shifted code points */
8059 } else if(isContinuation(sOrder
)) {
8060 if((sOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
8062 sOrder
= (sOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
8063 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8066 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8069 } else { /* Just lower level values */
8073 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8077 } else { /* regular */
8078 if((sOrder
& UCOL_PRIMARYMASK
) > LVT
) {
8079 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8082 if((sOrder
& UCOL_PRIMARYMASK
) > 0) {
8084 sOrder
&= UCOL_PRIMARYMASK
;
8085 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8088 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
8095 sOrder
&= UCOL_PRIMARYMASK
;
8099 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
8100 if(tOrder
== UCOL_NO_MORE_CES
) {
8101 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8103 } else if(tOrder
== 0
8104 || (tInShifted
&& (tOrder
& UCOL_PRIMARYMASK
) == 0)) {
8105 /* UCA amendment - ignore ignorables that follow shifted code points */
8107 } else if(isContinuation(tOrder
)) {
8108 if((tOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
8110 tOrder
= (tOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
8111 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8114 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8117 } else { /* Just lower level values */
8121 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8125 } else { /* regular */
8126 if((tOrder
& UCOL_PRIMARYMASK
) > LVT
) {
8127 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8130 if((tOrder
& UCOL_PRIMARYMASK
) > 0) {
8132 tOrder
&= UCOL_PRIMARYMASK
;
8133 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8136 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
8143 tOrder
&= UCOL_PRIMARYMASK
;
8146 if(sOrder
== tOrder
) {
8148 if(doHiragana && hirResult == UCOL_EQUAL) {
8149 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
8150 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
8151 ? UCOL_LESS:UCOL_GREATER;
8155 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
8158 sOrder
= 0; tOrder
= 0;
8162 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
8165 } /* no primary difference... do the rest from the buffers */
8168 /* now, we're gonna reexamine collected CEs */
8172 /* This is the secondary level of comparison */
8174 if(!isFrenchSec
) { /* normal */
8179 secS
= *(sCE
++) & UCOL_SECONDARYMASK
;
8183 secT
= *(tCE
++) & UCOL_SECONDARYMASK
;
8187 if(secS
== UCOL_NO_MORE_CES_SECONDARY
) {
8194 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
8198 } else { /* do the French */
8199 uint32_t *sCESave
= NULL
;
8200 uint32_t *tCESave
= NULL
;
8201 sCE
= sCEs
.pos
-2; /* this could also be sCEs-- if needs to be optimized */
8204 while (secS
== 0 && sCE
>= sCEs
.buf
) {
8207 if(isContinuation(secS
)) {
8208 while(isContinuation(secS
= *(sCE
--)));
8209 /* after this, secS has the start of continuation, and sCEs points before that */
8210 sCESave
= sCE
; /* we save it, so that we know where to come back AND that we need to go forward */
8211 sCE
+=2; /* need to point to the first continuation CP */
8212 /* However, now you can just continue doing stuff */
8216 if(!isContinuation(secS
)) { /* This means we have finished with this cont */
8217 sCE
= sCESave
; /* reset the pointer to before continuation */
8222 secS
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
8225 while(secT
== 0 && tCE
>= tCEs
.buf
) {
8228 if(isContinuation(secT
)) {
8229 while(isContinuation(secT
= *(tCE
--)));
8230 /* after this, secS has the start of continuation, and sCEs points before that */
8231 tCESave
= tCE
; /* we save it, so that we know where to come back AND that we need to go forward */
8232 tCE
+=2; /* need to point to the first continuation CP */
8233 /* However, now you can just continue doing stuff */
8237 if(!isContinuation(secT
)) { /* This means we have finished with this cont */
8238 tCE
= tCESave
; /* reset the pointer to before continuation */
8243 secT
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
8247 if(secS
== UCOL_NO_MORE_CES_SECONDARY
|| (sCE
< sCEs
.buf
&& tCE
< tCEs
.buf
)) {
8254 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
8261 /* doing the case bit */
8266 while((secS
& UCOL_REMOVE_CASE
) == 0) {
8267 if(!isContinuation(*sCE
++)) {
8268 secS
=*(sCE
-1) & UCOL_TERT_CASE_MASK
;
8275 while((secT
& UCOL_REMOVE_CASE
) == 0) {
8276 if(!isContinuation(*tCE
++)) {
8277 secT
= *(tCE
-1) & UCOL_TERT_CASE_MASK
;
8284 if((secS
& UCOL_CASE_BIT_MASK
) < (secT
& UCOL_CASE_BIT_MASK
)) {
8287 } else if((secS
& UCOL_CASE_BIT_MASK
) > (secT
& UCOL_CASE_BIT_MASK
)) {
8288 result
= UCOL_GREATER
;
8292 if((secS
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
|| (secT
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
) {
8301 /* Tertiary level */
8308 while((secS
& UCOL_REMOVE_CASE
) == 0) {
8309 secS
= *(sCE
++) & tertiaryMask
;
8310 if(!isContinuation(secS
)) {
8313 secS
&= UCOL_REMOVE_CASE
;
8317 while((secT
& UCOL_REMOVE_CASE
) == 0) {
8318 secT
= *(tCE
++) & tertiaryMask
;
8319 if(!isContinuation(secT
)) {
8322 secT
&= UCOL_REMOVE_CASE
;
8327 if((secS
& UCOL_REMOVE_CASE
) == 1) {
8334 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
8341 if(qShifted
/*checkQuad*/) {
8342 UBool sInShifted
= TRUE
;
8343 UBool tInShifted
= TRUE
;
8349 while(secS
== 0 && secS
!= UCOL_NO_MORE_CES
|| (isContinuation(secS
) && !sInShifted
)) {
8351 if(isContinuation(secS
)) {
8355 } else if(secS
> LVT
|| (secS
& UCOL_PRIMARYMASK
) == 0) { /* non continuation */
8356 secS
= UCOL_PRIMARYMASK
;
8362 secS
&= UCOL_PRIMARYMASK
;
8365 while(secT
== 0 && secT
!= UCOL_NO_MORE_CES
|| (isContinuation(secT
) && !tInShifted
)) {
8367 if(isContinuation(secT
)) {
8371 } else if(secT
> LVT
|| (secT
& UCOL_PRIMARYMASK
) == 0) {
8372 secT
= UCOL_PRIMARYMASK
;
8378 secT
&= UCOL_PRIMARYMASK
;
8381 if(secS
== UCOL_NO_MORE_CES_PRIMARY
) {
8388 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
8392 } else if(doHiragana
&& hirResult
!= UCOL_EQUAL
) {
8393 // If we're fine on quaternaries, we might be different
8394 // on Hiragana. This, however, might fail us in shifted.
8399 /* For IDENTICAL comparisons, we use a bitwise character comparison */
8400 /* as a tiebreaker if all else is equal. */
8401 /* Getting here should be quite rare - strings are not identical - */
8402 /* that is checked first, but compared == through all other checks. */
8405 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8406 result
= ucol_checkIdent(sColl
, tColl
, TRUE
, status
);
8410 if ((sColl
->flags
| tColl
->flags
) & UCOL_ITER_ALLOCATED
) {
8411 freeHeapWritableBuffer(sColl
);
8412 freeHeapWritableBuffer(tColl
);
8414 if (sCEs
.buf
!= sCEs
.localArray
) {
8415 uprv_free(sCEs
.buf
);
8417 if (tCEs
.buf
!= tCEs
.localArray
) {
8418 uprv_free(tCEs
.buf
);
8426 static inline uint32_t
8427 ucol_getLatinOneContraction(const UCollator
*coll
, int32_t strength
,
8428 uint32_t CE
, const UChar
*s
, int32_t *index
, int32_t len
) {
8429 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
&0xFFF);
8430 int32_t latinOneOffset
= (CE
& 0x00FFF000) >> 12;
8432 UChar schar
= 0, tchar
= 0;
8436 if(s
[*index
] == 0) { // end of string
8437 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8443 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8449 while(schar
> (tchar
= *(UCharOffset
+offset
))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8453 if (schar
== tchar
) {
8455 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
+offset
]);
8459 if(schar
& 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8460 return UCOL_BAIL_OUT_CE
;
8462 // skip completely ignorables
8463 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(coll
->mapping
, schar
);
8464 if(isZeroCE
== 0) { // we have to ignore completely ignorables
8469 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8476 * This is a fast strcoll, geared towards text in Latin-1.
8477 * It supports contractions of size two, French secondaries
8478 * and case switching. You can use it with strengths primary
8479 * to tertiary. It does not support shifted and case level.
8480 * It relies on the table build by setupLatin1Table. If it
8481 * doesn't understand something, it will go to the regular
8484 static inline UCollationResult
8485 ucol_strcollUseLatin1( const UCollator
*coll
,
8486 const UChar
*source
,
8488 const UChar
*target
,
8493 int32_t strength
= coll
->strength
;
8495 int32_t sIndex
= 0, tIndex
= 0;
8496 UChar sChar
= 0, tChar
= 0;
8497 uint32_t sOrder
=0, tOrder
=0;
8499 UBool endOfSource
= FALSE
, endOfTarget
= FALSE
;
8501 uint32_t *elements
= coll
->latinOneCEs
;
8503 UBool haveContractions
= FALSE
; // if we have contractions in our string
8504 // we cannot do French secondary
8506 // Do the primary level
8508 while(sOrder
==0) { // this loop skips primary ignorables
8509 // sOrder=getNextlatinOneCE(source);
8510 if(sLen
==-1) { // handling zero terminated strings
8511 sChar
=source
[sIndex
++];
8516 } else { // handling strings with known length
8521 sChar
=source
[sIndex
++];
8523 if(sChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8524 //fprintf(stderr, "R");
8526 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8528 sOrder
= elements
[sChar
];
8529 if(sOrder
>= UCOL_NOT_FOUND
) { // if we got a special
8530 // specials can basically be either contractions or bail-out signs. If we get anything
8531 // else, we'll bail out anywasy
8532 if(getCETag(sOrder
) == CONTRACTION_TAG
) {
8533 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, sOrder
, source
, &sIndex
, sLen
);
8534 haveContractions
= TRUE
; // if there are contractions, we cannot do French secondary
8535 // However, if there are contractions in the table, but we always use just one char,
8536 // we might be able to do French. This should be checked out.
8538 if(sOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8539 //fprintf(stderr, "S");
8541 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8546 while(tOrder
==0) { // this loop skips primary ignorables
8547 // tOrder=getNextlatinOneCE(target);
8548 if(tLen
==-1) { // handling zero terminated strings
8549 tChar
=target
[tIndex
++];
8551 if(endOfSource
) { // this is different than source loop,
8552 // as we already know that source loop is done here,
8553 // so we can either finish the primary loop if both
8554 // strings are done or anounce the result if only
8555 // target is done. Same below.
8558 return UCOL_GREATER
;
8561 } else { // handling strings with known length
8566 return UCOL_GREATER
;
8569 tChar
=target
[tIndex
++];
8571 if(tChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8572 //fprintf(stderr, "R");
8574 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8576 tOrder
= elements
[tChar
];
8577 if(tOrder
>= UCOL_NOT_FOUND
) {
8578 // Handling specials, see the comments for source
8579 if(getCETag(tOrder
) == CONTRACTION_TAG
) {
8580 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, tOrder
, target
, &tIndex
, tLen
);
8581 haveContractions
= TRUE
;
8583 if(tOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8584 //fprintf(stderr, "S");
8586 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8590 if(endOfSource
) { // source is finished, but target is not, say the result.
8594 if(sOrder
== tOrder
) { // if we have same CEs, we continue the loop
8595 sOrder
= 0; tOrder
= 0;
8598 // compare current top bytes
8599 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8600 // top bytes differ, return difference
8601 if(sOrder
< tOrder
) {
8603 } else if(sOrder
> tOrder
) {
8604 return UCOL_GREATER
;
8606 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8607 // since we must return enum value
8610 // top bytes match, continue with following bytes
8617 // after primary loop, we definitely know the sizes of strings,
8618 // so we set it and use simpler loop for secondaries and tertiaries
8619 sLen
= sIndex
; tLen
= tIndex
;
8620 if(strength
>= UCOL_SECONDARY
) {
8621 // adjust the table beggining
8622 elements
+= coll
->latinOneTableLen
;
8623 endOfSource
= FALSE
; endOfTarget
= FALSE
;
8625 if(coll
->frenchCollation
== UCOL_OFF
) { // non French
8626 // This loop is a simplified copy of primary loop
8627 // at this point we know that whole strings are latin-1, so we don't
8628 // check for that. We also know that we only have contractions as
8630 sIndex
= 0; tIndex
= 0;
8637 sChar
=source
[sIndex
++];
8638 sOrder
= elements
[sChar
];
8639 if(sOrder
> UCOL_NOT_FOUND
) {
8640 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, sOrder
, source
, &sIndex
, sLen
);
8649 return UCOL_GREATER
;
8652 tChar
=target
[tIndex
++];
8653 tOrder
= elements
[tChar
];
8654 if(tOrder
> UCOL_NOT_FOUND
) {
8655 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, tOrder
, target
, &tIndex
, tLen
);
8662 if(sOrder
== tOrder
) {
8663 sOrder
= 0; tOrder
= 0;
8666 // see primary loop for comments on this
8667 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8668 if(sOrder
< tOrder
) {
8670 } else if(sOrder
> tOrder
) {
8671 return UCOL_GREATER
;
8679 if(haveContractions
) { // if we have contractions, we have to bail out
8680 // since we don't really know how to handle them here
8682 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8684 // For French, we go backwards
8685 sIndex
= sLen
; tIndex
= tLen
;
8692 sChar
=source
[--sIndex
];
8693 sOrder
= elements
[sChar
];
8694 // don't even look for contractions
8702 return UCOL_GREATER
;
8705 tChar
=target
[--tIndex
];
8706 tOrder
= elements
[tChar
];
8707 // don't even look for contractions
8713 if(sOrder
== tOrder
) {
8714 sOrder
= 0; tOrder
= 0;
8717 // see the primary loop for comments
8718 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8719 if(sOrder
< tOrder
) {
8721 } else if(sOrder
> tOrder
) {
8722 return UCOL_GREATER
;
8733 if(strength
>= UCOL_TERTIARY
) {
8734 // tertiary loop is the same as secondary (except no French)
8735 elements
+= coll
->latinOneTableLen
;
8736 sIndex
= 0; tIndex
= 0;
8737 endOfSource
= FALSE
; endOfTarget
= FALSE
;
8744 sChar
=source
[sIndex
++];
8745 sOrder
= elements
[sChar
];
8746 if(sOrder
> UCOL_NOT_FOUND
) {
8747 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, sOrder
, source
, &sIndex
, sLen
);
8753 return UCOL_EQUAL
; // if both strings are at the end, they are equal
8755 return UCOL_GREATER
;
8758 tChar
=target
[tIndex
++];
8759 tOrder
= elements
[tChar
];
8760 if(tOrder
> UCOL_NOT_FOUND
) {
8761 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, tOrder
, target
, &tIndex
, tLen
);
8767 if(sOrder
== tOrder
) {
8768 sOrder
= 0; tOrder
= 0;
8771 if(((sOrder
^tOrder
)&0xff000000)!=0) {
8772 if(sOrder
< tOrder
) {
8774 } else if(sOrder
> tOrder
) {
8775 return UCOL_GREATER
;
8786 // Preparing the context objects for iterating over strings
8787 collIterate sColl
, tColl
;
8789 IInit_collIterate(coll
, source
, sLen
, &sColl
);
8790 IInit_collIterate(coll
, target
, tLen
, &tColl
);
8791 return ucol_strcollRegular(&sColl
, &tColl
, status
);
8795 U_CAPI UCollationResult U_EXPORT2
8796 ucol_strcollIter( const UCollator
*coll
,
8797 UCharIterator
*sIter
,
8798 UCharIterator
*tIter
,
8799 UErrorCode
*status
) {
8800 if(!status
|| U_FAILURE(*status
)) {
8804 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER
);
8805 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, sIter=%p, tIter=%p", coll
, sIter
, tIter
);
8807 if (sIter
== tIter
) {
8808 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8811 if(sIter
== NULL
|| tIter
== NULL
|| coll
== NULL
) {
8812 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8813 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8817 UCollationResult result
= UCOL_EQUAL
;
8819 // Preparing the context objects for iterating over strings
8820 collIterate sColl
, tColl
;
8821 // The division for the array length may truncate the array size to
8822 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8823 // for all platforms anyway.
8824 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8825 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8826 UNormIterator
*sNormIter
= NULL
, *tNormIter
= NULL
;
8828 IInit_collIterate(coll
, NULL
, -1, &sColl
);
8829 sColl
.iterator
= sIter
;
8830 sColl
.flags
|= UCOL_USE_ITERATOR
;
8831 IInit_collIterate(coll
, NULL
, -1, &tColl
);
8832 tColl
.flags
|= UCOL_USE_ITERATOR
;
8833 tColl
.iterator
= tIter
;
8835 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
) {
8836 sNormIter
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
8837 sColl
.iterator
= unorm_setIter(sNormIter
, sIter
, UNORM_FCD
, status
);
8838 sColl
.flags
&= ~UCOL_ITER_NORM
;
8840 tNormIter
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
8841 tColl
.iterator
= unorm_setIter(tNormIter
, tIter
, UNORM_FCD
, status
);
8842 tColl
.flags
&= ~UCOL_ITER_NORM
;
8845 UChar32 sChar
= U_SENTINEL
, tChar
= U_SENTINEL
;
8847 while((sChar
= sColl
.iterator
->next(sColl
.iterator
)) ==
8848 (tChar
= tColl
.iterator
->next(tColl
.iterator
))) {
8849 if(UCOL_ISTHAIPREVOWEL(sChar
)) {
8852 if(sChar
== U_SENTINEL
) {
8853 result
= UCOL_EQUAL
;
8858 if(sChar
== U_SENTINEL
) {
8859 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8862 if(tChar
== U_SENTINEL
) {
8863 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8866 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8867 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8869 if (ucol_unsafeCP((UChar
)sChar
, coll
) || ucol_unsafeCP((UChar
)tChar
, coll
))
8871 // We are stopped in the middle of a contraction.
8872 // Scan backwards through the == part of the string looking for the start of the contraction.
8873 // It doesn't matter which string we scan, since they are the same in this region.
8876 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8877 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8879 while (sChar
!= U_SENTINEL
&& ucol_unsafeCP((UChar
)sChar
, coll
));
8883 if(U_SUCCESS(*status
)) {
8884 result
= ucol_strcollRegular(&sColl
, &tColl
, status
);
8888 if(sNormIter
|| tNormIter
) {
8889 unorm_closeIter(sNormIter
);
8890 unorm_closeIter(tNormIter
);
8893 UTRACE_EXIT_VALUE_STATUS(result
, *status
)
8900 /* ucol_strcoll Main public API string comparison function */
8902 U_CAPI UCollationResult U_EXPORT2
8903 ucol_strcoll( const UCollator
*coll
,
8904 const UChar
*source
,
8905 int32_t sourceLength
,
8906 const UChar
*target
,
8907 int32_t targetLength
) {
8910 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL
);
8911 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
8912 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source=%p, target=%p", coll
, source
, target
);
8913 UTRACE_DATA2(UTRACE_VERBOSE
, "source string = %vh ", source
, sourceLength
);
8914 UTRACE_DATA2(UTRACE_VERBOSE
, "target string = %vh ", target
, targetLength
);
8917 UErrorCode status
= U_ZERO_ERROR
;
8918 if(source
== NULL
|| target
== NULL
) {
8919 // do not crash, but return. Should have
8920 // status argument to return error.
8921 UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL
);
8924 collIterate sColl
, tColl
;
8926 /* Scan the strings. Find: */
8927 /* The length of any leading portion that is equal */
8928 /* Whether they are exactly equal. (in which case we just return) */
8929 const UChar
*pSrc
= source
;
8930 const UChar
*pTarg
= target
;
8931 int32_t equalLength
;
8933 if (sourceLength
== -1 && targetLength
== -1) {
8934 // Both strings are null terminated.
8935 // Check for them being the same string, and scan through
8936 // any leading equal portion.
8937 if (source
==target
) {
8938 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8943 if ( *pSrc
!= *pTarg
|| *pSrc
== 0) {
8946 if(UCOL_ISTHAIPREVOWEL(*pSrc
)) {
8952 if (*pSrc
== 0 && *pTarg
== 0) {
8953 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8956 equalLength
= pSrc
- source
;
8960 // One or both strings has an explicit length.
8961 /* check if source and target are same strings */
8963 if (source
==target
&& sourceLength
==targetLength
) {
8964 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8967 const UChar
*pSrcEnd
= source
+ sourceLength
;
8968 const UChar
*pTargEnd
= target
+ targetLength
;
8971 // Scan while the strings are bitwise ==, or until one is exhausted.
8973 if (pSrc
== pSrcEnd
|| pTarg
== pTargEnd
) {
8976 if ((*pSrc
== 0 && sourceLength
== -1) || (*pTarg
== 0 && targetLength
== -1)) {
8979 if (*pSrc
!= *pTarg
) {
8982 if(UCOL_ISTHAIPREVOWEL(*pSrc
)) { // they are the same here, so any will do
8988 equalLength
= pSrc
- source
;
8990 // If we made it all the way through both strings, we are done. They are ==
8991 if ((pSrc
==pSrcEnd
|| (pSrcEnd
<pSrc
&& *pSrc
==0)) && /* At end of src string, however it was specified. */
8992 (pTarg
==pTargEnd
|| (pTargEnd
<pTarg
&& *pTarg
==0))) { /* and also at end of dest string */
8993 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8997 if (equalLength
> 0) {
8998 /* There is an identical portion at the beginning of the two strings. */
8999 /* If the identical portion ends within a contraction or a comibining */
9000 /* character sequence, back up to the start of that sequence. */
9001 pSrc
= source
+ equalLength
; /* point to the first differing chars */
9002 pTarg
= target
+ equalLength
;
9003 if (pSrc
!= source
+sourceLength
&& ucol_unsafeCP(*pSrc
, coll
) ||
9004 pTarg
!= target
+targetLength
&& ucol_unsafeCP(*pTarg
, coll
))
9006 // We are stopped in the middle of a contraction.
9007 // Scan backwards through the == part of the string looking for the start of the contraction.
9008 // It doesn't matter which string we scan, since they are the same in this region.
9014 while (equalLength
>0 && ucol_unsafeCP(*pSrc
, coll
));
9017 source
+= equalLength
;
9018 target
+= equalLength
;
9019 if (sourceLength
> 0) {
9020 sourceLength
-= equalLength
;
9022 if (targetLength
> 0) {
9023 targetLength
-= equalLength
;
9027 UCollationResult returnVal
;
9028 if(!coll
->latinOneUse
|| (sourceLength
> 0 && *source
&0xff00) || (targetLength
> 0 && *target
&0xff00)) {
9029 // Preparing the context objects for iterating over strings
9030 IInit_collIterate(coll
, source
, sourceLength
, &sColl
);
9031 IInit_collIterate(coll
, target
, targetLength
, &tColl
);
9032 returnVal
= ucol_strcollRegular(&sColl
, &tColl
, &status
);
9034 returnVal
= ucol_strcollUseLatin1(coll
, source
, sourceLength
, target
, targetLength
, &status
);
9036 UTRACE_EXIT_VALUE(returnVal
);
9040 /* convenience function for comparing strings */
9041 U_CAPI UBool U_EXPORT2
9042 ucol_greater( const UCollator
*coll
,
9043 const UChar
*source
,
9044 int32_t sourceLength
,
9045 const UChar
*target
,
9046 int32_t targetLength
)
9048 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
9052 /* convenience function for comparing strings */
9053 U_CAPI UBool U_EXPORT2
9054 ucol_greaterOrEqual( const UCollator
*coll
,
9055 const UChar
*source
,
9056 int32_t sourceLength
,
9057 const UChar
*target
,
9058 int32_t targetLength
)
9060 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
9064 /* convenience function for comparing strings */
9065 U_CAPI UBool U_EXPORT2
9066 ucol_equal( const UCollator
*coll
,
9067 const UChar
*source
,
9068 int32_t sourceLength
,
9069 const UChar
*target
,
9070 int32_t targetLength
)
9072 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
9076 /* returns the locale name the collation data comes from */
9077 U_CAPI
const char * U_EXPORT2
9078 ucol_getLocale(const UCollator
*coll
, ULocDataLocaleType type
, UErrorCode
*status
) {
9079 return ucol_getLocaleByType(coll
, type
, status
);
9082 U_CAPI
const char * U_EXPORT2
9083 ucol_getLocaleByType(const UCollator
*coll
, ULocDataLocaleType type
, UErrorCode
*status
) {
9084 const char *result
= NULL
;
9085 if(status
== NULL
|| U_FAILURE(*status
)) {
9088 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE
);
9089 UTRACE_DATA1(UTRACE_INFO
, "coll=%p", coll
);
9092 case ULOC_ACTUAL_LOCALE
:
9093 // validLocale is set only if service registration has explicitly set the
9094 // requested and valid locales. if this is the case, the actual locale
9095 // is considered to be the valid locale.
9096 if (coll
->validLocale
!= NULL
) {
9097 result
= coll
->validLocale
;
9098 } else if(coll
->elements
!= NULL
) {
9099 result
= ures_getLocale(coll
->elements
, status
);
9102 case ULOC_VALID_LOCALE
:
9103 if (coll
->validLocale
!= NULL
) {
9104 result
= coll
->validLocale
;
9105 } else if(coll
->rb
!= NULL
) {
9106 result
= ures_getLocale(coll
->rb
, status
);
9109 case ULOC_REQUESTED_LOCALE
:
9110 result
= coll
->requestedLocale
;
9113 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
9115 UTRACE_DATA1(UTRACE_INFO
, "result = %s", result
);
9116 UTRACE_EXIT_STATUS(*status
);
9120 U_CAPI USet
* U_EXPORT2
9121 ucol_getTailoredSet(const UCollator
*coll
, UErrorCode
*status
)
9123 if(status
== NULL
|| U_FAILURE(*status
)) {
9126 if(coll
== NULL
|| coll
->UCA
== NULL
) {
9127 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
9129 UParseError parseError
;
9130 UColTokenParser src
;
9131 int32_t rulesLen
= 0;
9132 const UChar
*rules
= ucol_getRules(coll
, &rulesLen
);
9133 const UChar
*current
= NULL
;
9134 UBool startOfRules
= TRUE
;
9135 // we internally use the C++ class, for the following reasons:
9136 // 1. we need to utilize canonical iterator, which is a C++ only class
9137 // 2. canonical iterator returns UnicodeStrings - USet cannot take them
9138 // 3. USet is internally really UnicodeSet, C is just a wrapper
9139 UnicodeSet
*tailored
= new UnicodeSet();
9140 UnicodeString pattern
;
9141 UnicodeString empty
;
9142 CanonicalIterator
it(empty
, *status
);
9145 // The idea is to tokenize the rule set. For each non-reset token,
9146 // we add all the canonicaly equivalent FCD sequences
9147 ucol_tok_initTokenList(&src
, rules
, rulesLen
, coll
->UCA
, status
);
9148 while ((current
= ucol_tok_parseNextToken(&src
, startOfRules
, &parseError
, status
)) != NULL
) {
9149 startOfRules
= FALSE
;
9150 if(src
.parsedToken
.strength
!= UCOL_TOK_RESET
) {
9151 const UChar
*stuff
= src
.source
+(src
.parsedToken
.charsOffset
);
9152 it
.setSource(UnicodeString(stuff
, src
.parsedToken
.charsLen
), *status
);
9153 pattern
= it
.next();
9154 while(!pattern
.isBogus()) {
9155 if(Normalizer::quickCheck(pattern
, UNORM_FCD
, *status
) != UNORM_NO
) {
9156 tailored
->add(pattern
);
9158 pattern
= it
.next();
9162 ucol_tok_closeTokenList(&src
);
9163 return (USet
*)tailored
;
9166 U_CAPI UBool U_EXPORT2
9167 ucol_equals(const UCollator
*source
, const UCollator
*target
) {
9168 UErrorCode status
= U_ZERO_ERROR
;
9169 // if pointers are equal, collators are equal
9170 if(source
== target
) {
9173 int32_t i
= 0, j
= 0;
9174 // if any of attributes are different, collators are not equal
9175 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
9176 if(ucol_getAttribute(source
, (UColAttribute
)i
, &status
) != ucol_getAttribute(target
, (UColAttribute
)i
, &status
) || U_FAILURE(status
)) {
9181 int32_t sourceRulesLen
= 0, targetRulesLen
= 0;
9182 const UChar
*sourceRules
= ucol_getRules(source
, &sourceRulesLen
);
9183 const UChar
*targetRules
= ucol_getRules(target
, &targetRulesLen
);
9185 if(sourceRulesLen
== targetRulesLen
&& u_strncmp(sourceRules
, targetRules
, sourceRulesLen
) == 0) {
9186 // all the attributes are equal and the rules are equal - collators are equal
9189 // hard part, need to construct tree from rules and see if they yield the same tailoring
9190 UBool result
= TRUE
;
9191 UParseError parseError
;
9192 UColTokenParser sourceParser
, targetParser
;
9193 int32_t sourceListLen
= 0, targetListLen
= 0;
9194 ucol_tok_initTokenList(&sourceParser
, sourceRules
, sourceRulesLen
, source
->UCA
, &status
);
9195 ucol_tok_initTokenList(&targetParser
, targetRules
, targetRulesLen
, target
->UCA
, &status
);
9196 sourceListLen
= ucol_tok_assembleTokenList(&sourceParser
, &parseError
, &status
);
9197 targetListLen
= ucol_tok_assembleTokenList(&targetParser
, &parseError
, &status
);
9199 if(sourceListLen
!= targetListLen
) {
9200 // different number of resets
9203 UColToken
*sourceReset
= NULL
, *targetReset
= NULL
;
9204 UChar
*sourceResetString
= NULL
, *targetResetString
= NULL
;
9205 int32_t sourceStringLen
= 0, targetStringLen
= 0;
9206 for(i
= 0; i
< sourceListLen
; i
++) {
9207 sourceReset
= sourceParser
.lh
[i
].reset
;
9208 sourceResetString
= sourceParser
.source
+(sourceReset
->source
& 0xFFFFFF);
9209 sourceStringLen
= sourceReset
->source
>> 24;
9210 for(j
= 0; j
< sourceListLen
; j
++) {
9211 targetReset
= targetParser
.lh
[j
].reset
;
9212 targetResetString
= targetParser
.source
+(targetReset
->source
& 0xFFFFFF);
9213 targetStringLen
= targetReset
->source
>> 24;
9214 if(sourceStringLen
== targetStringLen
&& (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) == 0)) {
9215 sourceReset
= sourceParser
.lh
[i
].first
;
9216 targetReset
= targetParser
.lh
[j
].first
;
9217 while(sourceReset
!= NULL
&& targetReset
!= NULL
) {
9218 sourceResetString
= sourceParser
.source
+(sourceReset
->source
& 0xFFFFFF);
9219 sourceStringLen
= sourceReset
->source
>> 24;
9220 targetResetString
= targetParser
.source
+(targetReset
->source
& 0xFFFFFF);
9221 targetStringLen
= targetReset
->source
>> 24;
9222 if(sourceStringLen
!= targetStringLen
|| (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) != 0)) {
9226 // probably also need to check the expansions
9227 if(sourceReset
->expansion
) {
9228 if(!targetReset
->expansion
) {
9232 // compare expansions
9233 sourceResetString
= sourceParser
.source
+(sourceReset
->expansion
& 0xFFFFFF);
9234 sourceStringLen
= sourceReset
->expansion
>> 24;
9235 targetResetString
= targetParser
.source
+(targetReset
->expansion
& 0xFFFFFF);
9236 targetStringLen
= targetReset
->expansion
>> 24;
9237 if(sourceStringLen
!= targetStringLen
|| (u_strncmp(sourceResetString
, targetResetString
, sourceStringLen
) != 0)) {
9243 if(targetReset
->expansion
) {
9248 sourceReset
= sourceReset
->next
;
9249 targetReset
= targetReset
->next
;
9251 if(sourceReset
!= targetReset
) { // at least one is not NULL
9252 // there are more tailored elements in one list
9261 // couldn't find the reset anchor, so the collators are not equal
9262 if(j
== sourceListLen
) {
9270 ucol_tok_closeTokenList(&sourceParser
);
9271 ucol_tok_closeTokenList(&targetParser
);
9276 U_CAPI
void U_EXPORT2
9277 ucol_getUCAVersion(const UCollator
* coll
, UVersionInfo info
) {
9278 if(coll
&& coll
->UCA
) {
9279 uprv_memcpy(info
, coll
->UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
9283 U_CAPI
int32_t U_EXPORT2
9284 ucol_cloneBinary(const UCollator
*coll
,
9285 uint8_t *buffer
, int32_t capacity
,
9289 if(U_FAILURE(*status
)) {
9292 if(coll
->hasRealData
== TRUE
) {
9293 length
= coll
->image
->size
;
9294 if(length
<= capacity
) {
9295 uprv_memcpy(buffer
, coll
->image
, length
);
9298 length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
9299 if(length
<= capacity
) {
9300 /* build the UCATableHeader with minimal entries */
9301 /* do not copy the header from the UCA file because its values are wrong! */
9302 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
9304 /* reset everything */
9305 uprv_memset(buffer
, 0, length
);
9307 /* set the tailoring-specific values */
9308 UCATableHeader
*myData
= (UCATableHeader
*)buffer
;
9309 myData
->size
= length
;
9311 /* offset for the options, the only part of the data that is present after the header */
9312 myData
->options
= sizeof(UCATableHeader
);
9314 /* need to always set the expansion value for an upper bound of the options */
9315 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
9317 myData
->magic
= UCOL_HEADER_MAGIC
;
9318 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
9319 myData
->charSetFamily
= U_CHARSET_FAMILY
;
9321 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
9322 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
9324 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
9325 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
9326 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
9327 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
9329 /* copy the collator options */
9330 uprv_memcpy(buffer
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
9336 U_CAPI UCollator
* U_EXPORT2
9337 ucol_openBinary(const uint8_t *bin
, int32_t length
,
9338 const UCollator
*base
,
9341 UCollator
*result
= NULL
;
9342 if(U_FAILURE(*status
)){
9346 // we don't support null base yet
9347 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
9350 UCATableHeader
*colData
= (UCATableHeader
*)bin
;
9351 // do we want version check here? We're trying to figure out whether collators are compatible
9352 if(uprv_memcmp(colData
->UCAVersion
, base
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
9353 uprv_memcmp(colData
->UCDVersion
, base
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0 ||
9354 colData
->version
[0] != UCOL_BUILDER_VERSION
) {
9355 *status
= U_COLLATOR_VERSION_MISMATCH
;
9358 if((uint32_t)length
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
9359 result
= ucol_initCollator((const UCATableHeader
*)bin
, result
, base
, status
);
9360 if(U_FAILURE(*status
)){
9363 result
->hasRealData
= TRUE
;
9366 result
= ucol_initCollator(base
->image
, result
, base
, status
);
9367 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(bin
+((const UCATableHeader
*)bin
)->options
), status
);
9368 if(U_FAILURE(*status
)){
9371 result
->hasRealData
= FALSE
;
9373 *status
= U_USELESS_COLLATOR_ERROR
;
9377 result
->freeImageOnClose
= FALSE
;
9379 result
->validLocale
= NULL
;
9380 result
->requestedLocale
= NULL
;
9381 result
->rules
= NULL
;
9382 result
->rulesLength
= 0;
9383 result
->freeRulesOnClose
= FALSE
;
9385 result
->elements
= NULL
;
9389 #endif /* #if !UCONFIG_NO_COLLATION */