2 *******************************************************************************
3 * Copyright (C) 1996-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * Modification history
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_COLLATION
23 #include "unicode/coleitr.h"
24 #include "unicode/unorm.h"
25 #include "unicode/udata.h"
26 #include "unicode/ustring.h"
47 /* added by synwee for trie manipulation*/
48 #define STAGE_1_SHIFT_ 10
49 #define STAGE_2_SHIFT_ 4
50 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
51 #define STAGE_3_MASK_ 0xF
52 #define LAST_BYTE_MASK_ 0xFF
53 #define SECOND_LAST_BYTE_SHIFT_ 8
55 #define ZERO_CC_LIMIT_ 0xC0
57 // this is static pointer to the normalizer fcdTrieIndex
58 // it is always the same between calls to u_cleanup
59 // and therefore writing to it is not synchronized.
60 // It is cleaned in ucol_cleanup
61 static const uint16_t *fcdTrieIndex
=NULL
;
63 // These are values from UCA required for
64 // implicit generation and supressing sort key compression
65 // they should regularly be in the UCA, but if one
66 // is running without UCA, it could be a problem
67 static const int32_t maxRegularPrimary
= 0xA0;
68 static const int32_t minImplicitPrimary
= 0xE0;
69 static const int32_t maxImplicitPrimary
= 0xE4;
72 static UBool U_CALLCONV
79 static int32_t U_CALLCONV
80 _getFoldingOffset(uint32_t data
) {
81 return (int32_t)(data
&0xFFFFFF);
87 inline void IInit_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
88 int32_t sourceLen
, collIterate
*s
)
90 (s
)->string
= (s
)->pos
= (UChar
*)(sourceString
);
94 s
->flags
|= UCOL_ITER_HASLEN
;
95 (s
)->endp
= (UChar
*)sourceString
+sourceLen
;
98 /* change to enable easier checking for end of string for fcdpositon */
101 (s
)->extendCEs
= NULL
;
102 (s
)->extendCEsSize
= 0;
103 (s
)->CEpos
= (s
)->toReturn
= (s
)->CEs
;
104 (s
)->offsetBuffer
= NULL
;
105 (s
)->offsetBufferSize
= 0;
106 (s
)->offsetReturn
= (s
)->offsetStore
= NULL
;
107 (s
)->offsetRepeatCount
= (s
)->offsetRepeatValue
= 0;
108 (s
)->writableBuffer
= (s
)->stackWritableBuffer
;
109 (s
)->writableBufSize
= UCOL_WRITABLE_BUFFER_SIZE
;
110 (s
)->coll
= (collator
);
111 (s
)->fcdPosition
= 0;
112 if(collator
->normalizationMode
== UCOL_ON
) {
113 (s
)->flags
|= UCOL_ITER_NORM
;
115 if(collator
->hiraganaQ
== UCOL_ON
&& collator
->strength
>= UCOL_QUATERNARY
) {
116 (s
)->flags
|= UCOL_HIRAGANA_Q
;
118 (s
)->iterator
= NULL
;
119 //(s)->iteratorIndex = 0;
122 U_CAPI
void U_EXPORT2
123 uprv_init_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
124 int32_t sourceLen
, collIterate
*s
){
125 /* Out-of-line version for use from other files. */
126 IInit_collIterate(collator
, sourceString
, sourceLen
, s
);
131 * Backup the state of the collIterate struct data
132 * @param data collIterate to backup
133 * @param backup storage
136 inline void backupState(const collIterate
*data
, collIterateState
*backup
)
138 backup
->fcdPosition
= data
->fcdPosition
;
139 backup
->flags
= data
->flags
;
140 backup
->origFlags
= data
->origFlags
;
141 backup
->pos
= data
->pos
;
142 backup
->bufferaddress
= data
->writableBuffer
;
143 backup
->buffersize
= data
->writableBufSize
;
144 backup
->iteratorMove
= 0;
145 backup
->iteratorIndex
= 0;
146 if(data
->iterator
!= NULL
) {
147 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
148 backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
);
149 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
150 if(backup
->iteratorIndex
== UITER_NO_STATE
) {
151 while((backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
)) == UITER_NO_STATE
) {
152 backup
->iteratorMove
++;
153 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
155 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
161 * Loads the state into the collIterate struct data
162 * @param data collIterate to backup
163 * @param backup storage
164 * @param forwards boolean to indicate if forwards iteration is used,
165 * false indicates backwards iteration
168 inline void loadState(collIterate
*data
, const collIterateState
*backup
,
171 UErrorCode status
= U_ZERO_ERROR
;
172 data
->flags
= backup
->flags
;
173 data
->origFlags
= backup
->origFlags
;
174 if(data
->iterator
!= NULL
) {
175 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
176 data
->iterator
->setState(data
->iterator
, backup
->iteratorIndex
, &status
);
177 if(backup
->iteratorMove
!= 0) {
178 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
181 data
->pos
= backup
->pos
;
183 if ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
184 data
->writableBuffer
!= backup
->bufferaddress
) {
186 this is when a new buffer has been reallocated and we'll have to
187 calculate the new position.
188 note the new buffer has to contain the contents of the old buffer.
191 data
->pos
= data
->writableBuffer
+
192 (data
->pos
- backup
->bufferaddress
);
195 /* backwards direction */
196 uint32_t temp
= backup
->buffersize
-
197 (data
->pos
- backup
->bufferaddress
);
198 data
->pos
= data
->writableBuffer
+ (data
->writableBufSize
- temp
);
201 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
203 this is alittle tricky.
204 if we are initially not in the normalization buffer, even if we
205 normalize in the later stage, the data in the buffer will be
206 ignored, since we skip back up to the data string.
207 however if we are already in the normalization buffer, any
208 further normalization will pull data into the normalization
209 buffer and modify the fcdPosition.
210 since we are keeping the data in the buffer for use, the
211 fcdPosition can not be reverted back.
214 data
->fcdPosition
= backup
->fcdPosition
;
221 * Checks for a collIterate being positioned at the end of
226 inline UBool
collIter_eos(collIterate
*s
) {
227 if(s
->flags
& UCOL_USE_ITERATOR
) {
228 return !(s
->iterator
->hasNext(s
->iterator
));
230 if ((s
->flags
& UCOL_ITER_HASLEN
) == 0 && *s
->pos
!= 0) {
231 // Null terminated string, but not at null, so not at end.
232 // Whether in main or normalization buffer doesn't matter.
236 // String with length. Can't be in normalization buffer, which is always
238 if (s
->flags
& UCOL_ITER_HASLEN
) {
239 return (s
->pos
== s
->endp
);
242 // We are at a null termination, could be either normalization buffer or main string.
243 if ((s
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
244 // At null at end of main string.
248 // At null at end of normalization buffer. Need to check whether there there are
249 // any characters left in the main buffer.
250 if(s
->origFlags
& UCOL_USE_ITERATOR
) {
251 return !(s
->iterator
->hasNext(s
->iterator
));
252 } else if ((s
->origFlags
& UCOL_ITER_HASLEN
) == 0) {
253 // Null terminated main string. fcdPosition is the 'return' position into main buf.
254 return (*s
->fcdPosition
== 0);
257 // Main string with an end pointer.
258 return s
->fcdPosition
== s
->endp
;
264 * Checks for a collIterate being positioned at the start of
269 inline UBool
collIter_bos(collIterate
*source
) {
270 // if we're going backwards, we need to know whether there is more in the
271 // iterator, even if we are in the side buffer
272 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
273 return !source
->iterator
->hasPrevious(source
->iterator
);
275 if (source
->pos
<= source
->string
||
276 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
277 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)) {
284 inline UBool collIter_SimpleBos(collIterate *source) {
285 // if we're going backwards, we need to know whether there is more in the
286 // iterator, even if we are in the side buffer
287 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
288 return !source->iterator->hasPrevious(source->iterator);
290 if (source->pos == source->string) {
295 //return (data->pos == data->string) ||
299 * Checks and free writable buffer if it is not the original stack buffer
300 * in collIterate. This function does not reassign the writable buffer.
301 * @param data collIterate struct to determine and free the writable buffer
304 inline void freeHeapWritableBuffer(collIterate
*data
)
306 if (data
->writableBuffer
!= data
->stackWritableBuffer
) {
307 uprv_free(data
->writableBuffer
);
312 /****************************************************************************/
313 /* Following are the open/close functions */
315 /****************************************************************************/
318 ucol_initFromBinary(const uint8_t *bin
, int32_t length
,
319 const UCollator
*base
,
323 UCollator
*result
= fillIn
;
324 if(U_FAILURE(*status
)) {
329 // we don't support null base yet
330 *status = U_ILLEGAL_ARGUMENT_ERROR;
334 // We need these and we could be running without UCA
335 uprv_uca_initImplicitConstants(status
);
336 UCATableHeader
*colData
= (UCATableHeader
*)bin
;
337 // do we want version check here? We're trying to figure out whether collators are compatible
338 if((base
&& (uprv_memcmp(colData
->UCAVersion
, base
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
339 uprv_memcmp(colData
->UCDVersion
, base
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0)) ||
340 colData
->version
[0] != UCOL_BUILDER_VERSION
)
342 *status
= U_COLLATOR_VERSION_MISMATCH
;
346 if((uint32_t)length
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
347 result
= ucol_initCollator((const UCATableHeader
*)bin
, result
, base
, status
);
348 if(U_FAILURE(*status
)){
351 result
->hasRealData
= TRUE
;
355 result
= ucol_initCollator(base
->image
, result
, base
, status
);
356 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(bin
+((const UCATableHeader
*)bin
)->options
), status
);
357 if(U_FAILURE(*status
)){
360 result
->hasRealData
= FALSE
;
363 *status
= U_USELESS_COLLATOR_ERROR
;
367 result
->freeImageOnClose
= FALSE
;
369 result
->actualLocale
= NULL
;
370 result
->validLocale
= NULL
;
371 result
->requestedLocale
= NULL
;
372 result
->rules
= NULL
;
373 result
->rulesLength
= 0;
374 result
->freeRulesOnClose
= FALSE
;
375 result
->ucaRules
= NULL
;
379 U_CAPI UCollator
* U_EXPORT2
380 ucol_openBinary(const uint8_t *bin
, int32_t length
,
381 const UCollator
*base
,
384 return ucol_initFromBinary(bin
, length
, base
, NULL
, status
);
387 U_CAPI
int32_t U_EXPORT2
388 ucol_cloneBinary(const UCollator
*coll
,
389 uint8_t *buffer
, int32_t capacity
,
393 if(U_FAILURE(*status
)) {
397 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
400 if(coll
->hasRealData
== TRUE
) {
401 length
= coll
->image
->size
;
402 if(length
<= capacity
) {
403 uprv_memcpy(buffer
, coll
->image
, length
);
405 *status
= U_BUFFER_OVERFLOW_ERROR
;
408 length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
409 if(length
<= capacity
) {
410 /* build the UCATableHeader with minimal entries */
411 /* do not copy the header from the UCA file because its values are wrong! */
412 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
414 /* reset everything */
415 uprv_memset(buffer
, 0, length
);
417 /* set the tailoring-specific values */
418 UCATableHeader
*myData
= (UCATableHeader
*)buffer
;
419 myData
->size
= length
;
421 /* offset for the options, the only part of the data that is present after the header */
422 myData
->options
= sizeof(UCATableHeader
);
424 /* need to always set the expansion value for an upper bound of the options */
425 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
427 myData
->magic
= UCOL_HEADER_MAGIC
;
428 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
429 myData
->charSetFamily
= U_CHARSET_FAMILY
;
431 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
432 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
434 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
435 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
436 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
437 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
439 /* copy the collator options */
440 uprv_memcpy(buffer
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
442 *status
= U_BUFFER_OVERFLOW_ERROR
;
448 U_CAPI UCollator
* U_EXPORT2
449 ucol_safeClone(const UCollator
*coll
, void *stackBuffer
, int32_t * pBufferSize
, UErrorCode
*status
)
451 UCollator
* localCollator
;
452 int32_t bufferSizeNeeded
= (int32_t)sizeof(UCollator
);
453 char *stackBufferChars
= (char *)stackBuffer
;
454 int32_t imageSize
= 0;
455 int32_t rulesSize
= 0;
456 int32_t rulesPadding
= 0;
459 UBool colAllocated
= FALSE
;
460 UBool imageAllocated
= FALSE
;
462 if (status
== NULL
|| U_FAILURE(*status
)){
465 if ((stackBuffer
&& !pBufferSize
) || !coll
){
466 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
469 if (coll
->rules
&& coll
->freeRulesOnClose
) {
470 rulesSize
= (int32_t)(coll
->rulesLength
+ 1)*sizeof(UChar
);
471 rulesPadding
= (int32_t)(bufferSizeNeeded
% sizeof(UChar
));
472 bufferSizeNeeded
+= rulesSize
+ rulesPadding
;
475 if (stackBuffer
&& *pBufferSize
<= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
476 *pBufferSize
= bufferSizeNeeded
;
480 /* Pointers on 64-bit platforms need to be aligned
481 * on a 64-bit boundry in memory.
483 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
484 int32_t offsetUp
= (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars
);
485 if (*pBufferSize
> offsetUp
) {
486 *pBufferSize
-= offsetUp
;
487 stackBufferChars
+= offsetUp
;
490 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
494 stackBuffer
= (void *)stackBufferChars
;
496 if (stackBuffer
== NULL
|| *pBufferSize
< bufferSizeNeeded
) {
497 /* allocate one here...*/
498 stackBufferChars
= (char *)uprv_malloc(bufferSizeNeeded
);
499 // Null pointer check.
500 if (stackBufferChars
== NULL
) {
501 *status
= U_MEMORY_ALLOCATION_ERROR
;
505 if (U_SUCCESS(*status
)) {
506 *status
= U_SAFECLONE_ALLOCATED_WARNING
;
509 localCollator
= (UCollator
*)stackBufferChars
;
510 rules
= (UChar
*)(stackBufferChars
+ sizeof(UCollator
) + rulesPadding
);
512 UErrorCode tempStatus
= U_ZERO_ERROR
;
513 imageSize
= ucol_cloneBinary(coll
, NULL
, 0, &tempStatus
);
515 if (coll
->freeImageOnClose
) {
516 image
= (uint8_t *)uprv_malloc(imageSize
);
517 // Null pointer check
519 *status
= U_MEMORY_ALLOCATION_ERROR
;
522 ucol_cloneBinary(coll
, image
, imageSize
, status
);
523 imageAllocated
= TRUE
;
526 image
= (uint8_t *)coll
->image
;
528 localCollator
= ucol_initFromBinary(image
, imageSize
, coll
->UCA
, localCollator
, status
);
529 if (U_FAILURE(*status
)) {
534 if (coll
->freeRulesOnClose
) {
535 localCollator
->rules
= u_strcpy(rules
, coll
->rules
);
536 //bufferEnd += rulesSize;
539 localCollator
->rules
= coll
->rules
;
541 localCollator
->freeRulesOnClose
= FALSE
;
542 localCollator
->rulesLength
= coll
->rulesLength
;
546 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
547 ucol_setAttribute(localCollator
, (UColAttribute
)i
, ucol_getAttribute(coll
, (UColAttribute
)i
, status
), status
);
549 // zero copies of pointers
550 localCollator
->actualLocale
= NULL
;
551 localCollator
->validLocale
= NULL
;
552 localCollator
->requestedLocale
= NULL
;
553 localCollator
->ucaRules
= coll
->ucaRules
; // There should only be one copy here.
554 localCollator
->freeOnClose
= colAllocated
;
555 localCollator
->freeImageOnClose
= imageAllocated
;
556 return localCollator
;
559 U_CAPI
void U_EXPORT2
560 ucol_close(UCollator
*coll
)
562 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE
);
563 UTRACE_DATA1(UTRACE_INFO
, "coll = %p", coll
);
565 // these are always owned by each UCollator struct,
566 // so we always free them
567 if(coll
->validLocale
!= NULL
) {
568 uprv_free(coll
->validLocale
);
570 if(coll
->actualLocale
!= NULL
) {
571 uprv_free(coll
->actualLocale
);
573 if(coll
->requestedLocale
!= NULL
) {
574 uprv_free(coll
->requestedLocale
);
576 if(coll
->latinOneCEs
!= NULL
) {
577 uprv_free(coll
->latinOneCEs
);
579 if(coll
->options
!= NULL
&& coll
->freeOptionsOnClose
) {
580 uprv_free(coll
->options
);
582 if(coll
->rules
!= NULL
&& coll
->freeRulesOnClose
) {
583 uprv_free((UChar
*)coll
->rules
);
585 if(coll
->image
!= NULL
&& coll
->freeImageOnClose
) {
586 uprv_free((UCATableHeader
*)coll
->image
);
589 /* Here, it would be advisable to close: */
590 /* - UData for UCA (unless we stuff it in the root resb */
591 /* Again, do we need additional housekeeping... HMMM! */
592 UTRACE_DATA1(UTRACE_INFO
, "coll->freeOnClose: %d", coll
->freeOnClose
);
593 if(coll
->freeOnClose
){
594 /* for safeClone, if freeOnClose is FALSE,
595 don't free the other instance data */
602 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
603 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
604 U_CFUNC
uint8_t* U_EXPORT2
605 ucol_cloneRuleData(const UCollator
*coll
, int32_t *length
, UErrorCode
*status
)
607 uint8_t *result
= NULL
;
608 if(U_FAILURE(*status
)) {
611 if(coll
->hasRealData
== TRUE
) {
612 *length
= coll
->image
->size
;
613 result
= (uint8_t *)uprv_malloc(*length
);
615 if (result
== NULL
) {
616 *status
= U_MEMORY_ALLOCATION_ERROR
;
619 uprv_memcpy(result
, coll
->image
, *length
);
621 *length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
622 result
= (uint8_t *)uprv_malloc(*length
);
624 if (result
== NULL
) {
625 *status
= U_MEMORY_ALLOCATION_ERROR
;
629 /* build the UCATableHeader with minimal entries */
630 /* do not copy the header from the UCA file because its values are wrong! */
631 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
633 /* reset everything */
634 uprv_memset(result
, 0, *length
);
636 /* set the tailoring-specific values */
637 UCATableHeader
*myData
= (UCATableHeader
*)result
;
638 myData
->size
= *length
;
640 /* offset for the options, the only part of the data that is present after the header */
641 myData
->options
= sizeof(UCATableHeader
);
643 /* need to always set the expansion value for an upper bound of the options */
644 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
646 myData
->magic
= UCOL_HEADER_MAGIC
;
647 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
648 myData
->charSetFamily
= U_CHARSET_FAMILY
;
650 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
651 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
653 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
654 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
655 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
656 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
658 /* copy the collator options */
659 uprv_memcpy(result
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
664 void ucol_setOptionsFromHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
665 if(U_FAILURE(*status
)) {
668 result
->caseFirst
= (UColAttributeValue
)opts
->caseFirst
;
669 result
->caseLevel
= (UColAttributeValue
)opts
->caseLevel
;
670 result
->frenchCollation
= (UColAttributeValue
)opts
->frenchCollation
;
671 result
->normalizationMode
= (UColAttributeValue
)opts
->normalizationMode
;
672 result
->strength
= (UColAttributeValue
)opts
->strength
;
673 result
->variableTopValue
= opts
->variableTopValue
;
674 result
->alternateHandling
= (UColAttributeValue
)opts
->alternateHandling
;
675 result
->hiraganaQ
= (UColAttributeValue
)opts
->hiraganaQ
;
676 result
->numericCollation
= (UColAttributeValue
)opts
->numericCollation
;
678 result
->caseFirstisDefault
= TRUE
;
679 result
->caseLevelisDefault
= TRUE
;
680 result
->frenchCollationisDefault
= TRUE
;
681 result
->normalizationModeisDefault
= TRUE
;
682 result
->strengthisDefault
= TRUE
;
683 result
->variableTopValueisDefault
= TRUE
;
684 result
->hiraganaQisDefault
= TRUE
;
685 result
->numericCollationisDefault
= TRUE
;
687 ucol_updateInternalState(result
, status
);
689 result
->options
= opts
;
694 * Approximate determination if a character is at a contraction end.
695 * Guaranteed to be TRUE if a character is at the end of a contraction,
696 * otherwise it is not deterministic.
697 * @param c character to be determined
698 * @param coll collator
701 inline UBool
ucol_contractionEndCP(UChar c
, const UCollator
*coll
) {
702 if (c
< coll
->minContrEndCP
) {
708 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
709 if (U16_IS_TRAIL(c
)) {
712 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
714 htbyte
= coll
->contrEndCP
[hash
>>3];
715 return (((htbyte
>> (hash
& 7)) & 1) == 1);
721 * i_getCombiningClass()
722 * A fast, at least partly inline version of u_getCombiningClass()
723 * This is a candidate for further optimization. Used heavily
724 * in contraction processing.
727 inline uint8_t i_getCombiningClass(UChar32 c
, const UCollator
*coll
) {
729 if ((c
>= 0x300 && ucol_unsafeCP(c
, coll
)) || c
> 0xFFFF) {
730 sCC
= u_getCombiningClass(c
);
735 UCollator
* ucol_initCollator(const UCATableHeader
*image
, UCollator
*fillIn
, const UCollator
*UCA
, UErrorCode
*status
) {
737 UCollator
*result
= fillIn
;
738 if(U_FAILURE(*status
) || image
== NULL
) {
743 result
= (UCollator
*)uprv_malloc(sizeof(UCollator
));
745 *status
= U_MEMORY_ALLOCATION_ERROR
;
748 result
->freeOnClose
= TRUE
;
750 result
->freeOnClose
= FALSE
;
754 if (fcdTrieIndex
== NULL
) {
755 // The result is constant, until the library is reloaded.
756 fcdTrieIndex
= unorm_getFCDTrie(status
);
757 ucln_i18n_registerCleanup(UCLN_I18N_UCOL
, ucol_cleanup
);
760 result
->image
= image
;
761 result
->mapping
.getFoldingOffset
= _getFoldingOffset
;
762 const uint8_t *mapping
= (uint8_t*)result
->image
+result
->image
->mappingPosition
;
763 utrie_unserialize(&result
->mapping
, mapping
, result
->image
->endExpansionCE
- result
->image
->mappingPosition
, status
);
764 if(U_FAILURE(*status
)) {
765 if(result
->freeOnClose
== TRUE
) {
772 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
773 result
->latinOneMapping
= UTRIE_GET32_LATIN1(&result
->mapping
);
774 result
->contractionCEs
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->contractionCEs
);
775 result
->contractionIndex
= (UChar
*)((uint8_t*)result
->image
+result
->image
->contractionIndex
);
776 result
->expansion
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->expansion
);
778 result
->options
= (UColOptionSet
*)((uint8_t*)result
->image
+result
->image
->options
);
779 result
->freeOptionsOnClose
= FALSE
;
782 result
->caseFirst
= (UColAttributeValue
)result
->options
->caseFirst
;
783 result
->caseLevel
= (UColAttributeValue
)result
->options
->caseLevel
;
784 result
->frenchCollation
= (UColAttributeValue
)result
->options
->frenchCollation
;
785 result
->normalizationMode
= (UColAttributeValue
)result
->options
->normalizationMode
;
786 result
->strength
= (UColAttributeValue
)result
->options
->strength
;
787 result
->variableTopValue
= result
->options
->variableTopValue
;
788 result
->alternateHandling
= (UColAttributeValue
)result
->options
->alternateHandling
;
789 result
->hiraganaQ
= (UColAttributeValue
)result
->options
->hiraganaQ
;
790 result
->numericCollation
= (UColAttributeValue
)result
->options
->numericCollation
;
792 result
->caseFirstisDefault
= TRUE
;
793 result
->caseLevelisDefault
= TRUE
;
794 result
->frenchCollationisDefault
= TRUE
;
795 result
->normalizationModeisDefault
= TRUE
;
796 result
->strengthisDefault
= TRUE
;
797 result
->variableTopValueisDefault
= TRUE
;
798 result
->alternateHandlingisDefault
= TRUE
;
799 result
->hiraganaQisDefault
= TRUE
;
800 result
->numericCollationisDefault
= TRUE
;
802 /*result->scriptOrder = NULL;*/
804 result
->rules
= NULL
;
805 result
->rulesLength
= 0;
806 result
->freeRulesOnClose
= FALSE
;
808 /* get the version info from UCATableHeader and populate the Collator struct*/
809 result
->dataVersion
[0] = result
->image
->version
[0]; /* UCA Builder version*/
810 result
->dataVersion
[1] = result
->image
->version
[1]; /* UCA Tailoring rules version*/
811 result
->dataVersion
[2] = 0;
812 result
->dataVersion
[3] = 0;
814 result
->unsafeCP
= (uint8_t *)result
->image
+ result
->image
->unsafeCP
;
815 result
->minUnsafeCP
= 0;
816 for (c
=0; c
<0x300; c
++) { // Find the smallest unsafe char.
817 if (ucol_unsafeCP(c
, result
)) break;
819 result
->minUnsafeCP
= c
;
821 result
->contrEndCP
= (uint8_t *)result
->image
+ result
->image
->contrEndCP
;
822 result
->minContrEndCP
= 0;
823 for (c
=0; c
<0x300; c
++) { // Find the Contraction-ending char.
824 if (ucol_contractionEndCP(c
, result
)) break;
826 result
->minContrEndCP
= c
;
828 /* max expansion tables */
829 result
->endExpansionCE
= (uint32_t*)((uint8_t*)result
->image
+
830 result
->image
->endExpansionCE
);
831 result
->lastEndExpansionCE
= result
->endExpansionCE
+
832 result
->image
->endExpansionCECount
- 1;
833 result
->expansionCESize
= (uint8_t*)result
->image
+
834 result
->image
->expansionCESize
;
837 //result->errorCode = *status;
839 result
->latinOneCEs
= NULL
;
841 result
->latinOneRegenTable
= FALSE
;
842 result
->latinOneFailed
= FALSE
;
845 ucol_updateInternalState(result
, status
);
847 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
848 result
->ucaRules
= NULL
;
849 result
->actualLocale
= NULL
;
850 result
->validLocale
= NULL
;
851 result
->requestedLocale
= NULL
;
852 result
->hasRealData
= FALSE
; // real data lives in .dat file...
853 result
->freeImageOnClose
= FALSE
;
858 /* new Mark's code */
861 * For generation of Implicit CEs
864 * Cleaned up so that changes can be made more easily.
866 # First Implicit: E26A792D
867 # Last Implicit: E3DC70C0
868 # First CJK: E0030300
870 # First CJK_A: E0A9DF00
871 # Last CJK_A: E0DE3100
873 /* Following is a port of Mark's code for new treatment of implicits.
874 * It is positioned here, since ucol_initUCA need to initialize the
875 * variables below according to the data in the fractional UCA.
880 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
881 * b) bump any non-CJK characters by 10FFFF.
882 * The relevant blocks are:
883 * A: 4E00..9FFF; CJK Unified Ideographs
884 * F900..FAFF; CJK Compatibility Ideographs
885 * B: 3400..4DBF; CJK Unified Ideographs Extension A
886 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
888 * no new B characters are allocated between 4E00 and FAFF, and
889 * no new A characters are outside of this range,
890 * (very high probability) this simple code will work.
891 * The reordered blocks are:
893 * Block2 is CJK_COMPAT_USED
896 * Any other CJK gets its normal code point
897 * Any non-CJK gets +10FFFF
898 * When we reorder Block1, we make sure that it is at the very start,
899 * so that it will use a 3-byte form.
900 * Warning: the we only pick up the compatibility characters that are
901 * NOT decomposed, so that block is smaller!
906 NON_CJK_OFFSET
= 0x110000,
907 UCOL_MAX_INPUT
= 0x220001; // 2 * Unicode range + 2
910 * Precomputed by constructor
913 final3Multiplier
= 0,
914 final4Multiplier
= 0,
929 CJK_LIMIT
= 0x9FFF+1,
930 CJK_COMPAT_USED_BASE
= 0xFA0E,
931 CJK_COMPAT_USED_LIMIT
= 0xFA2F+1,
933 CJK_A_LIMIT
= 0x4DBF+1,
934 CJK_B_BASE
= 0x20000,
935 CJK_B_LIMIT
= 0x2A6DF+1;
937 static UChar32
swapCJK(UChar32 i
) {
940 if (i
< CJK_LIMIT
) return i
- CJK_BASE
;
942 if (i
< CJK_COMPAT_USED_BASE
) return i
+ NON_CJK_OFFSET
;
944 if (i
< CJK_COMPAT_USED_LIMIT
) return i
- CJK_COMPAT_USED_BASE
945 + (CJK_LIMIT
- CJK_BASE
);
946 if (i
< CJK_B_BASE
) return i
+ NON_CJK_OFFSET
;
948 if (i
< CJK_B_LIMIT
) return i
; // non-BMP-CJK
950 return i
+ NON_CJK_OFFSET
; // non-CJK
952 if (i
< CJK_A_BASE
) return i
+ NON_CJK_OFFSET
;
954 if (i
< CJK_A_LIMIT
) return i
- CJK_A_BASE
955 + (CJK_LIMIT
- CJK_BASE
)
956 + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
957 return i
+ NON_CJK_OFFSET
; // non-CJK
960 U_CAPI UChar32 U_EXPORT2
961 uprv_uca_getRawFromCodePoint(UChar32 i
) {
965 U_CAPI UChar32 U_EXPORT2
966 uprv_uca_getCodePointFromRaw(UChar32 i
) {
969 if(i
>= NON_CJK_OFFSET
) {
970 result
= i
- NON_CJK_OFFSET
;
971 } else if(i
>= CJK_B_BASE
) {
973 } else if(i
< CJK_A_LIMIT
+ (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) { // rest of CJKs, compacted
974 if(i
< CJK_LIMIT
- CJK_BASE
) {
975 result
= i
+ CJK_BASE
;
976 } else if(i
< (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) {
977 result
= i
+ CJK_COMPAT_USED_BASE
- (CJK_LIMIT
- CJK_BASE
);
979 result
= i
+ CJK_A_BASE
- (CJK_LIMIT
- CJK_BASE
) - (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
987 // GET IMPLICIT PRIMARY WEIGHTS
988 // Return value is left justified primary key
989 U_CAPI
uint32_t U_EXPORT2
990 uprv_uca_getImplicitFromRaw(UChar32 cp
) {
992 if (cp < 0 || cp > UCOL_MAX_INPUT) {
993 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
996 int32_t last0
= cp
- min4Boundary
;
998 int32_t last1
= cp
/ final3Count
;
999 last0
= cp
% final3Count
;
1001 int32_t last2
= last1
/ medialCount
;
1002 last1
%= medialCount
;
1004 last0
= minTrail
+ last0
*final3Multiplier
; // spread out, leaving gap at start
1005 last1
= minTrail
+ last1
; // offset
1006 last2
= min3Primary
+ last2
; // offset
1008 if (last2 >= min4Primary) {
1009 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1012 return (last2
<< 24) + (last1
<< 16) + (last0
<< 8);
1014 int32_t last1
= last0
/ final4Count
;
1015 last0
%= final4Count
;
1017 int32_t last2
= last1
/ medialCount
;
1018 last1
%= medialCount
;
1020 int32_t last3
= last2
/ medialCount
;
1021 last2
%= medialCount
;
1023 last0
= minTrail
+ last0
*final4Multiplier
; // spread out, leaving gap at start
1024 last1
= minTrail
+ last1
; // offset
1025 last2
= minTrail
+ last2
; // offset
1026 last3
= min4Primary
+ last3
; // offset
1028 if (last3 > max4Primary) {
1029 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1032 return (last3
<< 24) + (last2
<< 16) + (last1
<< 8) + last0
;
1036 static uint32_t U_EXPORT2
1037 uprv_uca_getImplicitPrimary(UChar32 cp
) {
1038 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1042 // we now have a range of numbers from 0 to 21FFFF.
1044 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1046 return uprv_uca_getImplicitFromRaw(cp
);
1050 * Converts implicit CE into raw integer ("code point")
1052 * @return -1 if illegal format
1054 U_CAPI UChar32 U_EXPORT2
1055 uprv_uca_getRawFromImplicit(uint32_t implicit
) {
1057 UChar32 b3
= implicit
& 0xFF;
1058 UChar32 b2
= (implicit
>> 8) & 0xFF;
1059 UChar32 b1
= (implicit
>> 16) & 0xFF;
1060 UChar32 b0
= (implicit
>> 24) & 0xFF;
1062 // simple parameter checks
1063 if (b0
< min3Primary
|| b0
> max4Primary
1064 || b1
< minTrail
|| b1
> maxTrail
)
1069 // take care of the final values, and compose
1070 if (b0
< min4Primary
) {
1071 if (b2
< minTrail
|| b2
> max3Trail
|| b3
!= 0)
1074 UChar32 remainder
= b2
% final3Multiplier
;
1078 b2
/= final3Multiplier
;
1079 result
= ((b0
* medialCount
) + b1
) * final3Count
+ b2
;
1081 if (b2
< minTrail
|| b2
> maxTrail
1082 || b3
< minTrail
|| b3
> max4Trail
)
1086 UChar32 remainder
= b3
% final4Multiplier
;
1089 b3
/= final4Multiplier
;
1091 result
= (((b0
* medialCount
) + b1
) * medialCount
+ b2
) * final4Count
+ b3
+ min4Boundary
;
1094 if (result
< 0 || result
> UCOL_MAX_INPUT
)
1100 static inline int32_t divideAndRoundUp(int a
, int b
) {
1104 /* this function is either called from initUCA or from genUCA before
1105 * doing canonical closure for the UCA.
1109 * Set up to generate implicits.
1112 * @param minTrail final byte
1113 * @param maxTrail final byte
1114 * @param gap3 the gap we leave for tailoring for 3-byte forms
1115 * @param gap4 the gap we leave for tailoring for 4-byte forms
1117 static void initImplicitConstants(int minPrimary
, int maxPrimary
,
1118 int minTrailIn
, int maxTrailIn
,
1119 int gap3
, int primaries3count
,
1120 UErrorCode
*status
) {
1121 // some simple parameter checks
1122 if ((minPrimary
< 0 || minPrimary
>= maxPrimary
|| maxPrimary
> 0xFF)
1123 || (minTrailIn
< 0 || minTrailIn
>= maxTrailIn
|| maxTrailIn
> 0xFF)
1124 || (primaries3count
< 1))
1126 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1130 minTrail
= minTrailIn
;
1131 maxTrail
= maxTrailIn
;
1133 min3Primary
= minPrimary
;
1134 max4Primary
= maxPrimary
;
1135 // compute constants for use later.
1136 // number of values we can use in trailing bytes
1137 // leave room for empty values between AND above, e.g. if gap = 2
1138 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1139 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1140 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1141 final3Multiplier
= gap3
+ 1;
1142 final3Count
= (maxTrail
- minTrail
+ 1) / final3Multiplier
;
1143 max3Trail
= minTrail
+ (final3Count
- 1) * final3Multiplier
;
1145 // medials can use full range
1146 medialCount
= (maxTrail
- minTrail
+ 1);
1147 // find out how many values fit in each form
1148 int32_t threeByteCount
= medialCount
* final3Count
;
1149 // now determine where the 3/4 boundary is.
1150 // we use 3 bytes below the boundary, and 4 above
1151 int32_t primariesAvailable
= maxPrimary
- minPrimary
+ 1;
1152 int32_t primaries4count
= primariesAvailable
- primaries3count
;
1155 int32_t min3ByteCoverage
= primaries3count
* threeByteCount
;
1156 min4Primary
= minPrimary
+ primaries3count
;
1157 min4Boundary
= min3ByteCoverage
;
1158 // Now expand out the multiplier for the 4 bytes, and redo.
1160 int32_t totalNeeded
= UCOL_MAX_INPUT
- min4Boundary
;
1161 int32_t neededPerPrimaryByte
= divideAndRoundUp(totalNeeded
, primaries4count
);
1162 int32_t neededPerFinalByte
= divideAndRoundUp(neededPerPrimaryByte
, medialCount
* medialCount
);
1163 int32_t gap4
= (maxTrail
- minTrail
- 1) / neededPerFinalByte
;
1165 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1168 final4Multiplier
= gap4
+ 1;
1169 final4Count
= neededPerFinalByte
;
1170 max4Trail
= minTrail
+ (final4Count
- 1) * final4Multiplier
;
1174 * Supply parameters for generating implicit CEs
1176 U_CAPI
void U_EXPORT2
1177 uprv_uca_initImplicitConstants(UErrorCode
*status
) {
1178 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1179 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1180 initImplicitConstants(minImplicitPrimary
, maxImplicitPrimary
, 0x04, 0xFE, 1, 1, status
);
1184 /* collIterNormalize Incremental Normalization happens here. */
1185 /* pick up the range of chars identifed by FCD, */
1186 /* normalize it into the collIterate's writable buffer, */
1187 /* switch the collIterate's state to use the writable buffer. */
1190 void collIterNormalize(collIterate
*collationSource
)
1192 UErrorCode status
= U_ZERO_ERROR
;
1195 UChar
*srcP
= collationSource
->pos
- 1; /* Start of chars to normalize */
1196 UChar
*endP
= collationSource
->fcdPosition
; /* End of region to normalize+1 */
1198 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1199 srcP
, (int32_t)(endP
- srcP
),
1202 if(status
== U_BUFFER_OVERFLOW_ERROR
|| status
== U_STRING_NOT_TERMINATED_WARNING
) {
1203 // reallocate and terminate
1204 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1205 &collationSource
->writableBuffer
,
1206 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1210 fprintf(stderr
, "collIterNormalize(), out of memory\n");
1214 status
= U_ZERO_ERROR
;
1215 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1216 srcP
, (int32_t)(endP
- srcP
),
1220 if (U_FAILURE(status
)) {
1222 fprintf(stderr
, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status
));
1227 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1228 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1230 collationSource
->pos
= collationSource
->writableBuffer
;
1231 collationSource
->origFlags
= collationSource
->flags
;
1232 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1233 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1237 // This function takes the iterator and extracts normalized stuff up to the next boundary
1238 // It is similar in the end results to the collIterNormalize, but for the cases when we
1241 inline void normalizeIterator(collIterate *collationSource) {
1242 UErrorCode status = U_ZERO_ERROR;
1243 UBool wasNormalized = FALSE;
1244 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1245 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1246 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1247 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1248 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1249 // reallocate and terminate
1250 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1251 &collationSource->writableBuffer,
1252 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1256 fprintf(stderr, "normalizeIterator(), out of memory\n");
1260 status = U_ZERO_ERROR;
1261 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1262 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1263 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1264 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1266 // Terminate the buffer - we already checked that it is big enough
1267 collationSource->writableBuffer[normLen] = 0;
1268 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1269 collationSource->flags |= UCOL_ITER_ALLOCATED;
1271 collationSource->pos = collationSource->writableBuffer;
1272 collationSource->origFlags = collationSource->flags;
1273 collationSource->flags |= UCOL_ITER_INNORMBUF;
1274 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1278 /* Incremental FCD check and normalize */
1279 /* Called from getNextCE when normalization state is suspect. */
1280 /* When entering, the state is known to be this: */
1281 /* o We are working in the main buffer of the collIterate, not the side */
1282 /* writable buffer. When in the side buffer, normalization mode is always off, */
1283 /* so we won't get here. */
1284 /* o The leading combining class from the current character is 0 or */
1285 /* the trailing combining class of the previous char was zero. */
1286 /* True because the previous call to this function will have always exited */
1287 /* that way, and we get called for every char where cc might be non-zero. */
1289 inline UBool
collIterFCD(collIterate
*collationSource
) {
1291 const UChar
*srcP
, *endP
;
1293 uint8_t prevTrailingCC
= 0;
1295 UBool needNormalize
= FALSE
;
1297 srcP
= collationSource
->pos
-1;
1299 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1300 endP
= collationSource
->endp
;
1305 // Get the trailing combining class of the current character. If it's zero,
1309 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1311 if (U16_IS_LEAD(c
)) {
1312 if ((endP
== NULL
|| srcP
!= endP
) && U16_IS_TRAIL(c2
=*srcP
)) {
1314 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1320 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1322 if (prevTrailingCC
!= 0) {
1323 // The current char has a non-zero trailing CC. Scan forward until we find
1324 // a char with a leading cc of zero.
1325 while (endP
== NULL
|| srcP
!= endP
)
1327 const UChar
*savedSrcP
= srcP
;
1331 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1332 if (fcd
!= 0 && U16_IS_LEAD(c
)) {
1333 if ((endP
== NULL
|| srcP
!= endP
) && U16_IS_TRAIL(c2
=*srcP
)) {
1335 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1340 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1341 if (leadingCC
== 0) {
1342 srcP
= savedSrcP
; // Hit char that is not part of combining sequence.
1343 // back up over it. (Could be surrogate pair!)
1347 if (leadingCC
< prevTrailingCC
) {
1348 needNormalize
= TRUE
;
1351 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1356 collationSource
->fcdPosition
= (UChar
*)srcP
;
1358 return needNormalize
;
1361 /****************************************************************************/
1362 /* Following are the CE retrieval functions */
1364 /****************************************************************************/
1366 static uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
);
1367 static uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
);
1369 /* there should be a macro version of this function in the header file */
1370 /* This is the first function that tries to fetch a collation element */
1371 /* If it's not succesfull or it encounters a more difficult situation */
1372 /* some more sofisticated and slower functions are invoked */
1374 inline uint32_t ucol_IGetNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1376 if (collationSource
->CEpos
> collationSource
->toReturn
) { /* Are there any CEs from previous expansions? */
1377 order
= *(collationSource
->toReturn
++); /* if so, return them */
1378 if(collationSource
->CEpos
== collationSource
->toReturn
) {
1379 collationSource
->CEpos
= collationSource
->toReturn
= collationSource
->extendCEs
? collationSource
->extendCEs
: collationSource
->CEs
;
1385 collationSource
->offsetReturn
= NULL
;
1387 for (;;) /* Loop handles case when incremental normalize switches */
1388 { /* to or from the side buffer / original string, and we */
1389 /* need to start again to get the next character. */
1391 if ((collationSource
->flags
& (UCOL_ITER_HASLEN
| UCOL_ITER_INNORMBUF
| UCOL_ITER_NORM
| UCOL_HIRAGANA_Q
| UCOL_USE_ITERATOR
)) == 0)
1393 // The source string is null terminated and we're not working from the side buffer,
1394 // and we're not normalizing. This is the fast path.
1395 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1396 ch
= *collationSource
->pos
++;
1401 return UCOL_NO_MORE_CES
;
1405 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1406 // Normal path for strings when length is specified.
1407 // (We can't be in side buffer because it is always null terminated.)
1408 if (collationSource
->pos
>= collationSource
->endp
) {
1409 // Ran off of the end of the main source string. We're done.
1410 return UCOL_NO_MORE_CES
;
1412 ch
= *collationSource
->pos
++;
1414 else if(collationSource
->flags
& UCOL_USE_ITERATOR
) {
1415 UChar32 iterCh
= collationSource
->iterator
->next(collationSource
->iterator
);
1416 if(iterCh
== U_SENTINEL
) {
1417 return UCOL_NO_MORE_CES
;
1423 // Null terminated string.
1424 ch
= *collationSource
->pos
++;
1426 // Ran off end of buffer.
1427 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1428 // Ran off end of main string. backing up one character.
1429 collationSource
->pos
--;
1430 return UCOL_NO_MORE_CES
;
1434 // Hit null in the normalize side buffer.
1435 // Usually this means the end of the normalized data,
1436 // except for one odd case: a null followed by combining chars,
1437 // which is the case if we are at the start of the buffer.
1438 if (collationSource
->pos
== collationSource
->writableBuffer
+1) {
1442 // Null marked end of side buffer.
1443 // Revert to the main string and
1444 // loop back to top to try again to get a character.
1445 collationSource
->pos
= collationSource
->fcdPosition
;
1446 collationSource
->flags
= collationSource
->origFlags
;
1452 if(collationSource
->flags
&UCOL_HIRAGANA_Q
) {
1453 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1454 * based on whether the previous codepoint was Hiragana or Katakana.
1456 if(((ch
>=0x3040 && ch
<=0x3096) || (ch
>= 0x309d && ch
<= 0x309f)) ||
1457 ((collationSource
->flags
& UCOL_WAS_HIRAGANA
) && (ch
>= 0x3099 && ch
<= 0x309C))) {
1458 collationSource
->flags
|= UCOL_WAS_HIRAGANA
;
1460 collationSource
->flags
&= ~UCOL_WAS_HIRAGANA
;
1464 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1465 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1466 if ((collationSource
->flags
& UCOL_ITER_NORM
) == 0) {
1470 if (collationSource
->fcdPosition
>= collationSource
->pos
) {
1471 // An earlier FCD check has already covered the current character.
1472 // We can go ahead and process this char.
1476 if (ch
< ZERO_CC_LIMIT_
) {
1477 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1481 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1482 // We need to peek at the next character in order to tell if we are FCD
1483 if ((collationSource
->flags
& UCOL_ITER_HASLEN
) && collationSource
->pos
>= collationSource
->endp
) {
1484 // We are at the last char of source string.
1485 // It is always OK for FCD check.
1489 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1490 if (*collationSource
->pos
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1496 // Need a more complete FCD check and possible normalization.
1497 if (collIterFCD(collationSource
)) {
1498 collIterNormalize(collationSource
);
1500 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1501 // No normalization was needed. Go ahead and process the char we already had.
1505 // Some normalization happened. Next loop iteration will pick up a char
1506 // from the normalization buffer.
1512 /* For latin-1 characters we never need to fall back to the UCA table */
1513 /* because all of the UCA data is replicated in the latinOneMapping array */
1514 order
= coll
->latinOneMapping
[ch
];
1515 if (order
> UCOL_NOT_FOUND
) {
1516 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
);
1521 order
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
1522 if(order
> UCOL_NOT_FOUND
) { /* if a CE is special */
1523 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
); /* and try to get the special CE */
1525 if(order
== UCOL_NOT_FOUND
&& coll
->UCA
) { /* We couldn't find a good CE in the tailoring */
1526 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1527 order
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
1529 if(order
> UCOL_NOT_FOUND
) { /* UCA also gives us a special CE */
1530 order
= ucol_prv_getSpecialCE(coll
->UCA
, ch
, order
, collationSource
, status
);
1534 if(order
== UCOL_NOT_FOUND
) {
1535 order
= getImplicit(ch
, collationSource
);
1537 return order
; /* return the CE */
1540 /* ucol_getNextCE, out-of-line version for use from other files. */
1541 U_CAPI
uint32_t U_EXPORT2
1542 ucol_getNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1543 return ucol_IGetNextCE(coll
, collationSource
, status
);
1548 * Incremental previous normalization happens here. Pick up the range of chars
1549 * identifed by FCD, normalize it into the collIterate's writable buffer,
1550 * switch the collIterate's state to use the writable buffer.
1551 * @param data collation iterator data
1554 void collPrevIterNormalize(collIterate
*data
)
1556 UErrorCode status
= U_ZERO_ERROR
;
1557 UChar
*pEnd
= data
->pos
; /* End normalize + 1 */
1562 /* Start normalize */
1563 if (data
->fcdPosition
== NULL
) {
1564 pStart
= data
->string
;
1567 pStart
= data
->fcdPosition
+ 1;
1570 normLen
= unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0,
1571 data
->writableBuffer
, 0, &status
);
1573 if (data
->writableBufSize
<= normLen
) {
1574 freeHeapWritableBuffer(data
);
1575 data
->writableBuffer
= (UChar
*)uprv_malloc((normLen
+ 1) *
1577 if(data
->writableBuffer
== NULL
) { // something is wrong here, return
1578 data
->writableBufSize
= 0; // Reset writableBufSize
1581 data
->flags
|= UCOL_ITER_ALLOCATED
;
1582 /* to handle the zero termination */
1583 data
->writableBufSize
= normLen
+ 1;
1585 status
= U_ZERO_ERROR
;
1587 this puts the null termination infront of the normalized string instead
1590 pStartNorm
= data
->writableBuffer
+ (data
->writableBufSize
- normLen
);
1591 *(pStartNorm
- 1) = 0;
1592 unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0, pStartNorm
,
1595 if (data
->offsetBuffer
== NULL
) {
1596 int32_t len
= normLen
>= UCOL_EXPAND_CE_BUFFER_SIZE
? normLen
+ 1 : UCOL_EXPAND_CE_BUFFER_SIZE
;
1598 data
->offsetBufferSize
= len
;
1599 data
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * len
);
1600 data
->offsetStore
= data
->offsetBuffer
;
1601 } else if(data
->offsetBufferSize
< (int32_t) normLen
) {
1602 int32_t storeIX
= data
->offsetStore
- data
->offsetBuffer
;
1603 int32_t *tob
= (int32_t *) uprv_realloc(data
->offsetBuffer
, sizeof(int32_t) * (normLen
+ 1));
1606 data
->offsetBuffer
= tob
;
1607 data
->offsetStore
= &data
->offsetBuffer
[storeIX
];
1608 data
->offsetBufferSize
= normLen
+ 1;
1613 * The usual case at this point is that we've got a base
1614 * character followed by marks that were normalized. If
1615 * fcdPosition is NULL, that means that we backed up to
1616 * the beginning of the string and there's no base character.
1618 * Forward processing will usually normalize when it sees
1619 * the first mark, so that mark will get it's natural offset
1620 * and the rest will get the offset of the character following
1621 * the marks. The base character will also get its natural offset.
1623 * We write the offset of the base character, if there is one,
1624 * followed by the offset of the first mark and then the offsets
1625 * of the rest of the marks.
1627 int32_t firstMarkOffset
= 0;
1628 int32_t trailOffset
= data
->pos
- data
->string
+ 1;
1629 int32_t trailCount
= normLen
- 1;
1631 if (data
->fcdPosition
!= NULL
) {
1632 int32_t baseOffset
= data
->fcdPosition
- data
->string
;
1633 UChar baseChar
= *data
->fcdPosition
;
1635 firstMarkOffset
= baseOffset
+ 1;
1638 * If the base character is the start of a contraction, forward processing
1639 * will normalize the marks while checking for the contraction, which means
1640 * that the offset of the first mark will the same as the other marks.
1642 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1644 if (baseChar
>= 0x100) {
1645 uint32_t baseOrder
= UTRIE_GET32_FROM_LEAD(&data
->coll
->mapping
, baseChar
);
1647 if (baseOrder
== UCOL_NOT_FOUND
&& data
->coll
->UCA
) {
1648 baseOrder
= UTRIE_GET32_FROM_LEAD(&data
->coll
->UCA
->mapping
, baseChar
);
1651 if (baseOrder
> UCOL_NOT_FOUND
&& getCETag(baseOrder
) == CONTRACTION_TAG
) {
1652 firstMarkOffset
= trailOffset
;
1656 *(data
->offsetStore
++) = baseOffset
;
1659 *(data
->offsetStore
++) = firstMarkOffset
;
1661 for (int32_t i
= 0; i
< trailCount
; i
+= 1) {
1662 *(data
->offsetStore
++) = trailOffset
;
1665 data
->offsetRepeatValue
= trailOffset
;
1667 data
->offsetReturn
= data
->offsetStore
- 1;
1668 if (data
->offsetReturn
== data
->offsetBuffer
) {
1669 data
->offsetStore
= data
->offsetBuffer
;
1672 data
->pos
= data
->writableBuffer
+ data
->writableBufSize
;
1673 data
->origFlags
= data
->flags
;
1674 data
->flags
|= UCOL_ITER_INNORMBUF
;
1675 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
1680 * Incremental FCD check for previous iteration and normalize. Called from
1681 * getPrevCE when normalization state is suspect.
1682 * When entering, the state is known to be this:
1683 * o We are working in the main buffer of the collIterate, not the side
1684 * writable buffer. When in the side buffer, normalization mode is always
1685 * off, so we won't get here.
1686 * o The leading combining class from the current character is 0 or the
1687 * trailing combining class of the previous char was zero.
1688 * True because the previous call to this function will have always exited
1689 * that way, and we get called for every char where cc might be non-zero.
1690 * @param data collation iterate struct
1691 * @return normalization status, TRUE for normalization to be done, FALSE
1695 inline UBool
collPrevIterFCD(collIterate
*data
)
1697 const UChar
*src
, *start
;
1700 uint8_t trailingCC
= 0;
1702 UBool result
= FALSE
;
1704 start
= data
->string
;
1705 src
= data
->pos
+ 1;
1707 /* Get the trailing combining class of the current character. */
1709 if (!U16_IS_SURROGATE(c
)) {
1710 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1711 } else if (U16_IS_TRAIL(c
) && start
< src
&& U16_IS_LEAD(c2
= *(src
- 1))) {
1713 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1715 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1717 } else /* unpaired surrogate */ {
1721 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1723 if (leadingCC
!= 0) {
1725 The current char has a non-zero leading combining class.
1726 Scan backward until we find a char with a trailing cc of zero.
1731 data
->fcdPosition
= NULL
;
1736 if (!U16_IS_SURROGATE(c
)) {
1737 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1738 } else if (U16_IS_TRAIL(c
) && start
< src
&& U16_IS_LEAD(c2
= *(src
- 1))) {
1740 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1742 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1744 } else /* unpaired surrogate */ {
1748 trailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1750 if (trailingCC
== 0) {
1754 if (leadingCC
< trailingCC
) {
1758 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1762 data
->fcdPosition
= (UChar
*)src
;
1767 /** gets a character from the string at a given offset
1768 * Handles both normal and iterative cases.
1769 * No error checking - caller beware!
1772 UChar
peekCharacter(collIterate
*source
, int32_t offset
) {
1773 if(source
->pos
!= NULL
) {
1774 return *(source
->pos
+ offset
);
1775 } else if(source
->iterator
!= NULL
) {
1777 source
->iterator
->move(source
->iterator
, offset
, UITER_CURRENT
);
1778 UChar toReturn
= (UChar
)source
->iterator
->next(source
->iterator
);
1779 source
->iterator
->move(source
->iterator
, -offset
-1, UITER_CURRENT
);
1782 return (UChar
)source
->iterator
->current(source
->iterator
);
1785 return (UChar
)U_SENTINEL
;
1790 * Determines if we are at the start of the data string in the backwards
1791 * collation iterator
1792 * @param data collation iterator
1793 * @return TRUE if we are at the start
1796 inline UBool
isAtStartPrevIterate(collIterate
*data
) {
1797 if(data
->pos
== NULL
&& data
->iterator
!= NULL
) {
1798 return !data
->iterator
->hasPrevious(data
->iterator
);
1800 //return (collIter_bos(data)) ||
1801 return (data
->pos
== data
->string
) ||
1802 ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
1803 *(data
->pos
- 1) == 0 && data
->fcdPosition
== NULL
);
1807 inline void goBackOne(collIterate
*data
) {
1809 // somehow, it looks like we need to keep iterator synced up
1810 // at all times, as above.
1814 if(data
->iterator
) {
1815 data
->iterator
->previous(data
->iterator
);
1818 if(data
->iterator
&& (data
->flags
& UCOL_USE_ITERATOR
)) {
1819 data
->iterator
->previous(data
->iterator
);
1827 * Inline function that gets a simple CE.
1828 * So what it does is that it will first check the expansion buffer. If the
1829 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1830 * is different from the string pointer, we return the collation element at the
1831 * return pointer and decrement it.
1832 * For more complicated CEs it resorts to getComplicatedCE.
1833 * @param coll collator data
1834 * @param data collation iterator struct
1835 * @param status error status
1838 inline uint32_t ucol_IGetPrevCE(const UCollator
*coll
, collIterate
*data
,
1841 uint32_t result
= (uint32_t)UCOL_NULLORDER
;
1843 if (data
->offsetReturn
!= NULL
) {
1844 if (data
->offsetRepeatCount
> 0) {
1845 data
->offsetRepeatCount
-= 1;
1847 if (data
->offsetReturn
== data
->offsetBuffer
) {
1848 data
->offsetReturn
= NULL
;
1849 data
->offsetStore
= data
->offsetBuffer
;
1851 data
->offsetReturn
-= 1;
1856 if ((data
->extendCEs
&& data
->toReturn
> data
->extendCEs
) ||
1857 (!data
->extendCEs
&& data
->toReturn
> data
->CEs
))
1859 data
->toReturn
-= 1;
1860 result
= *(data
->toReturn
);
1861 if (data
->CEs
== data
->toReturn
|| data
->extendCEs
== data
->toReturn
) {
1862 data
->CEpos
= data
->toReturn
;
1869 Loop handles case when incremental normalize switches to or from the
1870 side buffer / original string, and we need to start again to get the
1874 if (data
->flags
& UCOL_ITER_HASLEN
) {
1876 Normal path for strings when length is specified.
1877 Not in side buffer because it is always null terminated.
1879 if (data
->pos
<= data
->string
) {
1880 /* End of the main source string */
1881 return UCOL_NO_MORE_CES
;
1886 // we are using an iterator to go back. Pray for us!
1887 else if (data
->flags
& UCOL_USE_ITERATOR
) {
1888 UChar32 iterCh
= data
->iterator
->previous(data
->iterator
);
1889 if(iterCh
== U_SENTINEL
) {
1890 return UCOL_NO_MORE_CES
;
1898 /* we are in the side buffer. */
1901 At the start of the normalize side buffer.
1903 Because pointer points to the last accessed character,
1904 hence we have to increment it by one here.
1906 data
->flags
= data
->origFlags
;
1907 data
->offsetRepeatValue
= 0;
1909 if (data
->fcdPosition
== NULL
) {
1910 data
->pos
= data
->string
;
1911 return UCOL_NO_MORE_CES
;
1914 data
->pos
= data
->fcdPosition
+ 1;
1921 if(data
->flags
&UCOL_HIRAGANA_Q
) {
1922 if(ch
>=0x3040 && ch
<=0x309f) {
1923 data
->flags
|= UCOL_WAS_HIRAGANA
;
1925 data
->flags
&= ~UCOL_WAS_HIRAGANA
;
1930 * got a character to determine if there's fcd and/or normalization
1932 * if the current character is not fcd.
1933 * if current character is at the start of the string
1934 * Trailing combining class == 0.
1935 * Note if pos is in the writablebuffer, norm is always 0
1937 if (ch
< ZERO_CC_LIMIT_
||
1938 // this should propel us out of the loop in the iterator case
1939 (data
->flags
& UCOL_ITER_NORM
) == 0 ||
1940 (data
->fcdPosition
!= NULL
&& data
->fcdPosition
<= data
->pos
)
1941 || data
->string
== data
->pos
) {
1945 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1946 /* if next character is FCD */
1947 if (data
->pos
== data
->string
) {
1948 /* First char of string is always OK for FCD check */
1952 /* Not first char of string, do the FCD fast test */
1953 if (*(data
->pos
- 1) < NFC_ZERO_CC_BLOCK_LIMIT_
) {
1958 /* Need a more complete FCD check and possible normalization. */
1959 if (collPrevIterFCD(data
)) {
1960 collPrevIterNormalize(data
);
1963 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1964 /* No normalization. Go ahead and process the char. */
1969 Some normalization happened.
1970 Next loop picks up a char from the normalization buffer.
1974 /* attempt to handle contractions, after removal of the backwards
1977 if (ucol_contractionEndCP(ch
, coll
) && !isAtStartPrevIterate(data
)) {
1978 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, UCOL_CONTRACTION
, data
, status
);
1981 result
= coll
->latinOneMapping
[ch
];
1984 result
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
1986 if (result
> UCOL_NOT_FOUND
) {
1987 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, result
, data
, status
);
1989 if (result
== UCOL_NOT_FOUND
) { // Not found in master list
1990 if (!isAtStartPrevIterate(data
) &&
1991 ucol_contractionEndCP(ch
, data
->coll
))
1993 result
= UCOL_CONTRACTION
;
1996 result
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
2000 if (result
> UCOL_NOT_FOUND
) {
2002 result
= ucol_prv_getSpecialPrevCE(coll
->UCA
, ch
, result
, data
, status
);
2008 if(result
== UCOL_NOT_FOUND
) {
2009 result
= getPrevImplicit(ch
, data
);
2017 /* ucol_getPrevCE, out-of-line version for use from other files. */
2018 U_CFUNC
uint32_t U_EXPORT2
2019 ucol_getPrevCE(const UCollator
*coll
, collIterate
*data
,
2020 UErrorCode
*status
) {
2021 return ucol_IGetPrevCE(coll
, data
, status
);
2025 /* this should be connected to special Jamo handling */
2026 U_CFUNC
uint32_t U_EXPORT2
2027 ucol_getFirstCE(const UCollator
*coll
, UChar u
, UErrorCode
*status
) {
2030 IInit_collIterate(coll
, &u
, 1, &colIt
);
2031 order
= ucol_IGetNextCE(coll
, &colIt
, status
);
2032 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2037 * Inserts the argument character into the end of the buffer pushing back the
2039 * @param data collIterate struct data
2040 * @param pNull pointer to the null termination
2041 * @param ch character to be appended
2042 * @return the position of the new addition
2045 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar ch
)
2047 uint32_t size
= data
->writableBufSize
;
2049 static const uint32_t INCSIZE
= 5;
2051 if ((data
->writableBuffer
+ size
) > (pNull
+ 1)) {
2058 buffer will always be null terminated at the end.
2059 giving extra space since it is likely that more characters will be added.
2062 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
2063 if(newbuffer
!= NULL
) { // something wrong, but no status
2064 uprv_memcpy(newbuffer
, data
->writableBuffer
,
2065 data
->writableBufSize
* sizeof(UChar
));
2067 freeHeapWritableBuffer(data
);
2068 data
->writableBufSize
= size
;
2069 data
->writableBuffer
= newbuffer
;
2071 newbuffer
= newbuffer
+ data
->writableBufSize
;
2073 *(newbuffer
+ 1) = 0;
2079 * Inserts the argument string into the end of the buffer pushing back the
2081 * @param data collIterate struct data
2082 * @param pNull pointer to the null termination
2083 * @param string to be appended
2084 * @param length of the string to be appended
2085 * @return the position of the new addition
2088 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar
*str
,
2091 uint32_t size
= pNull
- data
->writableBuffer
;
2094 if (data
->writableBuffer
+ data
->writableBufSize
> pNull
+ length
+ 1) {
2095 uprv_memcpy(pNull
, str
, length
* sizeof(UChar
));
2096 *(pNull
+ length
) = 0;
2101 buffer will always be null terminated at the end.
2102 giving extra space since it is likely that more characters will be added.
2104 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * (size
+ length
+ 1));
2105 if(newbuffer
!= NULL
) {
2106 uprv_memcpy(newbuffer
, data
->writableBuffer
, size
* sizeof(UChar
));
2107 uprv_memcpy(newbuffer
+ size
, str
, length
* sizeof(UChar
));
2109 freeHeapWritableBuffer(data
);
2110 data
->writableBufSize
= size
+ length
+ 1;
2111 data
->writableBuffer
= newbuffer
;
2118 * Special normalization function for contraction in the forwards iterator.
2119 * This normalization sequence will place the current character at source->pos
2120 * and its following normalized sequence into the buffer.
2121 * The fcd position, pos will be changed.
2122 * pos will now point to positions in the buffer.
2123 * Flags will be changed accordingly.
2124 * @param data collation iterator data
2127 inline void normalizeNextContraction(collIterate
*data
)
2129 UChar
*buffer
= data
->writableBuffer
;
2130 uint32_t buffersize
= data
->writableBufSize
;
2132 UErrorCode status
= U_ZERO_ERROR
;
2133 /* because the pointer points to the next character */
2134 UChar
*pStart
= data
->pos
- 1;
2139 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2140 *data
->writableBuffer
= *(pStart
- 1);
2144 strsize
= u_strlen(data
->writableBuffer
);
2147 pEnd
= data
->fcdPosition
;
2149 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
2152 if (buffersize
<= normLen
+ strsize
) {
2153 uint32_t size
= strsize
+ normLen
+ 1;
2154 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2156 uprv_memcpy(temp
, buffer
, sizeof(UChar
) * strsize
);
2157 freeHeapWritableBuffer(data
);
2158 data
->writableBuffer
= temp
;
2159 data
->writableBufSize
= size
;
2160 data
->flags
|= UCOL_ITER_ALLOCATED
;
2162 return; // Avoid writing past bound of buffer->writableBuffer.
2166 status
= U_ZERO_ERROR
;
2167 pStartNorm
= buffer
+ strsize
;
2168 /* null-termination will be added here */
2169 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
,
2170 normLen
+ 1, &status
);
2172 data
->pos
= data
->writableBuffer
+ strsize
;
2173 data
->origFlags
= data
->flags
;
2174 data
->flags
|= UCOL_ITER_INNORMBUF
;
2175 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2179 * Contraction character management function that returns the next character
2180 * for the forwards iterator.
2181 * Does nothing if the next character is in buffer and not the first character
2183 * Else it checks next character in data string to see if it is normalizable.
2184 * If it is not, the character is simply copied into the buffer, else
2185 * the whole normalized substring is copied into the buffer, including the
2186 * current character.
2187 * @param data collation element iterator data
2188 * @return next character
2191 inline UChar
getNextNormalizedChar(collIterate
*data
)
2195 // Here we need to add the iterator code. One problem is the way
2196 // end of string is handled. If we just return next char, it could
2197 // be the sentinel. Most of the cases already check for this, but we
2199 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ) {
2200 /* if no normalization and not in buffer. */
2201 if(data
->flags
& UCOL_USE_ITERATOR
) {
2202 return (UChar
)data
->iterator
->next(data
->iterator
);
2204 return *(data
->pos
++);
2208 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2209 //normalizeIterator(data);
2212 UChar
*pEndWritableBuffer
= NULL
;
2213 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2214 if ((innormbuf
&& *data
->pos
!= 0) ||
2215 (data
->fcdPosition
!= NULL
&& !innormbuf
&&
2216 data
->pos
< data
->fcdPosition
)) {
2218 if next character is in normalized buffer, no further normalization
2221 return *(data
->pos
++);
2224 if (data
->flags
& UCOL_ITER_HASLEN
) {
2225 /* in data string */
2226 if (data
->pos
+ 1 == data
->endp
) {
2227 return *(data
->pos
++);
2232 // inside the normalization buffer, but at the end
2233 // (since we encountered zero). This means, in the
2234 // case we're using char iterator, that we need to
2235 // do another round of normalization.
2236 //if(data->origFlags & UCOL_USE_ITERATOR) {
2237 // we need to restore original flags,
2238 // otherwise, we'll lose them
2239 //data->flags = data->origFlags;
2240 //normalizeIterator(data);
2241 //return *(data->pos++);
2244 in writable buffer, at this point fcdPosition can not be
2245 pointing to the end of the data string. see contracting tag.
2247 if(data
->fcdPosition
) {
2248 if (*(data
->fcdPosition
+ 1) == 0 ||
2249 data
->fcdPosition
+ 1 == data
->endp
) {
2250 /* at the end of the string, dump it into the normalizer */
2251 data
->pos
= insertBufferEnd(data
, data
->pos
,
2252 *(data
->fcdPosition
)) + 1;
2253 // Check if data->pos received a null pointer
2254 if (data
->pos
== NULL
) {
2255 return (UChar
)-1; // Return to indicate error.
2257 return *(data
->fcdPosition
++);
2259 pEndWritableBuffer
= data
->pos
;
2260 data
->pos
= data
->fcdPosition
;
2261 } else if(data
->origFlags
& UCOL_USE_ITERATOR
) {
2262 // if we are here, we're using a normalizing iterator.
2263 // we should just continue further.
2264 data
->flags
= data
->origFlags
;
2266 return (UChar
)data
->iterator
->next(data
->iterator
);
2271 if (*(data
->pos
+ 1) == 0) {
2272 return *(data
->pos
++);
2278 nextch
= *data
->pos
;
2281 * if the current character is not fcd.
2282 * Trailing combining class == 0.
2284 if ((data
->fcdPosition
== NULL
|| data
->fcdPosition
< data
->pos
) &&
2285 (nextch
>= NFC_ZERO_CC_BLOCK_LIMIT_
||
2286 ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
)) {
2288 Need a more complete FCD check and possible normalization.
2289 normalize substring will be appended to buffer
2291 if (collIterFCD(data
)) {
2292 normalizeNextContraction(data
);
2293 return *(data
->pos
++);
2295 else if (innormbuf
) {
2296 /* fcdposition shifted even when there's no normalization, if we
2297 don't input the rest into this, we'll get the wrong position when
2298 we reach the end of the writableBuffer */
2299 int32_t length
= data
->fcdPosition
- data
->pos
+ 1;
2300 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
,
2301 data
->pos
- 1, length
);
2302 // Check if data->pos received a null pointer
2303 if (data
->pos
== NULL
) {
2304 return (UChar
)-1; // Return to indicate error.
2306 return *(data
->pos
++);
2312 no normalization is to be done hence only one character will be
2313 appended to the buffer.
2315 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
, ch
) + 1;
2316 // Check if data->pos received a null pointer
2317 if (data
->pos
== NULL
) {
2318 return (UChar
)-1; // Return to indicate error.
2322 /* points back to the pos in string */
2329 * Function to copy the buffer into writableBuffer and sets the fcd position to
2330 * the correct position
2331 * @param source data string source
2332 * @param buffer character buffer
2333 * @param tempdb current position in buffer that has been used up
2336 inline void setDiscontiguosAttribute(collIterate
*source
, UChar
*buffer
,
2339 /* okay confusing part here. to ensure that the skipped characters are
2340 considered later, we need to place it in the appropriate position in the
2341 normalization buffer and reassign the pos pointer. simple case if pos
2342 reside in string, simply copy to normalization buffer and
2343 fcdposition = pos, pos = start of normalization buffer. if pos in
2344 normalization buffer, we'll insert the copy infront of pos and point pos
2345 to the start of the normalization buffer. why am i doing these copies?
2346 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2347 not require any changes, which be really painful. */
2348 uint32_t length
= u_strlen(buffer
);;
2349 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
2350 u_strcpy(tempdb
, source
->pos
);
2353 source
->fcdPosition
= source
->pos
;
2354 source
->origFlags
= source
->flags
;
2355 source
->flags
|= UCOL_ITER_INNORMBUF
;
2356 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2359 if (length
>= source
->writableBufSize
) {
2360 freeHeapWritableBuffer(source
);
2361 source
->writableBuffer
=
2362 (UChar
*)uprv_malloc((length
+ 1) * sizeof(UChar
));
2363 if(source
->writableBuffer
== NULL
) {
2364 source
->writableBufSize
= 0; // Reset size
2367 source
->writableBufSize
= length
;
2370 u_strcpy(source
->writableBuffer
, buffer
);
2371 source
->pos
= source
->writableBuffer
;
2375 * Function to get the discontiguos collation element within the source.
2376 * Note this function will set the position to the appropriate places.
2377 * @param coll current collator used
2378 * @param source data string source
2379 * @param constart index to the start character in the contraction table
2380 * @return discontiguos collation element offset
2383 uint32_t getDiscontiguous(const UCollator
*coll
, collIterate
*source
,
2384 const UChar
*constart
)
2386 /* source->pos currently points to the second combining character after
2387 the start character */
2388 UChar
*temppos
= source
->pos
;
2389 UChar buffer
[4*UCOL_MAX_BUFFER
];
2390 UChar
*tempdb
= buffer
;
2391 const UChar
*tempconstart
= constart
;
2392 uint8_t tempflags
= source
->flags
;
2393 UBool multicontraction
= FALSE
;
2394 UChar
*tempbufferpos
= 0;
2395 collIterateState discState
;
2397 backupState(source
, &discState
);
2399 //*tempdb = *(source->pos - 1);
2400 *tempdb
= peekCharacter(source
, -1);
2408 if (((source
->flags
& UCOL_ITER_HASLEN
) && source
->pos
>= source
->endp
)
2409 || (peekCharacter(source
, 0) == 0 &&
2410 //|| (*source->pos == 0 &&
2411 ((source
->flags
& UCOL_ITER_INNORMBUF
) == 0 ||
2412 source
->fcdPosition
== NULL
||
2413 source
->fcdPosition
== source
->endp
||
2414 *(source
->fcdPosition
) == 0 ||
2415 u_getCombiningClass(*(source
->fcdPosition
)) == 0)) ||
2416 /* end of string in null terminated string or stopped by a
2417 null character, note fcd does not always point to a base
2418 character after the discontiguos change */
2419 u_getCombiningClass(peekCharacter(source
, 0)) == 0) {
2420 //u_getCombiningClass(*(source->pos)) == 0) {
2421 //constart = (UChar *)coll->image + getContractOffset(CE);
2422 if (multicontraction
) {
2424 source
->pos
= temppos
- 1;
2425 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2426 return *(coll
->contractionCEs
+
2427 (tempconstart
- coll
->contractionIndex
));
2429 constart
= tempconstart
;
2433 UCharOffset
= (UChar
*)(tempconstart
+ 1); /* skip the backward offset*/
2434 schar
= getNextNormalizedChar(source
);
2436 while (schar
> (tchar
= *UCharOffset
)) {
2440 if (schar
!= tchar
) {
2441 /* not the correct codepoint. we stuff the current codepoint into
2442 the discontiguos buffer and try the next character */
2448 if (u_getCombiningClass(schar
) ==
2449 u_getCombiningClass(peekCharacter(source
, -2))) {
2450 //u_getCombiningClass(*(source->pos - 2))) {
2455 result
= *(coll
->contractionCEs
+
2456 (UCharOffset
- coll
->contractionIndex
));
2460 if (result
== UCOL_NOT_FOUND
) {
2462 } else if (isContraction(result
)) {
2463 /* this is a multi-contraction*/
2464 tempconstart
= (UChar
*)coll
->image
+ getContractOffset(result
);
2465 if (*(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
))
2466 != UCOL_NOT_FOUND
) {
2467 multicontraction
= TRUE
;
2468 temppos
= source
->pos
+ 1;
2469 tempbufferpos
= buffer
+ u_strlen(buffer
);
2472 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2477 /* no problems simply reverting just like that,
2478 if we are in string before getting into this function, points back to
2479 string hence no problem.
2480 if we are in normalization buffer before getting into this function,
2481 since we'll never use another normalization within this function, we
2482 know that fcdposition points to a base character. the normalization buffer
2483 never change, hence this revert works. */
2484 loadState(source
, &discState
, TRUE
);
2487 //source->pos = temppos - 1;
2488 source
->flags
= tempflags
;
2489 return *(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
));
2493 inline UBool
isNonChar(UChar32 cp
) {
2494 return (UBool
)((cp
& 0xFFFE) == 0xFFFE || (0xFDD0 <= cp
&& cp
<= 0xFDEF) || (0xD800 <= cp
&& cp
<= 0xDFFF));
2497 /* now uses Mark's getImplicitPrimary code */
2499 inline uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
) {
2503 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
2504 *(collationSource
->CEpos
++) = ((r
& 0x0000FFFF)<<16) | 0x000000C0;
2505 collationSource
->offsetRepeatCount
+= 1;
2506 return (r
& UCOL_PRIMARYMASK
) | 0x00000505; // This was 'order'
2510 * Inserts the argument character into the front of the buffer replacing the
2511 * front null terminator.
2512 * @param data collation element iterator data
2513 * @param pNull pointer to the null terminator
2514 * @param ch character to be appended
2515 * @return positon of added character
2518 inline UChar
* insertBufferFront(collIterate
*data
, UChar
*pNull
, UChar ch
)
2520 uint32_t size
= data
->writableBufSize
;
2523 static const uint32_t INCSIZE
= 5;
2525 if (pNull
> data
->writableBuffer
+ 1) {
2532 buffer will always be null terminated infront.
2533 giving extra space since it is likely that more characters will be added.
2536 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
2537 if(newbuffer
== NULL
) {
2540 end
= newbuffer
+ INCSIZE
;
2541 uprv_memcpy(end
, data
->writableBuffer
,
2542 data
->writableBufSize
* sizeof(UChar
));
2546 freeHeapWritableBuffer(data
);
2548 data
->writableBufSize
= size
;
2549 data
->writableBuffer
= newbuffer
;
2554 * Special normalization function for contraction in the previous iterator.
2555 * This normalization sequence will place the current character at source->pos
2556 * and its following normalized sequence into the buffer.
2557 * The fcd position, pos will be changed.
2558 * pos will now point to positions in the buffer.
2559 * Flags will be changed accordingly.
2560 * @param data collation iterator data
2563 inline void normalizePrevContraction(collIterate
*data
, UErrorCode
*status
)
2565 uint32_t nulltermsize
;
2566 UErrorCode localstatus
= U_ZERO_ERROR
;
2567 UChar
*pEnd
= data
->pos
+ 1; /* End normalize + 1 */
2572 if (data
->flags
& UCOL_ITER_HASLEN
) {
2574 normalization buffer not used yet, we'll pull down the next
2575 character into the end of the buffer
2577 *(data
->writableBuffer
+ (data
->writableBufSize
- 1)) = *(data
->pos
+ 1);
2578 nulltermsize
= data
->writableBufSize
- 1;
2581 nulltermsize
= data
->writableBufSize
;
2582 UChar
*temp
= data
->writableBuffer
+ (nulltermsize
- 1);
2583 while (*(temp
--) != 0) {
2588 /* Start normalize */
2589 if (data
->fcdPosition
== NULL
) {
2590 pStart
= data
->string
;
2593 pStart
= data
->fcdPosition
+ 1;
2596 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, data
->writableBuffer
, 0,
2599 if (nulltermsize
<= normLen
) {
2600 uint32_t size
= data
->writableBufSize
- nulltermsize
+ normLen
+ 1;
2601 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2603 *status
= U_MEMORY_ALLOCATION_ERROR
;
2606 nulltermsize
= normLen
+ 1;
2607 uprv_memcpy(temp
+ normLen
, data
->writableBuffer
,
2608 sizeof(UChar
) * (data
->writableBufSize
- nulltermsize
));
2609 freeHeapWritableBuffer(data
);
2610 data
->writableBuffer
= temp
;
2611 data
->writableBufSize
= size
;
2615 this puts the null termination infront of the normalized string instead
2618 pStartNorm
= data
->writableBuffer
+ (nulltermsize
- normLen
);
2619 *(pStartNorm
- 1) = 0;
2620 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
, normLen
,
2623 data
->pos
= data
->writableBuffer
+ nulltermsize
;
2624 data
->origFlags
= data
->flags
;
2625 data
->flags
|= UCOL_ITER_INNORMBUF
;
2626 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2630 * Contraction character management function that returns the previous character
2631 * for the backwards iterator.
2632 * Does nothing if the previous character is in buffer and not the first
2634 * Else it checks previous character in data string to see if it is
2636 * If it is not, the character is simply copied into the buffer, else
2637 * the whole normalized substring is copied into the buffer, including the
2638 * current character.
2639 * @param data collation element iterator data
2640 * @return previous character
2643 inline UChar
getPrevNormalizedChar(collIterate
*data
, UErrorCode
*status
)
2648 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2649 UChar
*pNull
= NULL
;
2650 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ||
2651 (innormbuf
&& *(data
->pos
- 1) != 0)) {
2653 if no normalization.
2654 if previous character is in normalized buffer, no further normalization
2657 if(data
->flags
& UCOL_USE_ITERATOR
) {
2658 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
2659 return (UChar
)data
->iterator
->next(data
->iterator
);
2661 return *(data
->pos
- 1);
2666 if ((data
->fcdPosition
==NULL
)||(data
->flags
& UCOL_ITER_HASLEN
)) {
2667 /* in data string */
2668 if ((start
- 1) == data
->string
) {
2669 return *(start
- 1);
2673 prevch
= *(start
- 1);
2677 in writable buffer, at this point fcdPosition can not be NULL.
2678 see contracting tag.
2680 if (data
->fcdPosition
== data
->string
) {
2681 /* at the start of the string, just dump it into the normalizer */
2682 insertBufferFront(data
, data
->pos
- 1, *(data
->fcdPosition
));
2683 data
->fcdPosition
= NULL
;
2684 return *(data
->pos
- 1);
2686 pNull
= data
->pos
- 1;
2687 start
= data
->fcdPosition
;
2689 prevch
= *(start
- 1);
2692 * if the current character is not fcd.
2693 * Trailing combining class == 0.
2695 if (data
->fcdPosition
> start
&&
2696 (ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
|| prevch
>= NFC_ZERO_CC_BLOCK_LIMIT_
))
2699 Need a more complete FCD check and possible normalization.
2700 normalize substring will be appended to buffer
2702 UChar
*backuppos
= data
->pos
;
2704 if (collPrevIterFCD(data
)) {
2705 normalizePrevContraction(data
, status
);
2706 return *(data
->pos
- 1);
2708 data
->pos
= backuppos
;
2709 data
->fcdPosition
++;
2714 no normalization is to be done hence only one character will be
2715 appended to the buffer.
2717 insertBufferFront(data
, pNull
, ch
);
2718 data
->fcdPosition
--;
2724 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2725 /* It is called by getNextCE */
2727 /* The following should be even */
2728 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2730 uint32_t ucol_prv_getSpecialCE(const UCollator
*coll
, UChar ch
, uint32_t CE
, collIterate
*source
, UErrorCode
*status
) {
2731 collIterateState entryState
;
2732 backupState(source
, &entryState
);
2736 // This loop will repeat only in the case of contractions, and only when a contraction
2737 // is found and the first CE resulting from that contraction is itself a special
2738 // (an expansion, for example.) All other special CE types are fully handled the
2739 // first time through, and the loop exits.
2741 const uint32_t *CEOffset
= NULL
;
2742 switch(getCETag(CE
)) {
2744 /* This one is not found, and we'll let somebody else bother about it... no more games */
2748 // Special processing is getting a CE that is preceded by a certain prefix
2749 // Currently this is only needed for optimizing Japanese length and iteration marks.
2750 // When we encouter a special processing tag, we go backwards and try to see if
2752 // Contraction tables are used - so the whole process is not unlike contraction.
2753 // prefix data is stored backwards in the table.
2754 const UChar
*UCharOffset
;
2756 collIterateState prefixState
;
2757 backupState(source
, &prefixState
);
2758 loadState(source
, &entryState
, TRUE
);
2759 goBackOne(source
); // We want to look at the point where we entered - actually one
2763 // This loop will run once per source string character, for as long as we
2764 // are matching a potential contraction sequence
2766 // First we position ourselves at the begining of contraction sequence
2767 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2768 if (collIter_bos(source
)) {
2769 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2772 schar
= getPrevNormalizedChar(source
, status
);
2775 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2779 if (schar
== tchar
) {
2780 // Found the source string char in the table.
2781 // Pick up the corresponding CE from the table.
2782 CE
= *(coll
->contractionCEs
+
2783 (UCharOffset
- coll
->contractionIndex
));
2787 // Source string char was not in the table.
2788 // We have not found the prefix.
2789 CE
= *(coll
->contractionCEs
+
2790 (ContractionStart
- coll
->contractionIndex
));
2794 // The source string char was in the contraction table, and the corresponding
2795 // CE is not a prefix CE. We found the prefix, break
2796 // out of loop, this CE will end up being returned. This is the normal
2797 // way out of prefix handling when the source actually contained
2802 if(CE
!= UCOL_NOT_FOUND
) { // we found something and we can merilly continue
2803 loadState(source
, &prefixState
, TRUE
);
2804 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2805 source
->flags
= source
->origFlags
;
2807 } else { // prefix search was a failure, we have to backup all the way to the start
2808 loadState(source
, &entryState
, TRUE
);
2812 case CONTRACTION_TAG
:
2814 /* This should handle contractions */
2815 collIterateState state
;
2816 backupState(source
, &state
);
2817 uint32_t firstCE
= *(coll
->contractionCEs
+ ((UChar
*)coll
->image
+getContractOffset(CE
) - coll
->contractionIndex
)); //UCOL_NOT_FOUND;
2818 const UChar
*UCharOffset
;
2822 /* This loop will run once per source string character, for as long as we */
2823 /* are matching a potential contraction sequence */
2825 /* First we position ourselves at the begining of contraction sequence */
2826 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2828 if (collIter_eos(source
)) {
2829 // Ran off the end of the source string.
2830 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2831 // So we'll pick whatever we have at the point...
2832 if (CE
== UCOL_NOT_FOUND
) {
2833 // back up the source over all the chars we scanned going into this contraction.
2835 loadState(source
, &state
, TRUE
);
2836 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2837 source
->flags
= source
->origFlags
;
2843 uint8_t maxCC
= (uint8_t)(*(UCharOffset
)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2844 uint8_t allSame
= (uint8_t)(*(UCharOffset
++)>>8);
2846 schar
= getNextNormalizedChar(source
);
2847 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2851 if (schar
== tchar
) {
2852 // Found the source string char in the contraction table.
2853 // Pick up the corresponding CE from the table.
2854 CE
= *(coll
->contractionCEs
+
2855 (UCharOffset
- coll
->contractionIndex
));
2859 // Source string char was not in contraction table.
2860 // Unless we have a discontiguous contraction, we have finished
2861 // with this contraction.
2862 // in order to do the proper detection, we
2863 // need to see if we're dealing with a supplementary
2864 /* We test whether the next two char are surrogate pairs.
2865 * This test is done if the iterator is not NULL.
2866 * If there is no surrogate pair, the iterator
2867 * goes back one if needed. */
2868 UChar32 miss
= schar
;
2869 if (source
->iterator
) {
2870 UChar32 surrNextChar
; /* the next char in the iteration to test */
2871 int32_t prevPos
; /* holds the previous position before move forward of the source iterator */
2872 if(U16_IS_LEAD(schar
) && source
->iterator
->hasNext(source
->iterator
)) {
2873 prevPos
= source
->iterator
->index
;
2874 surrNextChar
= getNextNormalizedChar(source
);
2875 if (U16_IS_TRAIL(surrNextChar
)) {
2876 miss
= U16_GET_SUPPLEMENTARY(schar
, surrNextChar
);
2877 } else if (prevPos
< source
->iterator
->index
){
2881 } else if (U16_IS_LEAD(schar
)) {
2882 miss
= U16_GET_SUPPLEMENTARY(schar
, getNextNormalizedChar(source
));
2888 (sCC
= i_getCombiningClass(miss
, coll
)) == 0 ||
2890 (allSame
!= 0 && sCC
== maxCC
) ||
2891 collIter_eos(source
))
2893 // Contraction can not be discontiguous.
2894 goBackOne(source
); // back up the source string by one,
2895 // because the character we just looked at was
2896 // not part of the contraction. */
2897 if(U_IS_SUPPLEMENTARY(miss
)) {
2900 CE
= *(coll
->contractionCEs
+
2901 (ContractionStart
- coll
->contractionIndex
));
2904 // Contraction is possibly discontiguous.
2905 // Scan more of source string looking for a match
2908 /* find the next character if schar is not a base character
2909 and we are not yet at the end of the string */
2910 tempchar
= getNextNormalizedChar(source
);
2911 // probably need another supplementary thingie here
2913 if (i_getCombiningClass(tempchar
, coll
) == 0) {
2915 if(U_IS_SUPPLEMENTARY(miss
)) {
2918 /* Spit out the last char of the string, wasn't tasty enough */
2919 CE
= *(coll
->contractionCEs
+
2920 (ContractionStart
- coll
->contractionIndex
));
2922 CE
= getDiscontiguous(coll
, source
, ContractionStart
);
2925 } // else after if(schar == tchar)
2927 if(CE
== UCOL_NOT_FOUND
) {
2928 /* The Source string did not match the contraction that we were checking. */
2929 /* Back up the source position to undo the effects of having partially */
2930 /* scanned through what ultimately proved to not be a contraction. */
2931 loadState(source
, &state
, TRUE
);
2936 if(!isContraction(CE
)) {
2937 // The source string char was in the contraction table, and the corresponding
2938 // CE is not a contraction CE. We completed the contraction, break
2939 // out of loop, this CE will end up being returned. This is the normal
2940 // way out of contraction handling when the source actually contained
2946 // The source string char was in the contraction table, and the corresponding
2947 // CE is IS a contraction CE. We will continue looping to check the source
2948 // string for the remaining chars in the contraction.
2949 uint32_t tempCE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
2950 if(tempCE
!= UCOL_NOT_FOUND
) {
2951 // We have scanned a a section of source string for which there is a
2952 // CE from the contraction table. Remember the CE and scan position, so
2953 // that we can return to this point if further scanning fails to
2954 // match a longer contraction sequence.
2958 backupState(source
, &state
);
2959 getNextNormalizedChar(source
);
2961 // Another way to do this is:
2962 //collIterateState tempState;
2963 //backupState(source, &tempState);
2964 //goBackOne(source);
2965 //backupState(source, &state);
2966 //loadState(source, &tempState, TRUE);
2968 // The problem is that for incomplete contractions we have to remember the previous
2969 // position. Before, the only thing I needed to do was state.pos--;
2970 // After iterator introduction and especially after introduction of normalizing
2971 // iterators, it became much more difficult to decrease the saved state.
2972 // I'm not yet sure which of the two methods above is faster.
2976 } // case CONTRACTION_TAG:
2977 case LONG_PRIMARY_TAG
:
2979 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
2980 CE
= ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
2981 source
->offsetRepeatCount
+= 1;
2986 /* This should handle expansion. */
2987 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2988 /* I have to decide where continuations are going to be dealt with */
2990 uint32_t i
; /* general counter */
2992 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
2993 size
= getExpansionCount(CE
);
2995 //source->offsetRepeatCount = -1;
2997 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2998 for(i
= 1; i
<size
; i
++) {
2999 *(source
->CEpos
++) = *CEOffset
++;
3000 source
->offsetRepeatCount
+= 1;
3002 } else { /* else, we do */
3003 while(*CEOffset
!= 0) {
3004 *(source
->CEpos
++) = *CEOffset
++;
3005 source
->offsetRepeatCount
+= 1;
3014 We do a check to see if we want to collate digits as numbers; if so we generate
3015 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3018 uint32_t i
; /* general counter */
3020 if (source
->coll
->numericCollation
== UCOL_ON
){
3021 collIterateState digitState
= {0,0,0,0,0,0,0,0,0};
3025 uint32_t digIndx
= 0;
3026 uint32_t endIndex
= 0;
3027 uint32_t trailingZeroIndex
= 0;
3029 uint8_t collateVal
= 0;
3031 UBool nonZeroValReached
= FALSE
;
3033 uint8_t numTempBuf
[UCOL_MAX_DIGITS_FOR_NUMBER
/2 + 3]; // I just need a temporary place to store my generated CEs.
3035 We parse the source string until we hit a char that's NOT a digit.
3036 Use this u_charDigitValue. This might be slow because we have to
3037 handle surrogates...
3040 if (U16_IS_LEAD(ch)){
3041 if (!collIter_eos(source)) {
3042 backupState(source, &digitState);
3043 UChar trail = getNextNormalizedChar(source);
3044 if(U16_IS_TRAIL(trail)) {
3045 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3047 loadState(source, &digitState, TRUE);
3056 digVal = u_charDigitValue(char32);
3058 digVal
= u_charDigitValue(cp
); // if we have arrived here, we have
3059 // already processed possible supplementaries that trigered the digit tag -
3060 // all supplementaries are marked in the UCA.
3062 We pad a zero in front of the first element anyways. This takes
3063 care of the (probably) most common case where people are sorting things followed
3068 // Make sure we have enough space. No longer needed;
3069 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3070 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3071 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3073 // Skipping over leading zeroes.
3075 nonZeroValReached
= TRUE
;
3077 if (nonZeroValReached
) {
3079 We parse the digit string into base 100 numbers (this fits into a byte).
3080 We only add to the buffer in twos, thus if we are parsing an odd character,
3081 that serves as the 'tens' digit while the if we are parsing an even one, that
3082 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3083 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3084 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3085 than all the other bytes.
3088 if (digIndx
% 2 == 1){
3089 collateVal
+= (uint8_t)digVal
;
3091 // We don't enter the low-order-digit case unless we've already seen
3092 // the high order, or for the first digit, which is always non-zero.
3093 if (collateVal
!= 0)
3094 trailingZeroIndex
= 0;
3096 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3100 // We drop the collation value into the buffer so if we need to do
3101 // a "front patch" we don't have to check to see if we're hitting the
3103 collateVal
= (uint8_t)(digVal
* 10);
3105 // Check for trailing zeroes.
3106 if (collateVal
== 0)
3108 if (!trailingZeroIndex
)
3109 trailingZeroIndex
= (digIndx
/2) + 2;
3112 trailingZeroIndex
= 0;
3114 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3119 // Get next character.
3120 if (!collIter_eos(source
)){
3121 ch
= getNextNormalizedChar(source
);
3122 if (U16_IS_LEAD(ch
)){
3123 if (!collIter_eos(source
)) {
3124 backupState(source
, &digitState
);
3125 UChar trail
= getNextNormalizedChar(source
);
3126 if(U16_IS_TRAIL(trail
)) {
3127 char32
= U16_GET_SUPPLEMENTARY(ch
, trail
);
3129 loadState(source
, &digitState
, TRUE
);
3137 if ((digVal
= u_charDigitValue(char32
)) == -1 || digIndx
> UCOL_MAX_DIGITS_FOR_NUMBER
){
3138 // Resetting position to point to the next unprocessed char. We
3139 // overshot it when doing our test/set for numbers.
3140 if (char32
> 0xFFFF) { // For surrogates.
3141 loadState(source
, &digitState
, TRUE
);
3142 //goBackOne(source);
3152 if (nonZeroValReached
== FALSE
){
3157 endIndex
= trailingZeroIndex
? trailingZeroIndex
: ((digIndx
/2) + 2) ;
3158 if (digIndx
% 2 != 0){
3160 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3161 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3162 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3163 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3166 for(i
= 2; i
< endIndex
; i
++){
3167 numTempBuf
[i
] = (((((numTempBuf
[i
] - 6)/2) % 10) * 10) +
3168 (((numTempBuf
[i
+1])-6)/2) / 10) * 2 + 6;
3173 // Subtract one off of the last byte.
3174 numTempBuf
[endIndex
-1] -= 1;
3177 We want to skip over the first two slots in the buffer. The first slot
3178 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3179 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3181 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
3182 numTempBuf
[1] = (uint8_t)(0x80 + ((digIndx
/2) & 0x7F));
3184 // Now transfer the collation key to our collIterate struct.
3185 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3186 //size = ((endIndex+1) & ~1)/2;
3187 CE
= (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3188 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3189 UCOL_BYTE_COMMON
; // Tertiary weight.
3190 i
= 2; // Reset the index into the buffer.
3193 uint32_t primWeight
= numTempBuf
[i
++] << 8;
3195 primWeight
|= numTempBuf
[i
++];
3196 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3200 // no numeric mode, we'll just switch to whatever we stashed and continue
3201 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3207 /* various implicits optimization */
3208 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3209 /* UCA is filled with these. Tailorings are NOT_FOUND */
3210 return getImplicit(cp
, source
);
3211 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3212 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3213 return getImplicit(cp
, source
);
3214 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3216 static const uint32_t
3217 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3218 //const uint32_t LCount = 19;
3219 static const uint32_t VCount
= 21;
3220 static const uint32_t TCount
= 28;
3221 //const uint32_t NCount = VCount * TCount; // 588
3222 //const uint32_t SCount = LCount * NCount; // 11172
3223 uint32_t L
= ch
- SBase
;
3225 // divide into pieces
3227 uint32_t T
= L
% TCount
; // we do it in this order since some compilers can do % and / in one operation
3229 uint32_t V
= L
% VCount
;
3238 // return the first CE, but first put the rest into the expansion buffer
3239 if (!source
->coll
->image
->jamoSpecial
) { // FAST PATH
3241 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, V
);
3243 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, T
);
3246 return UTRIE_GET32_FROM_LEAD(&coll
->mapping
, L
);
3248 } else { // Jamo is Special
3249 // Since Hanguls pass the FCD check, it is
3250 // guaranteed that we won't be in
3251 // the normalization buffer if something like this happens
3252 // However, if we are using a uchar iterator and normalization
3253 // is ON, the Hangul that lead us here is going to be in that
3254 // normalization buffer. Here we want to restore the uchar
3255 // iterator state and pull out of the normalization buffer
3256 if(source
->iterator
!= NULL
&& source
->flags
& UCOL_ITER_INNORMBUF
) {
3257 source
->flags
= source
->origFlags
; // restore the iterator
3260 // Move Jamos into normalization buffer
3261 source
->writableBuffer
[0] = (UChar
)L
;
3262 source
->writableBuffer
[1] = (UChar
)V
;
3264 source
->writableBuffer
[2] = (UChar
)T
;
3265 source
->writableBuffer
[3] = 0;
3267 source
->writableBuffer
[2] = 0;
3270 source
->fcdPosition
= source
->pos
; // Indicate where to continue in main input string
3271 // after exhausting the writableBuffer
3272 source
->pos
= source
->writableBuffer
;
3273 source
->origFlags
= source
->flags
;
3274 source
->flags
|= UCOL_ITER_INNORMBUF
;
3275 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3277 return(UCOL_IGNORABLE
);
3281 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3282 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3283 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3284 /* we return 0 (completely ignorable - per UCA specification */
3287 collIterateState state
;
3288 backupState(source
, &state
);
3289 if (collIter_eos(source
) || !(U16_IS_TRAIL((trail
= getNextNormalizedChar(source
))))) {
3290 // we chould have stepped one char forward and it might have turned that it
3291 // was not a trail surrogate. In that case, we have to backup.
3292 loadState(source
, &state
, TRUE
);
3295 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3296 CE
= UTRIE_GET32_FROM_OFFSET_TRAIL(&coll
->mapping
, CE
&0xFFFFFF, trail
);
3297 if(CE
== UCOL_NOT_FOUND
) { // there are tailored surrogates in this block, but not this one.
3298 // We need to backup
3299 loadState(source
, &state
, TRUE
);
3302 // calculate the supplementary code point value, if surrogate was not tailored
3303 cp
= ((((uint32_t)ch
)<<10UL)+(trail
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3307 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3309 if( source
->flags
& UCOL_USE_ITERATOR
) {
3310 if(U_IS_TRAIL(nextChar
= (UChar
)source
->iterator
->current(source
->iterator
))) {
3311 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3312 source
->iterator
->next(source
->iterator
);
3313 return getImplicit(cp
, source
);
3317 } else if((((source
->flags
& UCOL_ITER_HASLEN
) == 0 ) || (source
->pos
<source
->endp
)) &&
3318 U_IS_TRAIL((nextChar
=*source
->pos
))) {
3319 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3321 return getImplicit(cp
, source
);
3323 return 0; /* completely ignorable */
3325 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3326 return 0; /* broken surrogate sequence */
3328 /* not yet implemented */
3329 /* probably after 1.8 */
3330 return UCOL_NOT_FOUND
;
3332 *status
= U_INTERNAL_PROGRAM_ERROR
;
3336 if (CE
<= UCOL_NOT_FOUND
) break;
3342 /* now uses Mark's getImplicitPrimary code */
3344 inline uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
) {
3349 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
3351 *(collationSource
->CEpos
++) = (r
& UCOL_PRIMARYMASK
) | 0x00000505;
3352 collationSource
->toReturn
= collationSource
->CEpos
;
3354 if (collationSource
->offsetBuffer
== NULL
) {
3355 collationSource
->offsetBufferSize
= UCOL_EXPAND_CE_BUFFER_SIZE
;
3356 collationSource
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE
);
3357 collationSource
->offsetStore
= collationSource
->offsetBuffer
;
3360 // **** doesn't work if using iterator ****
3361 if (collationSource
->flags
& UCOL_ITER_INNORMBUF
) {
3362 collationSource
->offsetRepeatCount
= 1;
3364 int32_t firstOffset
= (int32_t)(collationSource
->pos
- collationSource
->string
);
3366 *(collationSource
->offsetStore
++) = firstOffset
;
3367 *(collationSource
->offsetStore
++) = firstOffset
+ 1;
3369 collationSource
->offsetReturn
= collationSource
->offsetStore
- 1;
3370 *(collationSource
->offsetBuffer
) = firstOffset
;
3371 if (collationSource
->offsetReturn
== collationSource
->offsetBuffer
) {
3372 collationSource
->offsetStore
= collationSource
->offsetBuffer
;
3376 return ((r
& 0x0000FFFF)<<16) | 0x000000C0;
3380 * This function handles the special CEs like contractions, expansions,
3382 * It is called by both getPrevCE
3384 uint32_t ucol_prv_getSpecialPrevCE(const UCollator
*coll
, UChar ch
, uint32_t CE
,
3385 collIterate
*source
,
3388 const uint32_t *CEOffset
= NULL
;
3389 UChar
*UCharOffset
= NULL
;
3391 const UChar
*constart
= NULL
;
3393 UChar buffer
[UCOL_MAX_BUFFER
];
3394 uint32_t *endCEBuffer
;
3396 int32_t noChars
= 0;
3397 int32_t CECount
= 0;
3401 /* the only ces that loops are thai and contractions */
3402 switch (getCETag(CE
))
3404 case NOT_FOUND_TAG
: /* this tag always returns */
3409 // Special processing is getting a CE that is preceded by a certain prefix
3410 // Currently this is only needed for optimizing Japanese length and iteration marks.
3411 // When we encouter a special processing tag, we go backwards and try to see if
3413 // Contraction tables are used - so the whole process is not unlike contraction.
3414 // prefix data is stored backwards in the table.
3415 const UChar
*UCharOffset
;
3417 collIterateState prefixState
;
3418 backupState(source
, &prefixState
);
3420 // This loop will run once per source string character, for as long as we
3421 // are matching a potential contraction sequence
3423 // First we position ourselves at the begining of contraction sequence
3424 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3426 if (collIter_bos(source
)) {
3427 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3430 schar
= getPrevNormalizedChar(source
, status
);
3433 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3437 if (schar
== tchar
) {
3438 // Found the source string char in the table.
3439 // Pick up the corresponding CE from the table.
3440 CE
= *(coll
->contractionCEs
+
3441 (UCharOffset
- coll
->contractionIndex
));
3445 // if there is a completely ignorable code point in the middle of
3446 // a prefix, we need to act as if it's not there
3447 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3448 // lone surrogates cannot be set to zero as it would break other processing
3449 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, schar
);
3450 // it's easy for BMP code points
3453 } else if(U16_IS_TRAIL(schar
) || U16_IS_LEAD(schar
)) {
3454 // for supplementary code points, we have to check the next one
3455 // situations where we are going to ignore
3456 // 1. beginning of the string: schar is a lone surrogate
3457 // 2. schar is a lone surrogate
3458 // 3. schar is a trail surrogate in a valid surrogate sequence
3459 // that is explicitly set to zero.
3460 if (!collIter_bos(source
)) {
3462 if(U16_IS_LEAD(lead
= getPrevNormalizedChar(source
, status
))) {
3463 isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, lead
);
3464 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
3465 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(&coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
3467 // this is a real, assigned completely ignorable code point
3473 // lone surrogate, completely ignorable
3477 // lone surrogate at the beggining, completely ignorable
3481 // Source string char was not in the table.
3482 // We have not found the prefix.
3483 CE
= *(coll
->contractionCEs
+
3484 (ContractionStart
- coll
->contractionIndex
));
3488 // The source string char was in the contraction table, and the corresponding
3489 // CE is not a prefix CE. We found the prefix, break
3490 // out of loop, this CE will end up being returned. This is the normal
3491 // way out of prefix handling when the source actually contained
3496 loadState(source
, &prefixState
, TRUE
);
3500 case CONTRACTION_TAG
:
3501 /* to ensure that the backwards and forwards iteration matches, we
3502 take the current region of most possible match and pass it through
3503 the forward iteration. this will ensure that the obstinate problem of
3504 overlapping contractions will not occur.
3506 schar
= peekCharacter(source
, 0);
3507 constart
= (UChar
*)coll
->image
+ getContractOffset(CE
);
3508 if (isAtStartPrevIterate(source
)
3509 /* commented away contraction end checks after adding the checks
3511 /* start of string or this is not the end of any contraction */
3512 CE
= *(coll
->contractionCEs
+
3513 (constart
- coll
->contractionIndex
));
3517 UCharOffset
= strbuffer
+ (UCOL_MAX_BUFFER
- 1);
3518 *(UCharOffset
--) = 0;
3520 // have to swap thai characters
3521 while (ucol_unsafeCP(schar
, coll
)) {
3522 *(UCharOffset
) = schar
;
3525 schar
= getPrevNormalizedChar(source
, status
);
3527 // TODO: when we exhaust the contraction buffer,
3528 // it needs to get reallocated. The problem is
3529 // that the size depends on the string which is
3530 // not iterated over. However, since we're travelling
3531 // backwards, we already had to set the iterator at
3532 // the end - so we might as well know where we are?
3533 if (UCharOffset
+ 1 == buffer
) {
3534 /* we have exhausted the buffer */
3535 int32_t newsize
= 0;
3536 if(source
->pos
) { // actually dealing with a position
3537 newsize
= source
->pos
- source
->string
+ 1;
3538 } else { // iterator
3539 newsize
= 4 * UCOL_MAX_BUFFER
;
3541 strbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) *
3542 (newsize
+ UCOL_MAX_BUFFER
));
3544 if (strbuffer
== NULL
) {
3545 *status
= U_MEMORY_ALLOCATION_ERROR
;
3546 return UCOL_NO_MORE_CES
;
3548 UCharOffset
= strbuffer
+ newsize
;
3549 uprv_memcpy(UCharOffset
, buffer
,
3550 UCOL_MAX_BUFFER
* sizeof(UChar
));
3553 if ((source
->pos
&& (source
->pos
== source
->string
||
3554 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
3555 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)))
3556 || (source
->iterator
&& !source
->iterator
->hasPrevious(source
->iterator
))) {
3560 /* adds the initial base character to the string */
3561 *(UCharOffset
) = schar
;
3567 if (source
->offsetReturn
!= NULL
) {
3568 source
->offsetStore
= source
->offsetReturn
- noChars
;
3571 // **** doesn't work if using iterator ****
3572 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
3573 if (source
->fcdPosition
== NULL
) {
3576 offsetBias
= (int32_t)(source
->fcdPosition
- source
->string
);
3579 offsetBias
= (int32_t)(source
->pos
- source
->string
);
3583 // **** doesn't work if using iterator ****
3584 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
3588 if (source
->fcdPosition
== NULL
) {
3591 offsetBias
= (int32_t)(source
->fcdPosition
- source
->string
);
3595 offsetBias
= (int32_t)(source
->pos
- source
->string
);
3599 /* a new collIterate is used to simplify things, since using the current
3600 collIterate will mean that the forward and backwards iteration will
3601 share and change the same buffers. we don't want to get into that. */
3605 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3606 IInit_collIterate(coll
, UCharOffset
, noChars
, &temp
);
3607 temp
.flags
&= ~UCOL_ITER_NORM
;
3609 rawOffset
= temp
.pos
- temp
.string
; // should always be zero?
3610 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3612 if (source
->extendCEs
) {
3613 endCEBuffer
= source
->extendCEs
+ source
->extendCEsSize
;
3614 CECount
= (source
->CEpos
- source
->extendCEs
)/sizeof(uint32_t);
3616 endCEBuffer
= source
->CEs
+ UCOL_EXPAND_CE_BUFFER_SIZE
;
3617 CECount
= (source
->CEpos
- source
->CEs
)/sizeof(uint32_t);
3620 if (source
->offsetBuffer
== NULL
) {
3621 source
->offsetBufferSize
= UCOL_EXPAND_CE_BUFFER_SIZE
;
3622 source
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE
);
3623 source
->offsetStore
= source
->offsetBuffer
;
3626 while (CE
!= UCOL_NO_MORE_CES
) {
3627 *(source
->CEpos
++) = CE
;
3629 if (offsetBias
>= 0) {
3630 *(source
->offsetStore
++) = rawOffset
+ offsetBias
;
3634 if (source
->CEpos
== endCEBuffer
) {
3635 /* ran out of CE space, reallocate to new buffer.
3636 If reallocation fails, reset pointers and bail out,
3637 there's no guarantee of the right character position after
3639 if (source
->extendCEs
== NULL
) {
3640 source
->extendCEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t) *
3641 (source
->extendCEsSize
=UCOL_EXPAND_CE_BUFFER_SIZE
+ UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE
));
3642 if (source
->extendCEs
== NULL
) {
3643 // Handle error later.
3646 source
->extendCEs
= (uint32_t *)uprv_memcpy(source
->extendCEs
, source
->CEs
, UCOL_EXPAND_CE_BUFFER_SIZE
* sizeof(uint32_t));
3649 uint32_t *tempBufCE
= (uint32_t *)uprv_realloc(source
->extendCEs
,
3650 sizeof(uint32_t) * (source
->extendCEsSize
+= UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE
));
3651 if (tempBufCE
== NULL
) {
3652 // Handle error later.
3656 source
->extendCEs
= tempBufCE
;
3660 if (CECount
== -1) {
3661 *status
= U_MEMORY_ALLOCATION_ERROR
;
3662 source
->extendCEsSize
= 0;
3663 source
->CEpos
= source
->CEs
;
3664 freeHeapWritableBuffer(&temp
);
3666 if (strbuffer
!= buffer
) {
3667 uprv_free(strbuffer
);
3670 return (uint32_t)UCOL_NULLORDER
;
3673 source
->CEpos
= source
->extendCEs
+ CECount
;
3674 endCEBuffer
= source
->extendCEs
+ source
->extendCEsSize
;
3677 if (offsetBias
>= 0 && source
->offsetStore
>= &source
->offsetBuffer
[source
->offsetBufferSize
]) {
3678 int32_t storeIX
= source
->offsetStore
- source
->offsetBuffer
;
3679 int32_t *tob
= (int32_t *) uprv_realloc(source
->offsetBuffer
,
3680 sizeof(int32_t) * (source
->offsetBufferSize
+ UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE
));
3683 source
->offsetBuffer
= tob
;
3684 source
->offsetStore
= &source
->offsetBuffer
[storeIX
];
3685 source
->offsetBufferSize
+= UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE
;
3688 *status
= U_MEMORY_ALLOCATION_ERROR
;
3689 source
->CEpos
= source
->CEs
;
3690 freeHeapWritableBuffer(&temp
);
3692 if (strbuffer
!= buffer
) {
3693 uprv_free(strbuffer
);
3696 return (uint32_t) UCOL_NULLORDER
;
3700 rawOffset
= temp
.pos
- temp
.string
;
3701 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3704 if (source
->offsetRepeatValue
!= 0) {
3705 if (CECount
> noChars
) {
3706 source
->offsetRepeatCount
+= temp
.offsetRepeatCount
;
3708 // **** does this really skip the right offsets? ****
3709 source
->offsetReturn
-= (noChars
- CECount
);
3713 freeHeapWritableBuffer(&temp
);
3715 if (strbuffer
!= buffer
) {
3716 uprv_free(strbuffer
);
3719 if (offsetBias
>= 0) {
3720 source
->offsetReturn
= source
->offsetStore
- 1;
3721 if (source
->offsetReturn
== source
->offsetBuffer
) {
3722 source
->offsetStore
= source
->offsetBuffer
;
3726 source
->toReturn
= source
->CEpos
- 1;
3727 if (source
->toReturn
== source
->CEs
) {
3728 source
->CEpos
= source
->CEs
;
3731 return *(source
->toReturn
);
3733 case LONG_PRIMARY_TAG
:
3735 *(source
->CEpos
++) = ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
3736 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
3737 source
->toReturn
= source
->CEpos
- 1;
3739 if (source
->offsetBuffer
== NULL
) {
3740 source
->offsetBufferSize
= UCOL_EXPAND_CE_BUFFER_SIZE
;
3741 source
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE
);
3742 source
->offsetStore
= source
->offsetBuffer
;
3745 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
3746 source
->offsetRepeatCount
= 1;
3748 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
3750 *(source
->offsetStore
++) = firstOffset
;
3751 *(source
->offsetStore
++) = firstOffset
+ 1;
3753 source
->offsetReturn
= source
->offsetStore
- 1;
3754 *(source
->offsetBuffer
) = firstOffset
;
3755 if (source
->offsetReturn
== source
->offsetBuffer
) {
3756 source
->offsetStore
= source
->offsetBuffer
;
3761 return *(source
->toReturn
);
3764 case EXPANSION_TAG
: /* this tag always returns */
3767 This should handle expansion.
3768 NOTE: we can encounter both continuations and expansions in an expansion!
3769 I have to decide where continuations are going to be dealt with
3771 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
3773 // **** doesn't work if using iterator ****
3774 if (source
->offsetReturn
!= NULL
) {
3775 if (! (source
->flags
& UCOL_ITER_INNORMBUF
) && source
->offsetReturn
== source
->offsetBuffer
) {
3776 source
->offsetStore
= source
->offsetBuffer
;
3782 if (source
->offsetBuffer
== NULL
) {
3783 source
->offsetBufferSize
= UCOL_EXPAND_CE_BUFFER_SIZE
;
3784 source
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE
);
3785 source
->offsetStore
= source
->offsetBuffer
;
3788 /* find the offset to expansion table */
3789 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3790 size
= getExpansionCount(CE
);
3793 if there are less than 16 elements in expansion, we don't terminate
3797 for (count
= 0; count
< size
; count
++) {
3798 *(source
->CEpos
++) = *CEOffset
++;
3800 if (firstOffset
>= 0) {
3801 *(source
->offsetStore
++) = firstOffset
+ 1;
3806 while (*CEOffset
!= 0) {
3807 *(source
->CEpos
++) = *CEOffset
++;
3809 if (firstOffset
>= 0) {
3810 *(source
->offsetStore
++) = firstOffset
+ 1;
3815 if (firstOffset
>= 0) {
3816 source
->offsetReturn
= source
->offsetStore
- 1;
3817 *(source
->offsetBuffer
) = firstOffset
;
3818 if (source
->offsetReturn
== source
->offsetBuffer
) {
3819 source
->offsetStore
= source
->offsetBuffer
;
3822 source
->offsetRepeatCount
+= size
- 1;
3825 source
->toReturn
= source
->CEpos
- 1;
3826 // in case of one element expansion, we
3827 // want to immediately return CEpos
3828 if(source
->toReturn
== source
->CEs
) {
3829 source
->CEpos
= source
->CEs
;
3832 return *(source
->toReturn
);
3838 We do a check to see if we want to collate digits as numbers; if so we generate
3839 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3842 uint32_t i
; /* general counter */
3844 if (source
->coll
->numericCollation
== UCOL_ON
){
3845 uint32_t digIndx
= 0;
3846 uint32_t endIndex
= 0;
3847 uint32_t leadingZeroIndex
= 0;
3848 uint32_t trailingZeroCount
= 0;
3850 uint8_t collateVal
= 0;
3852 UBool nonZeroValReached
= FALSE
;
3854 uint8_t numTempBuf
[UCOL_MAX_DIGITS_FOR_NUMBER
/2 + 2]; // I just need a temporary place to store my generated CEs.
3856 We parse the source string until we hit a char that's NOT a digit.
3857 Use this u_charDigitValue. This might be slow because we have to
3858 handle surrogates...
3861 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3862 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3863 element we process when going backward. To determine how long that chunk might be, we may need to make
3864 two passes through the loop that collects digits - one to see how long the string is (and how much is
3865 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3866 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3867 element chunk after resetting the state to the initialState at the right side of the digit string.
3869 uint32_t ceLimit
= 0;
3870 UChar initial_ch
= ch
;
3871 collIterateState initialState
= {0,0,0,0,0,0,0,0,0};
3872 backupState(source
, &initialState
);
3875 collIterateState state
= {0,0,0,0,0,0,0,0,0};
3879 if (U16_IS_TRAIL (ch
)) {
3880 if (!collIter_bos(source
)){
3881 UChar lead
= getPrevNormalizedChar(source
, status
);
3882 if(U16_IS_LEAD(lead
)) {
3883 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
3894 digVal
= u_charDigitValue(char32
);
3897 // Make sure we have enough space. No longer needed;
3898 // at this point the largest value of digIndx when we need to save data in numTempBuf
3899 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3900 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3902 // Skip over trailing zeroes, and keep a count of them.
3904 nonZeroValReached
= TRUE
;
3906 if (nonZeroValReached
) {
3908 We parse the digit string into base 100 numbers (this fits into a byte).
3909 We only add to the buffer in twos, thus if we are parsing an odd character,
3910 that serves as the 'tens' digit while the if we are parsing an even one, that
3911 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3912 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3913 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3914 than all the other bytes.
3916 Since we're doing in this reverse we want to put the first digit encountered into the
3917 ones place and the second digit encountered into the tens place.
3920 if ((digIndx
+ trailingZeroCount
) % 2 == 1) {
3921 // High-order digit case (tens place)
3922 collateVal
+= (uint8_t)(digVal
* 10);
3924 // We cannot set leadingZeroIndex unless it has been set for the
3925 // low-order digit. Therefore, all we can do for the high-order
3926 // digit is turn it off, never on.
3927 // The only time we will have a high digit without a low is for
3928 // the very first non-zero digit, so no zero check is necessary.
3929 if (collateVal
!= 0)
3930 leadingZeroIndex
= 0;
3932 // The first pass through, digIndx may exceed the limit, but in that case
3933 // we no longer care about numTempBuf contents since they will be discarded
3934 if ( digIndx
< UCOL_MAX_DIGITS_FOR_NUMBER
) {
3935 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3939 // Low-order digit case (ones place)
3940 collateVal
= (uint8_t)digVal
;
3942 // Check for leading zeroes.
3943 if (collateVal
== 0) {
3944 if (!leadingZeroIndex
)
3945 leadingZeroIndex
= (digIndx
/2) + 2;
3947 leadingZeroIndex
= 0;
3949 // No need to write to buffer; the case of a last odd digit
3950 // is handled below.
3954 ++trailingZeroCount
;
3956 if (!collIter_bos(source
)) {
3957 ch
= getPrevNormalizedChar(source
, status
);
3958 //goBackOne(source);
3959 if (U16_IS_TRAIL(ch
)) {
3960 backupState(source
, &state
);
3961 if (!collIter_bos(source
)) {
3963 UChar lead
= getPrevNormalizedChar(source
, status
);
3965 if(U16_IS_LEAD(lead
)) {
3966 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
3968 loadState(source
, &state
, FALSE
);
3975 if ((digVal
= u_charDigitValue(char32
)) == -1 || (ceLimit
> 0 && (digIndx
+ trailingZeroCount
) >= ceLimit
)) {
3976 if (char32
> 0xFFFF) {// For surrogates.
3977 loadState(source
, &state
, FALSE
);
3979 // Don't need to "reverse" the goBackOne call,
3980 // as this points to the next position to process..
3981 //if (char32 > 0xFFFF) // For surrogates.
3982 //getNextNormalizedChar(source);
3991 if (digIndx
+ trailingZeroCount
<= UCOL_MAX_DIGITS_FOR_NUMBER
) {
3992 // our collation element is not too big, go ahead and finish with it
3995 // our digit string is too long for a collation element;
3996 // set the limit for it, reset the state and begin again
3997 ceLimit
= (digIndx
+ trailingZeroCount
) % UCOL_MAX_DIGITS_FOR_NUMBER
;
3998 if ( ceLimit
== 0 ) {
3999 ceLimit
= UCOL_MAX_DIGITS_FOR_NUMBER
;
4002 loadState(source
, &initialState
, FALSE
);
4003 digIndx
= endIndex
= leadingZeroIndex
= trailingZeroCount
= 0;
4005 nonZeroValReached
= FALSE
;
4008 if (! nonZeroValReached
) {
4010 trailingZeroCount
= 0;
4014 if ((digIndx
+ trailingZeroCount
) % 2 != 0) {
4015 numTempBuf
[((digIndx
)/2) + 2] = collateVal
*2 + 6;
4016 digIndx
+= 1; // The implicit leading zero
4018 if (trailingZeroCount
% 2 != 0) {
4019 // We had to consume one trailing zero for the low digit
4020 // of the least significant byte
4021 digIndx
+= 1; // The trailing zero not in the exponent
4022 trailingZeroCount
-= 1;
4025 endIndex
= leadingZeroIndex
? leadingZeroIndex
: ((digIndx
/2) + 2) ;
4027 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4031 We want to skip over the first two slots in the buffer. The first slot
4032 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4033 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4034 The exponent must be adjusted by the number of leading zeroes, and the number of
4037 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
4038 uint32_t exponent
= (digIndx
+trailingZeroCount
)/2;
4039 if (leadingZeroIndex
)
4040 exponent
-= ((digIndx
/2) + 2 - leadingZeroIndex
);
4041 numTempBuf
[1] = (uint8_t)(0x80 + (exponent
& 0x7F));
4043 // Now transfer the collation key to our collIterate struct.
4044 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4045 //size = ((endIndex+1) & ~1)/2;
4046 *(source
->CEpos
++) = (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
4047 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
4048 UCOL_BYTE_COMMON
; // Tertiary weight.
4049 i
= endIndex
- 1; // Reset the index into the buffer.
4051 uint32_t primWeight
= numTempBuf
[i
--] << 8;
4053 primWeight
|= numTempBuf
[i
--];
4054 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
4057 source
->toReturn
= source
->CEpos
-1;
4058 return *(source
->toReturn
);
4060 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
4066 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
4068 static const uint32_t
4069 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
4070 //const uint32_t LCount = 19;
4071 static const uint32_t VCount
= 21;
4072 static const uint32_t TCount
= 28;
4073 //const uint32_t NCount = VCount * TCount; /* 588 */
4074 //const uint32_t SCount = LCount * NCount; /* 11172 */
4076 uint32_t L
= ch
- SBase
;
4079 we do it in this order since some compilers can do % and / in one
4082 uint32_t T
= L
% TCount
;
4084 uint32_t V
= L
% VCount
;
4092 if (source
->offsetBuffer
== NULL
) {
4093 source
->offsetBufferSize
= UCOL_EXPAND_CE_BUFFER_SIZE
;
4094 source
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE
);
4095 source
->offsetStore
= source
->offsetBuffer
;
4098 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
4100 *(source
->offsetStore
++) = firstOffset
;
4103 * return the first CE, but first put the rest into the expansion buffer
4105 if (!source
->coll
->image
->jamoSpecial
) {
4106 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, L
);
4107 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, V
);
4108 *(source
->offsetStore
++) = firstOffset
+ 1;
4111 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, T
);
4112 *(source
->offsetStore
++) = firstOffset
+ 1;
4115 source
->toReturn
= source
->CEpos
- 1;
4117 source
->offsetReturn
= source
->offsetStore
- 1;
4118 if (source
->offsetReturn
== source
->offsetBuffer
) {
4119 source
->offsetStore
= source
->offsetBuffer
;
4122 return *(source
->toReturn
);
4124 // Since Hanguls pass the FCD check, it is
4125 // guaranteed that we won't be in
4126 // the normalization buffer if something like this happens
4127 // Move Jamos into normalization buffer
4129 Move the Jamos into the
4130 normalization buffer
4132 UChar
*tempbuffer
= source
->writableBuffer
+
4133 (source
->writableBufSize
- 1);
4136 *(tempbuffer
- 1) = (UChar
)T
;
4137 *(tempbuffer
- 2) = (UChar
)V
;
4138 *(tempbuffer
- 3) = (UChar
)L
;
4139 *(tempbuffer
- 4) = 0;
4141 *(tempbuffer
- 1) = (UChar
)V
;
4142 *(tempbuffer
- 2) = (UChar
)L
;
4143 *(tempbuffer
- 3) = 0;
4147 Indicate where to continue in main input string after exhausting
4150 if (source
->pos
== source
->string
) {
4151 source
->fcdPosition
= NULL
;
4153 source
->fcdPosition
= source
->pos
-1;
4156 source
->pos
= tempbuffer
;
4157 source
->origFlags
= source
->flags
;
4158 source
->flags
|= UCOL_ITER_INNORMBUF
;
4159 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
4161 return(UCOL_IGNORABLE
);
4165 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
4167 if (source
->offsetBuffer
== NULL
) {
4168 source
->offsetBufferSize
= UCOL_EXPAND_CE_BUFFER_SIZE
;
4169 source
->offsetBuffer
= (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE
);
4170 source
->offsetStore
= source
->offsetBuffer
;
4173 // **** doesn't work if using iterator ****
4174 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
4175 source
->offsetRepeatCount
= 1;
4177 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
4179 *(source
->offsetStore
++) = firstOffset
;
4180 *(source
->offsetStore
++) = firstOffset
+ 1;
4182 source
->offsetReturn
= source
->offsetStore
- 1;
4183 if (source
->offsetReturn
== source
->offsetBuffer
) {
4184 source
->offsetStore
= source
->offsetBuffer
;
4189 return getPrevImplicit(ch
, source
);
4191 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4192 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4193 return getPrevImplicit(ch
, source
);
4195 case SURROGATE_TAG
: /* This is a surrogate pair */
4196 /* essentialy an engaged lead surrogate. */
4197 /* if you have encountered it here, it means that a */
4198 /* broken sequence was encountered and this is an error */
4201 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
4202 return 0; /* broken surrogate sequence */
4204 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
4209 if (isAtStartPrevIterate(source
)) {
4210 /* we are at the start of the string, wrong place to be at */
4213 if (source
->pos
!= source
->writableBuffer
) {
4214 prev
= source
->pos
- 1;
4216 prev
= source
->fcdPosition
;
4220 /* Handles Han and Supplementary characters here.*/
4221 if (U16_IS_LEAD(prevChar
)) {
4222 cp
= ((((uint32_t)prevChar
)<<10UL)+(ch
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4225 return 0; /* completely ignorable */
4228 return getPrevImplicit(cp
, source
);
4231 /* UCA is filled with these. Tailorings are NOT_FOUND */
4232 /* not yet implemented */
4233 case CHARSET_TAG
: /* this tag always returns */
4234 /* probably after 1.8 */
4235 return UCOL_NOT_FOUND
;
4237 default: /* this tag always returns */
4238 *status
= U_INTERNAL_PROGRAM_ERROR
;
4243 if (CE
<= UCOL_NOT_FOUND
) {
4251 /* This should really be a macro */
4252 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4255 uint8_t *reallocateBuffer(uint8_t **secondaries
, uint8_t *secStart
, uint8_t *second
, uint32_t *secSize
, uint32_t newSize
, UErrorCode
*status
) {
4257 fprintf(stderr
, ".");
4259 uint8_t *newStart
= NULL
;
4260 uint32_t offset
= *secondaries
-secStart
;
4262 if(secStart
==second
) {
4263 newStart
=(uint8_t*)uprv_malloc(newSize
);
4264 if(newStart
==NULL
) {
4265 *status
= U_MEMORY_ALLOCATION_ERROR
;
4268 uprv_memcpy(newStart
, secStart
, *secondaries
-secStart
);
4270 newStart
=(uint8_t*)uprv_realloc(secStart
, newSize
);
4271 if(newStart
==NULL
) {
4272 *status
= U_MEMORY_ALLOCATION_ERROR
;
4273 /* Since we're reallocating, return original reference so we don't loose it. */
4277 *secondaries
=newStart
+offset
;
4283 /* This should really be a macro */
4284 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4285 /* secondaries in French */
4287 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4297 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4299 while((start)<(end)) { \
4301 *(start)++ = *(end); \
4306 /****************************************************************************/
4307 /* Following are the sortkey generation functions */
4309 /****************************************************************************/
4312 * Merge two sort keys.
4313 * This is useful, for example, to combine sort keys from first and last names
4314 * to sort such pairs.
4315 * Merged sort keys consider on each collation level the first part first entirely,
4316 * then the second one.
4317 * It is possible to merge multiple sort keys by consecutively merging
4318 * another one with the intermediate result.
4320 * The length of the merge result is the sum of the lengths of the input sort keys
4323 * @param src1 the first sort key
4324 * @param src1Length the length of the first sort key, including the zero byte at the end;
4325 * can be -1 if the function is to find the length
4326 * @param src2 the second sort key
4327 * @param src2Length the length of the second sort key, including the zero byte at the end;
4328 * can be -1 if the function is to find the length
4329 * @param dest the buffer where the merged sort key is written,
4330 * can be NULL if destCapacity==0
4331 * @param destCapacity the number of bytes in the dest buffer
4332 * @return the length of the merged sort key, src1Length+src2Length-1;
4333 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4334 * in which cases the contents of dest is undefined
4338 U_CAPI
int32_t U_EXPORT2
4339 ucol_mergeSortkeys(const uint8_t *src1
, int32_t src1Length
,
4340 const uint8_t *src2
, int32_t src2Length
,
4341 uint8_t *dest
, int32_t destCapacity
) {
4345 /* check arguments */
4346 if( src1
==NULL
|| src1Length
<-2 || src1Length
==0 || (src1Length
>0 && src1
[src1Length
-1]!=0) ||
4347 src2
==NULL
|| src2Length
<-2 || src2Length
==0 || (src2Length
>0 && src2
[src2Length
-1]!=0) ||
4348 destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)
4350 /* error, attempt to write a zero byte and return 0 */
4351 if(dest
!=NULL
&& destCapacity
>0) {
4357 /* check lengths and capacity */
4359 src1Length
=(int32_t)uprv_strlen((const char *)src1
)+1;
4362 src2Length
=(int32_t)uprv_strlen((const char *)src2
)+1;
4365 destLength
=src1Length
+src2Length
-1;
4366 if(destLength
>destCapacity
) {
4367 /* the merged sort key does not fit into the destination */
4371 /* merge the sort keys with the same number of levels */
4372 while(*src1
!=0 && *src2
!=0) { /* while both have another level */
4373 /* copy level from src1 not including 00 or 01 */
4374 while((b
=*src1
)>=2) {
4379 /* add a 02 merge separator */
4382 /* copy level from src2 not including 00 or 01 */
4383 while((b
=*src2
)>=2) {
4388 /* if both sort keys have another level, then add a 01 level separator and continue */
4389 if(*src1
==1 && *src2
==1) {
4397 * here, at least one sort key is finished now, but the other one
4398 * might have some contents left from containing more levels;
4399 * that contents is just appended to the result
4402 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4405 /* append src2, "the other, unfinished sort key" */
4406 uprv_strcpy((char *)dest
, (const char *)src2
);
4408 /* trust that neither sort key contained illegally embedded zero bytes */
4413 U_CAPI
int32_t U_EXPORT2
4414 ucol_getSortKey(const UCollator
*coll
,
4415 const UChar
*source
,
4416 int32_t sourceLength
,
4418 int32_t resultLength
)
4420 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY
);
4421 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
4422 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source string = %vh ", coll
, source
,
4423 ((sourceLength
==-1 && source
!=NULL
) ? u_strlen(source
) : sourceLength
));
4426 UErrorCode status
= U_ZERO_ERROR
;
4427 int32_t keySize
= 0;
4429 if(source
!= NULL
) {
4430 // source == NULL is actually an error situation, but we would need to
4431 // have an error code to return it. Until we introduce a new
4432 // API, it stays like this
4434 /* this uses the function pointer that is set in updateinternalstate */
4435 /* currently, there are two funcs: */
4436 /*ucol_calcSortKey(...);*/
4437 /*ucol_calcSortKeySimpleTertiary(...);*/
4439 keySize
= coll
->sortKeyGen(coll
, source
, sourceLength
, &result
, resultLength
, FALSE
, &status
);
4440 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4441 // That's not good. Something unusual happened.
4442 // We don't know how much we initialized before we failed.
4443 // NULL terminate for safety.
4444 // We have no way say that we have generated a partial sort key.
4449 UTRACE_DATA2(UTRACE_VERBOSE
, "Sort Key = %vb", result
, keySize
);
4450 UTRACE_EXIT_STATUS(status
);
4454 /* this function is called by the C++ API for sortkey generation */
4456 ucol_getSortKeyWithAllocation(const UCollator
*coll
,
4457 const UChar
*source
, int32_t sourceLength
,
4459 UErrorCode
*pErrorCode
) {
4461 return coll
->sortKeyGen(coll
, source
, sourceLength
, pResult
, 0, TRUE
, pErrorCode
);
4464 #define UCOL_FSEC_BUF_SIZE 256
4466 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4467 /* or if we run out of space while making a sortkey and want to return ASAP */
4468 int32_t ucol_getSortKeySize(const UCollator
*coll
, collIterate
*s
, int32_t currentSize
, UColAttributeValue strength
, int32_t len
) {
4469 UErrorCode status
= U_ZERO_ERROR
;
4470 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4471 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4472 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4473 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4474 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4475 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4476 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4477 //UBool qShifted = shifted && (compareQuad == 0);
4478 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4479 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4480 uint8_t fSecsBuff
[UCOL_FSEC_BUF_SIZE
];
4481 uint8_t *fSecs
= fSecsBuff
;
4482 uint32_t fSecsLen
= 0, fSecsMaxLen
= UCOL_FSEC_BUF_SIZE
;
4483 uint8_t *frenchStartPtr
= NULL
, *frenchEndPtr
= NULL
;
4485 uint32_t variableTopValue
= coll
->variableTopValue
;
4486 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4489 /* allocate one more space for hiragana */
4491 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4493 uint32_t order
= UCOL_NO_MORE_CES
;
4494 uint8_t primary1
= 0;
4495 uint8_t primary2
= 0;
4496 uint8_t secondary
= 0;
4497 uint8_t tertiary
= 0;
4498 int32_t caseShift
= 0;
4499 uint32_t c2
= 0, c3
= 0, c4
= 0; /* variables for compression */
4501 uint8_t caseSwitch
= coll
->caseSwitch
;
4502 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4503 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4505 UBool wasShifted
= FALSE
;
4506 UBool notIsContinuation
= FALSE
;
4507 uint8_t leadPrimary
= 0;
4511 order
= ucol_IGetNextCE(coll
, s
, &status
);
4512 if(order
== UCOL_NO_MORE_CES
) {
4520 notIsContinuation
= !isContinuation(order
);
4523 if(notIsContinuation
) {
4524 tertiary
= (uint8_t)((order
& UCOL_BYTE_SIZE_MASK
));
4526 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4528 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4529 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4530 primary1
= (uint8_t)(order
>> 8);
4533 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4534 || (!notIsContinuation
&& wasShifted
))
4535 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4536 /* and other ignorables should be removed if following a shifted code point */
4537 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4538 /* we should just completely ignore it */
4541 if(compareQuad
== 0) {
4543 currentSize
+= (c2
/UCOL_BOT_COUNT4
)+1;
4554 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4555 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4556 /* calculate sortkey size */
4557 if(primary1
!= UCOL_IGNORABLE
) {
4558 if(notIsContinuation
) {
4559 if(leadPrimary
== primary1
) {
4562 if(leadPrimary
!= 0) {
4565 if(primary2
== UCOL_IGNORABLE
) {
4566 /* one byter, not compressed */
4570 else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
4571 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4572 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4573 (primary1
> maxRegularPrimary
&& primary1
< minImplicitPrimary
))
4575 /* not compressible */
4579 else { /* compress */
4580 leadPrimary
= primary1
;
4584 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4586 if(primary2
!= UCOL_IGNORABLE
) {
4592 if(secondary
> compareSec
) { /* I think that != 0 test should be != IGNORABLE */
4594 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4598 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4599 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+1;
4601 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+1;
4608 fSecs
[fSecsLen
++] = secondary
;
4609 if(fSecsLen
== fSecsMaxLen
) {
4611 if(fSecs
== fSecsBuff
) {
4612 fSecsTemp
= (uint8_t *)uprv_malloc(2*fSecsLen
);
4614 fSecsTemp
= (uint8_t *)uprv_realloc(fSecs
, 2*fSecsLen
);
4616 if(fSecsTemp
== NULL
) {
4617 status
= U_MEMORY_ALLOCATION_ERROR
;
4623 if(notIsContinuation
) {
4624 if (frenchStartPtr
!= NULL
) {
4625 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4626 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4627 frenchStartPtr
= NULL
;
4630 if (frenchStartPtr
== NULL
) {
4631 frenchStartPtr
= fSecs
+fSecsLen
-2;
4633 frenchEndPtr
= fSecs
+fSecsLen
-1;
4638 if(doCase
&& (primary1
> 0 || strength
>= UCOL_SECONDARY
)) {
4639 // do the case level if we need to do it. We don't want to calculate
4640 // case level for primary ignorables if we have only primary strength and case level
4641 // otherwise we would break well formedness of CEs
4642 if (caseShift
== 0) {
4644 caseShift
= UCOL_CASE_SHIFT_START
;
4646 if((tertiary
&0x3F) > 0 && notIsContinuation
) {
4648 if((tertiary
&0xC0) != 0) {
4649 if (caseShift
== 0) {
4651 caseShift
= UCOL_CASE_SHIFT_START
;
4657 if(notIsContinuation
) {
4658 tertiary
^= caseSwitch
;
4662 tertiary
&= tertiaryMask
;
4663 if(tertiary
> compareTer
) { /* I think that != 0 test should be != IGNORABLE */
4664 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4668 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
4669 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
4670 currentSize
+= (c3
/(uint32_t)coll
->tertiaryTopCount
)+1;
4672 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
)+1;
4680 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4681 if(s
->flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4682 if(c4
>0) { // Close this part
4683 currentSize
+= (c4
/UCOL_BOT_COUNT4
)+1;
4686 currentSize
++; // Add the Hiragana
4687 } else { // This wasn't Hiragana, so we can continue adding stuff
4696 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4700 if(frenchStartPtr
!= NULL
) {
4701 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4703 for(i
= 0; i
<fSecsLen
; i
++) {
4704 secondary
= *(fSecs
+fSecsLen
-i
-1);
4705 /* This is compression code. */
4706 if (secondary
== UCOL_COMMON2
) {
4710 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4711 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+((c2
%(uint32_t)UCOL_TOP_COUNT2
!= 0)?1:0);
4713 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4721 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4723 if(fSecs
!= fSecsBuff
) {
4729 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
) + ((c3
%(uint32_t)coll
->tertiaryBottomCount
!= 0)?1:0);
4732 if(c4
> 0 && compareQuad
== 0) {
4733 currentSize
+= (c4
/(uint32_t)UCOL_BOT_COUNT4
)+((c4
%(uint32_t)UCOL_BOT_COUNT4
!= 0)?1:0);
4737 currentSize
+= u_lengthOfIdenticalLevelRun(s
->string
, len
);
4743 inline void doCaseShift(uint8_t **cases
, uint32_t &caseShift
) {
4744 if (caseShift
== 0) {
4745 *(*cases
)++ = UCOL_CASE_BYTE_START
;
4746 caseShift
= UCOL_CASE_SHIFT_START
;
4750 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4751 // know how many values we wanted to add, even if we didn't add them all
4753 inline void addWithIncrement(uint8_t *&primaries
, uint8_t *limit
, uint32_t &size
, const uint8_t value
) {
4755 if(primaries
< limit
) {
4756 *(primaries
)++ = value
;
4760 // Packs the secondary buffer when processing French locale. Adds the terminator.
4762 inline uint8_t *packFrench(uint8_t *primaries
, uint8_t *primEnd
, uint8_t *secondaries
, uint32_t *secsize
, uint8_t *frenchStartPtr
, uint8_t *frenchEndPtr
) {
4765 uint32_t i
= 0, size
= 0;
4766 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4767 addWithIncrement(primaries
, primEnd
, i
, UCOL_LEVELTERMINATOR
);
4768 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4769 if(frenchStartPtr
!= NULL
) {
4770 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4772 for(i
= 0; i
<*secsize
; i
++) {
4773 secondary
= *(secondaries
-i
-1);
4774 /* This is compression code. */
4775 if (secondary
== UCOL_COMMON2
) {
4779 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4780 while (count2
> UCOL_TOP_COUNT2
) {
4781 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
));
4782 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4784 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1)));
4786 while (count2
> UCOL_BOT_COUNT2
) {
4787 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4788 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4790 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4794 addWithIncrement(primaries
, primEnd
, size
, secondary
);
4798 while (count2
> UCOL_BOT_COUNT2
) {
4799 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4800 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4802 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4808 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4810 /* This is the sortkey work horse function */
4811 U_CFUNC
int32_t U_CALLCONV
4812 ucol_calcSortKey(const UCollator
*coll
,
4813 const UChar
*source
,
4814 int32_t sourceLength
,
4816 uint32_t resultLength
,
4817 UBool allocateSKBuffer
,
4820 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4822 uint32_t i
= 0; /* general purpose counter */
4824 /* Stack allocated buffers for buffers we use */
4825 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
], caseB
[UCOL_CASE_MAX_BUFFER
], quad
[UCOL_QUAD_MAX_BUFFER
];
4827 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
, *cases
= caseB
, *quads
= quad
;
4829 if(U_FAILURE(*status
)) {
4833 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
4834 primaries
= *result
= prim
;
4835 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
4838 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
,
4839 caseSize
= UCOL_CASE_MAX_BUFFER
, quadSize
= UCOL_QUAD_MAX_BUFFER
;
4841 uint32_t sortKeySize
= 1; /* it is always \0 terminated */
4843 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
4844 UChar
*normSource
= normBuffer
;
4845 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
4847 int32_t len
= (sourceLength
== -1 ? u_strlen(source
) : sourceLength
);
4849 UColAttributeValue strength
= coll
->strength
;
4851 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4852 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4853 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4854 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4855 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4856 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4857 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4858 //UBool qShifted = shifted && (compareQuad == 0);
4859 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4860 /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4862 uint32_t variableTopValue
= coll
->variableTopValue
;
4863 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4864 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4865 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4866 uint8_t UCOL_HIRAGANA_QUAD
= 0;
4868 UCOL_HIRAGANA_QUAD
=UCOL_COMMON_BOT4
++;
4869 /* allocate one more space for hiragana, value for hiragana */
4871 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4873 /* support for special features like caselevel and funky secondaries */
4874 uint8_t *frenchStartPtr
= NULL
;
4875 uint8_t *frenchEndPtr
= NULL
;
4876 uint32_t caseShift
= 0;
4878 sortKeySize
+= ((compareSec
?0:1) + (compareTer
?0:1) + (doCase
?1:0) + /*(qShifted?1:0)*/(compareQuad
?0:1) + (compareIdent
?1:0));
4880 /* If we need to normalize, we'll do it all at once at the beginning! */
4881 UNormalizationMode normMode
;
4883 normMode
= UNORM_NFD
;
4884 } else if(coll
->normalizationMode
!= UCOL_OFF
) {
4885 normMode
= UNORM_FCD
;
4887 normMode
= UNORM_NONE
;
4890 if(normMode
!= UNORM_NONE
&& UNORM_YES
!= unorm_quickCheck(source
, len
, normMode
, status
)) {
4891 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4895 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
4896 normSourceLen
= len
;
4897 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
4898 if(normSource
== NULL
) {
4899 *status
= U_MEMORY_ALLOCATION_ERROR
;
4902 *status
= U_ZERO_ERROR
;
4903 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4909 if(U_FAILURE(*status
)) {
4912 source
= normSource
;
4916 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4917 if(source
== normSource
) {
4918 s
.flags
&= ~UCOL_ITER_NORM
;
4921 if(resultLength
== 0 || primaries
== NULL
) {
4922 int32_t keyLen
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4923 if(normSource
!= normBuffer
) {
4924 uprv_free(normSource
);
4928 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 1;
4929 if(strength
> UCOL_PRIMARY
) {
4933 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
4935 uint8_t *primStart
= primaries
;
4936 uint8_t *secStart
= secondaries
;
4937 uint8_t *terStart
= tertiaries
;
4938 uint8_t *caseStart
= cases
;
4939 uint8_t *quadStart
= quads
;
4943 uint8_t primary1
= 0;
4944 uint8_t primary2
= 0;
4945 uint8_t secondary
= 0;
4946 uint8_t tertiary
= 0;
4947 uint8_t caseSwitch
= coll
->caseSwitch
;
4948 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4949 int8_t tertiaryAddition
= coll
->tertiaryAddition
;
4950 uint8_t tertiaryTop
= coll
->tertiaryTop
;
4951 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
4952 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4953 uint8_t caseBits
= 0;
4955 UBool finished
= FALSE
;
4956 UBool wasShifted
= FALSE
;
4957 UBool notIsContinuation
= FALSE
;
4959 uint32_t prevBuffSize
= 0;
4961 uint32_t count2
= 0, count3
= 0, count4
= 0;
4962 uint8_t leadPrimary
= 0;
4965 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
4967 order
= ucol_IGetNextCE(coll
, &s
, status
);
4968 if(order
== UCOL_NO_MORE_CES
) {
4977 notIsContinuation
= !isContinuation(order
);
4979 if(notIsContinuation
) {
4980 tertiary
= (uint8_t)(order
& UCOL_BYTE_SIZE_MASK
);
4982 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4985 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4986 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4987 primary1
= (uint8_t)(order
>> 8);
4989 /*if(notIsContinuation && scriptOrder != NULL) {
4990 primary1 = scriptOrder[primary1];
4993 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4994 || (!notIsContinuation
&& wasShifted
))
4995 || (wasShifted
&& primary1
== 0)) /* amendment to the UCA says that primary ignorables */
4997 /* and other ignorables should be removed if following a shifted code point */
4998 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4999 /* we should just completely ignore it */
5002 if(compareQuad
== 0) {
5004 while (count4
> UCOL_BOT_COUNT4
) {
5005 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5006 count4
-= UCOL_BOT_COUNT4
;
5008 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5011 /* We are dealing with a variable and we're treating them as shifted */
5012 /* This is a shifted ignorable */
5013 if(primary1
!= 0) { /* we need to check this since we could be in continuation */
5014 *quads
++ = primary1
;
5017 *quads
++ = primary2
;
5023 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5024 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5025 /* regular and simple sortkey calc */
5026 if(primary1
!= UCOL_IGNORABLE
) {
5027 if(notIsContinuation
) {
5028 if(leadPrimary
== primary1
) {
5029 *primaries
++ = primary2
;
5031 if(leadPrimary
!= 0) {
5032 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5034 if(primary2
== UCOL_IGNORABLE
) {
5035 /* one byter, not compressed */
5036 *primaries
++ = primary1
;
5038 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
5039 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5040 (primary1
> maxRegularPrimary
&& primary1
< minImplicitPrimary
)) {
5041 /* not compressible */
5043 *primaries
++ = primary1
;
5044 if(primaries
<= primarySafeEnd
) {
5045 *primaries
++ = primary2
;
5047 } else { /* compress */
5048 *primaries
++ = leadPrimary
= primary1
;
5049 if(primaries
<= primarySafeEnd
) {
5050 *primaries
++ = primary2
;
5054 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5055 *primaries
++ = primary1
;
5056 if((primary2
!= UCOL_IGNORABLE
) && (primaries
<= primarySafeEnd
)) {
5057 *primaries
++ = primary2
; /* second part */
5062 if(secondary
> compareSec
) {
5064 /* This is compression code. */
5065 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5069 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5070 while (count2
> UCOL_TOP_COUNT2
) {
5071 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5072 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5074 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5076 while (count2
> UCOL_BOT_COUNT2
) {
5077 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5078 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5080 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5084 *secondaries
++ = secondary
;
5087 *secondaries
++ = secondary
;
5088 /* Do the special handling for French secondaries */
5089 /* We need to get continuation elements and do intermediate restore */
5090 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5091 if(notIsContinuation
) {
5092 if (frenchStartPtr
!= NULL
) {
5093 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5094 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
5095 frenchStartPtr
= NULL
;
5098 if (frenchStartPtr
== NULL
) {
5099 frenchStartPtr
= secondaries
- 2;
5101 frenchEndPtr
= secondaries
-1;
5106 if(doCase
&& (primary1
> 0 || strength
>= UCOL_SECONDARY
)) {
5107 // do the case level if we need to do it. We don't want to calculate
5108 // case level for primary ignorables if we have only primary strength and case level
5109 // otherwise we would break well formedness of CEs
5110 doCaseShift(&cases
, caseShift
);
5111 if(notIsContinuation
) {
5112 caseBits
= (uint8_t)(tertiary
& 0xC0);
5115 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
5116 if((caseBits
& 0xC0) == 0) {
5117 *(cases
-1) |= 1 << (--caseShift
);
5119 *(cases
-1) |= 0 << (--caseShift
);
5121 doCaseShift(&cases
, caseShift
);
5122 *(cases
-1) |= ((caseBits
>>6)&1) << (--caseShift
);
5125 if((caseBits
& 0xC0) == 0) {
5126 *(cases
-1) |= 0 << (--caseShift
);
5128 *(cases
-1) |= 1 << (--caseShift
);
5130 doCaseShift(&cases
, caseShift
);
5131 *(cases
-1) |= ((caseBits
>>7)&1) << (--caseShift
);
5138 if(notIsContinuation
) {
5139 tertiary
^= caseSwitch
;
5143 tertiary
&= tertiaryMask
;
5144 if(tertiary
> compareTer
) {
5145 /* This is compression code. */
5146 /* sequence size check is included in the if clause */
5147 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5150 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
5151 tertiary
+= tertiaryAddition
;
5152 } else if(tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
5153 tertiary
-= tertiaryAddition
;
5156 if ((tertiary
> tertiaryCommon
)) {
5157 while (count3
> coll
->tertiaryTopCount
) {
5158 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5159 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5161 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5163 while (count3
> coll
->tertiaryBottomCount
) {
5164 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5165 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5167 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5171 *tertiaries
++ = tertiary
;
5175 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
5176 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
5177 if(count4
>0) { // Close this part
5178 while (count4
> UCOL_BOT_COUNT4
) {
5179 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5180 count4
-= UCOL_BOT_COUNT4
;
5182 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5185 *quads
++ = UCOL_HIRAGANA_QUAD
; // Add the Hiragana
5186 } else { // This wasn't Hiragana, so we can continue adding stuff
5192 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5193 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5194 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5195 if(source
== normSource
) {
5196 s
.flags
&= ~UCOL_ITER_NORM
;
5198 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
5199 *status
= U_BUFFER_OVERFLOW_ERROR
;
5202 } else { /* It's much nicer if we can actually reallocate */
5203 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
)+(cases
-caseStart
)+(quads
-quadStart
);
5204 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5205 if(U_SUCCESS(*status
)) {
5206 *result
= primStart
;
5207 primarySafeEnd
= primStart
+ resultLength
- 1;
5208 if(strength
> UCOL_PRIMARY
) {
5212 /* We ran out of memory!? We can't recover. */
5213 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5223 prevBuffSize
= minBufferSize
;
5225 uint32_t frenchStartOffset
= 0, frenchEndOffset
= 0;
5226 if (frenchStartPtr
!= NULL
) {
5227 frenchStartOffset
= frenchStartPtr
- secStart
;
5228 frenchEndOffset
= frenchEndPtr
- secStart
;
5230 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5231 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5232 caseStart
= reallocateBuffer(&cases
, caseStart
, caseB
, &caseSize
, 2*caseSize
, status
);
5233 quadStart
= reallocateBuffer(&quads
, quadStart
, quad
, &quadSize
, 2*quadSize
, status
);
5234 if(U_FAILURE(*status
)) {
5235 /* We ran out of memory!? We can't recover. */
5236 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5239 if (frenchStartPtr
!= NULL
) {
5240 frenchStartPtr
= secStart
+ frenchStartOffset
;
5241 frenchEndPtr
= secStart
+ frenchEndOffset
;
5247 /* Here, we are generally done with processing */
5248 /* bailing out would not be too productive */
5250 if(U_SUCCESS(*status
)) {
5251 sortKeySize
+= (primaries
- primStart
);
5252 /* we have done all the CE's, now let's put them together to form a key */
5253 if(compareSec
== 0) {
5255 while (count2
> UCOL_BOT_COUNT2
) {
5256 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5257 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5259 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5261 uint32_t secsize
= secondaries
-secStart
;
5262 if(!isFrenchSec
) { // Regular situation, we know the length of secondaries
5263 sortKeySize
+= secsize
;
5264 if(sortKeySize
<= resultLength
) {
5265 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5266 uprv_memcpy(primaries
, secStart
, secsize
);
5267 primaries
+= secsize
;
5269 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
5270 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5271 if(U_SUCCESS(*status
)) {
5272 *result
= primStart
;
5273 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5274 uprv_memcpy(primaries
, secStart
, secsize
);
5275 primaries
+= secsize
;
5278 /* We ran out of memory!? We can't recover. */
5279 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5283 *status
= U_BUFFER_OVERFLOW_ERROR
;
5286 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5287 uint8_t *newPrim
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
5288 sortKeySize
+= secsize
;
5289 if(sortKeySize
<= resultLength
) { // if we managed to pack fine
5290 primaries
= newPrim
; // update the primary pointer
5291 } else { // overflow, need to reallocate and redo
5292 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
5293 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5294 if(U_SUCCESS(*status
)) {
5295 primaries
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
5298 /* We ran out of memory!? We can't recover. */
5299 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5303 *status
= U_BUFFER_OVERFLOW_ERROR
;
5310 uint32_t casesize
= cases
- caseStart
;
5311 sortKeySize
+= casesize
;
5312 if(sortKeySize
<= resultLength
) {
5313 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5314 uprv_memcpy(primaries
, caseStart
, casesize
);
5315 primaries
+= casesize
;
5317 if(allocateSKBuffer
== TRUE
) {
5318 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5319 if(U_SUCCESS(*status
)) {
5320 *result
= primStart
;
5321 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5322 uprv_memcpy(primaries
, caseStart
, casesize
);
5325 /* We ran out of memory!? We can't recover. */
5326 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5330 *status
= U_BUFFER_OVERFLOW_ERROR
;
5335 if(compareTer
== 0) {
5337 if (coll
->tertiaryCommon
!= UCOL_COMMON_BOT3
) {
5338 while (count3
>= coll
->tertiaryTopCount
) {
5339 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5340 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5342 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5344 while (count3
> coll
->tertiaryBottomCount
) {
5345 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5346 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5348 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5351 uint32_t tersize
= tertiaries
- terStart
;
5352 sortKeySize
+= tersize
;
5353 if(sortKeySize
<= resultLength
) {
5354 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5355 uprv_memcpy(primaries
, terStart
, tersize
);
5356 primaries
+= tersize
;
5358 if(allocateSKBuffer
== TRUE
) {
5359 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5360 if(U_SUCCESS(*status
)) {
5361 *result
= primStart
;
5362 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5363 uprv_memcpy(primaries
, terStart
, tersize
);
5366 /* We ran out of memory!? We can't recover. */
5367 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5371 *status
= U_BUFFER_OVERFLOW_ERROR
;
5375 if(compareQuad
== 0/*qShifted == TRUE*/) {
5377 while (count4
> UCOL_BOT_COUNT4
) {
5378 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5379 count4
-= UCOL_BOT_COUNT4
;
5381 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5383 uint32_t quadsize
= quads
- quadStart
;
5384 sortKeySize
+= quadsize
;
5385 if(sortKeySize
<= resultLength
) {
5386 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5387 uprv_memcpy(primaries
, quadStart
, quadsize
);
5388 primaries
+= quadsize
;
5390 if(allocateSKBuffer
== TRUE
) {
5391 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5392 if(U_SUCCESS(*status
)) {
5393 *result
= primStart
;
5394 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5395 uprv_memcpy(primaries
, quadStart
, quadsize
);
5398 /* We ran out of memory!? We can't recover. */
5399 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5403 *status
= U_BUFFER_OVERFLOW_ERROR
;
5409 sortKeySize
+= u_lengthOfIdenticalLevelRun(s
.string
, len
);
5410 if(sortKeySize
<= resultLength
) {
5411 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5412 primaries
+= u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5414 if(allocateSKBuffer
== TRUE
) {
5415 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, sortKeySize
, status
);
5416 if(U_SUCCESS(*status
)) {
5417 *result
= primStart
;
5418 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5419 u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5422 /* We ran out of memory!? We can't recover. */
5423 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5427 *status
= U_BUFFER_OVERFLOW_ERROR
;
5432 *(primaries
++) = '\0';
5435 if(allocateSKBuffer
== TRUE
) {
5436 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5438 if (*result
== NULL
) {
5439 *status
= U_MEMORY_ALLOCATION_ERROR
;
5442 uprv_memcpy(*result
, primStart
, sortKeySize
);
5443 if(primStart
!= prim
) {
5444 uprv_free(primStart
);
5449 if (allocateSKBuffer
== FALSE
&& resultLength
> 0 && U_FAILURE(*status
) && *status
!= U_BUFFER_OVERFLOW_ERROR
) {
5450 /* NULL terminate for safety */
5453 if(terStart
!= tert
) {
5454 uprv_free(terStart
);
5455 uprv_free(secStart
);
5456 uprv_free(caseStart
);
5457 uprv_free(quadStart
);
5460 /* To avoid memory leak, free the offset buffer if necessary. */
5461 freeOffsetBuffer(&s
);
5463 if(normSource
!= normBuffer
) {
5464 uprv_free(normSource
);
5471 U_CFUNC
int32_t U_CALLCONV
5472 ucol_calcSortKeySimpleTertiary(const UCollator
*coll
,
5473 const UChar
*source
,
5474 int32_t sourceLength
,
5476 uint32_t resultLength
,
5477 UBool allocateSKBuffer
,
5482 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5483 uint32_t i
= 0; /* general purpose counter */
5485 /* Stack allocated buffers for buffers we use */
5486 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
];
5488 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
;
5490 if(U_FAILURE(*status
)) {
5494 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
5495 primaries
= *result
= prim
;
5496 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
5499 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
;
5501 uint32_t sortKeySize
= 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5503 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
5504 UChar
*normSource
= normBuffer
;
5505 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
5507 int32_t len
= sourceLength
;
5509 /* If we need to normalize, we'll do it all at once at the beginning! */
5510 if(coll
->normalizationMode
!= UCOL_OFF
&& UNORM_YES
!= unorm_quickCheck(source
, len
, UNORM_FCD
, status
)) {
5511 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5515 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
5516 normSourceLen
= len
;
5517 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
5518 if(normSource
== NULL
) {
5519 *status
= U_MEMORY_ALLOCATION_ERROR
;
5522 *status
= U_ZERO_ERROR
;
5523 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5527 if(U_FAILURE(*status
)) {
5528 /* Should never happen. */
5529 uprv_free(normSource
);
5530 normSource
= normBuffer
;
5534 if(U_FAILURE(*status
)) {
5537 source
= normSource
;
5541 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5542 if(source
== normSource
) {
5543 s
.flags
&= ~UCOL_ITER_NORM
;
5546 if(resultLength
== 0 || primaries
== NULL
) {
5547 int32_t t
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5548 if(normSource
!= normBuffer
) {
5549 uprv_free(normSource
);
5554 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
5556 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
5558 uint8_t *primStart
= primaries
;
5559 uint8_t *secStart
= secondaries
;
5560 uint8_t *terStart
= tertiaries
;
5564 uint8_t primary1
= 0;
5565 uint8_t primary2
= 0;
5566 uint8_t secondary
= 0;
5567 uint8_t tertiary
= 0;
5568 uint8_t caseSwitch
= coll
->caseSwitch
;
5569 uint8_t tertiaryMask
= coll
->tertiaryMask
;
5570 int8_t tertiaryAddition
= coll
->tertiaryAddition
;
5571 uint8_t tertiaryTop
= coll
->tertiaryTop
;
5572 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
5573 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
5575 uint32_t prevBuffSize
= 0;
5577 UBool finished
= FALSE
;
5578 UBool notIsContinuation
= FALSE
;
5580 uint32_t count2
= 0, count3
= 0;
5581 uint8_t leadPrimary
= 0;
5584 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
5586 order
= ucol_IGetNextCE(coll
, &s
, status
);
5592 if(order
== UCOL_NO_MORE_CES
) {
5597 notIsContinuation
= !isContinuation(order
);
5599 if(notIsContinuation
) {
5600 tertiary
= (uint8_t)((order
& tertiaryMask
));
5602 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
5604 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5605 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5606 primary1
= (uint8_t)(order
>> 8);
5608 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5609 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5610 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5611 /* regular and simple sortkey calc */
5612 if(primary1
!= UCOL_IGNORABLE
) {
5613 if(notIsContinuation
) {
5614 if(leadPrimary
== primary1
) {
5615 *primaries
++ = primary2
;
5617 if(leadPrimary
!= 0) {
5618 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5620 if(primary2
== UCOL_IGNORABLE
) {
5621 /* one byter, not compressed */
5622 *primaries
++ = primary1
;
5624 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
5625 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5626 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5627 (primary1
> maxRegularPrimary
&& primary1
< minImplicitPrimary
)) {
5628 /* not compressible */
5630 *primaries
++ = primary1
;
5631 *primaries
++ = primary2
;
5632 } else { /* compress */
5633 *primaries
++ = leadPrimary
= primary1
;
5634 *primaries
++ = primary2
;
5637 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5638 *primaries
++ = primary1
;
5639 if(primary2
!= UCOL_IGNORABLE
) {
5640 *primaries
++ = primary2
; /* second part */
5645 if(secondary
> 0) { /* I think that != 0 test should be != IGNORABLE */
5646 /* This is compression code. */
5647 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5651 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5652 while (count2
> UCOL_TOP_COUNT2
) {
5653 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5654 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5656 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5658 while (count2
> UCOL_BOT_COUNT2
) {
5659 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5660 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5662 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5666 *secondaries
++ = secondary
;
5670 if(notIsContinuation
) {
5671 tertiary
^= caseSwitch
;
5675 /* This is compression code. */
5676 /* sequence size check is included in the if clause */
5677 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5680 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
5681 tertiary
+= tertiaryAddition
;
5682 } else if (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
5683 tertiary
-= tertiaryAddition
;
5686 if ((tertiary
> tertiaryCommon
)) {
5687 while (count3
> coll
->tertiaryTopCount
) {
5688 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5689 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5691 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5693 while (count3
> coll
->tertiaryBottomCount
) {
5694 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5695 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5697 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5701 *tertiaries
++ = tertiary
;
5705 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5706 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5707 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5708 if(source
== normSource
) {
5709 s
.flags
&= ~UCOL_ITER_NORM
;
5711 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5712 *status
= U_BUFFER_OVERFLOW_ERROR
;
5715 } else { /* It's much nicer if we can actually reallocate */
5716 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
);
5717 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5718 if(U_SUCCESS(*status
)) {
5719 *result
= primStart
;
5720 primarySafeEnd
= primStart
+ resultLength
- 2;
5722 /* We ran out of memory!? We can't recover. */
5723 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5733 prevBuffSize
= minBufferSize
;
5734 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5735 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5737 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5738 /* We ran out of memory!? We can't recover. */
5739 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5745 if(U_SUCCESS(*status
)) {
5746 sortKeySize
+= (primaries
- primStart
);
5747 /* we have done all the CE's, now let's put them together to form a key */
5749 while (count2
> UCOL_BOT_COUNT2
) {
5750 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5751 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5753 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5755 uint32_t secsize
= secondaries
-secStart
;
5756 sortKeySize
+= secsize
;
5757 if(sortKeySize
<= resultLength
) {
5758 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5759 uprv_memcpy(primaries
, secStart
, secsize
);
5760 primaries
+= secsize
;
5762 if(allocateSKBuffer
== TRUE
) {
5763 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5764 if(U_SUCCESS(*status
)) {
5765 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5766 *result
= primStart
;
5767 uprv_memcpy(primaries
, secStart
, secsize
);
5770 /* We ran out of memory!? We can't recover. */
5771 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5775 *status
= U_BUFFER_OVERFLOW_ERROR
;
5780 if (coll
->tertiaryCommon
!= UCOL_COMMON3_NORMAL
) {
5781 while (count3
>= coll
->tertiaryTopCount
) {
5782 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5783 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5785 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5787 while (count3
> coll
->tertiaryBottomCount
) {
5788 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5789 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5791 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5794 uint32_t tersize
= tertiaries
- terStart
;
5795 sortKeySize
+= tersize
;
5796 if(sortKeySize
<= resultLength
) {
5797 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5798 uprv_memcpy(primaries
, terStart
, tersize
);
5799 primaries
+= tersize
;
5801 if(allocateSKBuffer
== TRUE
) {
5802 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5803 if(U_SUCCESS(*status
)) {
5804 *result
= primStart
;
5805 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5806 uprv_memcpy(primaries
, terStart
, tersize
);
5809 /* We ran out of memory!? We can't recover. */
5810 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5814 *status
= U_MEMORY_ALLOCATION_ERROR
;
5818 *(primaries
++) = '\0';
5821 if(allocateSKBuffer
== TRUE
) {
5822 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5824 if (*result
== NULL
) {
5825 *status
= U_MEMORY_ALLOCATION_ERROR
;
5828 uprv_memcpy(*result
, primStart
, sortKeySize
);
5829 if(primStart
!= prim
) {
5830 uprv_free(primStart
);
5835 if (allocateSKBuffer
== FALSE
&& resultLength
> 0 && U_FAILURE(*status
) && *status
!= U_BUFFER_OVERFLOW_ERROR
) {
5836 /* NULL terminate for safety */
5839 if(terStart
!= tert
) {
5840 uprv_free(terStart
);
5841 uprv_free(secStart
);
5844 /* To avoid memory leak, free the offset buffer if necessary. */
5845 freeOffsetBuffer(&s
);
5847 if(normSource
!= normBuffer
) {
5848 uprv_free(normSource
);
5855 UBool
isShiftedCE(uint32_t CE
, uint32_t LVT
, UBool
*wasShifted
) {
5856 UBool notIsContinuation
= !isContinuation(CE
);
5857 uint8_t primary1
= (uint8_t)((CE
>> 24) & 0xFF);
5858 if(LVT
&& ((notIsContinuation
&& (CE
& 0xFFFF0000)<= LVT
&& primary1
> 0)
5859 || (!notIsContinuation
&& *wasShifted
))
5860 || (*wasShifted
&& primary1
== 0)) /* amendment to the UCA says that primary ignorables */
5862 // The stuff below should probably be in the sortkey code... maybe not...
5863 if(primary1
!= 0) { /* if we were shifted and we got an ignorable code point */
5864 /* we should just completely ignore it */
5868 //*wasShifted = TRUE;
5871 *wasShifted
= FALSE
;
5876 void terminatePSKLevel(int32_t level
, int32_t maxLevel
, int32_t &i
, uint8_t *dest
) {
5877 if(level
< maxLevel
) {
5878 dest
[i
++] = UCOL_LEVELTERMINATOR
;
5884 /** enumeration of level identifiers for partial sort key generation */
5886 UCOL_PSK_PRIMARY
= 0,
5887 UCOL_PSK_SECONDARY
= 1,
5889 UCOL_PSK_TERTIARY
= 3,
5890 UCOL_PSK_QUATERNARY
= 4,
5891 UCOL_PSK_QUIN
= 5, /** This is an extra level, not used - but we have three bits to blow */
5892 UCOL_PSK_IDENTICAL
= 6,
5893 UCOL_PSK_NULL
= 7, /** level for the end of sort key. Will just produce zeros */
5897 /** collation state enum. *_SHIFT value is how much to shift right
5898 * to get the state piece to the right. *_MASK value should be
5899 * ANDed with the shifted state. This data is stored in state[1]
5903 UCOL_PSK_LEVEL_SHIFT
= 0, /** level identificator. stores an enum value from above */
5904 UCOL_PSK_LEVEL_MASK
= 7, /** three bits */
5905 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
= 3, /** number of bytes of primary or quaternary already written */
5906 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
= 1,
5907 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5908 * This field is also used to denote that the French secondary level is finished
5910 UCOL_PSK_WAS_SHIFTED_SHIFT
= 4,/** was the last value shifted */
5911 UCOL_PSK_WAS_SHIFTED_MASK
= 1, /** can be 0 or 1 (Boolean) */
5912 UCOL_PSK_USED_FRENCH_SHIFT
= 5,/** how many French bytes have we already written */
5913 UCOL_PSK_USED_FRENCH_MASK
= 3, /** up to 4 bytes. See comment just below */
5914 /** When we do French we need to reverse secondary values. However, continuations
5915 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5917 UCOL_PSK_BOCSU_BYTES_SHIFT
= 7,
5918 UCOL_PSK_BOCSU_BYTES_MASK
= 3,
5919 UCOL_PSK_CONSUMED_CES_SHIFT
= 9,
5920 UCOL_PSK_CONSUMED_CES_MASK
= 0x7FFFF
5923 // macro calculating the number of expansion CEs available
5924 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5927 /** main sortkey part procedure. On the first call,
5928 * you should pass in a collator, an iterator, empty state
5929 * state[0] == state[1] == 0, a buffer to hold results
5930 * number of bytes you need and an error code pointer.
5931 * Make sure your buffer is big enough to hold the wanted
5932 * number of sortkey bytes. I don't check.
5933 * The only meaningful status you can get back is
5934 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5935 * have been dealt a raw deal and that you probably won't
5936 * be able to use partial sortkey generation for this
5937 * particular combination of string and collator. This
5938 * is highly unlikely, but you should still check the error code.
5939 * Any other status means that you're not in a sane situation
5940 * anymore. After the first call, preserve state values and
5941 * use them on subsequent calls to obtain more bytes of a sortkey.
5942 * Use until the number of bytes written is smaller than the requested
5943 * number of bytes. Generated sortkey is not compatible with the
5944 * one generated by ucol_getSortKey, as we don't do any compression.
5945 * However, levels are still terminated by a 1 (one) and the sortkey
5946 * is terminated by a 0 (zero). Identical level is the same as in the
5947 * regular sortkey - internal bocu-1 implementation is used.
5948 * For curious, although you cannot do much about this, here is
5949 * the structure of state words.
5950 * state[0] - iterator state. Depends on the iterator implementation,
5951 * but allows the iterator to continue where it stopped in
5952 * the last iteration.
5953 * state[1] - collation processing state. Here is the distribution
5955 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5956 * quaternary, quin (we don't use this one), identical and
5957 * null (producing only zeroes - first one to terminate the
5958 * sortkey and subsequent to fill the buffer).
5959 * 3 - byte count. Number of bytes written on the primary level.
5960 * 4 - was shifted. Whether the previous iteration finished in the
5962 * 5, 6 - French continuation bytes written. See the comment in the enum
5963 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5964 * the identical level.
5965 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5966 * since thes last successful update of the iterator state.
5968 U_CAPI
int32_t U_EXPORT2
5969 ucol_nextSortKeyPart(const UCollator
*coll
,
5970 UCharIterator
*iter
,
5972 uint8_t *dest
, int32_t count
,
5975 /* error checking */
5976 if(status
==NULL
|| U_FAILURE(*status
)) {
5979 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART
);
5980 if( coll
==NULL
|| iter
==NULL
||
5982 count
<0 || (count
>0 && dest
==NULL
)
5984 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
5985 UTRACE_EXIT_STATUS(status
);
5989 UTRACE_DATA6(UTRACE_VERBOSE
, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5990 coll
, iter
, state
[0], state
[1], dest
, count
);
5994 UTRACE_EXIT_VALUE(0);
5997 /** Setting up situation according to the state we got from the previous iteration */
5998 // The state of the iterator from the previous invocation
5999 uint32_t iterState
= state
[0];
6000 // Has the last iteration ended in the shifted state
6001 UBool wasShifted
= ((state
[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT
) & UCOL_PSK_WAS_SHIFTED_MASK
)?TRUE
:FALSE
;
6002 // What is the current level of the sortkey?
6003 int32_t level
= (state
[1] >> UCOL_PSK_LEVEL_SHIFT
) & UCOL_PSK_LEVEL_MASK
;
6004 // Have we written only one byte from a two byte primary in the previous iteration?
6005 // Also on secondary level - have we finished with the French secondary?
6006 int32_t byteCountOrFrenchDone
= (state
[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
;
6007 // number of bytes in the continuation buffer for French
6008 int32_t usedFrench
= (state
[1] >> UCOL_PSK_USED_FRENCH_SHIFT
) & UCOL_PSK_USED_FRENCH_MASK
;
6009 // Number of bytes already written from a bocsu sequence. Since
6010 // the longes bocsu sequence is 4 long, this can be up to 3.
6011 int32_t bocsuBytesUsed
= (state
[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT
) & UCOL_PSK_BOCSU_BYTES_MASK
;
6012 // Number of elements that need to be consumed in this iteration because
6013 // the iterator returned UITER_NO_STATE at the end of the last iteration,
6014 // so we had to save the last valid state.
6015 int32_t cces
= (state
[1] >> UCOL_PSK_CONSUMED_CES_SHIFT
) & UCOL_PSK_CONSUMED_CES_MASK
;
6017 /** values that depend on the collator attributes */
6018 // strength of the collator.
6019 int32_t strength
= ucol_getAttribute(coll
, UCOL_STRENGTH
, status
);
6020 // maximal level of the partial sortkey. Need to take whether case level is done
6021 int32_t maxLevel
= 0;
6022 if(strength
< UCOL_TERTIARY
) {
6023 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
6024 maxLevel
= UCOL_PSK_CASE
;
6026 maxLevel
= strength
;
6029 if(strength
== UCOL_TERTIARY
) {
6030 maxLevel
= UCOL_PSK_TERTIARY
;
6031 } else if(strength
== UCOL_QUATERNARY
) {
6032 maxLevel
= UCOL_PSK_QUATERNARY
;
6033 } else { // identical
6034 maxLevel
= UCOL_IDENTICAL
;
6037 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
6038 uint8_t UCOL_HIRAGANA_QUAD
=
6039 (ucol_getAttribute(coll
, UCOL_HIRAGANA_QUATERNARY_MODE
, status
) == UCOL_ON
)?0xFE:0xFF;
6040 // Boundary value that decides whether a CE is shifted or not
6041 uint32_t LVT
= (coll
->alternateHandling
== UCOL_SHIFTED
)?(coll
->variableTopValue
<<16):0;
6042 // Are we doing French collation?
6043 UBool doingFrench
= (ucol_getAttribute(coll
, UCOL_FRENCH_COLLATION
, status
) == UCOL_ON
);
6045 /** initializing the collation state */
6046 UBool notIsContinuation
= FALSE
;
6047 uint32_t CE
= UCOL_NO_MORE_CES
;
6050 IInit_collIterate(coll
, NULL
, -1, &s
);
6052 s
.flags
|= UCOL_USE_ITERATOR
;
6053 // This variable tells us whether we have produced some other levels in this iteration
6054 // before we moved to the identical level. In that case, we need to switch the
6055 // type of the iterator.
6056 UBool doingIdenticalFromStart
= FALSE
;
6057 // Normalizing iterator
6058 // The division for the array length may truncate the array size to
6059 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6060 // for all platforms anyway.
6061 UAlignedMemory stackNormIter
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
6062 UNormIterator
*normIter
= NULL
;
6063 // If the normalization is turned on for the collator and we are below identical level
6064 // we will use a FCD normalizing iterator
6065 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
&& level
< UCOL_PSK_IDENTICAL
) {
6066 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6067 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_FCD
, status
);
6068 s
.flags
&= ~UCOL_ITER_NORM
;
6069 if(U_FAILURE(*status
)) {
6070 UTRACE_EXIT_STATUS(*status
);
6073 } else if(level
== UCOL_PSK_IDENTICAL
) {
6074 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
6075 // will be updating the state - and this cannot be done on an ordinary iterator.
6076 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6077 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6078 s
.flags
&= ~UCOL_ITER_NORM
;
6079 if(U_FAILURE(*status
)) {
6080 UTRACE_EXIT_STATUS(*status
);
6083 doingIdenticalFromStart
= TRUE
;
6086 // This is the tentative new state of the iterator. The problem
6087 // is that the iterator might return an undefined state, in
6088 // which case we should save the last valid state and increase
6089 // the iterator skip value.
6090 uint32_t newState
= 0;
6092 // First, we set the iterator to the last valid position
6093 // from the last iteration. This was saved in state[0].
6094 if(iterState
== 0) {
6096 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
&& !byteCountOrFrenchDone
) {
6097 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
6099 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6102 /* reset to previous state */
6103 s
.iterator
->setState(s
.iterator
, iterState
, status
);
6104 if(U_FAILURE(*status
)) {
6105 UTRACE_EXIT_STATUS(*status
);
6112 // This variable tells us whether we can attempt to update the state
6113 // of iterator. Situations where we don't want to update iterator state
6114 // are the existence of expansion CEs that are not yet processed, and
6115 // finishing the case level without enough space in the buffer to insert
6116 // a level terminator.
6117 UBool canUpdateState
= TRUE
;
6119 // Consume all the CEs that were consumed at the end of the previous
6120 // iteration without updating the iterator state. On identical level,
6121 // consume the code points.
6122 int32_t counter
= cces
;
6123 if(level
< UCOL_PSK_IDENTICAL
) {
6124 while(counter
-->0) {
6125 // If we're doing French and we are on the secondary level,
6127 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6128 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
6130 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6132 if(CE
==UCOL_NO_MORE_CES
) {
6133 /* should not happen */
6134 *status
=U_INTERNAL_PROGRAM_ERROR
;
6135 UTRACE_EXIT_STATUS(*status
);
6138 if(uprv_numAvailableExpCEs(s
)) {
6139 canUpdateState
= FALSE
;
6143 while(counter
-->0) {
6144 uiter_next32(s
.iterator
);
6148 // French secondary needs to know whether the iterator state of zero came from previous level OR
6149 // from a new invocation...
6150 UBool wasDoingPrimary
= FALSE
;
6151 // destination buffer byte counter. When this guy
6152 // gets to count, we're done with the iteration
6154 // used to count the zero bytes written after we
6155 // have finished with the sort key
6159 // Hm.... I think we're ready to plunge in. Basic story is as following:
6160 // we have a fall through case based on level. This is used for initial
6161 // positioning on iteration start. Every level processor contains a
6162 // for(;;) which will be broken when we exhaust all the CEs. Other
6163 // way to exit is a goto saveState, which happens when we have filled
6166 case UCOL_PSK_PRIMARY
:
6167 wasDoingPrimary
= TRUE
;
6172 // We should save the state only if we
6173 // are sure that we are done with the
6174 // previous iterator state
6175 if(canUpdateState
&& byteCountOrFrenchDone
== 0) {
6176 newState
= s
.iterator
->getState(s
.iterator
);
6177 if(newState
!= UITER_NO_STATE
) {
6178 iterState
= newState
;
6182 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6184 if(CE
==UCOL_NO_MORE_CES
) {
6185 // Add the level separator
6186 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6187 byteCountOrFrenchDone
=0;
6188 // Restart the iteration an move to the
6190 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6192 level
= UCOL_PSK_SECONDARY
;
6195 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6196 CE
>>= UCOL_PRIMARYORDERSHIFT
; /* get primary */
6198 if(byteCountOrFrenchDone
== 0) {
6199 // get the second byte of primary
6200 dest
[i
++]=(uint8_t)(CE
>> 8);
6202 byteCountOrFrenchDone
= 0;
6204 if((CE
&=0xff)!=0) {
6207 byteCountOrFrenchDone
= 1;
6211 dest
[i
++]=(uint8_t)CE
;
6215 if(uprv_numAvailableExpCEs(s
)) {
6216 canUpdateState
= FALSE
;
6218 canUpdateState
= TRUE
;
6221 /* fall through to next level */
6222 case UCOL_PSK_SECONDARY
:
6223 if(strength
>= UCOL_SECONDARY
) {
6229 // We should save the state only if we
6230 // are sure that we are done with the
6231 // previous iterator state
6232 if(canUpdateState
) {
6233 newState
= s
.iterator
->getState(s
.iterator
);
6234 if(newState
!= UITER_NO_STATE
) {
6235 iterState
= newState
;
6239 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6241 if(CE
==UCOL_NO_MORE_CES
) {
6242 // Add the level separator
6243 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6244 byteCountOrFrenchDone
= 0;
6245 // Restart the iteration an move to the
6247 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6249 level
= UCOL_PSK_CASE
;
6252 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6253 CE
>>= 8; /* get secondary */
6255 dest
[i
++]=(uint8_t)CE
;
6258 if(uprv_numAvailableExpCEs(s
)) {
6259 canUpdateState
= FALSE
;
6261 canUpdateState
= TRUE
;
6264 } else { // French secondary processing
6265 uint8_t frenchBuff
[UCOL_MAX_BUFFER
];
6266 int32_t frenchIndex
= 0;
6267 // Here we are going backwards.
6268 // If the iterator is at the beggining, it should be
6270 if(wasDoingPrimary
) {
6271 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
6278 if(canUpdateState
) {
6279 newState
= s
.iterator
->getState(s
.iterator
);
6280 if(newState
!= UITER_NO_STATE
) {
6281 iterState
= newState
;
6285 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
6287 if(CE
==UCOL_NO_MORE_CES
) {
6288 // Add the level separator
6289 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6290 byteCountOrFrenchDone
= 0;
6291 // Restart the iteration an move to the next level
6292 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6293 level
= UCOL_PSK_CASE
;
6296 if(isContinuation(CE
)) { // if it's a continuation, we want to save it and
6297 // reverse when we get a first non-continuation CE.
6299 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
6300 } else if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6301 CE
>>= 8; /* get secondary */
6304 dest
[i
++]=(uint8_t)CE
;
6307 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
6308 frenchIndex
-= usedFrench
;
6310 while(i
< count
&& frenchIndex
) {
6311 dest
[i
++] = frenchBuff
[--frenchIndex
];
6316 if(uprv_numAvailableExpCEs(s
)) {
6317 canUpdateState
= FALSE
;
6319 canUpdateState
= TRUE
;
6324 level
= UCOL_PSK_CASE
;
6326 /* fall through to next level */
6328 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
6329 uint32_t caseShift
= UCOL_CASE_SHIFT_START
;
6330 uint8_t caseByte
= UCOL_CASE_BYTE_START
;
6331 uint8_t caseBits
= 0;
6337 // We should save the state only if we
6338 // are sure that we are done with the
6339 // previous iterator state
6340 if(canUpdateState
) {
6341 newState
= s
.iterator
->getState(s
.iterator
);
6342 if(newState
!= UITER_NO_STATE
) {
6343 iterState
= newState
;
6347 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6349 if(CE
==UCOL_NO_MORE_CES
) {
6350 // On the case level we might have an unfinished
6351 // case byte. Add one if it's started.
6352 if(caseShift
!= UCOL_CASE_SHIFT_START
) {
6353 dest
[i
++] = caseByte
;
6356 // We have finished processing CEs on this level.
6357 // However, we don't know if we have enough space
6358 // to add a case level terminator.
6360 // Add the level separator
6361 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6362 // Restart the iteration and move to the
6364 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6365 level
= UCOL_PSK_TERTIARY
;
6367 canUpdateState
= FALSE
;
6372 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6373 if(!isContinuation(CE
) && ((CE
& UCOL_PRIMARYMASK
) != 0 || strength
> UCOL_PRIMARY
)) {
6374 // do the case level if we need to do it. We don't want to calculate
6375 // case level for primary ignorables if we have only primary strength and case level
6376 // otherwise we would break well formedness of CEs
6377 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6378 caseBits
= (uint8_t)(CE
& 0xC0);
6379 // this copies the case level logic from the
6380 // sort key generation code
6382 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6383 if((caseBits
& 0xC0) == 0) {
6384 caseByte
|= 1 << (--caseShift
);
6386 caseByte
|= 0 << (--caseShift
);
6388 if(caseShift
== 0) {
6389 dest
[i
++] = caseByte
;
6390 caseShift
= UCOL_CASE_SHIFT_START
;
6391 caseByte
= UCOL_CASE_BYTE_START
;
6393 caseByte
|= ((caseBits
>>6)&1) << (--caseShift
);
6396 if((caseBits
& 0xC0) == 0) {
6397 caseByte
|= 0 << (--caseShift
);
6399 caseByte
|= 1 << (--caseShift
);
6401 if(caseShift
== 0) {
6402 dest
[i
++] = caseByte
;
6403 caseShift
= UCOL_CASE_SHIFT_START
;
6404 caseByte
= UCOL_CASE_BYTE_START
;
6406 caseByte
|= ((caseBits
>>7)&1) << (--caseShift
);
6413 // Not sure this is correct for the case level - revisit
6414 if(uprv_numAvailableExpCEs(s
)) {
6415 canUpdateState
= FALSE
;
6417 canUpdateState
= TRUE
;
6421 level
= UCOL_PSK_TERTIARY
;
6423 /* fall through to next level */
6424 case UCOL_PSK_TERTIARY
:
6425 if(strength
>= UCOL_TERTIARY
) {
6430 // We should save the state only if we
6431 // are sure that we are done with the
6432 // previous iterator state
6433 if(canUpdateState
) {
6434 newState
= s
.iterator
->getState(s
.iterator
);
6435 if(newState
!= UITER_NO_STATE
) {
6436 iterState
= newState
;
6440 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6442 if(CE
==UCOL_NO_MORE_CES
) {
6443 // Add the level separator
6444 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6445 byteCountOrFrenchDone
= 0;
6446 // Restart the iteration an move to the
6448 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6450 level
= UCOL_PSK_QUATERNARY
;
6453 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6454 notIsContinuation
= !isContinuation(CE
);
6456 if(notIsContinuation
) {
6457 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6458 CE
^= coll
->caseSwitch
;
6459 CE
&= coll
->tertiaryMask
;
6461 CE
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6465 dest
[i
++]=(uint8_t)CE
;
6468 if(uprv_numAvailableExpCEs(s
)) {
6469 canUpdateState
= FALSE
;
6471 canUpdateState
= TRUE
;
6475 // if we're not doing tertiary
6477 level
= UCOL_PSK_NULL
;
6479 /* fall through to next level */
6480 case UCOL_PSK_QUATERNARY
:
6481 if(strength
>= UCOL_QUATERNARY
) {
6486 // We should save the state only if we
6487 // are sure that we are done with the
6488 // previous iterator state
6489 if(canUpdateState
) {
6490 newState
= s
.iterator
->getState(s
.iterator
);
6491 if(newState
!= UITER_NO_STATE
) {
6492 iterState
= newState
;
6496 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6498 if(CE
==UCOL_NO_MORE_CES
) {
6499 // Add the level separator
6500 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6501 //dest[i++] = UCOL_LEVELTERMINATOR;
6502 byteCountOrFrenchDone
= 0;
6503 // Restart the iteration an move to the
6505 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6507 level
= UCOL_PSK_QUIN
;
6512 if(isShiftedCE(CE
, LVT
, &wasShifted
)) {
6513 CE
>>= 16; /* get primary */
6515 if(byteCountOrFrenchDone
== 0) {
6516 dest
[i
++]=(uint8_t)(CE
>> 8);
6518 byteCountOrFrenchDone
= 0;
6520 if((CE
&=0xff)!=0) {
6523 byteCountOrFrenchDone
= 1;
6526 dest
[i
++]=(uint8_t)CE
;
6530 notIsContinuation
= !isContinuation(CE
);
6531 if(notIsContinuation
) {
6532 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
6533 dest
[i
++] = UCOL_HIRAGANA_QUAD
;
6539 if(uprv_numAvailableExpCEs(s
)) {
6540 canUpdateState
= FALSE
;
6542 canUpdateState
= TRUE
;
6546 // if we're not doing quaternary
6548 level
= UCOL_PSK_NULL
;
6550 /* fall through to next level */
6552 level
= UCOL_PSK_IDENTICAL
;
6553 /* fall through to next level */
6554 case UCOL_PSK_IDENTICAL
:
6555 if(strength
>= UCOL_IDENTICAL
) {
6556 UChar32 first
, second
;
6557 int32_t bocsuBytesWritten
= 0;
6558 // We always need to do identical on
6559 // the NFD form of the string.
6560 if(normIter
== NULL
) {
6561 // we arrived from the level below and
6562 // normalization was not turned on.
6563 // therefore, we need to make a fresh NFD iterator
6564 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6565 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6566 } else if(!doingIdenticalFromStart
) {
6567 // there is an iterator, but we did some other levels.
6568 // therefore, we have a FCD iterator - need to make
6570 // normIter being at the beginning does not guarantee
6571 // that the underlying iterator is at the beginning
6572 iter
->move(iter
, 0, UITER_START
);
6573 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6575 // At this point we have a NFD iterator that is positioned
6576 // in the right place
6577 if(U_FAILURE(*status
)) {
6578 UTRACE_EXIT_STATUS(*status
);
6581 first
= uiter_previous32(s
.iterator
);
6582 // maybe we're at the start of the string
6583 if(first
== U_SENTINEL
) {
6586 uiter_next32(s
.iterator
);
6592 if(j
+1 < bocsuBytesWritten
) {
6593 bocsuBytesUsed
= j
+1;
6598 // On identical level, we will always save
6599 // the state if we reach this point, since
6600 // we don't depend on getNextCE for content
6601 // all the content is in our buffer and we
6602 // already either stored the full buffer OR
6603 // otherwise we won't arrive here.
6604 newState
= s
.iterator
->getState(s
.iterator
);
6605 if(newState
!= UITER_NO_STATE
) {
6606 iterState
= newState
;
6611 second
= uiter_next32(s
.iterator
);
6614 // end condition for identical level
6615 if(second
== U_SENTINEL
) {
6616 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6617 level
= UCOL_PSK_NULL
;
6620 bocsuBytesWritten
= u_writeIdenticalLevelRunTwoChars(first
, second
, buff
);
6624 if(bocsuBytesUsed
!= 0) {
6625 while(bocsuBytesUsed
-->0) {
6630 while(i
< count
&& j
< bocsuBytesWritten
) {
6631 dest
[i
++] = buff
[j
++];
6636 level
= UCOL_PSK_NULL
;
6638 /* fall through to next level */
6646 *status
= U_INTERNAL_PROGRAM_ERROR
;
6647 UTRACE_EXIT_STATUS(*status
);
6652 // Now we need to return stuff. First we want to see whether we have
6653 // done everything for the current state of iterator.
6654 if(byteCountOrFrenchDone
6655 || canUpdateState
== FALSE
6656 || (newState
= s
.iterator
->getState(s
.iterator
)) == UITER_NO_STATE
)
6658 // Any of above mean that the previous transaction
6659 // wasn't finished and that we should store the
6660 // previous iterator state.
6661 state
[0] = iterState
;
6663 // The transaction is complete. We will continue in the next iteration.
6664 state
[0] = s
.iterator
->getState(s
.iterator
);
6667 // Store the number of bocsu bytes written.
6668 if((bocsuBytesUsed
& UCOL_PSK_BOCSU_BYTES_MASK
) != bocsuBytesUsed
) {
6669 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6671 state
[1] = (bocsuBytesUsed
& UCOL_PSK_BOCSU_BYTES_MASK
) << UCOL_PSK_BOCSU_BYTES_SHIFT
;
6673 // Next we put in the level of comparison
6674 state
[1] |= ((level
& UCOL_PSK_LEVEL_MASK
) << UCOL_PSK_LEVEL_SHIFT
);
6676 // If we are doing French, we need to store whether we have just finished the French level
6677 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6678 state
[1] |= (((state
[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6680 state
[1] |= ((byteCountOrFrenchDone
& UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6683 // Was the latest CE shifted
6685 state
[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT
;
6687 // Check for cces overflow
6688 if((cces
& UCOL_PSK_CONSUMED_CES_MASK
) != cces
) {
6689 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6692 state
[1] |= ((cces
& UCOL_PSK_CONSUMED_CES_MASK
) << UCOL_PSK_CONSUMED_CES_SHIFT
);
6694 // Check for French overflow
6695 if((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) != usedFrench
) {
6696 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6698 // Store number of bytes written in the French secondary continuation sequence
6699 state
[1] |= ((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) << UCOL_PSK_USED_FRENCH_SHIFT
);
6702 // If we have used normalizing iterator, get rid of it
6703 if(normIter
!= NULL
) {
6704 unorm_closeIter(normIter
);
6707 /* To avoid memory leak, free the offset buffer if necessary. */
6708 freeOffsetBuffer(&s
);
6710 // Return number of meaningful sortkey bytes.
6711 UTRACE_DATA4(UTRACE_VERBOSE
, "dest = %vb, state=%d %d",
6712 dest
,i
, state
[0], state
[1]);
6713 UTRACE_EXIT_VALUE(i
);
6718 * Produce a bound for a given sortkey and a number of levels.
6720 U_CAPI
int32_t U_EXPORT2
6721 ucol_getBound(const uint8_t *source
,
6722 int32_t sourceLength
,
6723 UColBoundMode boundType
,
6724 uint32_t noOfLevels
,
6726 int32_t resultLength
,
6729 // consistency checks
6730 if(status
== NULL
|| U_FAILURE(*status
)) {
6733 if(source
== NULL
) {
6734 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6738 int32_t sourceIndex
= 0;
6739 // Scan the string until we skip enough of the key OR reach the end of the key
6742 if(source
[sourceIndex
] == UCOL_LEVELTERMINATOR
) {
6745 } while (noOfLevels
> 0
6746 && (source
[sourceIndex
] != 0 || sourceIndex
< sourceLength
));
6748 if((source
[sourceIndex
] == 0 || sourceIndex
== sourceLength
)
6749 && noOfLevels
> 0) {
6750 *status
= U_SORT_KEY_TOO_SHORT_WARNING
;
6754 // READ ME: this code assumes that the values for boundType
6755 // enum will not changes. They are set so that the enum value
6756 // corresponds to the number of extra bytes each bound type
6758 if(result
!= NULL
&& resultLength
>= sourceIndex
+boundType
) {
6759 uprv_memcpy(result
, source
, sourceIndex
);
6761 // Lower bound just gets terminated. No extra bytes
6762 case UCOL_BOUND_LOWER
: // = 0
6764 // Upper bound needs one extra byte
6765 case UCOL_BOUND_UPPER
: // = 1
6766 result
[sourceIndex
++] = 2;
6768 // Upper long bound needs two extra bytes
6769 case UCOL_BOUND_UPPER_LONG
: // = 2
6770 result
[sourceIndex
++] = 0xFF;
6771 result
[sourceIndex
++] = 0xFF;
6774 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6777 result
[sourceIndex
++] = 0;
6781 return sourceIndex
+boundType
+1;
6785 /****************************************************************************/
6786 /* Following are the functions that deal with the properties of a collator */
6787 /* there are new APIs and some compatibility APIs */
6788 /****************************************************************************/
6791 ucol_addLatinOneEntry(UCollator
*coll
, UChar ch
, uint32_t CE
,
6792 int32_t *primShift
, int32_t *secShift
, int32_t *terShift
)
6794 uint8_t primary1
= 0, primary2
= 0, secondary
= 0, tertiary
= 0;
6795 UBool reverseSecondary
= FALSE
;
6796 if(!isContinuation(CE
)) {
6797 tertiary
= (uint8_t)((CE
& coll
->tertiaryMask
));
6798 tertiary
^= coll
->caseSwitch
;
6799 reverseSecondary
= TRUE
;
6801 tertiary
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6802 tertiary
&= UCOL_REMOVE_CASE
;
6803 reverseSecondary
= FALSE
;
6806 secondary
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6807 primary2
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6808 primary1
= (uint8_t)(CE
>> 8);
6811 coll
->latinOneCEs
[ch
] |= (primary1
<< *primShift
);
6815 if(*primShift
< 0) {
6816 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6817 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6818 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6821 coll
->latinOneCEs
[ch
] |= (primary2
<< *primShift
);
6824 if(secondary
!= 0) {
6825 if(reverseSecondary
&& coll
->frenchCollation
== UCOL_ON
) { // reverse secondary
6826 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] >>= 8; // make space for secondary
6827 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< 24);
6828 } else { // normal case
6829 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< *secShift
);
6834 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] |= (tertiary
<< *terShift
);
6840 ucol_resizeLatinOneTable(UCollator
*coll
, int32_t size
, UErrorCode
*status
) {
6841 uint32_t *newTable
= (uint32_t *)uprv_malloc(size
*sizeof(uint32_t)*3);
6842 if(newTable
== NULL
) {
6843 *status
= U_MEMORY_ALLOCATION_ERROR
;
6844 coll
->latinOneFailed
= TRUE
;
6847 int32_t sizeToCopy
= ((size
<coll
->latinOneTableLen
)?size
:coll
->latinOneTableLen
)*sizeof(uint32_t);
6848 uprv_memset(newTable
, 0, size
*sizeof(uint32_t)*3);
6849 uprv_memcpy(newTable
, coll
->latinOneCEs
, sizeToCopy
);
6850 uprv_memcpy(newTable
+size
, coll
->latinOneCEs
+coll
->latinOneTableLen
, sizeToCopy
);
6851 uprv_memcpy(newTable
+2*size
, coll
->latinOneCEs
+2*coll
->latinOneTableLen
, sizeToCopy
);
6852 coll
->latinOneTableLen
= size
;
6853 uprv_free(coll
->latinOneCEs
);
6854 coll
->latinOneCEs
= newTable
;
6859 ucol_setUpLatinOne(UCollator
*coll
, UErrorCode
*status
) {
6860 UBool result
= TRUE
;
6861 if(coll
->latinOneCEs
== NULL
) {
6862 coll
->latinOneCEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN
*3);
6863 if(coll
->latinOneCEs
== NULL
) {
6864 *status
= U_MEMORY_ALLOCATION_ERROR
;
6867 coll
->latinOneTableLen
= UCOL_LATINONETABLELEN
;
6870 UCollationElements
*it
= ucol_openElements(coll
, &ch
, 1, status
);
6871 // Check for null pointer
6872 if (U_FAILURE(*status
)) {
6875 uprv_memset(coll
->latinOneCEs
, 0, sizeof(uint32_t)*coll
->latinOneTableLen
*3);
6877 int32_t primShift
= 24, secShift
= 24, terShift
= 24;
6879 int32_t contractionOffset
= UCOL_ENDOFLATINONERANGE
+1;
6881 // TODO: make safe if you get more than you wanted...
6882 for(ch
= 0; ch
<= UCOL_ENDOFLATINONERANGE
; ch
++) {
6883 primShift
= 24; secShift
= 24; terShift
= 24;
6885 CE
= coll
->latinOneMapping
[ch
];
6887 CE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
6888 if(CE
== UCOL_NOT_FOUND
&& coll
->UCA
) {
6889 CE
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
6892 if(CE
< UCOL_NOT_FOUND
) {
6893 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6895 switch (getCETag(CE
)) {
6898 ucol_setText(it
, &ch
, 1, status
);
6899 while((int32_t)(CE
= ucol_next(it
, status
)) != UCOL_NULLORDER
) {
6900 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6901 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6902 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6903 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6906 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6909 case CONTRACTION_TAG
:
6910 // here is the trick
6911 // F2 is contraction. We do something very similar to contractions
6912 // but have two indices, one in the real contraction table and the
6913 // other to where we stuffed things. This hopes that we don't have
6914 // many contractions (this should work for latin-1 tables).
6916 if((CE
& 0x00FFF000) != 0) {
6917 *status
= U_UNSUPPORTED_ERROR
;
6918 goto cleanup_after_failure
;
6921 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
6923 CE
|= (contractionOffset
& 0xFFF) << 12; // insert the offset in latin-1 table
6925 coll
->latinOneCEs
[ch
] = CE
;
6926 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = CE
;
6927 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = CE
;
6929 // We're going to jump into contraction table, pick the elements
6932 CE
= *(coll
->contractionCEs
+
6933 (UCharOffset
- coll
->contractionIndex
));
6934 if(CE
> UCOL_NOT_FOUND
&& getCETag(CE
) == EXPANSION_TAG
) {
6936 uint32_t i
; /* general counter */
6937 uint32_t *CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
6938 size
= getExpansionCount(CE
);
6940 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6941 for(i
= 0; i
<size
; i
++) {
6942 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6943 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6944 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6945 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6948 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6950 } else { /* else, we do */
6951 while(*CEOffset
!= 0) {
6952 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6953 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6954 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6955 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6958 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6961 contractionOffset
++;
6962 } else if(CE
< UCOL_NOT_FOUND
) {
6963 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
++, CE
, &primShift
, &secShift
, &terShift
);
6965 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6966 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6967 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6968 contractionOffset
++;
6971 primShift
= 24; secShift
= 24; terShift
= 24;
6972 if(contractionOffset
== coll
->latinOneTableLen
) { // we need to reallocate
6973 if(!ucol_resizeLatinOneTable(coll
, 2*coll
->latinOneTableLen
, status
)) {
6974 goto cleanup_after_failure
;
6977 } while(*UCharOffset
!= 0xFFFF);
6982 // 0xB7 is a precontext character defined in UCA5.1, a special
6983 // handle is implemeted in order to save LatinOne table for
6986 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6989 goto cleanup_after_failure
;
6994 goto cleanup_after_failure
;
6999 if(contractionOffset
< coll
->latinOneTableLen
) {
7000 if(!ucol_resizeLatinOneTable(coll
, contractionOffset
, status
)) {
7001 goto cleanup_after_failure
;
7004 ucol_closeElements(it
);
7007 cleanup_after_failure
:
7008 // status should already be set before arriving here.
7009 coll
->latinOneFailed
= TRUE
;
7010 ucol_closeElements(it
);
7014 void ucol_updateInternalState(UCollator
*coll
, UErrorCode
*status
) {
7015 if(U_SUCCESS(*status
)) {
7016 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
7017 coll
->caseSwitch
= UCOL_CASE_SWITCH
;
7019 coll
->caseSwitch
= UCOL_NO_CASE_SWITCH
;
7022 if(coll
->caseLevel
== UCOL_ON
|| coll
->caseFirst
== UCOL_OFF
) {
7023 coll
->tertiaryMask
= UCOL_REMOVE_CASE
;
7024 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
7025 coll
->tertiaryAddition
= (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF
; /* Should be 0x80 */
7026 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_OFF
;
7027 coll
->tertiaryBottom
= UCOL_COMMON_BOT3
;
7029 coll
->tertiaryMask
= UCOL_KEEP_CASE
;
7030 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_ON
;
7031 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
7032 coll
->tertiaryCommon
= UCOL_COMMON3_UPPERFIRST
;
7033 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_UPPER
;
7034 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_UPPER
;
7036 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
7037 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_LOWER
;
7038 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_LOWER
;
7042 /* Set the compression values */
7043 uint8_t tertiaryTotal
= (uint8_t)(coll
->tertiaryTop
- UCOL_COMMON_BOT3
-1);
7044 coll
->tertiaryTopCount
= (uint8_t)(UCOL_PROPORTION3
*tertiaryTotal
); /* we multilply double with int, but need only int */
7045 coll
->tertiaryBottomCount
= (uint8_t)(tertiaryTotal
- coll
->tertiaryTopCount
);
7047 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
== UCOL_TERTIARY
7048 && coll
->frenchCollation
== UCOL_OFF
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
)
7050 coll
->sortKeyGen
= ucol_calcSortKeySimpleTertiary
;
7052 coll
->sortKeyGen
= ucol_calcSortKey
;
7054 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
<= UCOL_TERTIARY
&& coll
->numericCollation
== UCOL_OFF
7055 && coll
->alternateHandling
== UCOL_NON_IGNORABLE
&& !coll
->latinOneFailed
)
7057 if(coll
->latinOneCEs
== NULL
|| coll
->latinOneRegenTable
) {
7058 if(ucol_setUpLatinOne(coll
, status
)) { // if we succeed in building latin1 table, we'll use it
7059 //fprintf(stderr, "F");
7060 coll
->latinOneUse
= TRUE
;
7062 coll
->latinOneUse
= FALSE
;
7064 if(*status
== U_UNSUPPORTED_ERROR
) {
7065 *status
= U_ZERO_ERROR
;
7067 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7068 coll
->latinOneUse
= TRUE
;
7071 coll
->latinOneUse
= FALSE
;
7076 U_CAPI
uint32_t U_EXPORT2
7077 ucol_setVariableTop(UCollator
*coll
, const UChar
*varTop
, int32_t len
, UErrorCode
*status
) {
7078 if(U_FAILURE(*status
) || coll
== NULL
) {
7082 len
= u_strlen(varTop
);
7085 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7090 IInit_collIterate(coll
, varTop
, len
, &s
);
7092 uint32_t CE
= ucol_IGetNextCE(coll
, &s
, status
);
7094 /* here we check if we have consumed all characters */
7095 /* you can put in either one character or a contraction */
7096 /* you shouldn't put more... */
7097 if(s
.pos
!= s
.endp
|| CE
== UCOL_NO_MORE_CES
) {
7098 *status
= U_CE_NOT_FOUND_ERROR
;
7102 uint32_t nextCE
= ucol_IGetNextCE(coll
, &s
, status
);
7104 if(isContinuation(nextCE
) && (nextCE
& UCOL_PRIMARYMASK
) != 0) {
7105 *status
= U_PRIMARY_TOO_LONG_ERROR
;
7108 if(coll
->variableTopValue
!= (CE
& UCOL_PRIMARYMASK
)>>16) {
7109 coll
->variableTopValueisDefault
= FALSE
;
7110 coll
->variableTopValue
= (CE
& UCOL_PRIMARYMASK
)>>16;
7113 /* To avoid memory leak, free the offset buffer if necessary. */
7114 freeOffsetBuffer(&s
);
7116 return CE
& UCOL_PRIMARYMASK
;
7119 U_CAPI
uint32_t U_EXPORT2
ucol_getVariableTop(const UCollator
*coll
, UErrorCode
*status
) {
7120 if(U_FAILURE(*status
) || coll
== NULL
) {
7123 return coll
->variableTopValue
<<16;
7126 U_CAPI
void U_EXPORT2
7127 ucol_restoreVariableTop(UCollator
*coll
, const uint32_t varTop
, UErrorCode
*status
) {
7128 if(U_FAILURE(*status
) || coll
== NULL
) {
7132 if(coll
->variableTopValue
!= (varTop
& UCOL_PRIMARYMASK
)>>16) {
7133 coll
->variableTopValueisDefault
= FALSE
;
7134 coll
->variableTopValue
= (varTop
& UCOL_PRIMARYMASK
)>>16;
7137 /* Attribute setter API */
7138 U_CAPI
void U_EXPORT2
7139 ucol_setAttribute(UCollator
*coll
, UColAttribute attr
, UColAttributeValue value
, UErrorCode
*status
) {
7140 if(U_FAILURE(*status
) || coll
== NULL
) {
7143 UColAttributeValue oldFrench
= coll
->frenchCollation
;
7144 UColAttributeValue oldCaseFirst
= coll
->caseFirst
;
7146 case UCOL_NUMERIC_COLLATION
: /* sort substrings of digits as numbers */
7147 if(value
== UCOL_ON
) {
7148 coll
->numericCollation
= UCOL_ON
;
7149 coll
->numericCollationisDefault
= FALSE
;
7150 } else if (value
== UCOL_OFF
) {
7151 coll
->numericCollation
= UCOL_OFF
;
7152 coll
->numericCollationisDefault
= FALSE
;
7153 } else if (value
== UCOL_DEFAULT
) {
7154 coll
->numericCollationisDefault
= TRUE
;
7155 coll
->numericCollation
= (UColAttributeValue
)coll
->options
->numericCollation
;
7157 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7160 case UCOL_HIRAGANA_QUATERNARY_MODE
: /* special quaternary values for Hiragana */
7161 if(value
== UCOL_ON
) {
7162 coll
->hiraganaQ
= UCOL_ON
;
7163 coll
->hiraganaQisDefault
= FALSE
;
7164 } else if (value
== UCOL_OFF
) {
7165 coll
->hiraganaQ
= UCOL_OFF
;
7166 coll
->hiraganaQisDefault
= FALSE
;
7167 } else if (value
== UCOL_DEFAULT
) {
7168 coll
->hiraganaQisDefault
= TRUE
;
7169 coll
->hiraganaQ
= (UColAttributeValue
)coll
->options
->hiraganaQ
;
7171 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7174 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
7175 if(value
== UCOL_ON
) {
7176 coll
->frenchCollation
= UCOL_ON
;
7177 coll
->frenchCollationisDefault
= FALSE
;
7178 } else if (value
== UCOL_OFF
) {
7179 coll
->frenchCollation
= UCOL_OFF
;
7180 coll
->frenchCollationisDefault
= FALSE
;
7181 } else if (value
== UCOL_DEFAULT
) {
7182 coll
->frenchCollationisDefault
= TRUE
;
7183 coll
->frenchCollation
= (UColAttributeValue
)coll
->options
->frenchCollation
;
7185 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7188 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
7189 if(value
== UCOL_SHIFTED
) {
7190 coll
->alternateHandling
= UCOL_SHIFTED
;
7191 coll
->alternateHandlingisDefault
= FALSE
;
7192 } else if (value
== UCOL_NON_IGNORABLE
) {
7193 coll
->alternateHandling
= UCOL_NON_IGNORABLE
;
7194 coll
->alternateHandlingisDefault
= FALSE
;
7195 } else if (value
== UCOL_DEFAULT
) {
7196 coll
->alternateHandlingisDefault
= TRUE
;
7197 coll
->alternateHandling
= (UColAttributeValue
)coll
->options
->alternateHandling
;
7199 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7202 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
7203 if(value
== UCOL_LOWER_FIRST
) {
7204 coll
->caseFirst
= UCOL_LOWER_FIRST
;
7205 coll
->caseFirstisDefault
= FALSE
;
7206 } else if (value
== UCOL_UPPER_FIRST
) {
7207 coll
->caseFirst
= UCOL_UPPER_FIRST
;
7208 coll
->caseFirstisDefault
= FALSE
;
7209 } else if (value
== UCOL_OFF
) {
7210 coll
->caseFirst
= UCOL_OFF
;
7211 coll
->caseFirstisDefault
= FALSE
;
7212 } else if (value
== UCOL_DEFAULT
) {
7213 coll
->caseFirst
= (UColAttributeValue
)coll
->options
->caseFirst
;
7214 coll
->caseFirstisDefault
= TRUE
;
7216 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7219 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
7220 if(value
== UCOL_ON
) {
7221 coll
->caseLevel
= UCOL_ON
;
7222 coll
->caseLevelisDefault
= FALSE
;
7223 } else if (value
== UCOL_OFF
) {
7224 coll
->caseLevel
= UCOL_OFF
;
7225 coll
->caseLevelisDefault
= FALSE
;
7226 } else if (value
== UCOL_DEFAULT
) {
7227 coll
->caseLevel
= (UColAttributeValue
)coll
->options
->caseLevel
;
7228 coll
->caseLevelisDefault
= TRUE
;
7230 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7233 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
7234 if(value
== UCOL_ON
) {
7235 coll
->normalizationMode
= UCOL_ON
;
7236 coll
->normalizationModeisDefault
= FALSE
;
7237 } else if (value
== UCOL_OFF
) {
7238 coll
->normalizationMode
= UCOL_OFF
;
7239 coll
->normalizationModeisDefault
= FALSE
;
7240 } else if (value
== UCOL_DEFAULT
) {
7241 coll
->normalizationModeisDefault
= TRUE
;
7242 coll
->normalizationMode
= (UColAttributeValue
)coll
->options
->normalizationMode
;
7244 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7247 case UCOL_STRENGTH
: /* attribute for strength */
7248 if (value
== UCOL_DEFAULT
) {
7249 coll
->strengthisDefault
= TRUE
;
7250 coll
->strength
= (UColAttributeValue
)coll
->options
->strength
;
7251 } else if (value
<= UCOL_IDENTICAL
) {
7252 coll
->strengthisDefault
= FALSE
;
7253 coll
->strength
= value
;
7255 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7258 case UCOL_ATTRIBUTE_COUNT
:
7260 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7263 if(oldFrench
!= coll
->frenchCollation
|| oldCaseFirst
!= coll
->caseFirst
) {
7264 coll
->latinOneRegenTable
= TRUE
;
7266 coll
->latinOneRegenTable
= FALSE
;
7268 ucol_updateInternalState(coll
, status
);
7271 U_CAPI UColAttributeValue U_EXPORT2
7272 ucol_getAttribute(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
) {
7273 if(U_FAILURE(*status
) || coll
== NULL
) {
7274 return UCOL_DEFAULT
;
7277 case UCOL_NUMERIC_COLLATION
:
7278 return coll
->numericCollation
;
7279 case UCOL_HIRAGANA_QUATERNARY_MODE
:
7280 return coll
->hiraganaQ
;
7281 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
7282 return coll
->frenchCollation
;
7283 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
7284 return coll
->alternateHandling
;
7285 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
7286 return coll
->caseFirst
;
7287 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
7288 return coll
->caseLevel
;
7289 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
7290 return coll
->normalizationMode
;
7291 case UCOL_STRENGTH
: /* attribute for strength */
7292 return coll
->strength
;
7293 case UCOL_ATTRIBUTE_COUNT
:
7295 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7298 return UCOL_DEFAULT
;
7301 U_CAPI
void U_EXPORT2
7302 ucol_setStrength( UCollator
*coll
,
7303 UCollationStrength strength
)
7305 UErrorCode status
= U_ZERO_ERROR
;
7306 ucol_setAttribute(coll
, UCOL_STRENGTH
, strength
, &status
);
7309 U_CAPI UCollationStrength U_EXPORT2
7310 ucol_getStrength(const UCollator
*coll
)
7312 UErrorCode status
= U_ZERO_ERROR
;
7313 return ucol_getAttribute(coll
, UCOL_STRENGTH
, &status
);
7316 /****************************************************************************/
7317 /* Following are misc functions */
7318 /* there are new APIs and some compatibility APIs */
7319 /****************************************************************************/
7321 U_CAPI
void U_EXPORT2
7322 ucol_getVersion(const UCollator
* coll
,
7323 UVersionInfo versionInfo
)
7325 /* RunTime version */
7326 uint8_t rtVersion
= UCOL_RUNTIME_VERSION
;
7327 /* Builder version*/
7328 uint8_t bdVersion
= coll
->image
->version
[0];
7330 /* Charset Version. Need to get the version from cnv files
7331 * makeconv should populate cnv files with version and
7332 * an api has to be provided in ucnv.h to obtain this version
7334 uint8_t csVersion
= 0;
7336 /* combine the version info */
7337 uint16_t cmbVersion
= (uint16_t)((rtVersion
<<11) | (bdVersion
<<6) | (csVersion
));
7339 /* Tailoring rules */
7340 versionInfo
[0] = (uint8_t)(cmbVersion
>>8);
7341 versionInfo
[1] = (uint8_t)cmbVersion
;
7342 versionInfo
[2] = coll
->image
->version
[1];
7344 versionInfo
[3] = coll
->UCA
->image
->UCAVersion
[0];
7351 /* This internal API checks whether a character is tailored or not */
7352 U_CAPI UBool U_EXPORT2
7353 ucol_isTailored(const UCollator
*coll
, const UChar u
, UErrorCode
*status
) {
7354 if(U_FAILURE(*status
) || coll
== NULL
|| coll
== coll
->UCA
) {
7358 uint32_t CE
= UCOL_NOT_FOUND
;
7359 const UChar
*ContractionStart
= NULL
;
7360 if(u
< 0x100) { /* latin-1 */
7361 CE
= coll
->latinOneMapping
[u
];
7362 if(coll
->UCA
&& CE
== coll
->UCA
->latinOneMapping
[u
]) {
7365 } else { /* regular */
7366 CE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, u
);
7369 if(isContraction(CE
)) {
7370 ContractionStart
= (UChar
*)coll
->image
+getContractOffset(CE
);
7371 CE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
7374 return (UBool
)(CE
!= UCOL_NOT_FOUND
);
7378 /****************************************************************************/
7379 /* Following are the string compare functions */
7381 /****************************************************************************/
7384 /* ucol_checkIdent internal function. Does byte level string compare. */
7385 /* Used by strcoll if strength == identical and strings */
7386 /* are otherwise equal. Moved out-of-line because this */
7387 /* is a rare case. */
7389 /* Comparison must be done on NFD normalized strings. */
7390 /* FCD is not good enough. */
7392 /* TODO: make an incremental NFD Comparison function, which could */
7393 /* be of general use */
7396 UCollationResult
ucol_checkIdent(collIterate
*sColl
, collIterate
*tColl
, UBool normalize
, UErrorCode
*status
)
7399 // TODO: When we have an UChar iterator, we need to access the whole string. One
7400 // useful modification would be a UChar iterator extract API, since reset next next...
7402 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7404 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7405 // of same type, but that doesn't really mean that it will stay that way.
7407 // The division for the array length may truncate the array size to
7408 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7409 // for all platforms anyway.
7410 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7411 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7412 //UChar sStackBuf[256], tStackBuf[256];
7413 //int32_t sBufSize = 256, tBufSize = 256;
7419 UBool freeSBuf
= FALSE
, freeTBuf
= FALSE
;
7421 if (sColl
->flags
& UCOL_USE_ITERATOR
) {
7422 UNormIterator
*sNIt
= NULL
, *tNIt
= NULL
;
7423 sNIt
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
7424 tNIt
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
7425 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7426 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7427 UCharIterator
*sIt
= unorm_setIter(sNIt
, sColl
->iterator
, UNORM_NFD
, status
);
7428 UCharIterator
*tIt
= unorm_setIter(tNIt
, tColl
->iterator
, UNORM_NFD
, status
);
7429 comparison
= u_strCompareIter(sIt
, tIt
, TRUE
);
7430 unorm_closeIter(sNIt
);
7431 unorm_closeIter(tNIt
);
7433 sLen
= (sColl
->flags
& UCOL_ITER_HASLEN
) ? sColl
->endp
- sColl
->string
: -1;
7434 sBuf
= sColl
->string
;
7435 tLen
= (tColl
->flags
& UCOL_ITER_HASLEN
) ? tColl
->endp
- tColl
->string
: -1;
7436 tBuf
= tColl
->string
;
7439 *status
= U_ZERO_ERROR
;
7440 if (unorm_quickCheck(sBuf
, sLen
, UNORM_NFD
, status
) != UNORM_YES
) {
7441 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
7445 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
7446 if(!u_growBufferFromStatic(sColl
->stackWritableBuffer
,
7447 &sColl
->writableBuffer
,
7448 (int32_t *)&sColl
->writableBufSize
, sLen
,
7452 *status
= U_MEMORY_ALLOCATION_ERROR
;
7453 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7455 *status
= U_ZERO_ERROR
;
7456 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
7465 sBuf
= sColl
->writableBuffer
;
7466 if (sBuf
!= sColl
->stackWritableBuffer
) {
7467 sColl
->flags
|= UCOL_ITER_ALLOCATED
;
7471 *status
= U_ZERO_ERROR
;
7472 if (unorm_quickCheck(tBuf
, tLen
, UNORM_NFD
, status
) != UNORM_YES
) {
7473 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7477 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
7478 if(!u_growBufferFromStatic(tColl
->stackWritableBuffer
,
7479 &tColl
->writableBuffer
,
7480 (int32_t *)&tColl
->writableBufSize
, tLen
,
7484 *status
= U_MEMORY_ALLOCATION_ERROR
;
7485 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7487 *status
= U_ZERO_ERROR
;
7488 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7497 tBuf
= tColl
->writableBuffer
;
7498 if (tBuf
!= tColl
->stackWritableBuffer
) {
7499 tColl
->flags
|= UCOL_ITER_ALLOCATED
;
7504 if (sLen
== -1 && tLen
== -1) {
7505 comparison
= u_strcmpCodePointOrder(sBuf
, tBuf
);
7508 sLen
= u_strlen(sBuf
);
7511 tLen
= u_strlen(tBuf
);
7513 comparison
= u_memcmpCodePointOrder(sBuf
, tBuf
, uprv_min(sLen
, tLen
));
7514 if (comparison
== 0) {
7515 comparison
= sLen
- tLen
;
7520 if (comparison
< 0) {
7522 } else if (comparison
== 0) {
7524 } else /* comparison > 0 */ {
7525 return UCOL_GREATER
;
7529 /* CEBuf - A struct and some inline functions to handle the saving */
7530 /* of CEs in a buffer within ucol_strcoll */
7532 #define UCOL_CEBUF_SIZE 512
7533 typedef struct ucol_CEBuf
{
7537 uint32_t localArray
[UCOL_CEBUF_SIZE
];
7542 inline void UCOL_INIT_CEBUF(ucol_CEBuf
*b
) {
7543 (b
)->buf
= (b
)->pos
= (b
)->localArray
;
7544 (b
)->endp
= (b
)->buf
+ UCOL_CEBUF_SIZE
;
7548 void ucol_CEBuf_Expand(ucol_CEBuf
*b
, collIterate
*ci
, UErrorCode
*status
) {
7553 ci
->flags
|= UCOL_ITER_ALLOCATED
;
7554 oldSize
= b
->pos
- b
->buf
;
7555 newSize
= oldSize
* 2;
7556 newBuf
= (uint32_t *)uprv_malloc(newSize
* sizeof(uint32_t));
7557 if(newBuf
== NULL
) {
7558 *status
= U_MEMORY_ALLOCATION_ERROR
;
7561 uprv_memcpy(newBuf
, b
->buf
, oldSize
* sizeof(uint32_t));
7562 if (b
->buf
!= b
->localArray
) {
7566 b
->endp
= b
->buf
+ newSize
;
7567 b
->pos
= b
->buf
+ oldSize
;
7572 inline void UCOL_CEBUF_PUT(ucol_CEBuf
*b
, uint32_t ce
, collIterate
*ci
, UErrorCode
*status
) {
7573 if (b
->pos
== b
->endp
) {
7574 ucol_CEBuf_Expand(b
, ci
, status
);
7576 if (U_SUCCESS(*status
)) {
7581 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7582 /* It is used when compare gets in trouble and needs to bail out */
7583 static UCollationResult
ucol_compareUsingSortKeys(collIterate
*sColl
,
7587 uint8_t sourceKey
[UCOL_MAX_BUFFER
], targetKey
[UCOL_MAX_BUFFER
];
7588 uint8_t *sourceKeyP
= sourceKey
;
7589 uint8_t *targetKeyP
= targetKey
;
7590 int32_t sourceKeyLen
= UCOL_MAX_BUFFER
, targetKeyLen
= UCOL_MAX_BUFFER
;
7591 const UCollator
*coll
= sColl
->coll
;
7592 UChar
*source
= NULL
;
7593 UChar
*target
= NULL
;
7594 int32_t result
= UCOL_EQUAL
;
7595 UChar sStackBuf
[256], tStackBuf
[256];
7596 int32_t sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7597 int32_t targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7599 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7600 if(sColl
->flags
& UCOL_USE_ITERATOR
) {
7601 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7602 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7604 UChar
*sBufp
= source
;
7606 UChar
*tBufp
= target
;
7607 while(sColl
->iterator
->hasNext(sColl
->iterator
)) {
7608 *sBufp
++ = (UChar
)sColl
->iterator
->next(sColl
->iterator
);
7610 while(tColl
->iterator
->hasNext(tColl
->iterator
)) {
7611 *tBufp
++ = (UChar
)tColl
->iterator
->next(tColl
->iterator
);
7613 sourceLength
= sBufp
- source
;
7614 targetLength
= tBufp
- target
;
7615 } else { // no iterators
7616 sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7617 targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7618 source
= sColl
->string
;
7619 target
= tColl
->string
;
7624 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7625 if(sourceKeyLen
> UCOL_MAX_BUFFER
) {
7626 sourceKeyP
= (uint8_t*)uprv_malloc(sourceKeyLen
*sizeof(uint8_t));
7627 if(sourceKeyP
== NULL
) {
7628 *status
= U_MEMORY_ALLOCATION_ERROR
;
7629 goto cleanup_and_do_compare
;
7631 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7634 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7635 if(targetKeyLen
> UCOL_MAX_BUFFER
) {
7636 targetKeyP
= (uint8_t*)uprv_malloc(targetKeyLen
*sizeof(uint8_t));
7637 if(targetKeyP
== NULL
) {
7638 *status
= U_MEMORY_ALLOCATION_ERROR
;
7639 goto cleanup_and_do_compare
;
7641 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7644 result
= uprv_strcmp((const char*)sourceKeyP
, (const char*)targetKeyP
);
7646 cleanup_and_do_compare
:
7647 if(sourceKeyP
!= NULL
&& sourceKeyP
!= sourceKey
) {
7648 uprv_free(sourceKeyP
);
7651 if(targetKeyP
!= NULL
&& targetKeyP
!= targetKey
) {
7652 uprv_free(targetKeyP
);
7657 } else if(result
>0) {
7658 return UCOL_GREATER
;
7665 static inline UCollationResult
7666 ucol_strcollRegular( collIterate
*sColl
, collIterate
*tColl
,
7667 // const UCollator *coll,
7668 // const UChar *source,
7669 // int32_t sourceLength,
7670 // const UChar *target,
7671 // int32_t targetLength,
7676 const UCollator
*coll
= sColl
->coll
;
7679 // setting up the collator parameters
7680 UColAttributeValue strength
= coll
->strength
;
7681 UBool initialCheckSecTer
= (strength
>= UCOL_SECONDARY
);
7683 UBool checkSecTer
= initialCheckSecTer
;
7684 UBool checkTertiary
= (strength
>= UCOL_TERTIARY
);
7685 UBool checkQuad
= (strength
>= UCOL_QUATERNARY
);
7686 UBool checkIdent
= (strength
== UCOL_IDENTICAL
);
7687 UBool checkCase
= (coll
->caseLevel
== UCOL_ON
);
7688 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && checkSecTer
;
7689 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
7690 UBool qShifted
= shifted
&& checkQuad
;
7691 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && checkQuad
;
7693 if(doHiragana
&& shifted
) {
7694 return (ucol_compareUsingSortKeys(sColl
, tColl
, status
));
7696 uint8_t caseSwitch
= coll
->caseSwitch
;
7697 uint8_t tertiaryMask
= coll
->tertiaryMask
;
7699 // This is the lowest primary value that will not be ignored if shifted
7700 uint32_t LVT
= (shifted
)?(coll
->variableTopValue
<<16):0;
7702 UCollationResult result
= UCOL_EQUAL
;
7703 UCollationResult hirResult
= UCOL_EQUAL
;
7705 // Preparing the CE buffers. They will be filled during the primary phase
7708 UCOL_INIT_CEBUF(&sCEs
);
7709 UCOL_INIT_CEBUF(&tCEs
);
7711 uint32_t secS
= 0, secT
= 0;
7712 uint32_t sOrder
=0, tOrder
=0;
7714 // Non shifted primary processing is quite simple
7718 // We fetch CEs until we hit a non ignorable primary or end.
7720 // We get the next CE
7721 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7722 // Stuff it in the buffer
7723 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7724 // And keep just the primary part.
7725 sOrder
&= UCOL_PRIMARYMASK
;
7726 } while(sOrder
== 0);
7728 // see the comments on the above block
7730 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7731 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7732 tOrder
&= UCOL_PRIMARYMASK
;
7733 } while(tOrder
== 0);
7735 // if both primaries are the same
7736 if(sOrder
== tOrder
) {
7737 // and there are no more CEs, we advance to the next level
7738 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7741 if(doHiragana
&& hirResult
== UCOL_EQUAL
) {
7742 if((sColl
->flags
& UCOL_WAS_HIRAGANA
) != (tColl
->flags
& UCOL_WAS_HIRAGANA
)) {
7743 hirResult
= ((sColl
->flags
& UCOL_WAS_HIRAGANA
) > (tColl
->flags
& UCOL_WAS_HIRAGANA
))
7744 ? UCOL_LESS
:UCOL_GREATER
;
7748 // if two primaries are different, we are done
7749 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7752 } // no primary difference... do the rest from the buffers
7753 } else { // shifted - do a slightly more complicated processing :)
7755 UBool sInShifted
= FALSE
;
7756 UBool tInShifted
= FALSE
;
7757 // This version of code can be refactored. However, it seems easier to understand this way.
7758 // Source loop. Sam as the target loop.
7760 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7761 if(sOrder
== UCOL_NO_MORE_CES
) {
7762 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7764 } else if(sOrder
== 0 || (sInShifted
&& (sOrder
& UCOL_PRIMARYMASK
) == 0)) {
7765 /* UCA amendment - ignore ignorables that follow shifted code points */
7767 } else if(isContinuation(sOrder
)) {
7768 if((sOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7770 sOrder
= (sOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7771 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7774 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7777 } else { /* Just lower level values */
7781 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7785 } else { /* regular */
7786 if((sOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7787 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7790 if((sOrder
& UCOL_PRIMARYMASK
) > 0) {
7792 sOrder
&= UCOL_PRIMARYMASK
;
7793 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7796 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7803 sOrder
&= UCOL_PRIMARYMASK
;
7807 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7808 if(tOrder
== UCOL_NO_MORE_CES
) {
7809 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7811 } else if(tOrder
== 0 || (tInShifted
&& (tOrder
& UCOL_PRIMARYMASK
) == 0)) {
7812 /* UCA amendment - ignore ignorables that follow shifted code points */
7814 } else if(isContinuation(tOrder
)) {
7815 if((tOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7817 tOrder
= (tOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7818 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7821 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7824 } else { /* Just lower level values */
7828 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7832 } else { /* regular */
7833 if((tOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7834 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7837 if((tOrder
& UCOL_PRIMARYMASK
) > 0) {
7839 tOrder
&= UCOL_PRIMARYMASK
;
7840 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7843 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7850 tOrder
&= UCOL_PRIMARYMASK
;
7853 if(sOrder
== tOrder
) {
7855 if(doHiragana && hirResult == UCOL_EQUAL) {
7856 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7857 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7858 ? UCOL_LESS:UCOL_GREATER;
7862 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7870 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7873 } /* no primary difference... do the rest from the buffers */
7876 /* now, we're gonna reexamine collected CEs */
7880 /* This is the secondary level of comparison */
7882 if(!isFrenchSec
) { /* normal */
7887 secS
= *(sCE
++) & UCOL_SECONDARYMASK
;
7891 secT
= *(tCE
++) & UCOL_SECONDARYMASK
;
7895 if(secS
== UCOL_NO_MORE_CES_SECONDARY
) {
7902 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7906 } else { /* do the French */
7907 uint32_t *sCESave
= NULL
;
7908 uint32_t *tCESave
= NULL
;
7909 sCE
= sCEs
.pos
-2; /* this could also be sCEs-- if needs to be optimized */
7912 while (secS
== 0 && sCE
>= sCEs
.buf
) {
7915 if(isContinuation(secS
)) {
7916 while(isContinuation(secS
= *(sCE
--)))
7918 /* after this, secS has the start of continuation, and sCEs points before that */
7919 sCESave
= sCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7920 sCE
+=2; /* need to point to the first continuation CP */
7921 /* However, now you can just continue doing stuff */
7925 if(!isContinuation(secS
)) { /* This means we have finished with this cont */
7926 sCE
= sCESave
; /* reset the pointer to before continuation */
7931 secS
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7934 while(secT
== 0 && tCE
>= tCEs
.buf
) {
7937 if(isContinuation(secT
)) {
7938 while(isContinuation(secT
= *(tCE
--)))
7940 /* after this, secS has the start of continuation, and sCEs points before that */
7941 tCESave
= tCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7942 tCE
+=2; /* need to point to the first continuation CP */
7943 /* However, now you can just continue doing stuff */
7947 if(!isContinuation(secT
)) { /* This means we have finished with this cont */
7948 tCE
= tCESave
; /* reset the pointer to before continuation */
7953 secT
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7957 if(secS
== UCOL_NO_MORE_CES_SECONDARY
|| (sCE
< sCEs
.buf
&& tCE
< tCEs
.buf
)) {
7964 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7971 /* doing the case bit */
7976 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7977 if(!isContinuation(*sCE
++)) {
7979 if(((secS
& UCOL_PRIMARYMASK
) != 0) || strength
> UCOL_PRIMARY
) {
7980 // primary ignorables should not be considered on the case level when the strength is primary
7981 // otherwise, the CEs stop being well-formed
7982 secS
&= UCOL_TERT_CASE_MASK
;
7992 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7993 if(!isContinuation(*tCE
++)) {
7995 if(((secT
& UCOL_PRIMARYMASK
) != 0) || strength
> UCOL_PRIMARY
) {
7996 // primary ignorables should not be considered on the case level when the strength is primary
7997 // otherwise, the CEs stop being well-formed
7998 secT
&= UCOL_TERT_CASE_MASK
;
8008 if((secS
& UCOL_CASE_BIT_MASK
) < (secT
& UCOL_CASE_BIT_MASK
)) {
8011 } else if((secS
& UCOL_CASE_BIT_MASK
) > (secT
& UCOL_CASE_BIT_MASK
)) {
8012 result
= UCOL_GREATER
;
8016 if((secS
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
|| (secT
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
) {
8025 /* Tertiary level */
8032 while((secS
& UCOL_REMOVE_CASE
) == 0) {
8033 secS
= *(sCE
++) & tertiaryMask
;
8034 if(!isContinuation(secS
)) {
8037 secS
&= UCOL_REMOVE_CASE
;
8041 while((secT
& UCOL_REMOVE_CASE
) == 0) {
8042 secT
= *(tCE
++) & tertiaryMask
;
8043 if(!isContinuation(secT
)) {
8046 secT
&= UCOL_REMOVE_CASE
;
8051 if((secS
& UCOL_REMOVE_CASE
) == 1) {
8058 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
8065 if(qShifted
/*checkQuad*/) {
8066 UBool sInShifted
= TRUE
;
8067 UBool tInShifted
= TRUE
;
8073 while(secS
== 0 && secS
!= UCOL_NO_MORE_CES
|| (isContinuation(secS
) && !sInShifted
)) {
8075 if(isContinuation(secS
)) {
8079 } else if(secS
> LVT
|| (secS
& UCOL_PRIMARYMASK
) == 0) { /* non continuation */
8080 secS
= UCOL_PRIMARYMASK
;
8086 secS
&= UCOL_PRIMARYMASK
;
8089 while(secT
== 0 && secT
!= UCOL_NO_MORE_CES
|| (isContinuation(secT
) && !tInShifted
)) {
8091 if(isContinuation(secT
)) {
8095 } else if(secT
> LVT
|| (secT
& UCOL_PRIMARYMASK
) == 0) {
8096 secT
= UCOL_PRIMARYMASK
;
8102 secT
&= UCOL_PRIMARYMASK
;
8105 if(secS
== UCOL_NO_MORE_CES_PRIMARY
) {
8112 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
8116 } else if(doHiragana
&& hirResult
!= UCOL_EQUAL
) {
8117 // If we're fine on quaternaries, we might be different
8118 // on Hiragana. This, however, might fail us in shifted.
8123 /* For IDENTICAL comparisons, we use a bitwise character comparison */
8124 /* as a tiebreaker if all else is equal. */
8125 /* Getting here should be quite rare - strings are not identical - */
8126 /* that is checked first, but compared == through all other checks. */
8129 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8130 result
= ucol_checkIdent(sColl
, tColl
, TRUE
, status
);
8134 if ((sColl
->flags
| tColl
->flags
) & UCOL_ITER_ALLOCATED
) {
8135 freeHeapWritableBuffer(sColl
);
8136 freeHeapWritableBuffer(tColl
);
8138 if (sCEs
.buf
!= sCEs
.localArray
) {
8139 uprv_free(sCEs
.buf
);
8141 if (tCEs
.buf
!= tCEs
.localArray
) {
8142 uprv_free(tCEs
.buf
);
8150 static inline uint32_t
8151 ucol_getLatinOneContraction(const UCollator
*coll
, int32_t strength
,
8152 uint32_t CE
, const UChar
*s
, int32_t *index
, int32_t len
)
8154 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
&0xFFF);
8155 int32_t latinOneOffset
= (CE
& 0x00FFF000) >> 12;
8157 UChar schar
= 0, tchar
= 0;
8161 if(s
[*index
] == 0) { // end of string
8162 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8168 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8174 while(schar
> (tchar
= *(UCharOffset
+offset
))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8178 if (schar
== tchar
) {
8180 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
+offset
]);
8184 if(schar
& 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8185 return UCOL_BAIL_OUT_CE
;
8187 // skip completely ignorables
8188 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, schar
);
8189 if(isZeroCE
== 0) { // we have to ignore completely ignorables
8194 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8201 * This is a fast strcoll, geared towards text in Latin-1.
8202 * It supports contractions of size two, French secondaries
8203 * and case switching. You can use it with strengths primary
8204 * to tertiary. It does not support shifted and case level.
8205 * It relies on the table build by setupLatin1Table. If it
8206 * doesn't understand something, it will go to the regular
8209 static inline UCollationResult
8210 ucol_strcollUseLatin1( const UCollator
*coll
,
8211 const UChar
*source
,
8213 const UChar
*target
,
8218 int32_t strength
= coll
->strength
;
8220 int32_t sIndex
= 0, tIndex
= 0;
8221 UChar sChar
= 0, tChar
= 0;
8222 uint32_t sOrder
=0, tOrder
=0;
8224 UBool endOfSource
= FALSE
;
8226 uint32_t *elements
= coll
->latinOneCEs
;
8228 UBool haveContractions
= FALSE
; // if we have contractions in our string
8229 // we cannot do French secondary
8231 // Do the primary level
8233 while(sOrder
==0) { // this loop skips primary ignorables
8234 // sOrder=getNextlatinOneCE(source);
8235 if(sLen
==-1) { // handling zero terminated strings
8236 sChar
=source
[sIndex
++];
8241 } else { // handling strings with known length
8246 sChar
=source
[sIndex
++];
8248 if(sChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8249 //fprintf(stderr, "R");
8251 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8253 sOrder
= elements
[sChar
];
8254 if(sOrder
>= UCOL_NOT_FOUND
) { // if we got a special
8255 // specials can basically be either contractions or bail-out signs. If we get anything
8256 // else, we'll bail out anywasy
8257 if(getCETag(sOrder
) == CONTRACTION_TAG
) {
8258 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, sOrder
, source
, &sIndex
, sLen
);
8259 haveContractions
= TRUE
; // if there are contractions, we cannot do French secondary
8260 // However, if there are contractions in the table, but we always use just one char,
8261 // we might be able to do French. This should be checked out.
8263 if(sOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8264 //fprintf(stderr, "S");
8266 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8271 while(tOrder
==0) { // this loop skips primary ignorables
8272 // tOrder=getNextlatinOneCE(target);
8273 if(tLen
==-1) { // handling zero terminated strings
8274 tChar
=target
[tIndex
++];
8276 if(endOfSource
) { // this is different than source loop,
8277 // as we already know that source loop is done here,
8278 // so we can either finish the primary loop if both
8279 // strings are done or anounce the result if only
8280 // target is done. Same below.
8283 return UCOL_GREATER
;
8286 } else { // handling strings with known length
8291 return UCOL_GREATER
;
8294 tChar
=target
[tIndex
++];
8296 if(tChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8297 //fprintf(stderr, "R");
8299 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8301 tOrder
= elements
[tChar
];
8302 if(tOrder
>= UCOL_NOT_FOUND
) {
8303 // Handling specials, see the comments for source
8304 if(getCETag(tOrder
) == CONTRACTION_TAG
) {
8305 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, tOrder
, target
, &tIndex
, tLen
);
8306 haveContractions
= TRUE
;
8308 if(tOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8309 //fprintf(stderr, "S");
8311 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8315 if(endOfSource
) { // source is finished, but target is not, say the result.
8319 if(sOrder
== tOrder
) { // if we have same CEs, we continue the loop
8320 sOrder
= 0; tOrder
= 0;
8323 // compare current top bytes
8324 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8325 // top bytes differ, return difference
8326 if(sOrder
< tOrder
) {
8328 } else if(sOrder
> tOrder
) {
8329 return UCOL_GREATER
;
8331 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8332 // since we must return enum value
8335 // top bytes match, continue with following bytes
8342 // after primary loop, we definitely know the sizes of strings,
8343 // so we set it and use simpler loop for secondaries and tertiaries
8344 sLen
= sIndex
; tLen
= tIndex
;
8345 if(strength
>= UCOL_SECONDARY
) {
8346 // adjust the table beggining
8347 elements
+= coll
->latinOneTableLen
;
8348 endOfSource
= FALSE
;
8350 if(coll
->frenchCollation
== UCOL_OFF
) { // non French
8351 // This loop is a simplified copy of primary loop
8352 // at this point we know that whole strings are latin-1, so we don't
8353 // check for that. We also know that we only have contractions as
8355 sIndex
= 0; tIndex
= 0;
8362 sChar
=source
[sIndex
++];
8363 sOrder
= elements
[sChar
];
8364 if(sOrder
> UCOL_NOT_FOUND
) {
8365 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, sOrder
, source
, &sIndex
, sLen
);
8374 return UCOL_GREATER
;
8377 tChar
=target
[tIndex
++];
8378 tOrder
= elements
[tChar
];
8379 if(tOrder
> UCOL_NOT_FOUND
) {
8380 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, tOrder
, target
, &tIndex
, tLen
);
8387 if(sOrder
== tOrder
) {
8388 sOrder
= 0; tOrder
= 0;
8391 // see primary loop for comments on this
8392 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8393 if(sOrder
< tOrder
) {
8395 } else if(sOrder
> tOrder
) {
8396 return UCOL_GREATER
;
8404 if(haveContractions
) { // if we have contractions, we have to bail out
8405 // since we don't really know how to handle them here
8407 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8409 // For French, we go backwards
8410 sIndex
= sLen
; tIndex
= tLen
;
8417 sChar
=source
[--sIndex
];
8418 sOrder
= elements
[sChar
];
8419 // don't even look for contractions
8427 return UCOL_GREATER
;
8430 tChar
=target
[--tIndex
];
8431 tOrder
= elements
[tChar
];
8432 // don't even look for contractions
8438 if(sOrder
== tOrder
) {
8439 sOrder
= 0; tOrder
= 0;
8442 // see the primary loop for comments
8443 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8444 if(sOrder
< tOrder
) {
8446 } else if(sOrder
> tOrder
) {
8447 return UCOL_GREATER
;
8458 if(strength
>= UCOL_TERTIARY
) {
8459 // tertiary loop is the same as secondary (except no French)
8460 elements
+= coll
->latinOneTableLen
;
8461 sIndex
= 0; tIndex
= 0;
8462 endOfSource
= FALSE
;
8469 sChar
=source
[sIndex
++];
8470 sOrder
= elements
[sChar
];
8471 if(sOrder
> UCOL_NOT_FOUND
) {
8472 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, sOrder
, source
, &sIndex
, sLen
);
8478 return UCOL_EQUAL
; // if both strings are at the end, they are equal
8480 return UCOL_GREATER
;
8483 tChar
=target
[tIndex
++];
8484 tOrder
= elements
[tChar
];
8485 if(tOrder
> UCOL_NOT_FOUND
) {
8486 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, tOrder
, target
, &tIndex
, tLen
);
8492 if(sOrder
== tOrder
) {
8493 sOrder
= 0; tOrder
= 0;
8496 if(((sOrder
^tOrder
)&0xff000000)!=0) {
8497 if(sOrder
< tOrder
) {
8499 } else if(sOrder
> tOrder
) {
8500 return UCOL_GREATER
;
8511 // Preparing the context objects for iterating over strings
8512 collIterate sColl
, tColl
;
8514 IInit_collIterate(coll
, source
, sLen
, &sColl
);
8515 IInit_collIterate(coll
, target
, tLen
, &tColl
);
8516 return ucol_strcollRegular(&sColl
, &tColl
, status
);
8520 U_CAPI UCollationResult U_EXPORT2
8521 ucol_strcollIter( const UCollator
*coll
,
8522 UCharIterator
*sIter
,
8523 UCharIterator
*tIter
,
8526 if(!status
|| U_FAILURE(*status
)) {
8530 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER
);
8531 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, sIter=%p, tIter=%p", coll
, sIter
, tIter
);
8533 if (sIter
== tIter
) {
8534 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8537 if(sIter
== NULL
|| tIter
== NULL
|| coll
== NULL
) {
8538 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8539 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8543 UCollationResult result
= UCOL_EQUAL
;
8545 // Preparing the context objects for iterating over strings
8546 collIterate sColl
, tColl
;
8547 // The division for the array length may truncate the array size to
8548 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8549 // for all platforms anyway.
8550 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8551 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8552 UNormIterator
*sNormIter
= NULL
, *tNormIter
= NULL
;
8554 IInit_collIterate(coll
, NULL
, -1, &sColl
);
8555 sColl
.iterator
= sIter
;
8556 sColl
.flags
|= UCOL_USE_ITERATOR
;
8557 IInit_collIterate(coll
, NULL
, -1, &tColl
);
8558 tColl
.flags
|= UCOL_USE_ITERATOR
;
8559 tColl
.iterator
= tIter
;
8561 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
) {
8562 sNormIter
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
8563 sColl
.iterator
= unorm_setIter(sNormIter
, sIter
, UNORM_FCD
, status
);
8564 sColl
.flags
&= ~UCOL_ITER_NORM
;
8566 tNormIter
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
8567 tColl
.iterator
= unorm_setIter(tNormIter
, tIter
, UNORM_FCD
, status
);
8568 tColl
.flags
&= ~UCOL_ITER_NORM
;
8571 UChar32 sChar
= U_SENTINEL
, tChar
= U_SENTINEL
;
8573 while((sChar
= sColl
.iterator
->next(sColl
.iterator
)) ==
8574 (tChar
= tColl
.iterator
->next(tColl
.iterator
))) {
8575 if(sChar
== U_SENTINEL
) {
8576 result
= UCOL_EQUAL
;
8581 if(sChar
== U_SENTINEL
) {
8582 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8585 if(tChar
== U_SENTINEL
) {
8586 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8589 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8590 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8592 if (ucol_unsafeCP((UChar
)sChar
, coll
) || ucol_unsafeCP((UChar
)tChar
, coll
))
8594 // We are stopped in the middle of a contraction.
8595 // Scan backwards through the == part of the string looking for the start of the contraction.
8596 // It doesn't matter which string we scan, since they are the same in this region.
8599 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8600 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8602 while (sChar
!= U_SENTINEL
&& ucol_unsafeCP((UChar
)sChar
, coll
));
8606 if(U_SUCCESS(*status
)) {
8607 result
= ucol_strcollRegular(&sColl
, &tColl
, status
);
8611 if(sNormIter
|| tNormIter
) {
8612 unorm_closeIter(sNormIter
);
8613 unorm_closeIter(tNormIter
);
8616 UTRACE_EXIT_VALUE_STATUS(result
, *status
)
8622 /* ucol_strcoll Main public API string comparison function */
8624 U_CAPI UCollationResult U_EXPORT2
8625 ucol_strcoll( const UCollator
*coll
,
8626 const UChar
*source
,
8627 int32_t sourceLength
,
8628 const UChar
*target
,
8629 int32_t targetLength
)
8633 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL
);
8634 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
8635 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source=%p, target=%p", coll
, source
, target
);
8636 UTRACE_DATA2(UTRACE_VERBOSE
, "source string = %vh ", source
, sourceLength
);
8637 UTRACE_DATA2(UTRACE_VERBOSE
, "target string = %vh ", target
, targetLength
);
8640 if(source
== NULL
|| target
== NULL
) {
8641 // do not crash, but return. Should have
8642 // status argument to return error.
8643 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8647 /* Quick check if source and target are same strings. */
8648 /* They should either both be NULL terminated or the explicit length should be set on both. */
8649 if (source
==target
&& sourceLength
==targetLength
) {
8650 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8654 /* Scan the strings. Find: */
8655 /* The length of any leading portion that is equal */
8656 /* Whether they are exactly equal. (in which case we just return) */
8657 const UChar
*pSrc
= source
;
8658 const UChar
*pTarg
= target
;
8659 int32_t equalLength
;
8661 if (sourceLength
== -1 && targetLength
== -1) {
8662 // Both strings are null terminated.
8663 // Scan through any leading equal portion.
8664 while (*pSrc
== *pTarg
&& *pSrc
!= 0) {
8668 if (*pSrc
== 0 && *pTarg
== 0) {
8669 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8672 equalLength
= pSrc
- source
;
8676 // One or both strings has an explicit length.
8677 const UChar
*pSrcEnd
= source
+ sourceLength
;
8678 const UChar
*pTargEnd
= target
+ targetLength
;
8680 // Scan while the strings are bitwise ==, or until one is exhausted.
8682 if (pSrc
== pSrcEnd
|| pTarg
== pTargEnd
) {
8685 if ((*pSrc
== 0 && sourceLength
== -1) || (*pTarg
== 0 && targetLength
== -1)) {
8688 if (*pSrc
!= *pTarg
) {
8694 equalLength
= pSrc
- source
;
8696 // If we made it all the way through both strings, we are done. They are ==
8697 if ((pSrc
==pSrcEnd
|| (pSrcEnd
<pSrc
&& *pSrc
==0)) && /* At end of src string, however it was specified. */
8698 (pTarg
==pTargEnd
|| (pTargEnd
<pTarg
&& *pTarg
==0))) /* and also at end of dest string */
8700 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8704 if (equalLength
> 0) {
8705 /* There is an identical portion at the beginning of the two strings. */
8706 /* If the identical portion ends within a contraction or a comibining */
8707 /* character sequence, back up to the start of that sequence. */
8709 // These values should already be set by the code above.
8710 //pSrc = source + equalLength; /* point to the first differing chars */
8711 //pTarg = target + equalLength;
8712 if (pSrc
!= source
+sourceLength
&& ucol_unsafeCP(*pSrc
, coll
) ||
8713 pTarg
!= target
+targetLength
&& ucol_unsafeCP(*pTarg
, coll
))
8715 // We are stopped in the middle of a contraction.
8716 // Scan backwards through the == part of the string looking for the start of the contraction.
8717 // It doesn't matter which string we scan, since they are the same in this region.
8723 while (equalLength
>0 && ucol_unsafeCP(*pSrc
, coll
));
8726 source
+= equalLength
;
8727 target
+= equalLength
;
8728 if (sourceLength
> 0) {
8729 sourceLength
-= equalLength
;
8731 if (targetLength
> 0) {
8732 targetLength
-= equalLength
;
8736 UErrorCode status
= U_ZERO_ERROR
;
8737 UCollationResult returnVal
;
8738 if(!coll
->latinOneUse
|| (sourceLength
> 0 && *source
&0xff00) || (targetLength
> 0 && *target
&0xff00)) {
8739 collIterate sColl
, tColl
;
8740 // Preparing the context objects for iterating over strings
8741 IInit_collIterate(coll
, source
, sourceLength
, &sColl
);
8742 IInit_collIterate(coll
, target
, targetLength
, &tColl
);
8743 returnVal
= ucol_strcollRegular(&sColl
, &tColl
, &status
);
8745 returnVal
= ucol_strcollUseLatin1(coll
, source
, sourceLength
, target
, targetLength
, &status
);
8747 UTRACE_EXIT_VALUE(returnVal
);
8751 /* convenience function for comparing strings */
8752 U_CAPI UBool U_EXPORT2
8753 ucol_greater( const UCollator
*coll
,
8754 const UChar
*source
,
8755 int32_t sourceLength
,
8756 const UChar
*target
,
8757 int32_t targetLength
)
8759 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8763 /* convenience function for comparing strings */
8764 U_CAPI UBool U_EXPORT2
8765 ucol_greaterOrEqual( const UCollator
*coll
,
8766 const UChar
*source
,
8767 int32_t sourceLength
,
8768 const UChar
*target
,
8769 int32_t targetLength
)
8771 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8775 /* convenience function for comparing strings */
8776 U_CAPI UBool U_EXPORT2
8777 ucol_equal( const UCollator
*coll
,
8778 const UChar
*source
,
8779 int32_t sourceLength
,
8780 const UChar
*target
,
8781 int32_t targetLength
)
8783 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8787 U_CAPI
void U_EXPORT2
8788 ucol_getUCAVersion(const UCollator
* coll
, UVersionInfo info
) {
8789 if(coll
&& coll
->UCA
) {
8790 uprv_memcpy(info
, coll
->UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
8794 #endif /* #if !UCONFIG_NO_COLLATION */