2 *******************************************************************************
3 * Copyright (C) 1996-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * Modification history
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
19 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h"
48 /* added by synwee for trie manipulation*/
49 #define STAGE_1_SHIFT_ 10
50 #define STAGE_2_SHIFT_ 4
51 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
52 #define STAGE_3_MASK_ 0xF
53 #define LAST_BYTE_MASK_ 0xFF
54 #define SECOND_LAST_BYTE_SHIFT_ 8
56 #define ZERO_CC_LIMIT_ 0xC0
58 // static UCA. There is only one. Collators don't use it.
59 // It is referenced only in ucol_initUCA and ucol_cleanup
60 static UCollator
* _staticUCA
= NULL
;
61 // static pointer to udata memory. Inited in ucol_initUCA
62 // used for cleanup in ucol_cleanup
63 static UDataMemory
* UCA_DATA_MEM
= NULL
;
65 // this is static pointer to the normalizer fcdTrieIndex
66 // it is always the same between calls to u_cleanup
67 // and therefore writing to it is not synchronized.
68 // It is cleaned in ucol_cleanup
69 static const uint16_t *fcdTrieIndex
=NULL
;
71 // These are values from UCA required for
72 // implicit generation and supressing sort key compression
73 // they should regularly be in the UCA, but if one
74 // is running without UCA, it could be a problem
75 static int32_t maxRegularPrimary
= 0xA0;
76 static int32_t minImplicitPrimary
= 0xE0;
77 static int32_t maxImplicitPrimary
= 0xE4;
80 static UBool U_CALLCONV
81 isAcceptableUCA(void * /*context*/,
82 const char * /*type*/, const char * /*name*/,
83 const UDataInfo
*pInfo
){
84 /* context, type & name are intentionally not used */
85 if( pInfo
->size
>=20 &&
86 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
87 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
88 pInfo
->dataFormat
[0]==UCA_DATA_FORMAT_0
&& /* dataFormat="UCol" */
89 pInfo
->dataFormat
[1]==UCA_DATA_FORMAT_1
&&
90 pInfo
->dataFormat
[2]==UCA_DATA_FORMAT_2
&&
91 pInfo
->dataFormat
[3]==UCA_DATA_FORMAT_3
&&
92 pInfo
->formatVersion
[0]==UCA_FORMAT_VERSION_0
&&
93 pInfo
->formatVersion
[1]>=UCA_FORMAT_VERSION_1
// &&
94 //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
95 //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
96 //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
98 UVersionInfo UCDVersion
;
99 u_getUnicodeVersion(UCDVersion
);
100 if(pInfo
->dataVersion
[0]==UCDVersion
[0] &&
101 pInfo
->dataVersion
[1]==UCDVersion
[1]) { // &&
102 //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
103 //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
114 static int32_t U_CALLCONV
115 _getFoldingOffset(uint32_t data
) {
116 return (int32_t)(data
&0xFFFFFF);
122 inline void IInit_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
123 int32_t sourceLen
, collIterate
*s
) {
124 (s
)->string
= (s
)->pos
= (UChar
*)(sourceString
);
127 if (sourceLen
>= 0) {
128 s
->flags
|= UCOL_ITER_HASLEN
;
129 (s
)->endp
= (UChar
*)sourceString
+sourceLen
;
132 /* change to enable easier checking for end of string for fcdpositon */
135 (s
)->CEpos
= (s
)->toReturn
= (s
)->CEs
;
136 (s
)->writableBuffer
= (s
)->stackWritableBuffer
;
137 (s
)->writableBufSize
= UCOL_WRITABLE_BUFFER_SIZE
;
138 (s
)->coll
= (collator
);
139 (s
)->fcdPosition
= 0;
140 if(collator
->normalizationMode
== UCOL_ON
) {
141 (s
)->flags
|= UCOL_ITER_NORM
;
143 if(collator
->hiraganaQ
== UCOL_ON
&& collator
->strength
>= UCOL_QUATERNARY
) {
144 (s
)->flags
|= UCOL_HIRAGANA_Q
;
146 (s
)->iterator
= NULL
;
147 //(s)->iteratorIndex = 0;
150 U_CAPI
void U_EXPORT2
151 uprv_init_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
152 int32_t sourceLen
, collIterate
*s
){
153 /* Out-of-line version for use from other files. */
154 IInit_collIterate(collator
, sourceString
, sourceLen
, s
);
159 * Backup the state of the collIterate struct data
160 * @param data collIterate to backup
161 * @param backup storage
164 inline void backupState(const collIterate
*data
, collIterateState
*backup
)
166 backup
->fcdPosition
= data
->fcdPosition
;
167 backup
->flags
= data
->flags
;
168 backup
->origFlags
= data
->origFlags
;
169 backup
->pos
= data
->pos
;
170 backup
->bufferaddress
= data
->writableBuffer
;
171 backup
->buffersize
= data
->writableBufSize
;
172 backup
->iteratorMove
= 0;
173 backup
->iteratorIndex
= 0;
174 if(data
->iterator
!= NULL
) {
175 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
176 backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
);
177 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
178 if(backup
->iteratorIndex
== UITER_NO_STATE
) {
179 while((backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
)) == UITER_NO_STATE
) {
180 backup
->iteratorMove
++;
181 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
183 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
189 * Loads the state into the collIterate struct data
190 * @param data collIterate to backup
191 * @param backup storage
192 * @param forwards boolean to indicate if forwards iteration is used,
193 * false indicates backwards iteration
196 inline void loadState(collIterate
*data
, const collIterateState
*backup
,
199 UErrorCode status
= U_ZERO_ERROR
;
200 data
->flags
= backup
->flags
;
201 data
->origFlags
= backup
->origFlags
;
202 if(data
->iterator
!= NULL
) {
203 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
204 data
->iterator
->setState(data
->iterator
, backup
->iteratorIndex
, &status
);
205 if(backup
->iteratorMove
!= 0) {
206 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
209 data
->pos
= backup
->pos
;
210 if ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
211 data
->writableBuffer
!= backup
->bufferaddress
) {
213 this is when a new buffer has been reallocated and we'll have to
214 calculate the new position.
215 note the new buffer has to contain the contents of the old buffer.
218 data
->pos
= data
->writableBuffer
+
219 (data
->pos
- backup
->bufferaddress
);
222 /* backwards direction */
223 uint32_t temp
= backup
->buffersize
-
224 (data
->pos
- backup
->bufferaddress
);
225 data
->pos
= data
->writableBuffer
+ (data
->writableBufSize
- temp
);
228 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
230 this is alittle tricky.
231 if we are initially not in the normalization buffer, even if we
232 normalize in the later stage, the data in the buffer will be
233 ignored, since we skip back up to the data string.
234 however if we are already in the normalization buffer, any
235 further normalization will pull data into the normalization
236 buffer and modify the fcdPosition.
237 since we are keeping the data in the buffer for use, the
238 fcdPosition can not be reverted back.
241 data
->fcdPosition
= backup
->fcdPosition
;
248 * Checks for a collIterate being positioned at the end of
253 inline UBool
collIter_eos(collIterate
*s
) {
254 if(s
->flags
& UCOL_USE_ITERATOR
) {
255 return !(s
->iterator
->hasNext(s
->iterator
));
257 if ((s
->flags
& UCOL_ITER_HASLEN
) == 0 && *s
->pos
!= 0) {
258 // Null terminated string, but not at null, so not at end.
259 // Whether in main or normalization buffer doesn't matter.
263 // String with length. Can't be in normalization buffer, which is always
265 if (s
->flags
& UCOL_ITER_HASLEN
) {
266 return (s
->pos
== s
->endp
);
269 // We are at a null termination, could be either normalization buffer or main string.
270 if ((s
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
271 // At null at end of main string.
275 // At null at end of normalization buffer. Need to check whether there there are
276 // any characters left in the main buffer.
277 if(s
->origFlags
& UCOL_USE_ITERATOR
) {
278 return !(s
->iterator
->hasNext(s
->iterator
));
279 } else if ((s
->origFlags
& UCOL_ITER_HASLEN
) == 0) {
280 // Null terminated main string. fcdPosition is the 'return' position into main buf.
281 return (*s
->fcdPosition
== 0);
284 // Main string with an end pointer.
285 return s
->fcdPosition
== s
->endp
;
291 * Checks for a collIterate being positioned at the start of
296 inline UBool
collIter_bos(collIterate
*source
) {
297 // if we're going backwards, we need to know whether there is more in the
298 // iterator, even if we are in the side buffer
299 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
300 return !source
->iterator
->hasPrevious(source
->iterator
);
302 if (source
->pos
<= source
->string
||
303 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
304 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)) {
311 inline UBool
collIter_SimpleBos(collIterate
*source
) {
312 // if we're going backwards, we need to know whether there is more in the
313 // iterator, even if we are in the side buffer
314 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
315 return !source
->iterator
->hasPrevious(source
->iterator
);
317 if (source
->pos
== source
->string
) {
322 //return (data->pos == data->string) ||
326 * Checks and free writable buffer if it is not the original stack buffer
327 * in collIterate. This function does not reassign the writable buffer.
328 * @param data collIterate struct to determine and free the writable buffer
331 inline void freeHeapWritableBuffer(collIterate
*data
)
333 if (data
->writableBuffer
!= data
->stackWritableBuffer
) {
334 uprv_free(data
->writableBuffer
);
339 /****************************************************************************/
340 /* Following are the open/close functions */
342 /****************************************************************************/
345 ucol_initFromBinary(const uint8_t *bin
, int32_t length
,
346 const UCollator
*base
,
350 UCollator
*result
= fillIn
;
351 if(U_FAILURE(*status
)) {
356 // we don't support null base yet
357 *status = U_ILLEGAL_ARGUMENT_ERROR;
361 // We need these and we could be running without UCA
362 uprv_uca_initImplicitConstants(0, 0, status
);
363 UCATableHeader
*colData
= (UCATableHeader
*)bin
;
364 // do we want version check here? We're trying to figure out whether collators are compatible
365 if((base
&& (uprv_memcmp(colData
->UCAVersion
, base
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
366 uprv_memcmp(colData
->UCDVersion
, base
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0)) ||
367 colData
->version
[0] != UCOL_BUILDER_VERSION
)
369 *status
= U_COLLATOR_VERSION_MISMATCH
;
373 if((uint32_t)length
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
374 result
= ucol_initCollator((const UCATableHeader
*)bin
, result
, base
, status
);
375 if(U_FAILURE(*status
)){
378 result
->hasRealData
= TRUE
;
382 result
= ucol_initCollator(base
->image
, result
, base
, status
);
383 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(bin
+((const UCATableHeader
*)bin
)->options
), status
);
384 if(U_FAILURE(*status
)){
387 result
->hasRealData
= FALSE
;
390 *status
= U_USELESS_COLLATOR_ERROR
;
394 result
->freeImageOnClose
= FALSE
;
396 result
->validLocale
= NULL
;
397 result
->requestedLocale
= NULL
;
398 result
->rules
= NULL
;
399 result
->rulesLength
= 0;
400 result
->freeRulesOnClose
= FALSE
;
402 result
->elements
= NULL
;
406 U_CAPI UCollator
* U_EXPORT2
407 ucol_openBinary(const uint8_t *bin
, int32_t length
,
408 const UCollator
*base
,
411 return ucol_initFromBinary(bin
, length
, base
, NULL
, status
);
414 U_CAPI UCollator
* U_EXPORT2
415 ucol_safeClone(const UCollator
*coll
, void *stackBuffer
, int32_t * pBufferSize
, UErrorCode
*status
)
417 UCollator
* localCollator
;
418 int32_t bufferSizeNeeded
= (int32_t)sizeof(UCollator
);
419 char *stackBufferChars
= (char *)stackBuffer
;
420 int32_t imageSize
= 0;
421 int32_t rulesSize
= 0;
422 int32_t rulesPadding
= 0;
425 UBool colAllocated
= FALSE
;
426 UBool imageAllocated
= FALSE
;
428 if (status
== NULL
|| U_FAILURE(*status
)){
431 if ((stackBuffer
&& !pBufferSize
) || !coll
){
432 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
435 if (coll
->rules
&& coll
->freeRulesOnClose
) {
436 rulesSize
= (int32_t)(coll
->rulesLength
+ 1)*sizeof(UChar
);
437 rulesPadding
= (int32_t)(bufferSizeNeeded
% sizeof(UChar
));
438 bufferSizeNeeded
+= rulesSize
+ rulesPadding
;
441 if (stackBuffer
&& *pBufferSize
<= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
442 *pBufferSize
= bufferSizeNeeded
;
446 /* Pointers on 64-bit platforms need to be aligned
447 * on a 64-bit boundry in memory.
449 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
450 int32_t offsetUp
= (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars
);
451 if (*pBufferSize
> offsetUp
) {
452 *pBufferSize
-= offsetUp
;
453 stackBufferChars
+= offsetUp
;
456 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
460 stackBuffer
= (void *)stackBufferChars
;
462 if (stackBuffer
== NULL
|| *pBufferSize
< bufferSizeNeeded
) {
463 /* allocate one here...*/
464 stackBufferChars
= (char *)uprv_malloc(bufferSizeNeeded
);
466 if (U_SUCCESS(*status
)) {
467 *status
= U_SAFECLONE_ALLOCATED_WARNING
;
470 localCollator
= (UCollator
*)stackBufferChars
;
471 rules
= (UChar
*)(stackBufferChars
+ sizeof(UCollator
) + rulesPadding
);
473 UErrorCode tempStatus
= U_ZERO_ERROR
;
474 imageSize
= ucol_cloneBinary(coll
, NULL
, 0, &tempStatus
);
476 if (coll
->freeImageOnClose
) {
477 image
= (uint8_t *)uprv_malloc(imageSize
);
478 ucol_cloneBinary(coll
, image
, imageSize
, status
);
479 imageAllocated
= TRUE
;
482 image
= (uint8_t *)coll
->image
;
484 localCollator
= ucol_initFromBinary(image
, imageSize
, coll
->UCA
, localCollator
, status
);
485 if (U_FAILURE(*status
)) {
490 if (coll
->freeRulesOnClose
) {
491 localCollator
->rules
= u_strcpy(rules
, coll
->rules
);
492 //bufferEnd += rulesSize;
495 localCollator
->rules
= coll
->rules
;
497 localCollator
->freeRulesOnClose
= FALSE
;
498 localCollator
->rulesLength
= coll
->rulesLength
;
502 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
503 ucol_setAttribute(localCollator
, (UColAttribute
)i
, ucol_getAttribute(coll
, (UColAttribute
)i
, status
), status
);
505 localCollator
->requestedLocale
= NULL
; // zero copies of pointers
506 localCollator
->validLocale
= NULL
;
507 localCollator
->rb
= NULL
;
508 localCollator
->elements
= NULL
;
509 localCollator
->freeOnClose
= colAllocated
;
510 localCollator
->freeImageOnClose
= imageAllocated
;
511 return localCollator
;
514 U_CAPI
void U_EXPORT2
515 ucol_close(UCollator
*coll
)
517 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE
);
518 UTRACE_DATA1(UTRACE_INFO
, "coll = %p", coll
);
520 // these are always owned by each UCollator struct,
521 // so we always free them
522 if(coll
->validLocale
!= NULL
) {
523 uprv_free(coll
->validLocale
);
525 if(coll
->requestedLocale
!= NULL
) {
526 uprv_free(coll
->requestedLocale
);
528 if(coll
->resCleaner
!= NULL
) {
529 coll
->resCleaner(coll
);
531 if(coll
->latinOneCEs
!= NULL
) {
532 uprv_free(coll
->latinOneCEs
);
534 if(coll
->options
!= NULL
&& coll
->freeOptionsOnClose
) {
535 uprv_free(coll
->options
);
537 if(coll
->rules
!= NULL
&& coll
->freeRulesOnClose
) {
538 uprv_free((UChar
*)coll
->rules
);
540 if(coll
->image
!= NULL
&& coll
->freeImageOnClose
) {
541 uprv_free((UCATableHeader
*)coll
->image
);
544 /* Here, it would be advisable to close: */
545 /* - UData for UCA (unless we stuff it in the root resb */
546 /* Again, do we need additional housekeeping... HMMM! */
547 UTRACE_DATA1(UTRACE_INFO
, "coll->freeOnClose: %d", coll
->freeOnClose
);
548 if(coll
->freeOnClose
){
549 /* for safeClone, if freeOnClose is FALSE,
550 don't free the other instance data */
557 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
558 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
559 U_CAPI
uint8_t* U_EXPORT2
560 ucol_cloneRuleData(const UCollator
*coll
, int32_t *length
, UErrorCode
*status
)
562 uint8_t *result
= NULL
;
563 if(U_FAILURE(*status
)) {
566 if(coll
->hasRealData
== TRUE
) {
567 *length
= coll
->image
->size
;
568 result
= (uint8_t *)uprv_malloc(*length
);
570 if (result
== NULL
) {
571 *status
= U_MEMORY_ALLOCATION_ERROR
;
574 uprv_memcpy(result
, coll
->image
, *length
);
576 *length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
577 result
= (uint8_t *)uprv_malloc(*length
);
579 if (result
== NULL
) {
580 *status
= U_MEMORY_ALLOCATION_ERROR
;
584 /* build the UCATableHeader with minimal entries */
585 /* do not copy the header from the UCA file because its values are wrong! */
586 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
588 /* reset everything */
589 uprv_memset(result
, 0, *length
);
591 /* set the tailoring-specific values */
592 UCATableHeader
*myData
= (UCATableHeader
*)result
;
593 myData
->size
= *length
;
595 /* offset for the options, the only part of the data that is present after the header */
596 myData
->options
= sizeof(UCATableHeader
);
598 /* need to always set the expansion value for an upper bound of the options */
599 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
601 myData
->magic
= UCOL_HEADER_MAGIC
;
602 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
603 myData
->charSetFamily
= U_CHARSET_FAMILY
;
605 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
606 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
608 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
609 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
610 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
611 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
613 /* copy the collator options */
614 uprv_memcpy(result
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
619 void ucol_setOptionsFromHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
620 if(U_FAILURE(*status
)) {
623 result
->caseFirst
= (UColAttributeValue
)opts
->caseFirst
;
624 result
->caseLevel
= (UColAttributeValue
)opts
->caseLevel
;
625 result
->frenchCollation
= (UColAttributeValue
)opts
->frenchCollation
;
626 result
->normalizationMode
= (UColAttributeValue
)opts
->normalizationMode
;
627 result
->strength
= (UColAttributeValue
)opts
->strength
;
628 result
->variableTopValue
= opts
->variableTopValue
;
629 result
->alternateHandling
= (UColAttributeValue
)opts
->alternateHandling
;
630 result
->hiraganaQ
= (UColAttributeValue
)opts
->hiraganaQ
;
631 result
->numericCollation
= (UColAttributeValue
)opts
->numericCollation
;
633 result
->caseFirstisDefault
= TRUE
;
634 result
->caseLevelisDefault
= TRUE
;
635 result
->frenchCollationisDefault
= TRUE
;
636 result
->normalizationModeisDefault
= TRUE
;
637 result
->strengthisDefault
= TRUE
;
638 result
->variableTopValueisDefault
= TRUE
;
639 result
->hiraganaQisDefault
= TRUE
;
640 result
->numericCollationisDefault
= TRUE
;
642 ucol_updateInternalState(result
, status
);
644 result
->options
= opts
;
649 * Approximate determination if a character is at a contraction end.
650 * Guaranteed to be TRUE if a character is at the end of a contraction,
651 * otherwise it is not deterministic.
652 * @param c character to be determined
653 * @param coll collator
656 inline UBool
ucol_contractionEndCP(UChar c
, const UCollator
*coll
) {
657 if (U16_IS_TRAIL(c
)) {
661 if (c
< coll
->minContrEndCP
) {
667 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
668 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
670 htbyte
= coll
->contrEndCP
[hash
>>3];
671 return (((htbyte
>> (hash
& 7)) & 1) == 1);
677 * i_getCombiningClass()
678 * A fast, at least partly inline version of u_getCombiningClass()
679 * This is a candidate for further optimization. Used heavily
680 * in contraction processing.
683 inline uint8_t i_getCombiningClass(UChar32 c
, const UCollator
*coll
) {
685 if ((c
>= 0x300 && ucol_unsafeCP(c
, coll
)) || c
> 0xFFFF) {
686 sCC
= u_getCombiningClass(c
);
691 UCollator
* ucol_initCollator(const UCATableHeader
*image
, UCollator
*fillIn
, const UCollator
*UCA
, UErrorCode
*status
) {
693 UCollator
*result
= fillIn
;
694 if(U_FAILURE(*status
) || image
== NULL
) {
699 result
= (UCollator
*)uprv_malloc(sizeof(UCollator
));
701 *status
= U_MEMORY_ALLOCATION_ERROR
;
704 result
->freeOnClose
= TRUE
;
706 result
->freeOnClose
= FALSE
;
709 result
->image
= image
;
710 result
->mapping
.getFoldingOffset
= _getFoldingOffset
;
711 const uint8_t *mapping
= (uint8_t*)result
->image
+result
->image
->mappingPosition
;
712 utrie_unserialize(&result
->mapping
, mapping
, result
->image
->endExpansionCE
- result
->image
->mappingPosition
, status
);
713 if(U_FAILURE(*status
)) {
714 if(result
->freeOnClose
== TRUE
) {
721 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
722 result
->latinOneMapping
= UTRIE_GET32_LATIN1(&result
->mapping
);
723 result
->contractionCEs
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->contractionCEs
);
724 result
->contractionIndex
= (UChar
*)((uint8_t*)result
->image
+result
->image
->contractionIndex
);
725 result
->expansion
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->expansion
);
727 result
->options
= (UColOptionSet
*)((uint8_t*)result
->image
+result
->image
->options
);
728 result
->freeOptionsOnClose
= FALSE
;
731 result
->caseFirst
= (UColAttributeValue
)result
->options
->caseFirst
;
732 result
->caseLevel
= (UColAttributeValue
)result
->options
->caseLevel
;
733 result
->frenchCollation
= (UColAttributeValue
)result
->options
->frenchCollation
;
734 result
->normalizationMode
= (UColAttributeValue
)result
->options
->normalizationMode
;
735 result
->strength
= (UColAttributeValue
)result
->options
->strength
;
736 result
->variableTopValue
= result
->options
->variableTopValue
;
737 result
->alternateHandling
= (UColAttributeValue
)result
->options
->alternateHandling
;
738 result
->hiraganaQ
= (UColAttributeValue
)result
->options
->hiraganaQ
;
739 result
->numericCollation
= (UColAttributeValue
)result
->options
->numericCollation
;
741 result
->caseFirstisDefault
= TRUE
;
742 result
->caseLevelisDefault
= TRUE
;
743 result
->frenchCollationisDefault
= TRUE
;
744 result
->normalizationModeisDefault
= TRUE
;
745 result
->strengthisDefault
= TRUE
;
746 result
->variableTopValueisDefault
= TRUE
;
747 result
->alternateHandlingisDefault
= TRUE
;
748 result
->hiraganaQisDefault
= TRUE
;
749 result
->numericCollationisDefault
= TRUE
;
751 /*result->scriptOrder = NULL;*/
753 result
->rules
= NULL
;
754 result
->rulesLength
= 0;
756 /* get the version info from UCATableHeader and populate the Collator struct*/
757 result
->dataVersion
[0] = result
->image
->version
[0]; /* UCA Builder version*/
758 result
->dataVersion
[1] = result
->image
->version
[1]; /* UCA Tailoring rules version*/
759 result
->dataVersion
[2] = 0;
760 result
->dataVersion
[3] = 0;
762 result
->unsafeCP
= (uint8_t *)result
->image
+ result
->image
->unsafeCP
;
763 result
->minUnsafeCP
= 0;
764 for (c
=0; c
<0x300; c
++) { // Find the smallest unsafe char.
765 if (ucol_unsafeCP(c
, result
)) break;
767 result
->minUnsafeCP
= c
;
769 result
->contrEndCP
= (uint8_t *)result
->image
+ result
->image
->contrEndCP
;
770 result
->minContrEndCP
= 0;
771 for (c
=0; c
<0x300; c
++) { // Find the Contraction-ending char.
772 if (ucol_contractionEndCP(c
, result
)) break;
774 result
->minContrEndCP
= c
;
776 /* max expansion tables */
777 result
->endExpansionCE
= (uint32_t*)((uint8_t*)result
->image
+
778 result
->image
->endExpansionCE
);
779 result
->lastEndExpansionCE
= result
->endExpansionCE
+
780 result
->image
->endExpansionCECount
- 1;
781 result
->expansionCESize
= (uint8_t*)result
->image
+
782 result
->image
->expansionCESize
;
785 //result->errorCode = *status;
787 result
->latinOneCEs
= NULL
;
789 result
->latinOneRegenTable
= FALSE
;
790 result
->latinOneFailed
= FALSE
;
792 result
->resCleaner
= NULL
;
794 ucol_updateInternalState(result
, status
);
800 /* new Mark's code */
803 * For generation of Implicit CEs
806 * Cleaned up so that changes can be made more easily.
808 # First Implicit: E26A792D
809 # Last Implicit: E3DC70C0
810 # First CJK: E0030300
812 # First CJK_A: E0A9DF00
813 # Last CJK_A: E0DE3100
815 /* Following is a port of Mark's code for new treatment of implicits.
816 * It is positioned here, since ucol_initUCA need to initialize the
817 * variables below according to the data in the fractional UCA.
822 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
823 * b) bump any non-CJK characters by 10FFFF.
824 * The relevant blocks are:
825 * A: 4E00..9FFF; CJK Unified Ideographs
826 * F900..FAFF; CJK Compatibility Ideographs
827 * B: 3400..4DBF; CJK Unified Ideographs Extension A
828 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
830 * no new B characters are allocated between 4E00 and FAFF, and
831 * no new A characters are outside of this range,
832 * (very high probability) this simple code will work.
833 * The reordered blocks are:
835 * Block2 is CJK_COMPAT_USED
838 * Any other CJK gets its normal code point
839 * Any non-CJK gets +10FFFF
840 * When we reorder Block1, we make sure that it is at the very start,
841 * so that it will use a 3-byte form.
842 * Warning: the we only pick up the compatibility characters that are
843 * NOT decomposed, so that block is smaller!
848 NON_CJK_OFFSET
= 0x110000,
849 UCOL_MAX_INPUT
= 0x220001; // 2 * Unicode range + 2
852 * Precomputed by constructor
855 final3Multiplier
= 0,
856 final4Multiplier
= 0,
871 CJK_LIMIT
= 0x9FFF+1,
872 CJK_COMPAT_USED_BASE
= 0xFA0E,
873 CJK_COMPAT_USED_LIMIT
= 0xFA2F+1,
875 CJK_A_LIMIT
= 0x4DBF+1,
876 CJK_B_BASE
= 0x20000,
877 CJK_B_LIMIT
= 0x2A6DF+1;
879 static UChar32
swapCJK(UChar32 i
) {
882 if (i
< CJK_LIMIT
) return i
- CJK_BASE
;
884 if (i
< CJK_COMPAT_USED_BASE
) return i
+ NON_CJK_OFFSET
;
886 if (i
< CJK_COMPAT_USED_LIMIT
) return i
- CJK_COMPAT_USED_BASE
887 + (CJK_LIMIT
- CJK_BASE
);
888 if (i
< CJK_B_BASE
) return i
+ NON_CJK_OFFSET
;
890 if (i
< CJK_B_LIMIT
) return i
; // non-BMP-CJK
892 return i
+ NON_CJK_OFFSET
; // non-CJK
894 if (i
< CJK_A_BASE
) return i
+ NON_CJK_OFFSET
;
896 if (i
< CJK_A_LIMIT
) return i
- CJK_A_BASE
897 + (CJK_LIMIT
- CJK_BASE
)
898 + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
899 return i
+ NON_CJK_OFFSET
; // non-CJK
902 U_CAPI UChar32 U_EXPORT2
903 uprv_uca_getRawFromCodePoint(UChar32 i
) {
907 U_CAPI UChar32 U_EXPORT2
908 uprv_uca_getCodePointFromRaw(UChar32 i
) {
911 if(i
>= NON_CJK_OFFSET
) {
912 result
= i
- NON_CJK_OFFSET
;
913 } else if(i
>= CJK_B_BASE
) {
915 } else if(i
< CJK_A_LIMIT
+ (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) { // rest of CJKs, compacted
916 if(i
< CJK_LIMIT
- CJK_BASE
) {
917 result
= i
+ CJK_BASE
;
918 } else if(i
< (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) {
919 result
= i
+ CJK_COMPAT_USED_BASE
- (CJK_LIMIT
- CJK_BASE
);
921 result
= i
+ CJK_A_BASE
- (CJK_LIMIT
- CJK_BASE
) - (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
929 // GET IMPLICIT PRIMARY WEIGHTS
930 // Return value is left justified primary key
931 U_CAPI
uint32_t U_EXPORT2
932 uprv_uca_getImplicitFromRaw(UChar32 cp
) {
934 if (cp < 0 || cp > UCOL_MAX_INPUT) {
935 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
938 int32_t last0
= cp
- min4Boundary
;
940 int32_t last1
= cp
/ final3Count
;
941 last0
= cp
% final3Count
;
943 int32_t last2
= last1
/ medialCount
;
944 last1
%= medialCount
;
946 last0
= minTrail
+ last0
*final3Multiplier
; // spread out, leaving gap at start
947 last1
= minTrail
+ last1
; // offset
948 last2
= min3Primary
+ last2
; // offset
950 if (last2 >= min4Primary) {
951 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
954 return (last2
<< 24) + (last1
<< 16) + (last0
<< 8);
956 int32_t last1
= last0
/ final4Count
;
957 last0
%= final4Count
;
959 int32_t last2
= last1
/ medialCount
;
960 last1
%= medialCount
;
962 int32_t last3
= last2
/ medialCount
;
963 last2
%= medialCount
;
965 last0
= minTrail
+ last0
*final4Multiplier
; // spread out, leaving gap at start
966 last1
= minTrail
+ last1
; // offset
967 last2
= minTrail
+ last2
; // offset
968 last3
= min4Primary
+ last3
; // offset
970 if (last3 > max4Primary) {
971 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
974 return (last3
<< 24) + (last2
<< 16) + (last1
<< 8) + last0
;
978 U_CAPI
uint32_t U_EXPORT2
979 uprv_uca_getImplicitPrimary(UChar32 cp
) {
980 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
984 // we now have a range of numbers from 0 to 21FFFF.
986 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
988 return uprv_uca_getImplicitFromRaw(cp
);
992 * Converts implicit CE into raw integer ("code point")
994 * @return -1 if illegal format
996 U_CAPI UChar32 U_EXPORT2
997 uprv_uca_getRawFromImplicit(uint32_t implicit
) {
999 UChar32 b3
= implicit
& 0xFF;
1001 UChar32 b2
= implicit
& 0xFF;
1003 UChar32 b1
= implicit
& 0xFF;
1005 UChar32 b0
= implicit
& 0xFF;
1007 // simple parameter checks
1008 if (b0
< min3Primary
|| b0
> max4Primary
1009 || b1
< minTrail
|| b1
> maxTrail
) return -1;
1013 // take care of the final values, and compose
1014 if (b0
< min4Primary
) {
1015 if (b2
< minTrail
|| b2
> max3Trail
|| b3
!= 0) return -1;
1017 UChar32 remainder
= b2
% final3Multiplier
;
1018 if (remainder
!= 0) return -1;
1020 b2
/= final3Multiplier
;
1021 result
= ((b0
* medialCount
) + b1
) * final3Count
+ b2
;
1023 if (b2
< minTrail
|| b2
> maxTrail
1024 || b3
< minTrail
|| b3
> max4Trail
) return -1;
1027 UChar32 remainder
= b3
% final4Multiplier
;
1028 if (remainder
!= 0) return -1;
1029 b3
/= final4Multiplier
;
1031 result
= (((b0
* medialCount
) + b1
) * medialCount
+ b2
) * final4Count
+ b3
+ min4Boundary
;
1034 if (result
< 0 || result
> UCOL_MAX_INPUT
) return -1;
1039 static inline int32_t divideAndRoundUp(int a
, int b
) {
1043 /* this function is either called from initUCA or from genUCA before
1044 * doing canonical closure for the UCA.
1048 * Set up to generate implicits.
1051 * @param minTrail final byte
1052 * @param maxTrail final byte
1053 * @param gap3 the gap we leave for tailoring for 3-byte forms
1054 * @param gap4 the gap we leave for tailoring for 4-byte forms
1056 static void initImplicitConstants(int minPrimary
, int maxPrimary
,
1057 int minTrailIn
, int maxTrailIn
,
1058 int gap3
, int primaries3count
,
1059 UErrorCode
*status
) {
1060 // some simple parameter checks
1061 if (minPrimary
< 0 || minPrimary
>= maxPrimary
|| maxPrimary
> 0xFF) {
1062 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1065 if (minTrailIn
< 0 || minTrailIn
>= maxTrailIn
|| maxTrailIn
> 0xFF) {
1066 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1069 if (primaries3count
< 1) {
1070 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1074 minTrail
= minTrailIn
;
1075 maxTrail
= maxTrailIn
;
1077 min3Primary
= minPrimary
;
1078 max4Primary
= maxPrimary
;
1079 // compute constants for use later.
1080 // number of values we can use in trailing bytes
1081 // leave room for empty values between AND above, e.g. if gap = 2
1082 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1083 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1084 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1085 final3Multiplier
= gap3
+ 1;
1086 final3Count
= (maxTrail
- minTrail
+ 1) / final3Multiplier
;
1087 max3Trail
= minTrail
+ (final3Count
- 1) * final3Multiplier
;
1089 // medials can use full range
1090 medialCount
= (maxTrail
- minTrail
+ 1);
1091 // find out how many values fit in each form
1092 int32_t threeByteCount
= medialCount
* final3Count
;
1093 // now determine where the 3/4 boundary is.
1094 // we use 3 bytes below the boundary, and 4 above
1095 int32_t primariesAvailable
= maxPrimary
- minPrimary
+ 1;
1096 int32_t primaries4count
= primariesAvailable
- primaries3count
;
1099 int32_t min3ByteCoverage
= primaries3count
* threeByteCount
;
1100 min4Primary
= minPrimary
+ primaries3count
;
1101 min4Boundary
= min3ByteCoverage
;
1102 // Now expand out the multiplier for the 4 bytes, and redo.
1104 int32_t totalNeeded
= UCOL_MAX_INPUT
- min4Boundary
;
1105 int32_t neededPerPrimaryByte
= divideAndRoundUp(totalNeeded
, primaries4count
);
1106 //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1107 int32_t neededPerFinalByte
= divideAndRoundUp(neededPerPrimaryByte
, medialCount
* medialCount
);
1108 //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1109 int32_t gap4
= (maxTrail
- minTrail
- 1) / neededPerFinalByte
;
1110 //if (DEBUG) System.out.println("expandedGap: " + gap4);
1112 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1115 final4Multiplier
= gap4
+ 1;
1116 final4Count
= neededPerFinalByte
;
1117 max4Trail
= minTrail
+ (final4Count
- 1) * final4Multiplier
;
1120 System.out.println("final4Count: " + final4Count);
1121 for (int counter = 0; counter <= final4Count; ++counter) {
1122 int value = minTrail + (1 + counter)*final4Multiplier;
1123 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1130 * Supply parameters for generating implicit CEs
1132 U_CAPI
void U_EXPORT2
1133 uprv_uca_initImplicitConstants(int32_t, int32_t, UErrorCode
*status
) {
1134 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1135 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1136 initImplicitConstants(minImplicitPrimary
, maxImplicitPrimary
, 0x04, 0xFE, 1, 1, status
);
1140 static UBool U_CALLCONV
1144 udata_close(UCA_DATA_MEM
);
1145 UCA_DATA_MEM
= NULL
;
1148 ucol_close(_staticUCA
);
1151 fcdTrieIndex
= NULL
;
1156 /* do not close UCA returned by ucol_initUCA! */
1158 ucol_initUCA(UErrorCode
*status
) {
1159 if(U_FAILURE(*status
)) {
1163 UBool f
= (_staticUCA
== NULL
);
1167 UCollator
*newUCA
= NULL
;
1168 UDataMemory
*result
= udata_openChoice(NULL
, UCA_DATA_TYPE
, UCA_DATA_NAME
, isAcceptableUCA
, NULL
, status
);
1170 if(U_FAILURE(*status
)) {
1172 udata_close(result
);
1178 if (fcdTrieIndex
== NULL
) {
1179 fcdTrieIndex
= unorm_getFCDTrie(status
);
1180 ucln_i18n_registerCleanup(UCLN_I18N_UCOL
, ucol_cleanup
);
1183 if(result
!= NULL
) { /* It looks like sometimes we can fail to find the data file */
1184 newUCA
= ucol_initCollator((const UCATableHeader
*)udata_getMemory(result
), newUCA
, newUCA
, status
);
1185 if(U_SUCCESS(*status
)){
1187 newUCA
->elements
= NULL
;
1188 newUCA
->validLocale
= NULL
;
1189 newUCA
->requestedLocale
= NULL
;
1190 newUCA
->hasRealData
= FALSE
; // real data lives in .dat file...
1191 newUCA
->freeImageOnClose
= FALSE
;
1193 if(_staticUCA
== NULL
) {
1194 _staticUCA
= newUCA
;
1195 UCA_DATA_MEM
= result
;
1201 if(newUCA
!= NULL
) {
1202 udata_close(result
);
1206 ucln_i18n_registerCleanup(UCLN_I18N_UCOL
, ucol_cleanup
);
1208 // Initalize variables for implicit generation
1209 const UCAConstants
*UCAconsts
= (UCAConstants
*)((uint8_t *)_staticUCA
->image
+ _staticUCA
->image
->UCAConsts
);
1210 uprv_uca_initImplicitConstants(UCAconsts
->UCA_PRIMARY_IMPLICIT_MIN
, UCAconsts
->UCA_PRIMARY_IMPLICIT_MAX
, status
);
1211 //_staticUCA->mapping.getFoldingOffset = _getFoldingOffset;
1213 udata_close(result
);
1223 /* collIterNormalize Incremental Normalization happens here. */
1224 /* pick up the range of chars identifed by FCD, */
1225 /* normalize it into the collIterate's writable buffer, */
1226 /* switch the collIterate's state to use the writable buffer. */
1229 void collIterNormalize(collIterate
*collationSource
)
1231 UErrorCode status
= U_ZERO_ERROR
;
1234 UChar
*srcP
= collationSource
->pos
- 1; /* Start of chars to normalize */
1235 UChar
*endP
= collationSource
->fcdPosition
; /* End of region to normalize+1 */
1237 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1238 srcP
, (int32_t)(endP
- srcP
),
1241 if(status
== U_BUFFER_OVERFLOW_ERROR
|| status
== U_STRING_NOT_TERMINATED_WARNING
) {
1242 // reallocate and terminate
1243 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1244 &collationSource
->writableBuffer
,
1245 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1249 fprintf(stderr
, "collIterNormalize(), out of memory\n");
1253 status
= U_ZERO_ERROR
;
1254 normLen
= unorm_decompose(collationSource
->writableBuffer
, (int32_t)collationSource
->writableBufSize
,
1255 srcP
, (int32_t)(endP
- srcP
),
1259 if (U_FAILURE(status
)) {
1261 fprintf(stderr
, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status
));
1266 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1267 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1269 collationSource
->pos
= collationSource
->writableBuffer
;
1270 collationSource
->origFlags
= collationSource
->flags
;
1271 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1272 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1276 // This function takes the iterator and extracts normalized stuff up to the next boundary
1277 // It is similar in the end results to the collIterNormalize, but for the cases when we
1280 inline void normalizeIterator(collIterate
*collationSource
) {
1281 UErrorCode status
= U_ZERO_ERROR
;
1282 UBool wasNormalized
= FALSE
;
1283 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1284 uint32_t iterIndex
= collationSource
->iterator
->getState(collationSource
->iterator
);
1285 int32_t normLen
= unorm_next(collationSource
->iterator
, collationSource
->writableBuffer
,
1286 (int32_t)collationSource
->writableBufSize
, UNORM_FCD
, 0, TRUE
, &wasNormalized
, &status
);
1287 if(status
== U_BUFFER_OVERFLOW_ERROR
|| normLen
== (int32_t)collationSource
->writableBufSize
) {
1288 // reallocate and terminate
1289 if(!u_growBufferFromStatic(collationSource
->stackWritableBuffer
,
1290 &collationSource
->writableBuffer
,
1291 (int32_t *)&collationSource
->writableBufSize
, normLen
+ 1,
1295 fprintf(stderr
, "normalizeIterator(), out of memory\n");
1299 status
= U_ZERO_ERROR
;
1300 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1301 collationSource
->iterator
->setState(collationSource
->iterator
, iterIndex
, &status
);
1302 normLen
= unorm_next(collationSource
->iterator
, collationSource
->writableBuffer
,
1303 (int32_t)collationSource
->writableBufSize
, UNORM_FCD
, 0, TRUE
, &wasNormalized
, &status
);
1305 // Terminate the buffer - we already checked that it is big enough
1306 collationSource
->writableBuffer
[normLen
] = 0;
1307 if(collationSource
->writableBuffer
!= collationSource
->stackWritableBuffer
) {
1308 collationSource
->flags
|= UCOL_ITER_ALLOCATED
;
1310 collationSource
->pos
= collationSource
->writableBuffer
;
1311 collationSource
->origFlags
= collationSource
->flags
;
1312 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1313 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1317 /* Incremental FCD check and normalize */
1318 /* Called from getNextCE when normalization state is suspect. */
1319 /* When entering, the state is known to be this: */
1320 /* o We are working in the main buffer of the collIterate, not the side */
1321 /* writable buffer. When in the side buffer, normalization mode is always off, */
1322 /* so we won't get here. */
1323 /* o The leading combining class from the current character is 0 or */
1324 /* the trailing combining class of the previous char was zero. */
1325 /* True because the previous call to this function will have always exited */
1326 /* that way, and we get called for every char where cc might be non-zero. */
1328 inline UBool
collIterFCD(collIterate
*collationSource
) {
1330 const UChar
*srcP
, *endP
;
1332 uint8_t prevTrailingCC
= 0;
1334 UBool needNormalize
= FALSE
;
1336 srcP
= collationSource
->pos
-1;
1338 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1339 endP
= collationSource
->endp
;
1344 // Get the trailing combining class of the current character. If it's zero,
1348 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1350 if (U16_IS_LEAD(c
)) {
1351 if ((endP
== NULL
|| srcP
!= endP
) && U16_IS_TRAIL(c2
=*srcP
)) {
1353 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1359 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1361 if (prevTrailingCC
!= 0) {
1362 // The current char has a non-zero trailing CC. Scan forward until we find
1363 // a char with a leading cc of zero.
1364 while (endP
== NULL
|| srcP
!= endP
)
1366 const UChar
*savedSrcP
= srcP
;
1370 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1371 if (fcd
!= 0 && U16_IS_LEAD(c
)) {
1372 if ((endP
== NULL
|| srcP
!= endP
) && U16_IS_TRAIL(c2
=*srcP
)) {
1374 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c2
);
1379 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1380 if (leadingCC
== 0) {
1381 srcP
= savedSrcP
; // Hit char that is not part of combining sequence.
1382 // back up over it. (Could be surrogate pair!)
1386 if (leadingCC
< prevTrailingCC
) {
1387 needNormalize
= TRUE
;
1390 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1395 collationSource
->fcdPosition
= (UChar
*)srcP
;
1397 return needNormalize
;
1400 /****************************************************************************/
1401 /* Following are the CE retrieval functions */
1403 /****************************************************************************/
1405 static uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
);
1406 static uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
);
1408 /* there should be a macro version of this function in the header file */
1409 /* This is the first function that tries to fetch a collation element */
1410 /* If it's not succesfull or it encounters a more difficult situation */
1411 /* some more sofisticated and slower functions are invoked */
1413 inline uint32_t ucol_IGetNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1415 if (collationSource
->CEpos
> collationSource
->toReturn
) { /* Are there any CEs from previous expansions? */
1416 order
= *(collationSource
->toReturn
++); /* if so, return them */
1417 if(collationSource
->CEpos
== collationSource
->toReturn
) {
1418 collationSource
->CEpos
= collationSource
->toReturn
= collationSource
->CEs
;
1425 for (;;) /* Loop handles case when incremental normalize switches */
1426 { /* to or from the side buffer / original string, and we */
1427 /* need to start again to get the next character. */
1429 if ((collationSource
->flags
& (UCOL_ITER_HASLEN
| UCOL_ITER_INNORMBUF
| UCOL_ITER_NORM
| UCOL_HIRAGANA_Q
| UCOL_USE_ITERATOR
)) == 0)
1431 // The source string is null terminated and we're not working from the side buffer,
1432 // and we're not normalizing. This is the fast path.
1433 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1434 ch
= *collationSource
->pos
++;
1439 return UCOL_NO_MORE_CES
;
1443 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1444 // Normal path for strings when length is specified.
1445 // (We can't be in side buffer because it is always null terminated.)
1446 if (collationSource
->pos
>= collationSource
->endp
) {
1447 // Ran off of the end of the main source string. We're done.
1448 return UCOL_NO_MORE_CES
;
1450 ch
= *collationSource
->pos
++;
1452 else if(collationSource
->flags
& UCOL_USE_ITERATOR
) {
1453 UChar32 iterCh
= collationSource
->iterator
->next(collationSource
->iterator
);
1454 if(iterCh
== U_SENTINEL
) {
1455 return UCOL_NO_MORE_CES
;
1461 // Null terminated string.
1462 ch
= *collationSource
->pos
++;
1464 // Ran off end of buffer.
1465 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1466 // Ran off end of main string. backing up one character.
1467 collationSource
->pos
--;
1468 return UCOL_NO_MORE_CES
;
1472 // Hit null in the normalize side buffer.
1473 // Usually this means the end of the normalized data,
1474 // except for one odd case: a null followed by combining chars,
1475 // which is the case if we are at the start of the buffer.
1476 if (collationSource
->pos
== collationSource
->writableBuffer
+1) {
1480 // Null marked end of side buffer.
1481 // Revert to the main string and
1482 // loop back to top to try again to get a character.
1483 collationSource
->pos
= collationSource
->fcdPosition
;
1484 collationSource
->flags
= collationSource
->origFlags
;
1490 if(collationSource
->flags
&UCOL_HIRAGANA_Q
) {
1491 if((ch
>=0x3040 && ch
<=0x3094) || ch
== 0x309d || ch
== 0x309e) {
1492 collationSource
->flags
|= UCOL_WAS_HIRAGANA
;
1494 collationSource
->flags
&= ~UCOL_WAS_HIRAGANA
;
1498 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1499 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1500 if ((collationSource
->flags
& UCOL_ITER_NORM
) == 0) {
1504 if (collationSource
->fcdPosition
>= collationSource
->pos
) {
1505 // An earlier FCD check has already covered the current character.
1506 // We can go ahead and process this char.
1510 if (ch
< ZERO_CC_LIMIT_
) {
1511 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1515 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1516 // We need to peek at the next character in order to tell if we are FCD
1517 if ((collationSource
->flags
& UCOL_ITER_HASLEN
) && collationSource
->pos
>= collationSource
->endp
) {
1518 // We are at the last char of source string.
1519 // It is always OK for FCD check.
1523 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1524 if (*collationSource
->pos
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1530 // Need a more complete FCD check and possible normalization.
1531 if (collIterFCD(collationSource
)) {
1532 collIterNormalize(collationSource
);
1534 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1535 // No normalization was needed. Go ahead and process the char we already had.
1539 // Some normalization happened. Next loop iteration will pick up a char
1540 // from the normalization buffer.
1546 /* For latin-1 characters we never need to fall back to the UCA table */
1547 /* because all of the UCA data is replicated in the latinOneMapping array */
1548 order
= coll
->latinOneMapping
[ch
];
1549 if (order
> UCOL_NOT_FOUND
) {
1550 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
);
1555 order
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
1556 if(order
> UCOL_NOT_FOUND
) { /* if a CE is special */
1557 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
); /* and try to get the special CE */
1559 if(order
== UCOL_NOT_FOUND
&& coll
->UCA
) { /* We couldn't find a good CE in the tailoring */
1560 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1561 order
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
1563 if(order
> UCOL_NOT_FOUND
) { /* UCA also gives us a special CE */
1564 order
= ucol_prv_getSpecialCE(coll
->UCA
, ch
, order
, collationSource
, status
);
1568 if(order
== UCOL_NOT_FOUND
) {
1569 order
= getImplicit(ch
, collationSource
);
1571 return order
; /* return the CE */
1574 /* ucol_getNextCE, out-of-line version for use from other files. */
1575 U_CAPI
uint32_t U_EXPORT2
1576 ucol_getNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1577 return ucol_IGetNextCE(coll
, collationSource
, status
);
1582 * Incremental previous normalization happens here. Pick up the range of chars
1583 * identifed by FCD, normalize it into the collIterate's writable buffer,
1584 * switch the collIterate's state to use the writable buffer.
1585 * @param data collation iterator data
1588 void collPrevIterNormalize(collIterate
*data
)
1590 UErrorCode status
= U_ZERO_ERROR
;
1591 UChar
*pEnd
= data
->pos
; /* End normalize + 1 */
1596 /* Start normalize */
1597 if (data
->fcdPosition
== NULL
) {
1598 pStart
= data
->string
;
1601 pStart
= data
->fcdPosition
+ 1;
1604 normLen
= unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0,
1605 data
->writableBuffer
, 0, &status
);
1607 if (data
->writableBufSize
<= normLen
) {
1608 freeHeapWritableBuffer(data
);
1609 data
->writableBuffer
= (UChar
*)uprv_malloc((normLen
+ 1) *
1611 if(data
->writableBuffer
== NULL
) { // something is wrong here, return
1614 data
->flags
|= UCOL_ITER_ALLOCATED
;
1615 /* to handle the zero termination */
1616 data
->writableBufSize
= normLen
+ 1;
1618 status
= U_ZERO_ERROR
;
1620 this puts the null termination infront of the normalized string instead
1623 pStartNorm
= data
->writableBuffer
+ (data
->writableBufSize
- normLen
);
1624 *(pStartNorm
- 1) = 0;
1625 unorm_normalize(pStart
, (pEnd
- pStart
) + 1, UNORM_NFD
, 0, pStartNorm
,
1628 data
->pos
= data
->writableBuffer
+ data
->writableBufSize
;
1629 data
->origFlags
= data
->flags
;
1630 data
->flags
|= UCOL_ITER_INNORMBUF
;
1631 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
1636 * Incremental FCD check for previous iteration and normalize. Called from
1637 * getPrevCE when normalization state is suspect.
1638 * When entering, the state is known to be this:
1639 * o We are working in the main buffer of the collIterate, not the side
1640 * writable buffer. When in the side buffer, normalization mode is always
1641 * off, so we won't get here.
1642 * o The leading combining class from the current character is 0 or the
1643 * trailing combining class of the previous char was zero.
1644 * True because the previous call to this function will have always exited
1645 * that way, and we get called for every char where cc might be non-zero.
1646 * @param data collation iterate struct
1647 * @return normalization status, TRUE for normalization to be done, FALSE
1651 inline UBool
collPrevIterFCD(collIterate
*data
)
1653 const UChar
*src
, *start
;
1656 uint8_t trailingCC
= 0;
1658 UBool result
= FALSE
;
1660 start
= data
->string
;
1661 src
= data
->pos
+ 1;
1663 /* Get the trailing combining class of the current character. */
1665 if (!U16_IS_SURROGATE(c
)) {
1666 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1667 } else if (U16_IS_TRAIL(c
) && start
< src
&& U16_IS_LEAD(c2
= *(src
- 1))) {
1669 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1671 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1673 } else /* unpaired surrogate */ {
1677 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1679 if (leadingCC
!= 0) {
1681 The current char has a non-zero leading combining class.
1682 Scan backward until we find a char with a trailing cc of zero.
1687 data
->fcdPosition
= NULL
;
1692 if (!U16_IS_SURROGATE(c
)) {
1693 fcd
= unorm_getFCD16(fcdTrieIndex
, c
);
1694 } else if (U16_IS_TRAIL(c
) && start
< src
&& U16_IS_LEAD(c2
= *(src
- 1))) {
1696 fcd
= unorm_getFCD16(fcdTrieIndex
, c2
);
1698 fcd
= unorm_getFCD16FromSurrogatePair(fcdTrieIndex
, fcd
, c
);
1700 } else /* unpaired surrogate */ {
1704 trailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1706 if (trailingCC
== 0) {
1710 if (leadingCC
< trailingCC
) {
1714 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1718 data
->fcdPosition
= (UChar
*)src
;
1723 /** gets a character from the string at a given offset
1724 * Handles both normal and iterative cases.
1725 * No error checking - caller beware!
1728 UChar
peekCharacter(collIterate
*source
, int32_t offset
) {
1729 if(source
->pos
!= NULL
) {
1730 return *(source
->pos
+ offset
);
1731 } else if(source
->iterator
!= NULL
) {
1733 source
->iterator
->move(source
->iterator
, offset
, UITER_CURRENT
);
1734 UChar toReturn
= (UChar
)source
->iterator
->next(source
->iterator
);
1735 source
->iterator
->move(source
->iterator
, -offset
-1, UITER_CURRENT
);
1738 return (UChar
)source
->iterator
->current(source
->iterator
);
1741 return (UChar
)U_SENTINEL
;
1746 * Determines if we are at the start of the data string in the backwards
1747 * collation iterator
1748 * @param data collation iterator
1749 * @return TRUE if we are at the start
1752 inline UBool
isAtStartPrevIterate(collIterate
*data
) {
1753 if(data
->pos
== NULL
&& data
->iterator
!= NULL
) {
1754 return !data
->iterator
->hasPrevious(data
->iterator
);
1756 //return (collIter_bos(data)) ||
1757 return (data
->pos
== data
->string
) ||
1758 ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
1759 *(data
->pos
- 1) == 0 && data
->fcdPosition
== NULL
);
1763 inline void goBackOne(collIterate
*data
) {
1765 // somehow, it looks like we need to keep iterator synced up
1766 // at all times, as above.
1770 if(data
->iterator
) {
1771 data
->iterator
->previous(data
->iterator
);
1774 if(data
->iterator
&& (data
->flags
& UCOL_USE_ITERATOR
)) {
1775 data
->iterator
->previous(data
->iterator
);
1783 * Inline function that gets a simple CE.
1784 * So what it does is that it will first check the expansion buffer. If the
1785 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1786 * is different from the string pointer, we return the collation element at the
1787 * return pointer and decrement it.
1788 * For more complicated CEs it resorts to getComplicatedCE.
1789 * @param coll collator data
1790 * @param data collation iterator struct
1791 * @param status error status
1794 inline uint32_t ucol_IGetPrevCE(const UCollator
*coll
, collIterate
*data
,
1797 uint32_t result
= (uint32_t)UCOL_NULLORDER
;
1798 if (data
->toReturn
> data
->CEs
) {
1800 result
= *(data
->toReturn
);
1801 if (data
->CEs
== data
->toReturn
) {
1802 data
->CEpos
= data
->toReturn
;
1808 Loop handles case when incremental normalize switches to or from the
1809 side buffer / original string, and we need to start again to get the
1813 if (data
->flags
& UCOL_ITER_HASLEN
) {
1815 Normal path for strings when length is specified.
1816 Not in side buffer because it is always null terminated.
1818 if (data
->pos
<= data
->string
) {
1819 /* End of the main source string */
1820 return UCOL_NO_MORE_CES
;
1825 // we are using an iterator to go back. Pray for us!
1826 else if (data
->flags
& UCOL_USE_ITERATOR
) {
1827 UChar32 iterCh
= data
->iterator
->previous(data
->iterator
);
1828 if(iterCh
== U_SENTINEL
) {
1829 return UCOL_NO_MORE_CES
;
1837 /* we are in the side buffer. */
1840 At the start of the normalize side buffer.
1842 Because pointer points to the last accessed character,
1843 hence we have to increment it by one here.
1845 if (data
->fcdPosition
== NULL
) {
1846 data
->pos
= data
->string
;
1847 return UCOL_NO_MORE_CES
;
1850 data
->pos
= data
->fcdPosition
+ 1;
1852 data
->flags
= data
->origFlags
;
1857 if(data
->flags
&UCOL_HIRAGANA_Q
) {
1858 if(ch
>=0x3040 && ch
<=0x309f) {
1859 data
->flags
|= UCOL_WAS_HIRAGANA
;
1861 data
->flags
&= ~UCOL_WAS_HIRAGANA
;
1866 * got a character to determine if there's fcd and/or normalization
1868 * if the current character is not fcd.
1869 * if current character is at the start of the string
1870 * Trailing combining class == 0.
1871 * Note if pos is in the writablebuffer, norm is always 0
1873 if (ch
< ZERO_CC_LIMIT_
||
1874 // this should propel us out of the loop in the iterator case
1875 (data
->flags
& UCOL_ITER_NORM
) == 0 ||
1876 (data
->fcdPosition
!= NULL
&& data
->fcdPosition
<= data
->pos
)
1877 || data
->string
== data
->pos
) {
1881 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1882 /* if next character is FCD */
1883 if (data
->pos
== data
->string
) {
1884 /* First char of string is always OK for FCD check */
1888 /* Not first char of string, do the FCD fast test */
1889 if (*(data
->pos
- 1) < NFC_ZERO_CC_BLOCK_LIMIT_
) {
1894 /* Need a more complete FCD check and possible normalization. */
1895 if (collPrevIterFCD(data
)) {
1896 collPrevIterNormalize(data
);
1899 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1900 /* No normalization. Go ahead and process the char. */
1905 Some normalization happened.
1906 Next loop picks up a char from the normalization buffer.
1910 /* attempt to handle contractions, after removal of the backwards
1913 if (ucol_contractionEndCP(ch
, coll
) && !isAtStartPrevIterate(data
)) {
1914 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, UCOL_CONTRACTION
, data
, status
);
1917 result
= coll
->latinOneMapping
[ch
];
1920 result
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
1922 if (result
> UCOL_NOT_FOUND
) {
1923 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, result
, data
, status
);
1925 if (result
== UCOL_NOT_FOUND
) { // Not found in master list
1926 if (!isAtStartPrevIterate(data
) &&
1927 ucol_contractionEndCP(ch
, data
->coll
)) {
1928 result
= UCOL_CONTRACTION
;
1931 result
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
1935 if (result
> UCOL_NOT_FOUND
) {
1937 result
= ucol_prv_getSpecialPrevCE(coll
->UCA
, ch
, result
, data
, status
);
1942 if(result
== UCOL_NOT_FOUND
) {
1943 result
= getPrevImplicit(ch
, data
);
1950 /* ucol_getPrevCE, out-of-line version for use from other files. */
1951 U_CAPI
uint32_t U_EXPORT2
1952 ucol_getPrevCE(const UCollator
*coll
, collIterate
*data
,
1953 UErrorCode
*status
) {
1954 return ucol_IGetPrevCE(coll
, data
, status
);
1958 /* this should be connected to special Jamo handling */
1959 U_CAPI
uint32_t U_EXPORT2
1960 ucol_getFirstCE(const UCollator
*coll
, UChar u
, UErrorCode
*status
) {
1963 IInit_collIterate(coll
, &u
, 1, &colIt
);
1964 order
= ucol_IGetNextCE(coll
, &colIt
, status
);
1965 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1970 * Inserts the argument character into the end of the buffer pushing back the
1972 * @param data collIterate struct data
1973 * @param pNull pointer to the null termination
1974 * @param ch character to be appended
1975 * @return the position of the new addition
1978 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar ch
)
1980 uint32_t size
= data
->writableBufSize
;
1982 const uint32_t incsize
= 5;
1984 if ((data
->writableBuffer
+ size
) > (pNull
+ 1)) {
1991 buffer will always be null terminated at the end.
1992 giving extra space since it is likely that more characters will be added.
1995 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
1996 if(newbuffer
!= NULL
) { // something wrong, but no status
1997 uprv_memcpy(newbuffer
, data
->writableBuffer
,
1998 data
->writableBufSize
* sizeof(UChar
));
2000 freeHeapWritableBuffer(data
);
2001 data
->writableBufSize
= size
;
2002 data
->writableBuffer
= newbuffer
;
2004 newbuffer
= newbuffer
+ data
->writableBufSize
;
2006 *(newbuffer
+ 1) = 0;
2012 * Inserts the argument string into the end of the buffer pushing back the
2014 * @param data collIterate struct data
2015 * @param pNull pointer to the null termination
2016 * @param string to be appended
2017 * @param length of the string to be appended
2018 * @return the position of the new addition
2021 inline UChar
* insertBufferEnd(collIterate
*data
, UChar
*pNull
, UChar
*str
,
2024 uint32_t size
= pNull
- data
->writableBuffer
;
2027 if (data
->writableBuffer
+ data
->writableBufSize
> pNull
+ length
+ 1) {
2028 uprv_memcpy(pNull
, str
, length
* sizeof(UChar
));
2029 *(pNull
+ length
) = 0;
2034 buffer will always be null terminated at the end.
2035 giving extra space since it is likely that more characters will be added.
2037 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * (size
+ length
+ 1));
2038 if(newbuffer
!= NULL
) {
2039 uprv_memcpy(newbuffer
, data
->writableBuffer
, size
* sizeof(UChar
));
2040 uprv_memcpy(newbuffer
+ size
, str
, length
* sizeof(UChar
));
2042 freeHeapWritableBuffer(data
);
2043 data
->writableBufSize
= size
+ length
+ 1;
2044 data
->writableBuffer
= newbuffer
;
2051 * Special normalization function for contraction in the forwards iterator.
2052 * This normalization sequence will place the current character at source->pos
2053 * and its following normalized sequence into the buffer.
2054 * The fcd position, pos will be changed.
2055 * pos will now point to positions in the buffer.
2056 * Flags will be changed accordingly.
2057 * @param data collation iterator data
2060 inline void normalizeNextContraction(collIterate
*data
)
2062 UChar
*buffer
= data
->writableBuffer
;
2063 uint32_t buffersize
= data
->writableBufSize
;
2065 UErrorCode status
= U_ZERO_ERROR
;
2066 /* because the pointer points to the next character */
2067 UChar
*pStart
= data
->pos
- 1;
2072 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2073 *data
->writableBuffer
= *(pStart
- 1);
2077 strsize
= u_strlen(data
->writableBuffer
);
2080 pEnd
= data
->fcdPosition
;
2082 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
2085 if (buffersize
<= normLen
+ strsize
) {
2086 uint32_t size
= strsize
+ normLen
+ 1;
2087 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2089 uprv_memcpy(temp
, buffer
, sizeof(UChar
) * strsize
);
2090 freeHeapWritableBuffer(data
);
2091 data
->writableBuffer
= temp
;
2092 data
->writableBufSize
= size
;
2093 data
->flags
|= UCOL_ITER_ALLOCATED
;
2097 status
= U_ZERO_ERROR
;
2098 pStartNorm
= buffer
+ strsize
;
2099 /* null-termination will be added here */
2100 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
,
2101 normLen
+ 1, &status
);
2103 data
->pos
= data
->writableBuffer
+ strsize
;
2104 data
->origFlags
= data
->flags
;
2105 data
->flags
|= UCOL_ITER_INNORMBUF
;
2106 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2110 * Contraction character management function that returns the next character
2111 * for the forwards iterator.
2112 * Does nothing if the next character is in buffer and not the first character
2114 * Else it checks next character in data string to see if it is normalizable.
2115 * If it is not, the character is simply copied into the buffer, else
2116 * the whole normalized substring is copied into the buffer, including the
2117 * current character.
2118 * @param data collation element iterator data
2119 * @return next character
2122 inline UChar
getNextNormalizedChar(collIterate
*data
)
2126 // Here we need to add the iterator code. One problem is the way
2127 // end of string is handled. If we just return next char, it could
2128 // be the sentinel. Most of the cases already check for this, but we
2130 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ) {
2131 /* if no normalization and not in buffer. */
2132 if(data
->flags
& UCOL_USE_ITERATOR
) {
2133 return (UChar
)data
->iterator
->next(data
->iterator
);
2135 return *(data
->pos
++);
2139 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2140 //normalizeIterator(data);
2143 UChar
*pEndWritableBuffer
= NULL
;
2144 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2145 if ((innormbuf
&& *data
->pos
!= 0) ||
2146 (data
->fcdPosition
!= NULL
&& !innormbuf
&&
2147 data
->pos
< data
->fcdPosition
)) {
2149 if next character is in normalized buffer, no further normalization
2152 return *(data
->pos
++);
2155 if (data
->flags
& UCOL_ITER_HASLEN
) {
2156 /* in data string */
2157 if (data
->pos
+ 1 == data
->endp
) {
2158 return *(data
->pos
++);
2163 // inside the normalization buffer, but at the end
2164 // (since we encountered zero). This means, in the
2165 // case we're using char iterator, that we need to
2166 // do another round of normalization.
2167 //if(data->origFlags & UCOL_USE_ITERATOR) {
2168 // we need to restore original flags,
2169 // otherwise, we'll lose them
2170 //data->flags = data->origFlags;
2171 //normalizeIterator(data);
2172 //return *(data->pos++);
2175 in writable buffer, at this point fcdPosition can not be
2176 pointing to the end of the data string. see contracting tag.
2178 if(data
->fcdPosition
) {
2179 if (*(data
->fcdPosition
+ 1) == 0 ||
2180 data
->fcdPosition
+ 1 == data
->endp
) {
2181 /* at the end of the string, dump it into the normalizer */
2182 data
->pos
= insertBufferEnd(data
, data
->pos
,
2183 *(data
->fcdPosition
)) + 1;
2184 return *(data
->fcdPosition
++);
2186 pEndWritableBuffer
= data
->pos
;
2187 data
->pos
= data
->fcdPosition
;
2188 } else if(data
->origFlags
& UCOL_USE_ITERATOR
) {
2189 // if we are here, we're using a normalizing iterator.
2190 // we should just continue further.
2191 data
->flags
= data
->origFlags
;
2193 return (UChar
)data
->iterator
->next(data
->iterator
);
2198 if (*(data
->pos
+ 1) == 0) {
2199 return *(data
->pos
++);
2205 nextch
= *data
->pos
;
2208 * if the current character is not fcd.
2209 * Trailing combining class == 0.
2211 if ((data
->fcdPosition
== NULL
|| data
->fcdPosition
< data
->pos
) &&
2212 (nextch
>= NFC_ZERO_CC_BLOCK_LIMIT_
||
2213 ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
)) {
2215 Need a more complete FCD check and possible normalization.
2216 normalize substring will be appended to buffer
2218 if (collIterFCD(data
)) {
2219 normalizeNextContraction(data
);
2220 return *(data
->pos
++);
2222 else if (innormbuf
) {
2223 /* fcdposition shifted even when there's no normalization, if we
2224 don't input the rest into this, we'll get the wrong position when
2225 we reach the end of the writableBuffer */
2226 int32_t length
= data
->fcdPosition
- data
->pos
+ 1;
2227 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
,
2228 data
->pos
- 1, length
);
2229 return *(data
->pos
++);
2235 no normalization is to be done hence only one character will be
2236 appended to the buffer.
2238 data
->pos
= insertBufferEnd(data
, pEndWritableBuffer
, ch
) + 1;
2241 /* points back to the pos in string */
2248 * Function to copy the buffer into writableBuffer and sets the fcd position to
2249 * the correct position
2250 * @param source data string source
2251 * @param buffer character buffer
2252 * @param tempdb current position in buffer that has been used up
2255 inline void setDiscontiguosAttribute(collIterate
*source
, UChar
*buffer
,
2258 /* okay confusing part here. to ensure that the skipped characters are
2259 considered later, we need to place it in the appropriate position in the
2260 normalization buffer and reassign the pos pointer. simple case if pos
2261 reside in string, simply copy to normalization buffer and
2262 fcdposition = pos, pos = start of normalization buffer. if pos in
2263 normalization buffer, we'll insert the copy infront of pos and point pos
2264 to the start of the normalization buffer. why am i doing these copies?
2265 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2266 not require any changes, which be really painful. */
2267 uint32_t length
= u_strlen(buffer
);;
2268 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
2269 u_strcpy(tempdb
, source
->pos
);
2272 source
->fcdPosition
= source
->pos
;
2273 source
->origFlags
= source
->flags
;
2274 source
->flags
|= UCOL_ITER_INNORMBUF
;
2275 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2278 if (length
>= source
->writableBufSize
) {
2279 freeHeapWritableBuffer(source
);
2280 source
->writableBuffer
=
2281 (UChar
*)uprv_malloc((length
+ 1) * sizeof(UChar
));
2282 if(source
->writableBuffer
== NULL
) {
2285 source
->writableBufSize
= length
;
2288 u_strcpy(source
->writableBuffer
, buffer
);
2289 source
->pos
= source
->writableBuffer
;
2293 * Function to get the discontiguos collation element within the source.
2294 * Note this function will set the position to the appropriate places.
2295 * @param coll current collator used
2296 * @param source data string source
2297 * @param constart index to the start character in the contraction table
2298 * @return discontiguos collation element offset
2301 uint32_t getDiscontiguous(const UCollator
*coll
, collIterate
*source
,
2302 const UChar
*constart
)
2304 /* source->pos currently points to the second combining character after
2305 the start character */
2306 UChar
*temppos
= source
->pos
;
2307 UChar buffer
[4*UCOL_MAX_BUFFER
];
2308 UChar
*tempdb
= buffer
;
2309 const UChar
*tempconstart
= constart
;
2310 uint8_t tempflags
= source
->flags
;
2311 UBool multicontraction
= FALSE
;
2312 UChar
*tempbufferpos
= 0;
2313 collIterateState discState
;
2315 backupState(source
, &discState
);
2317 //*tempdb = *(source->pos - 1);
2318 *tempdb
= peekCharacter(source
, -1);
2326 if (((source
->flags
& UCOL_ITER_HASLEN
) && source
->pos
>= source
->endp
)
2327 || (peekCharacter(source
, 0) == 0 &&
2328 //|| (*source->pos == 0 &&
2329 ((source
->flags
& UCOL_ITER_INNORMBUF
) == 0 ||
2330 source
->fcdPosition
== NULL
||
2331 source
->fcdPosition
== source
->endp
||
2332 *(source
->fcdPosition
) == 0 ||
2333 u_getCombiningClass(*(source
->fcdPosition
)) == 0)) ||
2334 /* end of string in null terminated string or stopped by a
2335 null character, note fcd does not always point to a base
2336 character after the discontiguos change */
2337 u_getCombiningClass(peekCharacter(source
, 0)) == 0) {
2338 //u_getCombiningClass(*(source->pos)) == 0) {
2339 //constart = (UChar *)coll->image + getContractOffset(CE);
2340 if (multicontraction
) {
2342 source
->pos
= temppos
- 1;
2343 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2344 return *(coll
->contractionCEs
+
2345 (tempconstart
- coll
->contractionIndex
));
2347 constart
= tempconstart
;
2351 UCharOffset
= (UChar
*)(tempconstart
+ 1); /* skip the backward offset*/
2352 schar
= getNextNormalizedChar(source
);
2354 while (schar
> (tchar
= *UCharOffset
)) {
2358 if (schar
!= tchar
) {
2359 /* not the correct codepoint. we stuff the current codepoint into
2360 the discontiguos buffer and try the next character */
2366 if (u_getCombiningClass(schar
) ==
2367 u_getCombiningClass(peekCharacter(source
, -2))) {
2368 //u_getCombiningClass(*(source->pos - 2))) {
2373 result
= *(coll
->contractionCEs
+
2374 (UCharOffset
- coll
->contractionIndex
));
2378 if (result
== UCOL_NOT_FOUND
) {
2380 } else if (isContraction(result
)) {
2381 /* this is a multi-contraction*/
2382 tempconstart
= (UChar
*)coll
->image
+ getContractOffset(result
);
2383 if (*(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
))
2384 != UCOL_NOT_FOUND
) {
2385 multicontraction
= TRUE
;
2386 temppos
= source
->pos
+ 1;
2387 tempbufferpos
= buffer
+ u_strlen(buffer
);
2390 setDiscontiguosAttribute(source
, buffer
, tempdb
);
2395 /* no problems simply reverting just like that,
2396 if we are in string before getting into this function, points back to
2397 string hence no problem.
2398 if we are in normalization buffer before getting into this function,
2399 since we'll never use another normalization within this function, we
2400 know that fcdposition points to a base character. the normalization buffer
2401 never change, hence this revert works. */
2402 loadState(source
, &discState
, TRUE
);
2405 //source->pos = temppos - 1;
2406 source
->flags
= tempflags
;
2407 return *(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
));
2411 inline UBool
isNonChar(UChar32 cp
) {
2412 if ((cp
& 0xFFFE) == 0xFFFE || (0xFDD0 <= cp
&& cp
<= 0xFDEF) || (0xD800 <= cp
&& cp
<= 0xDFFF)) {
2418 /* now uses Mark's getImplicitPrimary code */
2420 inline uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
) {
2424 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
2425 *(collationSource
->CEpos
++) = ((r
& 0x0000FFFF)<<16) | 0x000000C0;
2426 return (r
& UCOL_PRIMARYMASK
) | 0x00000505; // This was 'order'
2430 * Inserts the argument character into the front of the buffer replacing the
2431 * front null terminator.
2432 * @param data collation element iterator data
2433 * @param pNull pointer to the null terminator
2434 * @param ch character to be appended
2435 * @return positon of added character
2438 inline UChar
* insertBufferFront(collIterate
*data
, UChar
*pNull
, UChar ch
)
2440 uint32_t size
= data
->writableBufSize
;
2443 const uint32_t incsize
= 5;
2445 if (pNull
> data
->writableBuffer
+ 1) {
2452 buffer will always be null terminated infront.
2453 giving extra space since it is likely that more characters will be added.
2456 newbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) * size
);
2457 if(newbuffer
== NULL
) {
2460 end
= newbuffer
+ incsize
;
2461 uprv_memcpy(end
, data
->writableBuffer
,
2462 data
->writableBufSize
* sizeof(UChar
));
2466 freeHeapWritableBuffer(data
);
2468 data
->writableBufSize
= size
;
2469 data
->writableBuffer
= newbuffer
;
2474 * Special normalization function for contraction in the previous iterator.
2475 * This normalization sequence will place the current character at source->pos
2476 * and its following normalized sequence into the buffer.
2477 * The fcd position, pos will be changed.
2478 * pos will now point to positions in the buffer.
2479 * Flags will be changed accordingly.
2480 * @param data collation iterator data
2483 inline void normalizePrevContraction(collIterate
*data
, UErrorCode
*status
)
2485 UChar
*buffer
= data
->writableBuffer
;
2486 uint32_t buffersize
= data
->writableBufSize
;
2487 uint32_t nulltermsize
;
2488 UErrorCode localstatus
= U_ZERO_ERROR
;
2489 UChar
*pEnd
= data
->pos
+ 1; /* End normalize + 1 */
2494 if (data
->flags
& UCOL_ITER_HASLEN
) {
2496 normalization buffer not used yet, we'll pull down the next
2497 character into the end of the buffer
2499 *(buffer
+ (buffersize
- 1)) = *(data
->pos
+ 1);
2500 nulltermsize
= buffersize
- 1;
2503 nulltermsize
= buffersize
;
2504 UChar
*temp
= buffer
+ (nulltermsize
- 1);
2505 while (*(temp
--) != 0) {
2510 /* Start normalize */
2511 if (data
->fcdPosition
== NULL
) {
2512 pStart
= data
->string
;
2515 pStart
= data
->fcdPosition
+ 1;
2518 normLen
= unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, buffer
, 0,
2521 if (nulltermsize
<= normLen
) {
2522 uint32_t size
= buffersize
- nulltermsize
+ normLen
+ 1;
2523 UChar
*temp
= (UChar
*)uprv_malloc(size
* sizeof(UChar
));
2525 *status
= U_MEMORY_ALLOCATION_ERROR
;
2528 nulltermsize
= normLen
+ 1;
2529 uprv_memcpy(temp
+ normLen
, buffer
,
2530 sizeof(UChar
) * (buffersize
- nulltermsize
));
2531 freeHeapWritableBuffer(data
);
2532 data
->writableBuffer
= temp
;
2533 data
->writableBufSize
= size
;
2537 this puts the null termination infront of the normalized string instead
2540 pStartNorm
= buffer
+ (nulltermsize
- normLen
);
2541 *(pStartNorm
- 1) = 0;
2542 unorm_normalize(pStart
, pEnd
- pStart
, UNORM_NFD
, 0, pStartNorm
, normLen
,
2545 data
->pos
= data
->writableBuffer
+ nulltermsize
;
2546 data
->origFlags
= data
->flags
;
2547 data
->flags
|= UCOL_ITER_INNORMBUF
;
2548 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2552 * Contraction character management function that returns the previous character
2553 * for the backwards iterator.
2554 * Does nothing if the previous character is in buffer and not the first
2556 * Else it checks previous character in data string to see if it is
2558 * If it is not, the character is simply copied into the buffer, else
2559 * the whole normalized substring is copied into the buffer, including the
2560 * current character.
2561 * @param data collation element iterator data
2562 * @return previous character
2565 inline UChar
getPrevNormalizedChar(collIterate
*data
, UErrorCode
*status
)
2570 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2571 UChar
*pNull
= NULL
;
2572 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ||
2573 (innormbuf
&& *(data
->pos
- 1) != 0)) {
2575 if no normalization.
2576 if previous character is in normalized buffer, no further normalization
2579 if(data
->flags
& UCOL_USE_ITERATOR
) {
2580 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
2581 return (UChar
)data
->iterator
->next(data
->iterator
);
2583 return *(data
->pos
- 1);
2588 if (data
->flags
& UCOL_ITER_HASLEN
) {
2589 /* in data string */
2590 if ((start
- 1) == data
->string
) {
2591 return *(start
- 1);
2595 prevch
= *(start
- 1);
2599 in writable buffer, at this point fcdPosition can not be NULL.
2600 see contracting tag.
2602 if (data
->fcdPosition
== data
->string
) {
2603 /* at the start of the string, just dump it into the normalizer */
2604 insertBufferFront(data
, data
->pos
- 1, *(data
->fcdPosition
));
2605 data
->fcdPosition
= NULL
;
2606 return *(data
->pos
- 1);
2608 pNull
= data
->pos
- 1;
2609 start
= data
->fcdPosition
;
2611 prevch
= *(start
- 1);
2614 * if the current character is not fcd.
2615 * Trailing combining class == 0.
2617 if (data
->fcdPosition
> start
&&
2618 (ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
|| prevch
>= NFC_ZERO_CC_BLOCK_LIMIT_
))
2621 Need a more complete FCD check and possible normalization.
2622 normalize substring will be appended to buffer
2624 UChar
*backuppos
= data
->pos
;
2626 if (collPrevIterFCD(data
)) {
2627 normalizePrevContraction(data
, status
);
2628 return *(data
->pos
- 1);
2630 data
->pos
= backuppos
;
2631 data
->fcdPosition
++;
2636 no normalization is to be done hence only one character will be
2637 appended to the buffer.
2639 insertBufferFront(data
, pNull
, ch
);
2640 data
->fcdPosition
--;
2646 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2647 /* It is called by getNextCE */
2649 uint32_t ucol_prv_getSpecialCE(const UCollator
*coll
, UChar ch
, uint32_t CE
, collIterate
*source
, UErrorCode
*status
) {
2650 collIterateState entryState
;
2651 backupState(source
, &entryState
);
2655 // This loop will repeat only in the case of contractions, and only when a contraction
2656 // is found and the first CE resulting from that contraction is itself a special
2657 // (an expansion, for example.) All other special CE types are fully handled the
2658 // first time through, and the loop exits.
2660 const uint32_t *CEOffset
= NULL
;
2661 switch(getCETag(CE
)) {
2663 /* This one is not found, and we'll let somebody else bother about it... no more games */
2666 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2667 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2668 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2669 /* we return 0 (completely ignorable - per UCA specification */
2672 collIterateState state
;
2673 backupState(source
, &state
);
2674 if (collIter_eos(source
) || !(U16_IS_TRAIL((trail
= getNextNormalizedChar(source
))))) {
2675 // we chould have stepped one char forward and it might have turned that it
2676 // was not a trail surrogate. In that case, we have to backup.
2677 loadState(source
, &state
, TRUE
);
2680 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2681 CE
= UTRIE_GET32_FROM_OFFSET_TRAIL(&coll
->mapping
, CE
&0xFFFFFF, trail
);
2682 if(CE
== UCOL_NOT_FOUND
) { // there are tailored surrogates in this block, but not this one.
2683 // We need to backup
2684 loadState(source
, &state
, TRUE
);
2687 // calculate the supplementary code point value, if surrogate was not tailored
2688 cp
= ((((uint32_t)ch
)<<10UL)+(trail
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2694 // Special processing is getting a CE that is preceded by a certain prefix
2695 // Currently this is only needed for optimizing Japanese length and iteration marks.
2696 // When we encouter a special processing tag, we go backwards and try to see if
2698 // Contraction tables are used - so the whole process is not unlike contraction.
2699 // prefix data is stored backwards in the table.
2700 const UChar
*UCharOffset
;
2702 collIterateState prefixState
;
2703 backupState(source
, &prefixState
);
2704 loadState(source
, &entryState
, TRUE
);
2705 goBackOne(source
); // We want to look at the point where we entered - actually one
2709 // This loop will run once per source string character, for as long as we
2710 // are matching a potential contraction sequence
2712 // First we position ourselves at the begining of contraction sequence
2713 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2714 if (collIter_bos(source
)) {
2715 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2718 schar
= getPrevNormalizedChar(source
, status
);
2721 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2725 if (schar
== tchar
) {
2726 // Found the source string char in the table.
2727 // Pick up the corresponding CE from the table.
2728 CE
= *(coll
->contractionCEs
+
2729 (UCharOffset
- coll
->contractionIndex
));
2733 // Source string char was not in the table.
2734 // We have not found the prefix.
2735 CE
= *(coll
->contractionCEs
+
2736 (ContractionStart
- coll
->contractionIndex
));
2740 // The source string char was in the contraction table, and the corresponding
2741 // CE is not a prefix CE. We found the prefix, break
2742 // out of loop, this CE will end up being returned. This is the normal
2743 // way out of prefix handling when the source actually contained
2748 if(CE
!= UCOL_NOT_FOUND
) { // we found something and we can merilly continue
2749 loadState(source
, &prefixState
, TRUE
);
2750 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2751 source
->flags
= source
->origFlags
;
2753 } else { // prefix search was a failure, we have to backup all the way to the start
2754 loadState(source
, &entryState
, TRUE
);
2758 case CONTRACTION_TAG
:
2760 /* This should handle contractions */
2761 collIterateState state
;
2762 backupState(source
, &state
);
2763 uint32_t firstCE
= *(coll
->contractionCEs
+ ((UChar
*)coll
->image
+getContractOffset(CE
) - coll
->contractionIndex
)); //UCOL_NOT_FOUND;
2764 const UChar
*UCharOffset
;
2768 /* This loop will run once per source string character, for as long as we */
2769 /* are matching a potential contraction sequence */
2771 /* First we position ourselves at the begining of contraction sequence */
2772 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2774 if (collIter_eos(source
)) {
2775 // Ran off the end of the source string.
2776 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2777 // So we'll pick whatever we have at the point...
2778 if (CE
== UCOL_NOT_FOUND
) {
2779 // back up the source over all the chars we scanned going into this contraction.
2781 loadState(source
, &state
, TRUE
);
2782 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2783 source
->flags
= source
->origFlags
;
2789 uint8_t maxCC
= (uint8_t)(*(UCharOffset
)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2790 uint8_t allSame
= (uint8_t)(*(UCharOffset
++)>>8);
2792 schar
= getNextNormalizedChar(source
);
2793 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2797 if (schar
== tchar
) {
2798 // Found the source string char in the contraction table.
2799 // Pick up the corresponding CE from the table.
2800 CE
= *(coll
->contractionCEs
+
2801 (UCharOffset
- coll
->contractionIndex
));
2805 // Source string char was not in contraction table.
2806 // Unless we have a discontiguous contraction, we have finished
2807 // with this contraction.
2808 UChar32 miss
= schar
;
2809 if(U16_IS_LEAD(schar
)) { // in order to do the proper detection, we
2810 // need to see if we're dealing with a supplementary
2811 miss
= U16_GET_SUPPLEMENTARY(schar
, getNextNormalizedChar(source
));
2817 (sCC
= i_getCombiningClass(miss
, coll
)) == 0 ||
2819 (allSame
!= 0 && sCC
== maxCC
) ||
2820 collIter_eos(source
)) {
2821 // Contraction can not be discontiguous.
2822 goBackOne(source
); // back up the source string by one,
2823 // because the character we just looked at was
2824 // not part of the contraction. */
2825 if(U_IS_SUPPLEMENTARY(miss
)) {
2828 CE
= *(coll
->contractionCEs
+
2829 (ContractionStart
- coll
->contractionIndex
));
2832 // Contraction is possibly discontiguous.
2833 // Scan more of source string looking for a match
2836 /* find the next character if schar is not a base character
2837 and we are not yet at the end of the string */
2838 tempchar
= getNextNormalizedChar(source
);
2839 // probably need another supplementary thingie here
2841 if (i_getCombiningClass(tempchar
, coll
) == 0) {
2843 if(U_IS_SUPPLEMENTARY(miss
)) {
2846 /* Spit out the last char of the string, wasn't tasty enough */
2847 CE
= *(coll
->contractionCEs
+
2848 (ContractionStart
- coll
->contractionIndex
));
2850 CE
= getDiscontiguous(coll
, source
, ContractionStart
);
2853 } // else after if(schar == tchar)
2855 if(CE
== UCOL_NOT_FOUND
) {
2856 /* The Source string did not match the contraction that we were checking. */
2857 /* Back up the source position to undo the effects of having partially */
2858 /* scanned through what ultimately proved to not be a contraction. */
2859 loadState(source
, &state
, TRUE
);
2864 if(!isContraction(CE
)) {
2865 // The source string char was in the contraction table, and the corresponding
2866 // CE is not a contraction CE. We completed the contraction, break
2867 // out of loop, this CE will end up being returned. This is the normal
2868 // way out of contraction handling when the source actually contained
2874 // The source string char was in the contraction table, and the corresponding
2875 // CE is IS a contraction CE. We will continue looping to check the source
2876 // string for the remaining chars in the contraction.
2877 uint32_t tempCE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
2878 if(tempCE
!= UCOL_NOT_FOUND
) {
2879 // We have scanned a a section of source string for which there is a
2880 // CE from the contraction table. Remember the CE and scan position, so
2881 // that we can return to this point if further scanning fails to
2882 // match a longer contraction sequence.
2886 backupState(source
, &state
);
2887 getNextNormalizedChar(source
);
2889 // Another way to do this is:
2890 //collIterateState tempState;
2891 //backupState(source, &tempState);
2892 //goBackOne(source);
2893 //backupState(source, &state);
2894 //loadState(source, &tempState, TRUE);
2896 // The problem is that for incomplete contractions we have to remember the previous
2897 // position. Before, the only thing I needed to do was state.pos--;
2898 // After iterator introduction and especially after introduction of normalizing
2899 // iterators, it became much more difficult to decrease the saved state.
2900 // I'm not yet sure which of the two methods above is faster.
2904 } // case CONTRACTION_TAG:
2905 case LONG_PRIMARY_TAG
:
2907 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
2908 CE
= ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
2913 /* This should handle expansion. */
2914 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2915 /* I have to decide where continuations are going to be dealt with */
2917 uint32_t i
; /* general counter */
2918 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
2919 size
= getExpansionCount(CE
);
2921 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2922 for(i
= 1; i
<size
; i
++) {
2923 *(source
->CEpos
++) = *CEOffset
++;
2925 } else { /* else, we do */
2926 while(*CEOffset
!= 0) {
2927 *(source
->CEpos
++) = *CEOffset
++;
2935 We do a check to see if we want to collate digits as numbers; if so we generate
2936 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2939 uint32_t i
; /* general counter */
2941 if (source
->coll
->numericCollation
== UCOL_ON
){
2942 collIterateState digitState
= {0,0,0,0,0,0,0,0};
2945 uint32_t digIndx
= 0;
2946 uint32_t endIndex
= 0;
2947 uint32_t trailingZeroIndex
= 0;
2949 uint32_t primWeight
= 0;
2952 uint8_t collateVal
= 0;
2954 UBool nonZeroValReached
= FALSE
;
2956 uint8_t *numTempBuf
;
2957 uint8_t stackNumTempBuf
[UCOL_MAX_BUFFER
]; // I just need a temporary place to store my generated CEs.
2958 uint32_t numTempBufSize
= UCOL_MAX_BUFFER
;
2960 numTempBuf
= stackNumTempBuf
;
2962 We parse the source string until we hit a char that's NOT a digit.
2963 Use this u_charDigitValue. This might be slow because we have to
2964 handle surrogates...
2967 if (U16_IS_LEAD(ch)){
2968 if (!collIter_eos(source)) {
2969 backupState(source, &digitState);
2970 UChar trail = getNextNormalizedChar(source);
2971 if(U16_IS_TRAIL(trail)) {
2972 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2974 loadState(source, &digitState, TRUE);
2983 digVal = u_charDigitValue(char32);
2985 digVal
= u_charDigitValue(cp
); // if we have arrived here, we have
2986 // already processed possible supplementaries that trigered the digit tag -
2987 // all supplementaries are marked in the UCA.
2989 We pad a zero in front of the first element anyways. This takes
2990 care of the (probably) most common case where people are sorting things followed
2995 // Make sure we have enough space.
2996 if (digIndx
>= ((numTempBufSize
- 2) * 2) + 1)
2998 numTempBufSize
*= 2;
2999 if (numTempBuf
== stackNumTempBuf
){
3000 numTempBuf
= (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize
);
3001 uprv_memcpy(numTempBuf
, stackNumTempBuf
, UCOL_MAX_BUFFER
);
3003 uprv_realloc(numTempBuf
, numTempBufSize
);
3007 // Skipping over leading zeroes.
3009 nonZeroValReached
= TRUE
;
3011 if (nonZeroValReached
) {
3013 We parse the digit string into base 100 numbers (this fits into a byte).
3014 We only add to the buffer in twos, thus if we are parsing an odd character,
3015 that serves as the 'tens' digit while the if we are parsing an even one, that
3016 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3017 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3018 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3019 than all the other bytes.
3022 if (digIndx
% 2 == 1){
3023 collateVal
+= (uint8_t)digVal
;
3025 // We don't enter the low-order-digit case unless we've already seen
3026 // the high order, or for the first digit, which is always non-zero.
3027 if (collateVal
!= 0)
3028 trailingZeroIndex
= 0;
3030 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3034 // We drop the collation value into the buffer so if we need to do
3035 // a "front patch" we don't have to check to see if we're hitting the
3037 collateVal
= (uint8_t)(digVal
* 10);
3039 // Check for trailing zeroes.
3040 if (collateVal
== 0)
3042 if (!trailingZeroIndex
)
3043 trailingZeroIndex
= (digIndx
/2) + 2;
3046 trailingZeroIndex
= 0;
3048 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3053 // Get next character.
3054 if (!collIter_eos(source
)){
3055 ch
= getNextNormalizedChar(source
);
3056 if (U16_IS_LEAD(ch
)){
3057 if (!collIter_eos(source
)) {
3058 backupState(source
, &digitState
);
3059 UChar trail
= getNextNormalizedChar(source
);
3060 if(U16_IS_TRAIL(trail
)) {
3061 char32
= U16_GET_SUPPLEMENTARY(ch
, trail
);
3063 loadState(source
, &digitState
, TRUE
);
3071 if ((digVal
= u_charDigitValue(char32
)) == -1){
3072 // Resetting position to point to the next unprocessed char. We
3073 // overshot it when doing our test/set for numbers.
3074 if (char32
> 0xFFFF) { // For surrogates.
3075 loadState(source
, &digitState
, TRUE
);
3076 //goBackOne(source);
3086 if (nonZeroValReached
== FALSE
){
3091 endIndex
= trailingZeroIndex
? trailingZeroIndex
: ((digIndx
/2) + 2) ;
3092 if (digIndx
% 2 != 0){
3094 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3095 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3096 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3097 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3100 for(i
= 2; i
< endIndex
; i
++){
3101 numTempBuf
[i
] = (((((numTempBuf
[i
] - 6)/2) % 10) * 10) +
3102 (((numTempBuf
[i
+1])-6)/2) / 10) * 2 + 6;
3107 // Subtract one off of the last byte.
3108 numTempBuf
[endIndex
-1] -= 1;
3111 We want to skip over the first two slots in the buffer. The first slot
3112 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3113 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3115 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
3116 numTempBuf
[1] = (uint8_t)(0x80 + ((digIndx
/2) & 0x7F));
3118 // Now transfer the collation key to our collIterate struct.
3119 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3120 size
= ((endIndex
+1) & ~1)/2;
3121 CE
= (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3122 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3123 UCOL_BYTE_COMMON
; // Tertiary weight.
3124 i
= 2; // Reset the index into the buffer.
3127 primWeight
= numTempBuf
[i
++] << 8;
3129 primWeight
|= numTempBuf
[i
++];
3130 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3133 if (numTempBuf
!= stackNumTempBuf
)
3134 uprv_free(numTempBuf
);
3136 // no numeric mode, we'll just switch to whatever we stashed and continue
3137 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3143 /* various implicits optimization */
3144 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3145 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3146 //return getImplicit(cp, source, 0x04000000);
3147 return getImplicit(cp
, source
);
3148 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3149 /* UCA is filled with these. Tailorings are NOT_FOUND */
3150 //return getImplicit(cp, source, 0);
3151 return getImplicit(cp
, source
);
3152 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3153 return 0; /* broken surrogate sequence */
3154 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3156 if( source
->flags
& UCOL_USE_ITERATOR
) {
3157 if(U_IS_TRAIL(nextChar
= (UChar
)source
->iterator
->current(source
->iterator
))) {
3158 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3159 source
->iterator
->next(source
->iterator
);
3160 return getImplicit(cp
, source
);
3164 } else if((((source
->flags
& UCOL_ITER_HASLEN
) == 0 ) || (source
->pos
<source
->endp
)) &&
3165 U_IS_TRAIL((nextChar
=*source
->pos
))) {
3166 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3168 return getImplicit(cp
, source
);
3170 return 0; /* completely ignorable */
3172 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3175 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3176 //const uint32_t LCount = 19;
3177 const uint32_t VCount
= 21;
3178 const uint32_t TCount
= 28;
3179 //const uint32_t NCount = VCount * TCount; // 588
3180 //const uint32_t SCount = LCount * NCount; // 11172
3181 uint32_t L
= ch
- SBase
;
3183 // divide into pieces
3185 uint32_t T
= L
% TCount
; // we do it in this order since some compilers can do % and / in one operation
3187 uint32_t V
= L
% VCount
;
3196 // return the first CE, but first put the rest into the expansion buffer
3197 if (!source
->coll
->image
->jamoSpecial
) { // FAST PATH
3199 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, V
);
3201 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, T
);
3204 return UTRIE_GET32_FROM_LEAD(&coll
->mapping
, L
);
3206 } else { // Jamo is Special
3207 // Since Hanguls pass the FCD check, it is
3208 // guaranteed that we won't be in
3209 // the normalization buffer if something like this happens
3210 // However, if we are using a uchar iterator and normalization
3211 // is ON, the Hangul that lead us here is going to be in that
3212 // normalization buffer. Here we want to restore the uchar
3213 // iterator state and pull out of the normalization buffer
3214 if(source
->iterator
!= NULL
&& source
->flags
& UCOL_ITER_INNORMBUF
) {
3215 source
->flags
= source
->origFlags
; // restore the iterator
3218 // Move Jamos into normalization buffer
3219 source
->writableBuffer
[0] = (UChar
)L
;
3220 source
->writableBuffer
[1] = (UChar
)V
;
3222 source
->writableBuffer
[2] = (UChar
)T
;
3223 source
->writableBuffer
[3] = 0;
3225 source
->writableBuffer
[2] = 0;
3228 source
->fcdPosition
= source
->pos
; // Indicate where to continue in main input string
3229 // after exhausting the writableBuffer
3230 source
->pos
= source
->writableBuffer
;
3231 source
->origFlags
= source
->flags
;
3232 source
->flags
|= UCOL_ITER_INNORMBUF
;
3233 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3235 return(UCOL_IGNORABLE
);
3239 /* not yet implemented */
3240 /* probably after 1.8 */
3241 return UCOL_NOT_FOUND
;
3243 *status
= U_INTERNAL_PROGRAM_ERROR
;
3247 if (CE
<= UCOL_NOT_FOUND
) break;
3253 /* now uses Mark's getImplicitPrimary code */
3255 inline uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
) {
3260 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
3262 *(collationSource
->CEpos
++) = (r
& UCOL_PRIMARYMASK
) | 0x00000505;
3263 collationSource
->toReturn
= collationSource
->CEpos
;
3264 return ((r
& 0x0000FFFF)<<16) | 0x000000C0;
3268 * This function handles the special CEs like contractions, expansions,
3270 * It is called by both getPrevCE
3272 uint32_t ucol_prv_getSpecialPrevCE(const UCollator
*coll
, UChar ch
, uint32_t CE
,
3273 collIterate
*source
,
3276 const uint32_t *CEOffset
= NULL
;
3277 UChar
*UCharOffset
= NULL
;
3279 const UChar
*constart
= NULL
;
3281 UChar buffer
[UCOL_MAX_BUFFER
];
3282 uint32_t *endCEBuffer
;
3284 int32_t noChars
= 0;
3288 /* the only ces that loops are thai and contractions */
3289 switch (getCETag(CE
))
3291 case NOT_FOUND_TAG
: /* this tag always returns */
3293 case SURROGATE_TAG
: /* This is a surrogate pair */
3294 /* essentialy an engaged lead surrogate. */
3295 /* if you have encountered it here, it means that a */
3296 /* broken sequence was encountered and this is an error */
3300 // Special processing is getting a CE that is preceded by a certain prefix
3301 // Currently this is only needed for optimizing Japanese length and iteration marks.
3302 // When we encouter a special processing tag, we go backwards and try to see if
3304 // Contraction tables are used - so the whole process is not unlike contraction.
3305 // prefix data is stored backwards in the table.
3306 const UChar
*UCharOffset
;
3308 collIterateState prefixState
;
3309 backupState(source
, &prefixState
);
3311 // This loop will run once per source string character, for as long as we
3312 // are matching a potential contraction sequence
3314 // First we position ourselves at the begining of contraction sequence
3315 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3317 if (collIter_bos(source
)) {
3318 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3321 schar
= getPrevNormalizedChar(source
, status
);
3324 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3328 if (schar
== tchar
) {
3329 // Found the source string char in the table.
3330 // Pick up the corresponding CE from the table.
3331 CE
= *(coll
->contractionCEs
+
3332 (UCharOffset
- coll
->contractionIndex
));
3336 // if there is a completely ignorable code point in the middle of
3337 // a prefix, we need to act as if it's not there
3338 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3339 // lone surrogates cannot be set to zero as it would break other processing
3340 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, schar
);
3341 // it's easy for BMP code points
3344 } else if(U16_IS_TRAIL(schar
) || U16_IS_LEAD(schar
)) {
3345 // for supplementary code points, we have to check the next one
3346 // situations where we are going to ignore
3347 // 1. beginning of the string: schar is a lone surrogate
3348 // 2. schar is a lone surrogate
3349 // 3. schar is a trail surrogate in a valid surrogate sequence
3350 // that is explicitly set to zero.
3351 if (!collIter_bos(source
)) {
3353 if(U16_IS_LEAD(lead
= getPrevNormalizedChar(source
, status
))) {
3354 isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, lead
);
3355 if(getCETag(isZeroCE
) == SURROGATE_TAG
) {
3356 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(&coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
3358 // this is a real, assigned completely ignorable code point
3364 // lone surrogate, completely ignorable
3368 // lone surrogate at the beggining, completely ignorable
3372 // Source string char was not in the table.
3373 // We have not found the prefix.
3374 CE
= *(coll
->contractionCEs
+
3375 (ContractionStart
- coll
->contractionIndex
));
3379 // The source string char was in the contraction table, and the corresponding
3380 // CE is not a prefix CE. We found the prefix, break
3381 // out of loop, this CE will end up being returned. This is the normal
3382 // way out of prefix handling when the source actually contained
3387 loadState(source
, &prefixState
, TRUE
);
3391 case CONTRACTION_TAG
:
3392 /* to ensure that the backwards and forwards iteration matches, we
3393 take the current region of most possible match and pass it through
3394 the forward iteration. this will ensure that the obstinate problem of
3395 overlapping contractions will not occur.
3397 schar
= peekCharacter(source
, 0);
3398 constart
= (UChar
*)coll
->image
+ getContractOffset(CE
);
3399 if (isAtStartPrevIterate(source
)
3400 /* commented away contraction end checks after adding the checks
3402 /* start of string or this is not the end of any contraction */
3403 CE
= *(coll
->contractionCEs
+
3404 (constart
- coll
->contractionIndex
));
3408 UCharOffset
= strbuffer
+ (UCOL_MAX_BUFFER
- 1);
3409 *(UCharOffset
--) = 0;
3411 // have to swap thai characters
3412 while (ucol_unsafeCP(schar
, coll
)) {
3413 *(UCharOffset
) = schar
;
3416 schar
= getPrevNormalizedChar(source
, status
);
3418 // TODO: when we exhaust the contraction buffer,
3419 // it needs to get reallocated. The problem is
3420 // that the size depends on the string which is
3421 // not iterated over. However, since we're travelling
3422 // backwards, we already had to set the iterator at
3423 // the end - so we might as well know where we are?
3424 if (UCharOffset
+ 1 == buffer
) {
3425 /* we have exhausted the buffer */
3426 int32_t newsize
= 0;
3427 if(source
->pos
) { // actually dealing with a position
3428 newsize
= source
->pos
- source
->string
+ 1;
3429 } else { // iterator
3430 newsize
= 4 * UCOL_MAX_BUFFER
;
3432 strbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) *
3433 (newsize
+ UCOL_MAX_BUFFER
));
3435 if (strbuffer
== NULL
) {
3436 *status
= U_MEMORY_ALLOCATION_ERROR
;
3437 return UCOL_NO_MORE_CES
;
3439 UCharOffset
= strbuffer
+ newsize
;
3440 uprv_memcpy(UCharOffset
, buffer
,
3441 UCOL_MAX_BUFFER
* sizeof(UChar
));
3444 if ((source
->pos
&& (source
->pos
== source
->string
||
3445 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
3446 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)))
3447 || (source
->iterator
&& !source
->iterator
->hasPrevious(source
->iterator
))) {
3451 /* adds the initial base character to the string */
3452 *(UCharOffset
) = schar
;
3455 /* a new collIterate is used to simplify things, since using the current
3456 collIterate will mean that the forward and backwards iteration will
3457 share and change the same buffers. we don't want to get into that. */
3459 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3460 IInit_collIterate(coll
, UCharOffset
, noChars
, &temp
);
3461 temp
.flags
&= ~UCOL_ITER_NORM
;
3463 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3464 endCEBuffer
= source
->CEs
+ UCOL_EXPAND_CE_BUFFER_SIZE
;
3465 while (CE
!= UCOL_NO_MORE_CES
) {
3466 *(source
->CEpos
++) = CE
;
3467 if (source
->CEpos
== endCEBuffer
) {
3468 /* ran out of CE space, bail.
3469 there's no guarantee of the right character position after
3471 *status
= U_BUFFER_OVERFLOW_ERROR
;
3472 source
->CEpos
= source
->CEs
;
3473 freeHeapWritableBuffer(&temp
);
3474 if (strbuffer
!= buffer
) {
3475 uprv_free(strbuffer
);
3477 return (uint32_t)UCOL_NULLORDER
;
3479 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3481 freeHeapWritableBuffer(&temp
);
3482 if (strbuffer
!= buffer
) {
3483 uprv_free(strbuffer
);
3485 source
->toReturn
= source
->CEpos
- 1;
3486 if (source
->toReturn
== source
->CEs
) {
3487 source
->CEpos
= source
->CEs
;
3489 return *(source
->toReturn
);
3490 case LONG_PRIMARY_TAG
:
3492 *(source
->CEpos
++) = ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
3493 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
3494 source
->toReturn
= source
->CEpos
- 1;
3495 return *(source
->toReturn
);
3497 case EXPANSION_TAG
: /* this tag always returns */
3499 This should handle expansion.
3500 NOTE: we can encounter both continuations and expansions in an expansion!
3501 I have to decide where continuations are going to be dealt with
3503 /* find the offset to expansion table */
3504 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3505 size
= getExpansionCount(CE
);
3508 if there are less than 16 elements in expansion, we don't terminate
3511 for (count
= 0; count
< size
; count
++) {
3512 *(source
->CEpos
++) = *CEOffset
++;
3517 while (*CEOffset
!= 0) {
3518 *(source
->CEpos
++) = *CEOffset
++;
3521 source
->toReturn
= source
->CEpos
- 1;
3522 // in case of one element expansion, we
3523 // want to immediately return CEpos
3524 if(source
->toReturn
== source
->CEs
) {
3525 source
->CEpos
= source
->CEs
;
3527 return *(source
->toReturn
);
3531 We do a check to see if we want to collate digits as numbers; if so we generate
3532 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3535 uint32_t i
; /* general counter */
3537 if (source
->coll
->numericCollation
== UCOL_ON
){
3538 collIterateState state
= {0,0,0,0,0,0,0,0};
3541 uint32_t digIndx
= 0;
3542 uint32_t endIndex
= 0;
3543 uint32_t leadingZeroIndex
= 0;
3544 uint32_t trailingZeroCount
= 0;
3546 uint32_t primWeight
= 0;
3549 uint8_t collateVal
= 0;
3551 UBool nonZeroValReached
= FALSE
;
3553 uint8_t *numTempBuf
;
3554 uint8_t stackNumTempBuf
[UCOL_MAX_BUFFER
]; // I just need a temporary place to store my generated CEs.
3555 uint32_t numTempBufSize
= UCOL_MAX_BUFFER
;
3557 numTempBuf
= stackNumTempBuf
;
3559 We parse the source string until we hit a char that's NOT a digit.
3560 Use this u_charDigitValue. This might be slow because we have to
3561 handle surrogates...
3564 if (U16_IS_TRAIL (ch
)){
3565 if (!collIter_bos(source
)){
3566 UChar lead
= getPrevNormalizedChar(source
, status
);
3567 if(U16_IS_LEAD(lead
)) {
3568 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
3579 digVal
= u_charDigitValue(char32
);
3582 // Make sure we have enough space.
3583 if (digIndx
>= ((numTempBufSize
- 2) * 2) + 1)
3585 numTempBufSize
*= 2;
3586 if (numTempBuf
== stackNumTempBuf
){
3587 numTempBuf
= (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize
);
3588 uprv_memcpy(numTempBuf
, stackNumTempBuf
, UCOL_MAX_BUFFER
);
3590 uprv_realloc(numTempBuf
, numTempBufSize
);
3593 // Skip over trailing zeroes, and keep a count of them.
3595 nonZeroValReached
= TRUE
;
3596 if (nonZeroValReached
){
3598 We parse the digit string into base 100 numbers (this fits into a byte).
3599 We only add to the buffer in twos, thus if we are parsing an odd character,
3600 that serves as the 'tens' digit while the if we are parsing an even one, that
3601 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3602 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3603 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3604 than all the other bytes.
3606 Since we're doing in this reverse we want to put the first digit encountered into the
3607 ones place and the second digit encountered into the tens place.
3610 if ((digIndx
+ trailingZeroCount
) % 2 == 1){
3611 // High-order digit case (tens place)
3612 collateVal
+= (uint8_t)(digVal
* 10);
3614 // We cannot set leadingZeroIndex unless it has been set for the
3615 // low-order digit. Therefore, all we can do for the high-order
3616 // digit is turn it off, never on.
3617 // The only time we will have a high digit without a low is for
3618 // the very first non-zero digit, so no zero check is necessary.
3619 if (collateVal
!= 0)
3620 leadingZeroIndex
= 0;
3622 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3626 // Low-order digit case (ones place)
3627 collateVal
= (uint8_t)digVal
;
3629 // Check for leading zeroes.
3630 if (collateVal
== 0)
3632 if (!leadingZeroIndex
)
3633 leadingZeroIndex
= (digIndx
/2) + 2;
3636 leadingZeroIndex
= 0;
3638 // No need to write to buffer; the case of a last odd digit
3639 // is handled below.
3644 ++trailingZeroCount
;
3646 if (!collIter_bos(source
)){
3647 ch
= getPrevNormalizedChar(source
, status
);
3648 //goBackOne(source);
3649 if (U16_IS_TRAIL(ch
)){
3650 backupState(source
, &state
);
3651 if (!collIter_bos(source
))
3654 UChar lead
= getPrevNormalizedChar(source
, status
);
3655 if(U16_IS_LEAD(lead
)) {
3656 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
3658 loadState(source
, &state
, FALSE
);
3666 if ((digVal
= u_charDigitValue(char32
)) == -1){
3667 if (char32
> 0xFFFF) {// For surrogates.
3668 loadState(source
, &state
, FALSE
);
3670 // Don't need to "reverse" the goBackOne call,
3671 // as this points to the next position to process..
3672 //if (char32 > 0xFFFF) // For surrogates.
3673 //getNextNormalizedChar(source);
3681 if (nonZeroValReached
== FALSE
){
3683 trailingZeroCount
= 0;
3687 if ((digIndx
+ trailingZeroCount
) % 2 != 0){
3688 numTempBuf
[((digIndx
)/2) + 2] = collateVal
*2 + 6;
3689 digIndx
+= 1; // The implicit leading zero
3691 if (trailingZeroCount
% 2 != 0){
3692 // We had to consume one trailing zero for the low digit
3693 // of the least significant byte
3694 digIndx
+= 1; // The trailing zero not in the exponent
3695 trailingZeroCount
-= 1;
3698 endIndex
= leadingZeroIndex
? leadingZeroIndex
: ((digIndx
/2) + 2) ;
3700 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3704 We want to skip over the first two slots in the buffer. The first slot
3705 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3706 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3707 The exponent must be adjusted by the number of leading zeroes, and the number of
3710 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
3711 uint32_t exponent
= (digIndx
+trailingZeroCount
)/2;
3712 if (leadingZeroIndex
)
3713 exponent
-= ((digIndx
/2) + 2 - leadingZeroIndex
);
3714 numTempBuf
[1] = (uint8_t)(0x80 + (exponent
& 0x7F));
3716 // Now transfer the collation key to our collIterate struct.
3717 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3718 //size = ((endIndex+1) & ~1)/2;
3719 *(source
->CEpos
++) = (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3720 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3721 UCOL_BYTE_COMMON
; // Tertiary weight.
3722 i
= endIndex
- 1; // Reset the index into the buffer.
3725 primWeight
= numTempBuf
[i
--] << 8;
3727 primWeight
|= numTempBuf
[i
--];
3728 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3730 if (numTempBuf
!= stackNumTempBuf
)
3731 uprv_free(numTempBuf
);
3733 source
->toReturn
= source
->CEpos
-1;
3734 return *(source
->toReturn
);
3737 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3742 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3745 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3746 //const uint32_t LCount = 19;
3747 const uint32_t VCount
= 21;
3748 const uint32_t TCount
= 28;
3749 //const uint32_t NCount = VCount * TCount; /* 588 */
3750 //const uint32_t SCount = LCount * NCount; /* 11172 */
3752 uint32_t L
= ch
- SBase
;
3755 we do it in this order since some compilers can do % and / in one
3758 uint32_t T
= L
% TCount
;
3760 uint32_t V
= L
% VCount
;
3769 return the first CE, but first put the rest into the expansion buffer
3771 if (!source
->coll
->image
->jamoSpecial
)
3773 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, L
);
3774 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, V
);
3776 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, T
);
3778 source
->toReturn
= source
->CEpos
- 1;
3779 return *(source
->toReturn
);
3781 // Since Hanguls pass the FCD check, it is
3782 // guaranteed that we won't be in
3783 // the normalization buffer if something like this happens
3784 // Move Jamos into normalization buffer
3786 Move the Jamos into the
3787 normalization buffer
3789 UChar
*tempbuffer
= source
->writableBuffer
+
3790 (source
->writableBufSize
- 1);
3793 *(tempbuffer
- 1) = (UChar
)T
;
3794 *(tempbuffer
- 2) = (UChar
)V
;
3795 *(tempbuffer
- 3) = (UChar
)L
;
3796 *(tempbuffer
- 4) = 0;
3798 *(tempbuffer
- 1) = (UChar
)V
;
3799 *(tempbuffer
- 2) = (UChar
)L
;
3800 *(tempbuffer
- 3) = 0;
3804 Indicate where to continue in main input string after exhausting
3807 if (source
->pos
== source
->string
) {
3808 source
->fcdPosition
= NULL
;
3810 source
->fcdPosition
= source
->pos
-1;
3813 source
->pos
= tempbuffer
;
3814 source
->origFlags
= source
->flags
;
3815 source
->flags
|= UCOL_ITER_INNORMBUF
;
3816 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3818 return(UCOL_IGNORABLE
);
3821 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3822 return 0; /* broken surrogate sequence */
3823 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3828 if (isAtStartPrevIterate(source
)) {
3829 /* we are at the start of the string, wrong place to be at */
3832 if (source
->pos
!= source
->writableBuffer
) {
3833 prev
= source
->pos
- 1;
3835 prev
= source
->fcdPosition
;
3839 /* Handles Han and Supplementary characters here.*/
3840 if (U16_IS_LEAD(prevChar
)) {
3841 cp
= ((((uint32_t)prevChar
)<<10UL)+(ch
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3844 return 0; /* completely ignorable */
3846 return getPrevImplicit(cp
, source
);
3848 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3849 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3850 return getPrevImplicit(ch
, source
);
3851 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3852 return getPrevImplicit(ch
, source
);
3853 /* UCA is filled with these. Tailorings are NOT_FOUND */
3854 /* not yet implemented */
3855 case CHARSET_TAG
: /* this tag always returns */
3856 /* probably after 1.8 */
3857 return UCOL_NOT_FOUND
;
3858 default: /* this tag always returns */
3859 *status
= U_INTERNAL_PROGRAM_ERROR
;
3863 if (CE
<= UCOL_NOT_FOUND
) {
3870 /* This should really be a macro */
3871 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3874 uint8_t *reallocateBuffer(uint8_t **secondaries
, uint8_t *secStart
, uint8_t *second
, uint32_t *secSize
, uint32_t newSize
, UErrorCode
*status
) {
3876 fprintf(stderr
, ".");
3878 uint8_t *newStart
= NULL
;
3879 uint32_t offset
= *secondaries
-secStart
;
3881 if(secStart
==second
) {
3882 newStart
=(uint8_t*)uprv_malloc(newSize
);
3883 if(newStart
==NULL
) {
3884 *status
= U_MEMORY_ALLOCATION_ERROR
;
3887 uprv_memcpy(newStart
, secStart
, *secondaries
-secStart
);
3889 newStart
=(uint8_t*)uprv_realloc(secStart
, newSize
);
3890 if(newStart
==NULL
) {
3891 *status
= U_MEMORY_ALLOCATION_ERROR
;
3895 *secondaries
=newStart
+offset
;
3901 /* This should really be a macro */
3902 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3903 /* secondaries in French */
3905 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3915 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3917 while((start)<(end)) { \
3919 *(start)++ = *(end); \
3924 /****************************************************************************/
3925 /* Following are the sortkey generation functions */
3927 /****************************************************************************/
3930 * Merge two sort keys.
3931 * This is useful, for example, to combine sort keys from first and last names
3932 * to sort such pairs.
3933 * Merged sort keys consider on each collation level the first part first entirely,
3934 * then the second one.
3935 * It is possible to merge multiple sort keys by consecutively merging
3936 * another one with the intermediate result.
3938 * The length of the merge result is the sum of the lengths of the input sort keys
3941 * @param src1 the first sort key
3942 * @param src1Length the length of the first sort key, including the zero byte at the end;
3943 * can be -1 if the function is to find the length
3944 * @param src2 the second sort key
3945 * @param src2Length the length of the second sort key, including the zero byte at the end;
3946 * can be -1 if the function is to find the length
3947 * @param dest the buffer where the merged sort key is written,
3948 * can be NULL if destCapacity==0
3949 * @param destCapacity the number of bytes in the dest buffer
3950 * @return the length of the merged sort key, src1Length+src2Length-1;
3951 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3952 * in which cases the contents of dest is undefined
3956 U_CAPI
int32_t U_EXPORT2
3957 ucol_mergeSortkeys(const uint8_t *src1
, int32_t src1Length
,
3958 const uint8_t *src2
, int32_t src2Length
,
3959 uint8_t *dest
, int32_t destCapacity
) {
3963 /* check arguments */
3964 if( src1
==NULL
|| src1Length
<-2 || src1Length
==0 || (src1Length
>0 && src1
[src1Length
-1]!=0) ||
3965 src2
==NULL
|| src2Length
<-2 || src2Length
==0 || (src2Length
>0 && src2
[src2Length
-1]!=0) ||
3966 destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)
3968 /* error, attempt to write a zero byte and return 0 */
3969 if(dest
!=NULL
&& destCapacity
>0) {
3975 /* check lengths and capacity */
3977 src1Length
=(int32_t)uprv_strlen((const char *)src1
)+1;
3980 src2Length
=(int32_t)uprv_strlen((const char *)src2
)+1;
3983 destLength
=src1Length
+src2Length
-1;
3984 if(destLength
>destCapacity
) {
3985 /* the merged sort key does not fit into the destination */
3989 /* merge the sort keys with the same number of levels */
3990 while(*src1
!=0 && *src2
!=0) { /* while both have another level */
3991 /* copy level from src1 not including 00 or 01 */
3992 while((b
=*src1
)>=2) {
3997 /* add a 02 merge separator */
4000 /* copy level from src2 not including 00 or 01 */
4001 while((b
=*src2
)>=2) {
4006 /* if both sort keys have another level, then add a 01 level separator and continue */
4007 if(*src1
==1 && *src2
==1) {
4015 * here, at least one sort key is finished now, but the other one
4016 * might have some contents left from containing more levels;
4017 * that contents is just appended to the result
4020 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4023 /* append src2, "the other, unfinished sort key" */
4024 uprv_strcpy((char *)dest
, (const char *)src2
);
4026 /* trust that neither sort key contained illegally embedded zero bytes */
4031 U_CAPI
int32_t U_EXPORT2
4032 ucol_getSortKey(const UCollator
*coll
,
4033 const UChar
*source
,
4034 int32_t sourceLength
,
4036 int32_t resultLength
)
4038 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY
);
4039 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
4040 int32_t actualSrcLen
= sourceLength
;
4041 if (actualSrcLen
==-1 && source
!=NULL
) {
4042 actualSrcLen
= u_strlen(source
);
4044 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source string = %vh ", coll
, source
, actualSrcLen
);
4047 UErrorCode status
= U_ZERO_ERROR
;
4048 int32_t keySize
= 0;
4050 if(source
!= NULL
) {
4051 // source == NULL is actually an error situation, but we would need to
4052 // have an error code to return it. Until we introduce a new
4053 // API, it stays like this
4055 /* this uses the function pointer that is set in updateinternalstate */
4056 /* currently, there are two funcs: */
4057 /*ucol_calcSortKey(...);*/
4058 /*ucol_calcSortKeySimpleTertiary(...);*/
4060 keySize
= coll
->sortKeyGen(coll
, source
, sourceLength
, &result
, resultLength
, FALSE
, &status
);
4061 //((UCollator *)coll)->errorCode = status; /*semantically const */
4063 UTRACE_DATA2(UTRACE_VERBOSE
, "Sort Key = %vb", result
, keySize
);
4064 UTRACE_EXIT_STATUS(status
);
4068 /* this function is called by the C++ API for sortkey generation */
4070 ucol_getSortKeyWithAllocation(const UCollator
*coll
,
4071 const UChar
*source
, int32_t sourceLength
,
4073 UErrorCode
*pErrorCode
) {
4075 return coll
->sortKeyGen(coll
, source
, sourceLength
, pResult
, 0, TRUE
, pErrorCode
);
4078 #define UCOL_FSEC_BUF_SIZE 256
4080 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4081 /* or if we run out of space while making a sortkey and want to return ASAP */
4082 int32_t ucol_getSortKeySize(const UCollator
*coll
, collIterate
*s
, int32_t currentSize
, UColAttributeValue strength
, int32_t len
) {
4083 UErrorCode status
= U_ZERO_ERROR
;
4084 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4085 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4086 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4087 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4088 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4089 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4090 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4091 //UBool qShifted = shifted && (compareQuad == 0);
4092 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4093 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4094 uint8_t fSecsBuff
[UCOL_FSEC_BUF_SIZE
];
4095 uint8_t *fSecs
= fSecsBuff
;
4096 uint32_t fSecsLen
= 0, fSecsMaxLen
= UCOL_FSEC_BUF_SIZE
;
4097 uint8_t *frenchStartPtr
= NULL
, *frenchEndPtr
= NULL
;
4099 uint32_t variableTopValue
= coll
->variableTopValue
;
4100 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4103 /* allocate one more space for hiragana */
4105 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4107 uint32_t order
= UCOL_NO_MORE_CES
;
4108 uint8_t primary1
= 0;
4109 uint8_t primary2
= 0;
4110 uint8_t secondary
= 0;
4111 uint8_t tertiary
= 0;
4112 int32_t caseShift
= 0;
4113 uint32_t c2
= 0, c3
= 0, c4
= 0; /* variables for compression */
4115 uint8_t caseSwitch
= coll
->caseSwitch
;
4116 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4117 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4119 UBool wasShifted
= FALSE
;
4120 UBool notIsContinuation
= FALSE
;
4121 uint8_t leadPrimary
= 0;
4125 order
= ucol_IGetNextCE(coll
, s
, &status
);
4126 if(order
== UCOL_NO_MORE_CES
) {
4134 notIsContinuation
= !isContinuation(order
);
4137 if(notIsContinuation
) {
4138 tertiary
= (uint8_t)((order
& UCOL_BYTE_SIZE_MASK
));
4140 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4142 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4143 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4144 primary1
= (uint8_t)(order
>> 8);
4147 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4148 || (!notIsContinuation
&& wasShifted
))
4149 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4150 /* and other ignorables should be removed if following a shifted code point */
4151 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4152 /* we should just completely ignore it */
4155 if(compareQuad
== 0) {
4157 currentSize
+= (c2
/UCOL_BOT_COUNT4
)+1;
4168 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4169 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4170 /* calculate sortkey size */
4171 if(primary1
!= UCOL_IGNORABLE
) {
4172 if(notIsContinuation
) {
4173 if(leadPrimary
== primary1
) {
4176 if(leadPrimary
!= 0) {
4179 if(primary2
== UCOL_IGNORABLE
) {
4180 /* one byter, not compressed */
4183 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
4184 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4185 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4186 (primary1
> maxRegularPrimary
&& primary1
< minImplicitPrimary
)) {
4187 /* not compressible */
4190 } else { /* compress */
4191 leadPrimary
= primary1
;
4195 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4197 if(primary2
!= UCOL_IGNORABLE
) {
4203 if(secondary
> compareSec
) { /* I think that != 0 test should be != IGNORABLE */
4205 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4209 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4210 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+1;
4212 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+1;
4219 fSecs
[fSecsLen
++] = secondary
;
4220 if(fSecsLen
== fSecsMaxLen
) {
4221 if(fSecs
== fSecsBuff
) {
4222 fSecs
= (uint8_t *)uprv_malloc(2*fSecsLen
);
4224 fSecs
= (uint8_t *)uprv_realloc(fSecs
, 2*fSecsLen
);
4227 status
= U_MEMORY_ALLOCATION_ERROR
;
4232 if(notIsContinuation
) {
4233 if (frenchStartPtr
!= NULL
) {
4234 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4235 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4236 frenchStartPtr
= NULL
;
4239 if (frenchStartPtr
== NULL
) {
4240 frenchStartPtr
= fSecs
+fSecsLen
-2;
4242 frenchEndPtr
= fSecs
+fSecsLen
-1;
4247 if(doCase
&& (primary1
> 0 || strength
>= UCOL_SECONDARY
)) {
4248 // do the case level if we need to do it. We don't want to calculate
4249 // case level for primary ignorables if we have only primary strength and case level
4250 // otherwise we would break well formedness of CEs
4251 if (caseShift
== 0) {
4253 caseShift
= UCOL_CASE_SHIFT_START
;
4255 if((tertiary
&0x3F) > 0 && notIsContinuation
) {
4257 if((tertiary
&0xC0) != 0) {
4258 if (caseShift
== 0) {
4260 caseShift
= UCOL_CASE_SHIFT_START
;
4266 if(notIsContinuation
) {
4267 tertiary
^= caseSwitch
;
4271 tertiary
&= tertiaryMask
;
4272 if(tertiary
> compareTer
) { /* I think that != 0 test should be != IGNORABLE */
4273 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4277 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
4278 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
4279 currentSize
+= (c3
/(uint32_t)coll
->tertiaryTopCount
)+1;
4281 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
)+1;
4289 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4290 if(s
->flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4291 if(c4
>0) { // Close this part
4292 currentSize
+= (c4
/UCOL_BOT_COUNT4
)+1;
4295 currentSize
++; // Add the Hiragana
4296 } else { // This wasn't Hiragana, so we can continue adding stuff
4306 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4310 if(frenchStartPtr
!= NULL
) {
4311 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4313 for(i
= 0; i
<fSecsLen
; i
++) {
4314 secondary
= *(fSecs
+fSecsLen
-i
-1);
4315 /* This is compression code. */
4316 if (secondary
== UCOL_COMMON2
) {
4320 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4321 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+((c2
%(uint32_t)UCOL_TOP_COUNT2
!= 0)?1:0);
4323 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4331 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4333 if(fSecs
!= fSecsBuff
) {
4339 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
) + ((c3
%(uint32_t)coll
->tertiaryBottomCount
!= 0)?1:0);
4342 if(c4
> 0 && compareQuad
== 0) {
4343 currentSize
+= (c4
/(uint32_t)UCOL_BOT_COUNT4
)+((c4
%(uint32_t)UCOL_BOT_COUNT4
!= 0)?1:0);
4347 currentSize
+= u_lengthOfIdenticalLevelRun(s
->string
, len
);
4354 inline void doCaseShift(uint8_t **cases
, uint32_t &caseShift
) {
4355 if (caseShift
== 0) {
4356 *(*cases
)++ = UCOL_CASE_BYTE_START
;
4357 caseShift
= UCOL_CASE_SHIFT_START
;
4361 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4362 // know how many values we wanted to add, even if we didn't add them all
4364 inline void addWithIncrement(uint8_t *&primaries
, uint8_t *limit
, uint32_t &size
, const uint8_t value
) {
4366 if(primaries
< limit
) {
4367 *(primaries
)++ = value
;
4371 // Packs the secondary buffer when processing French locale. Adds the terminator.
4373 inline uint8_t *packFrench(uint8_t *primaries
, uint8_t *primEnd
, uint8_t *secondaries
, uint32_t *secsize
, uint8_t *frenchStartPtr
, uint8_t *frenchEndPtr
) {
4376 uint32_t i
= 0, size
= 0;
4377 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4378 addWithIncrement(primaries
, primEnd
, i
, UCOL_LEVELTERMINATOR
);
4379 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4380 if(frenchStartPtr
!= NULL
) {
4381 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4383 for(i
= 0; i
<*secsize
; i
++) {
4384 secondary
= *(secondaries
-i
-1);
4385 /* This is compression code. */
4386 if (secondary
== UCOL_COMMON2
) {
4390 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4391 while (count2
> UCOL_TOP_COUNT2
) {
4392 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
));
4393 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4395 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1)));
4397 while (count2
> UCOL_BOT_COUNT2
) {
4398 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4399 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4401 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4405 addWithIncrement(primaries
, primEnd
, size
, secondary
);
4409 while (count2
> UCOL_BOT_COUNT2
) {
4410 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4411 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4413 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4419 /* This is the sortkey work horse function */
4420 U_CFUNC
int32_t U_CALLCONV
4421 ucol_calcSortKey(const UCollator
*coll
,
4422 const UChar
*source
,
4423 int32_t sourceLength
,
4425 uint32_t resultLength
,
4426 UBool allocateSKBuffer
,
4429 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4431 uint32_t i
= 0; /* general purpose counter */
4433 /* Stack allocated buffers for buffers we use */
4434 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
], caseB
[UCOL_CASE_MAX_BUFFER
], quad
[UCOL_QUAD_MAX_BUFFER
];
4436 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
, *cases
= caseB
, *quads
= quad
;
4438 if(U_FAILURE(*status
)) {
4442 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
4443 primaries
= *result
= prim
;
4444 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
4447 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
,
4448 caseSize
= UCOL_CASE_MAX_BUFFER
, quadSize
= UCOL_QUAD_MAX_BUFFER
;
4450 uint32_t sortKeySize
= 1; /* it is always \0 terminated */
4452 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
4453 UChar
*normSource
= normBuffer
;
4454 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
4456 int32_t len
= (sourceLength
== -1 ? u_strlen(source
) : sourceLength
);
4458 UColAttributeValue strength
= coll
->strength
;
4460 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4461 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4462 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4463 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4464 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4465 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4466 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4467 //UBool qShifted = shifted && (compareQuad == 0);
4468 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4469 /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4471 uint32_t variableTopValue
= coll
->variableTopValue
;
4472 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4473 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4474 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4475 uint8_t UCOL_HIRAGANA_QUAD
= 0;
4477 UCOL_HIRAGANA_QUAD
=UCOL_COMMON_BOT4
++;
4478 /* allocate one more space for hiragana, value for hiragana */
4480 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4482 /* support for special features like caselevel and funky secondaries */
4483 uint8_t *frenchStartPtr
= NULL
;
4484 uint8_t *frenchEndPtr
= NULL
;
4485 uint32_t caseShift
= 0;
4487 sortKeySize
+= ((compareSec
?0:1) + (compareTer
?0:1) + (doCase
?1:0) + /*(qShifted?1:0)*/(compareQuad
?0:1) + (compareIdent
?1:0));
4489 /* If we need to normalize, we'll do it all at once at the beginning! */
4490 UNormalizationMode normMode
;
4492 normMode
= UNORM_NFD
;
4493 } else if(coll
->normalizationMode
!= UCOL_OFF
) {
4494 normMode
= UNORM_FCD
;
4496 normMode
= UNORM_NONE
;
4499 if(normMode
!= UNORM_NONE
&& UNORM_YES
!= unorm_quickCheck(source
, len
, normMode
, status
)) {
4500 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4504 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
4505 normSourceLen
= len
;
4506 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
4507 if(normSource
== NULL
) {
4508 *status
= U_MEMORY_ALLOCATION_ERROR
;
4511 *status
= U_ZERO_ERROR
;
4512 len
= unorm_internalNormalize(normSource
, normSourceLen
,
4518 if(U_FAILURE(*status
)) {
4521 source
= normSource
;
4525 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4526 if(source
== normSource
) {
4527 s
.flags
&= ~UCOL_ITER_NORM
;
4530 if(resultLength
== 0 || primaries
== NULL
) {
4531 int32_t keyLen
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4532 if(normSource
!= normBuffer
) {
4533 uprv_free(normSource
);
4537 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 1;
4538 if(strength
> UCOL_PRIMARY
) {
4542 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
4544 uint8_t *primStart
= primaries
;
4545 uint8_t *secStart
= secondaries
;
4546 uint8_t *terStart
= tertiaries
;
4547 uint8_t *caseStart
= cases
;
4548 uint8_t *quadStart
= quads
;
4552 uint8_t primary1
= 0;
4553 uint8_t primary2
= 0;
4554 uint8_t secondary
= 0;
4555 uint8_t tertiary
= 0;
4556 uint8_t caseSwitch
= coll
->caseSwitch
;
4557 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4558 int8_t tertiaryAddition
= (int8_t)coll
->tertiaryAddition
;
4559 uint8_t tertiaryTop
= coll
->tertiaryTop
;
4560 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
4561 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4562 uint8_t caseBits
= 0;
4564 UBool finished
= FALSE
;
4565 UBool wasShifted
= FALSE
;
4566 UBool notIsContinuation
= FALSE
;
4568 uint32_t prevBuffSize
= 0;
4570 uint32_t count2
= 0, count3
= 0, count4
= 0;
4571 uint8_t leadPrimary
= 0;
4574 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
4576 order
= ucol_IGetNextCE(coll
, &s
, status
);
4577 if(order
== UCOL_NO_MORE_CES
) {
4586 notIsContinuation
= !isContinuation(order
);
4588 if(notIsContinuation
) {
4589 tertiary
= (uint8_t)(order
& UCOL_BYTE_SIZE_MASK
);
4591 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4594 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4595 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4596 primary1
= (uint8_t)(order
>> 8);
4598 /*if(notIsContinuation && scriptOrder != NULL) {
4599 primary1 = scriptOrder[primary1];
4602 if(shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4603 || (!notIsContinuation
&& wasShifted
))
4604 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4605 /* and other ignorables should be removed if following a shifted code point */
4606 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4607 /* we should just completely ignore it */
4610 if(compareQuad
== 0) {
4612 while (count4
> UCOL_BOT_COUNT4
) {
4613 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4614 count4
-= UCOL_BOT_COUNT4
;
4616 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4619 /* We are dealing with a variable and we're treating them as shifted */
4620 /* This is a shifted ignorable */
4621 if(primary1
!= 0) { /* we need to check this since we could be in continuation */
4622 *quads
++ = primary1
;
4625 *quads
++ = primary2
;
4631 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4632 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4633 /* regular and simple sortkey calc */
4634 if(primary1
!= UCOL_IGNORABLE
) {
4635 if(notIsContinuation
) {
4636 if(leadPrimary
== primary1
) {
4637 *primaries
++ = primary2
;
4639 if(leadPrimary
!= 0) {
4640 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
4642 if(primary2
== UCOL_IGNORABLE
) {
4643 /* one byter, not compressed */
4644 *primaries
++ = primary1
;
4646 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
4647 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4648 (primary1
> maxRegularPrimary
&& primary1
< minImplicitPrimary
)) {
4649 /* not compressible */
4651 *primaries
++ = primary1
;
4652 *primaries
++ = primary2
;
4653 } else { /* compress */
4654 *primaries
++ = leadPrimary
= primary1
;
4655 *primaries
++ = primary2
;
4658 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4659 *primaries
++ = primary1
;
4660 if(primary2
!= UCOL_IGNORABLE
) {
4661 *primaries
++ = primary2
; /* second part */
4666 if(secondary
> compareSec
) {
4668 /* This is compression code. */
4669 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4673 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4674 while (count2
> UCOL_TOP_COUNT2
) {
4675 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
4676 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4678 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
4680 while (count2
> UCOL_BOT_COUNT2
) {
4681 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
4682 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4684 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
4688 *secondaries
++ = secondary
;
4691 *secondaries
++ = secondary
;
4692 /* Do the special handling for French secondaries */
4693 /* We need to get continuation elements and do intermediate restore */
4694 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4695 if(notIsContinuation
) {
4696 if (frenchStartPtr
!= NULL
) {
4697 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4698 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4699 frenchStartPtr
= NULL
;
4702 if (frenchStartPtr
== NULL
) {
4703 frenchStartPtr
= secondaries
- 2;
4705 frenchEndPtr
= secondaries
-1;
4710 if(doCase
&& (primary1
> 0 || strength
>= UCOL_SECONDARY
)) {
4711 // do the case level if we need to do it. We don't want to calculate
4712 // case level for primary ignorables if we have only primary strength and case level
4713 // otherwise we would break well formedness of CEs
4714 doCaseShift(&cases
, caseShift
);
4715 if(notIsContinuation
) {
4716 caseBits
= (uint8_t)(tertiary
& 0xC0);
4719 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
4720 if((caseBits
& 0xC0) == 0) {
4721 *(cases
-1) |= 1 << (--caseShift
);
4723 *(cases
-1) |= 0 << (--caseShift
);
4725 doCaseShift(&cases
, caseShift
);
4726 *(cases
-1) |= ((caseBits
>>6)&1) << (--caseShift
);
4729 if((caseBits
& 0xC0) == 0) {
4730 *(cases
-1) |= 0 << (--caseShift
);
4732 *(cases
-1) |= 1 << (--caseShift
);
4734 doCaseShift(&cases
, caseShift
);
4735 *(cases
-1) |= ((caseBits
>>7)&1) << (--caseShift
);
4742 if(notIsContinuation
) {
4743 tertiary
^= caseSwitch
;
4747 tertiary
&= tertiaryMask
;
4748 if(tertiary
> compareTer
) {
4749 /* This is compression code. */
4750 /* sequence size check is included in the if clause */
4751 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4754 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
4755 tertiary
+= tertiaryAddition
;
4756 } else if(tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
4757 tertiary
-= tertiaryAddition
;
4760 if ((tertiary
> tertiaryCommon
)) {
4761 while (count3
> coll
->tertiaryTopCount
) {
4762 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
4763 count3
-= (uint32_t)coll
->tertiaryTopCount
;
4765 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
4767 while (count3
> coll
->tertiaryBottomCount
) {
4768 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
4769 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
4771 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
4775 *tertiaries
++ = tertiary
;
4779 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4780 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4781 if(count4
>0) { // Close this part
4782 while (count4
> UCOL_BOT_COUNT4
) {
4783 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4784 count4
-= UCOL_BOT_COUNT4
;
4786 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4789 *quads
++ = UCOL_HIRAGANA_QUAD
; // Add the Hiragana
4790 } else { // This wasn't Hiragana, so we can continue adding stuff
4796 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
4797 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
4798 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4799 if(source
== normSource
) {
4800 s
.flags
&= ~UCOL_ITER_NORM
;
4802 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4803 *status
= U_BUFFER_OVERFLOW_ERROR
;
4806 } else { /* It's much nicer if we can actually reallocate */
4807 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
)+(cases
-caseStart
)+(quads
-quadStart
);
4808 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
4809 if(U_SUCCESS(*status
)) {
4810 *result
= primStart
;
4811 primarySafeEnd
= primStart
+ resultLength
- 1;
4812 if(strength
> UCOL_PRIMARY
) {
4816 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4817 if(source
== normSource
) {
4818 s
.flags
&= ~UCOL_ITER_NORM
;
4820 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4830 prevBuffSize
= minBufferSize
;
4831 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
4832 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
4833 caseStart
= reallocateBuffer(&cases
, caseStart
, caseB
, &caseSize
, 2*caseSize
, status
);
4834 quadStart
= reallocateBuffer(&quads
, quadStart
, quad
, &quadSize
, 2*quadSize
, status
);
4836 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4837 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
4838 if(source
== normSource
) {
4839 s
.flags
&= ~UCOL_ITER_NORM
;
4841 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4847 /* Here, we are generally done with processing */
4848 /* bailing out would not be too productive */
4850 if(U_SUCCESS(*status
)) {
4851 sortKeySize
+= (primaries
- primStart
);
4852 /* we have done all the CE's, now let's put them together to form a key */
4853 if(compareSec
== 0) {
4855 while (count2
> UCOL_BOT_COUNT2
) {
4856 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
4857 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4859 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
4861 uint32_t secsize
= secondaries
-secStart
;
4862 if(!isFrenchSec
) { // Regular situation, we know the length of secondaries
4863 sortKeySize
+= secsize
;
4864 if(sortKeySize
<= resultLength
) {
4865 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4866 uprv_memcpy(primaries
, secStart
, secsize
);
4867 primaries
+= secsize
;
4869 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
4870 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4871 if(U_SUCCESS(*status
)) {
4872 *result
= primStart
;
4873 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4874 uprv_memcpy(primaries
, secStart
, secsize
);
4875 primaries
+= secsize
;
4878 *status
= U_BUFFER_OVERFLOW_ERROR
;
4881 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4882 uint8_t *newPrim
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
4883 sortKeySize
+= secsize
;
4884 if(sortKeySize
<= resultLength
) { // if we managed to pack fine
4885 primaries
= newPrim
; // update the primary pointer
4886 } else { // overflow, need to reallocate and redo
4887 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
4888 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4889 if(U_SUCCESS(*status
)) {
4890 primaries
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
4893 *status
= U_BUFFER_OVERFLOW_ERROR
;
4900 uint32_t casesize
= cases
- caseStart
;
4901 sortKeySize
+= casesize
;
4902 if(sortKeySize
<= resultLength
) {
4903 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4904 uprv_memcpy(primaries
, caseStart
, casesize
);
4905 primaries
+= casesize
;
4907 if(allocateSKBuffer
== TRUE
) {
4908 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4909 if(U_SUCCESS(*status
)) {
4910 *result
= primStart
;
4911 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4912 uprv_memcpy(primaries
, caseStart
, casesize
);
4915 *status
= U_BUFFER_OVERFLOW_ERROR
;
4920 if(compareTer
== 0) {
4922 if (coll
->tertiaryCommon
!= UCOL_COMMON_BOT3
) {
4923 while (count3
>= coll
->tertiaryTopCount
) {
4924 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
4925 count3
-= (uint32_t)coll
->tertiaryTopCount
;
4927 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
4929 while (count3
> coll
->tertiaryBottomCount
) {
4930 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
4931 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
4933 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
4936 uint32_t tersize
= tertiaries
- terStart
;
4937 sortKeySize
+= tersize
;
4938 if(sortKeySize
<= resultLength
) {
4939 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4940 uprv_memcpy(primaries
, terStart
, tersize
);
4941 primaries
+= tersize
;
4943 if(allocateSKBuffer
== TRUE
) {
4944 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4945 if(U_SUCCESS(*status
)) {
4946 *result
= primStart
;
4947 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4948 uprv_memcpy(primaries
, terStart
, tersize
);
4951 *status
= U_BUFFER_OVERFLOW_ERROR
;
4955 if(compareQuad
== 0/*qShifted == TRUE*/) {
4957 while (count4
> UCOL_BOT_COUNT4
) {
4958 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4959 count4
-= UCOL_BOT_COUNT4
;
4961 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4963 uint32_t quadsize
= quads
- quadStart
;
4964 sortKeySize
+= quadsize
;
4965 if(sortKeySize
<= resultLength
) {
4966 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4967 uprv_memcpy(primaries
, quadStart
, quadsize
);
4968 primaries
+= quadsize
;
4970 if(allocateSKBuffer
== TRUE
) {
4971 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
4972 if(U_SUCCESS(*status
)) {
4973 *result
= primStart
;
4974 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4975 uprv_memcpy(primaries
, quadStart
, quadsize
);
4978 *status
= U_BUFFER_OVERFLOW_ERROR
;
4984 sortKeySize
+= u_lengthOfIdenticalLevelRun(s
.string
, len
);
4985 if(sortKeySize
<= resultLength
) {
4986 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4987 primaries
+= u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
4989 if(allocateSKBuffer
== TRUE
) {
4990 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, sortKeySize
, status
);
4991 if(U_SUCCESS(*status
)) {
4992 *result
= primStart
;
4993 *(primaries
++) = UCOL_LEVELTERMINATOR
;
4994 u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
4997 *status
= U_BUFFER_OVERFLOW_ERROR
;
5002 *(primaries
++) = '\0';
5005 if(terStart
!= tert
) {
5006 uprv_free(terStart
);
5007 uprv_free(secStart
);
5008 uprv_free(caseStart
);
5009 uprv_free(quadStart
);
5012 if(normSource
!= normBuffer
) {
5013 uprv_free(normSource
);
5016 if(allocateSKBuffer
== TRUE
) {
5017 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5019 if (*result
== NULL
) {
5020 *status
= U_MEMORY_ALLOCATION_ERROR
;
5023 uprv_memcpy(*result
, primStart
, sortKeySize
);
5024 if(primStart
!= prim
) {
5025 uprv_free(primStart
);
5033 U_CFUNC
int32_t U_CALLCONV
5034 ucol_calcSortKeySimpleTertiary(const UCollator
*coll
,
5035 const UChar
*source
,
5036 int32_t sourceLength
,
5038 uint32_t resultLength
,
5039 UBool allocateSKBuffer
,
5044 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5045 uint32_t i
= 0; /* general purpose counter */
5047 /* Stack allocated buffers for buffers we use */
5048 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
];
5050 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
;
5052 if(U_FAILURE(*status
)) {
5056 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
5057 primaries
= *result
= prim
;
5058 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
5061 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
;
5063 uint32_t sortKeySize
= 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5065 UChar normBuffer
[UCOL_NORMALIZATION_MAX_BUFFER
];
5066 UChar
*normSource
= normBuffer
;
5067 int32_t normSourceLen
= UCOL_NORMALIZATION_MAX_BUFFER
;
5069 int32_t len
= sourceLength
;
5071 /* If we need to normalize, we'll do it all at once at the beginning! */
5072 if(coll
->normalizationMode
!= UCOL_OFF
&& UNORM_YES
!= unorm_quickCheck(source
, len
, UNORM_FCD
, status
)) {
5073 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5077 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
5078 normSourceLen
= len
;
5079 normSource
= (UChar
*)uprv_malloc(len
*U_SIZEOF_UCHAR
);
5080 if(normSource
== NULL
) {
5081 *status
= U_MEMORY_ALLOCATION_ERROR
;
5084 *status
= U_ZERO_ERROR
;
5085 len
= unorm_internalNormalize(normSource
, normSourceLen
,
5091 if(U_FAILURE(*status
)) {
5094 source
= normSource
;
5098 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5099 if(source
== normSource
) {
5100 s
.flags
&= ~UCOL_ITER_NORM
;
5103 if(resultLength
== 0 || primaries
== NULL
) {
5104 int32_t t
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5105 if(normSource
!= normBuffer
) {
5106 uprv_free(normSource
);
5111 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
5113 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
5115 uint8_t *primStart
= primaries
;
5116 uint8_t *secStart
= secondaries
;
5117 uint8_t *terStart
= tertiaries
;
5121 uint8_t primary1
= 0;
5122 uint8_t primary2
= 0;
5123 uint8_t secondary
= 0;
5124 uint8_t tertiary
= 0;
5125 uint8_t caseSwitch
= coll
->caseSwitch
;
5126 uint8_t tertiaryMask
= coll
->tertiaryMask
;
5127 int8_t tertiaryAddition
= (int8_t)coll
->tertiaryAddition
;
5128 uint8_t tertiaryTop
= coll
->tertiaryTop
;
5129 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
5130 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
5132 uint32_t prevBuffSize
= 0;
5134 UBool finished
= FALSE
;
5135 UBool notIsContinuation
= FALSE
;
5137 uint32_t count2
= 0, count3
= 0;
5138 uint8_t leadPrimary
= 0;
5141 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
5143 order
= ucol_IGetNextCE(coll
, &s
, status
);
5149 if(order
== UCOL_NO_MORE_CES
) {
5154 notIsContinuation
= !isContinuation(order
);
5156 if(notIsContinuation
) {
5157 tertiary
= (uint8_t)((order
& tertiaryMask
));
5159 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
5161 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5162 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5163 primary1
= (uint8_t)(order
>> 8);
5165 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5166 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5167 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5168 /* regular and simple sortkey calc */
5169 if(primary1
!= UCOL_IGNORABLE
) {
5170 if(notIsContinuation
) {
5171 if(leadPrimary
== primary1
) {
5172 *primaries
++ = primary2
;
5174 if(leadPrimary
!= 0) {
5175 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5177 if(primary2
== UCOL_IGNORABLE
) {
5178 /* one byter, not compressed */
5179 *primaries
++ = primary1
;
5181 } else if(primary1
<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
||
5182 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5183 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5184 (primary1
> maxRegularPrimary
&& primary1
< minImplicitPrimary
)) {
5185 /* not compressible */
5187 *primaries
++ = primary1
;
5188 *primaries
++ = primary2
;
5189 } else { /* compress */
5190 *primaries
++ = leadPrimary
= primary1
;
5191 *primaries
++ = primary2
;
5194 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5195 *primaries
++ = primary1
;
5196 if(primary2
!= UCOL_IGNORABLE
) {
5197 *primaries
++ = primary2
; /* second part */
5202 if(secondary
> 0) { /* I think that != 0 test should be != IGNORABLE */
5203 /* This is compression code. */
5204 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5208 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5209 while (count2
> UCOL_TOP_COUNT2
) {
5210 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5211 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5213 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5215 while (count2
> UCOL_BOT_COUNT2
) {
5216 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5217 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5219 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5223 *secondaries
++ = secondary
;
5227 if(notIsContinuation
) {
5228 tertiary
^= caseSwitch
;
5232 /* This is compression code. */
5233 /* sequence size check is included in the if clause */
5234 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5237 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
5238 tertiary
+= tertiaryAddition
;
5239 } else if (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
5240 tertiary
-= tertiaryAddition
;
5243 if ((tertiary
> tertiaryCommon
)) {
5244 while (count3
> coll
->tertiaryTopCount
) {
5245 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5246 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5248 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5250 while (count3
> coll
->tertiaryBottomCount
) {
5251 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5252 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5254 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5258 *tertiaries
++ = tertiary
;
5262 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5263 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5264 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5265 if(source
== normSource
) {
5266 s
.flags
&= ~UCOL_ITER_NORM
;
5268 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5269 *status
= U_BUFFER_OVERFLOW_ERROR
;
5272 } else { /* It's much nicer if we can actually reallocate */
5273 int32_t sks
= sortKeySize
+(primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
);
5274 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5275 if(U_SUCCESS(*status
)) {
5276 *result
= primStart
;
5277 primarySafeEnd
= primStart
+ resultLength
- 2;
5279 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5280 if(source
== normSource
) {
5281 s
.flags
&= ~UCOL_ITER_NORM
;
5283 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5293 prevBuffSize
= minBufferSize
;
5294 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5295 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5297 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5298 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
);
5299 if(source
== normSource
) {
5300 s
.flags
&= ~UCOL_ITER_NORM
;
5302 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5308 if(U_SUCCESS(*status
)) {
5309 sortKeySize
+= (primaries
- primStart
);
5310 /* we have done all the CE's, now let's put them together to form a key */
5312 while (count2
> UCOL_BOT_COUNT2
) {
5313 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5314 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5316 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5318 uint32_t secsize
= secondaries
-secStart
;
5319 sortKeySize
+= secsize
;
5320 if(sortKeySize
<= resultLength
) {
5321 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5322 uprv_memcpy(primaries
, secStart
, secsize
);
5323 primaries
+= secsize
;
5325 if(allocateSKBuffer
== TRUE
) {
5326 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5327 if(U_SUCCESS(*status
)) {
5328 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5329 *result
= primStart
;
5330 uprv_memcpy(primaries
, secStart
, secsize
);
5333 *status
= U_BUFFER_OVERFLOW_ERROR
;
5338 if (coll
->tertiaryCommon
!= UCOL_COMMON3_NORMAL
) {
5339 while (count3
>= coll
->tertiaryTopCount
) {
5340 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5341 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5343 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5345 while (count3
> coll
->tertiaryBottomCount
) {
5346 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5347 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5349 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5352 uint32_t tersize
= tertiaries
- terStart
;
5353 sortKeySize
+= tersize
;
5354 if(sortKeySize
<= resultLength
) {
5355 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5356 uprv_memcpy(primaries
, terStart
, tersize
);
5357 primaries
+= tersize
;
5359 if(allocateSKBuffer
== TRUE
) {
5360 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5361 if(U_SUCCESS(*status
)) {
5362 *result
= primStart
;
5363 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5364 uprv_memcpy(primaries
, terStart
, tersize
);
5367 *status
= U_MEMORY_ALLOCATION_ERROR
;
5371 *(primaries
++) = '\0';
5374 if(terStart
!= tert
) {
5375 uprv_free(terStart
);
5376 uprv_free(secStart
);
5379 if(normSource
!= normBuffer
) {
5380 uprv_free(normSource
);
5383 if(allocateSKBuffer
== TRUE
) {
5384 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5386 if (*result
== NULL
) {
5387 *status
= U_MEMORY_ALLOCATION_ERROR
;
5390 uprv_memcpy(*result
, primStart
, sortKeySize
);
5391 if(primStart
!= prim
) {
5392 uprv_free(primStart
);
5400 UBool
isShiftedCE(uint32_t CE
, uint32_t LVT
, UBool
*wasShifted
) {
5401 UBool notIsContinuation
= !isContinuation(CE
);
5402 uint8_t primary1
= (uint8_t)((CE
>> 24) & 0xFF);
5403 if(LVT
&& ((notIsContinuation
&& (CE
& 0xFFFF0000)<= LVT
&& primary1
> 0)
5404 || (!notIsContinuation
&& *wasShifted
))
5405 || (*wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
5406 // The stuff below should probably be in the sortkey code... maybe not...
5407 if(primary1
!= 0) { /* if we were shifted and we got an ignorable code point */
5408 /* we should just completely ignore it */
5412 //*wasShifted = TRUE;
5415 *wasShifted
= FALSE
;
5420 void terminatePSKLevel(int32_t level
, int32_t maxLevel
, int32_t &i
, uint8_t *dest
) {
5421 if(level
< maxLevel
) {
5422 dest
[i
++] = UCOL_LEVELTERMINATOR
;
5428 /** enumeration of level identifiers for partial sort key generation */
5430 UCOL_PSK_PRIMARY
= 0,
5431 UCOL_PSK_SECONDARY
= 1,
5433 UCOL_PSK_TERTIARY
= 3,
5434 UCOL_PSK_QUATERNARY
= 4,
5435 UCOL_PSK_QUIN
= 5, /** This is an extra level, not used - but we have three bits to blow */
5436 UCOL_PSK_IDENTICAL
= 6,
5437 UCOL_PSK_NULL
= 7, /** level for the end of sort key. Will just produce zeros */
5441 /** collation state enum. *_SHIFT value is how much to shift right
5442 * to get the state piece to the right. *_MASK value should be
5443 * ANDed with the shifted state. This data is stored in state[1]
5447 UCOL_PSK_LEVEL_SHIFT
= 0, /** level identificator. stores an enum value from above */
5448 UCOL_PSK_LEVEL_MASK
= 7, /** three bits */
5449 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
= 3, /** number of bytes of primary or quaternary already written */
5450 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
= 1,
5451 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5452 * This field is also used to denote that the French secondary level is finished
5454 UCOL_PSK_WAS_SHIFTED_SHIFT
= 4,/** was the last value shifted */
5455 UCOL_PSK_WAS_SHIFTED_MASK
= 1, /** can be 0 or 1 (Boolean) */
5456 UCOL_PSK_USED_FRENCH_SHIFT
= 5,/** how many French bytes have we already written */
5457 UCOL_PSK_USED_FRENCH_MASK
= 3, /** up to 4 bytes. See comment just below */
5458 /** When we do French we need to reverse secondary values. However, continuations
5459 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5461 UCOL_PSK_BOCSU_BYTES_SHIFT
= 7,
5462 UCOL_PSK_BOCSU_BYTES_MASK
= 3,
5463 UCOL_PSK_CONSUMED_CES_SHIFT
= 9,
5464 UCOL_PSK_CONSUMED_CES_MASK
= 0x7FFFF
5467 // macro calculating the number of expansion CEs available
5468 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5471 /** main sortkey part procedure. On the first call,
5472 * you should pass in a collator, an iterator, empty state
5473 * state[0] == state[1] == 0, a buffer to hold results
5474 * number of bytes you need and an error code pointer.
5475 * Make sure your buffer is big enough to hold the wanted
5476 * number of sortkey bytes. I don't check.
5477 * The only meaningful status you can get back is
5478 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5479 * have been dealt a raw deal and that you probably won't
5480 * be able to use partial sortkey generation for this
5481 * particular combination of string and collator. This
5482 * is highly unlikely, but you should still check the error code.
5483 * Any other status means that you're not in a sane situation
5484 * anymore. After the first call, preserve state values and
5485 * use them on subsequent calls to obtain more bytes of a sortkey.
5486 * Use until the number of bytes written is smaller than the requested
5487 * number of bytes. Generated sortkey is not compatible with the
5488 * one generated by ucol_getSortKey, as we don't do any compression.
5489 * However, levels are still terminated by a 1 (one) and the sortkey
5490 * is terminated by a 0 (zero). Identical level is the same as in the
5491 * regular sortkey - internal bocu-1 implementation is used.
5492 * For curious, although you cannot do much about this, here is
5493 * the structure of state words.
5494 * state[0] - iterator state. Depends on the iterator implementation,
5495 * but allows the iterator to continue where it stopped in
5496 * the last iteration.
5497 * state[1] - collation processing state. Here is the distribution
5499 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5500 * quaternary, quin (we don't use this one), identical and
5501 * null (producing only zeroes - first one to terminate the
5502 * sortkey and subsequent to fill the buffer).
5503 * 3 - byte count. Number of bytes written on the primary level.
5504 * 4 - was shifted. Whether the previous iteration finished in the
5506 * 5, 6 - French continuation bytes written. See the comment in the enum
5507 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5508 * the identical level.
5509 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5510 * since thes last successful update of the iterator state.
5512 U_CAPI
int32_t U_EXPORT2
5513 ucol_nextSortKeyPart(const UCollator
*coll
,
5514 UCharIterator
*iter
,
5516 uint8_t *dest
, int32_t count
,
5517 UErrorCode
*status
) {
5518 /* error checking */
5519 if(status
==NULL
|| U_FAILURE(*status
)) {
5522 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART
);
5523 if( coll
==NULL
|| iter
==NULL
||
5525 count
<0 || (count
>0 && dest
==NULL
)
5527 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
5530 UTRACE_DATA6(UTRACE_VERBOSE
, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5531 coll
, iter
, state
[0], state
[1], dest
, count
);
5535 UTRACE_EXIT_VALUE(0);
5538 /** Setting up situation according to the state we got from the previous iteration */
5539 // The state of the iterator from the previous invocation
5540 uint32_t iterState
= state
[0];
5541 // Has the last iteration ended in the shifted state
5542 UBool wasShifted
= ((state
[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT
) & UCOL_PSK_WAS_SHIFTED_MASK
)?TRUE
:FALSE
;
5543 // What is the current level of the sortkey?
5544 int32_t level
= (state
[1] >> UCOL_PSK_LEVEL_SHIFT
) & UCOL_PSK_LEVEL_MASK
;
5545 // Have we written only one byte from a two byte primary in the previous iteration?
5546 // Also on secondary level - have we finished with the French secondary?
5547 int32_t byteCountOrFrenchDone
= (state
[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
;
5548 // number of bytes in the continuation buffer for French
5549 int32_t usedFrench
= (state
[1] >> UCOL_PSK_USED_FRENCH_SHIFT
) & UCOL_PSK_USED_FRENCH_MASK
;
5550 // Number of bytes already written from a bocsu sequence. Since
5551 // the longes bocsu sequence is 4 long, this can be up to 3.
5552 int32_t bocsuBytesUsed
= (state
[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT
) & UCOL_PSK_BOCSU_BYTES_MASK
;
5553 // Number of elements that need to be consumed in this iteration because
5554 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5555 // so we had to save the last valid state.
5556 int32_t cces
= (state
[1] >> UCOL_PSK_CONSUMED_CES_SHIFT
) & UCOL_PSK_CONSUMED_CES_MASK
;
5558 /** values that depend on the collator attributes */
5559 // strength of the collator.
5560 int32_t strength
= ucol_getAttribute(coll
, UCOL_STRENGTH
, status
);
5561 // maximal level of the partial sortkey. Need to take whether case level is done
5562 int32_t maxLevel
= 0;
5563 if(strength
< UCOL_TERTIARY
) {
5564 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
5565 maxLevel
= UCOL_PSK_CASE
;
5567 maxLevel
= strength
;
5570 if(strength
== UCOL_TERTIARY
) {
5571 maxLevel
= UCOL_PSK_TERTIARY
;
5572 } else if(strength
== UCOL_QUATERNARY
) {
5573 maxLevel
= UCOL_PSK_QUATERNARY
;
5574 } else { // identical
5575 maxLevel
= UCOL_IDENTICAL
;
5578 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5579 uint8_t UCOL_HIRAGANA_QUAD
=
5580 (ucol_getAttribute(coll
, UCOL_HIRAGANA_QUATERNARY_MODE
, status
) == UCOL_ON
)?0xFE:0xFF;
5581 // Boundary value that decides whether a CE is shifted or not
5582 uint32_t LVT
= (coll
->alternateHandling
== UCOL_SHIFTED
)?(coll
->variableTopValue
<<16):0;
5583 // Are we doing French collation?
5584 UBool doingFrench
= (ucol_getAttribute(coll
, UCOL_FRENCH_COLLATION
, status
) == UCOL_ON
);
5586 /** initializing the collation state */
5587 UBool notIsContinuation
= FALSE
;
5588 uint32_t CE
= UCOL_NO_MORE_CES
;
5591 IInit_collIterate(coll
, NULL
, -1, &s
);
5593 s
.flags
|= UCOL_USE_ITERATOR
;
5594 // This variable tells us whether we have produced some other levels in this iteration
5595 // before we moved to the identical level. In that case, we need to switch the
5596 // type of the iterator.
5597 UBool doingIdenticalFromStart
= FALSE
;
5598 // Normalizing iterator
5599 // The division for the array length may truncate the array size to
5600 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5601 // for all platforms anyway.
5602 UAlignedMemory stackNormIter
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
5603 UNormIterator
*normIter
= NULL
;
5604 // If the normalization is turned on for the collator and we are below identical level
5605 // we will use a FCD normalizing iterator
5606 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
&& level
< UCOL_PSK_IDENTICAL
) {
5607 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
5608 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_FCD
, status
);
5609 s
.flags
&= ~UCOL_ITER_NORM
;
5610 if(U_FAILURE(*status
)) {
5611 UTRACE_EXIT_STATUS(*status
);
5614 } else if(level
== UCOL_PSK_IDENTICAL
) {
5615 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5616 // will be updating the state - and this cannot be done on an ordinary iterator.
5617 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
5618 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
5619 s
.flags
&= ~UCOL_ITER_NORM
;
5620 if(U_FAILURE(*status
)) {
5621 UTRACE_EXIT_STATUS(*status
);
5624 doingIdenticalFromStart
= TRUE
;
5627 // This is the tentative new state of the iterator. The problem
5628 // is that the iterator might return an undefined state, in
5629 // which case we should save the last valid state and increase
5630 // the iterator skip value.
5631 uint32_t newState
= 0;
5633 // First, we set the iterator to the last valid position
5634 // from the last iteration. This was saved in state[0].
5635 if(iterState
== 0) {
5637 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
&& !byteCountOrFrenchDone
) {
5638 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
5640 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5643 /* reset to previous state */
5644 s
.iterator
->setState(s
.iterator
, iterState
, status
);
5645 if(U_FAILURE(*status
)) {
5646 UTRACE_EXIT_STATUS(*status
);
5653 // This variable tells us whether we can attempt to update the state
5654 // of iterator. Situations where we don't want to update iterator state
5655 // are the existence of expansion CEs that are not yet processed, and
5656 // finishing the case level without enough space in the buffer to insert
5657 // a level terminator.
5658 UBool canUpdateState
= TRUE
;
5660 // Consume all the CEs that were consumed at the end of the previous
5661 // iteration without updating the iterator state. On identical level,
5662 // consume the code points.
5663 int32_t counter
= cces
;
5664 if(level
< UCOL_PSK_IDENTICAL
) {
5665 while(counter
-->0) {
5666 // If we're doing French and we are on the secondary level,
5668 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
5669 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
5671 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5673 if(CE
==UCOL_NO_MORE_CES
) {
5674 /* should not happen */
5675 *status
=U_INTERNAL_PROGRAM_ERROR
;
5676 UTRACE_EXIT_STATUS(*status
);
5679 if(uprv_numAvailableExpCEs(s
)) {
5680 canUpdateState
= FALSE
;
5684 while(counter
-->0) {
5685 uiter_next32(s
.iterator
);
5689 // French secondary needs to know whether the iterator state of zero came from previous level OR
5690 // from a new invocation...
5691 UBool wasDoingPrimary
= FALSE
;
5692 // destination buffer byte counter. When this guy
5693 // gets to count, we're done with the iteration
5695 // used to count the zero bytes written after we
5696 // have finished with the sort key
5700 // Hm.... I think we're ready to plunge in. Basic story is as following:
5701 // we have a fall through case based on level. This is used for initial
5702 // positioning on iteration start. Every level processor contains a
5703 // for(;;) which will be broken when we exhaust all the CEs. Other
5704 // way to exit is a goto saveState, which happens when we have filled
5707 case UCOL_PSK_PRIMARY
:
5708 wasDoingPrimary
= TRUE
;
5713 // We should save the state only if we
5714 // are sure that we are done with the
5715 // previous iterator state
5716 if(canUpdateState
&& byteCountOrFrenchDone
== 0) {
5717 newState
= s
.iterator
->getState(s
.iterator
);
5718 if(newState
!= UITER_NO_STATE
) {
5719 iterState
= newState
;
5723 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5725 if(CE
==UCOL_NO_MORE_CES
) {
5726 // Add the level separator
5727 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5728 byteCountOrFrenchDone
=0;
5729 // Restart the iteration an move to the
5731 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5733 level
= UCOL_PSK_SECONDARY
;
5736 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5737 CE
>>= UCOL_PRIMARYORDERSHIFT
; /* get primary */
5739 if(byteCountOrFrenchDone
== 0) {
5740 // get the second byte of primary
5741 dest
[i
++]=(uint8_t)(CE
>> 8);
5743 byteCountOrFrenchDone
= 0;
5745 if((CE
&=0xff)!=0) {
5748 byteCountOrFrenchDone
= 1;
5752 dest
[i
++]=(uint8_t)CE
;
5756 if(uprv_numAvailableExpCEs(s
)) {
5757 canUpdateState
= FALSE
;
5759 canUpdateState
= TRUE
;
5762 /* fall through to next level */
5763 case UCOL_PSK_SECONDARY
:
5764 if(strength
>= UCOL_SECONDARY
) {
5770 // We should save the state only if we
5771 // are sure that we are done with the
5772 // previous iterator state
5773 if(canUpdateState
) {
5774 newState
= s
.iterator
->getState(s
.iterator
);
5775 if(newState
!= UITER_NO_STATE
) {
5776 iterState
= newState
;
5780 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5782 if(CE
==UCOL_NO_MORE_CES
) {
5783 // Add the level separator
5784 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5785 byteCountOrFrenchDone
= 0;
5786 // Restart the iteration an move to the
5788 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5790 level
= UCOL_PSK_CASE
;
5793 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5794 CE
>>= 8; /* get secondary */
5796 dest
[i
++]=(uint8_t)CE
;
5799 if(uprv_numAvailableExpCEs(s
)) {
5800 canUpdateState
= FALSE
;
5802 canUpdateState
= TRUE
;
5805 } else { // French secondary processing
5806 uint8_t frenchBuff
[UCOL_MAX_BUFFER
];
5807 int32_t frenchIndex
= 0;
5808 // Here we are going backwards.
5809 // If the iterator is at the beggining, it should be
5811 if(wasDoingPrimary
) {
5812 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
5819 if(canUpdateState
) {
5820 newState
= s
.iterator
->getState(s
.iterator
);
5821 if(newState
!= UITER_NO_STATE
) {
5822 iterState
= newState
;
5826 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
5828 if(CE
==UCOL_NO_MORE_CES
) {
5829 // Add the level separator
5830 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5831 byteCountOrFrenchDone
= 0;
5832 // Restart the iteration an move to the next level
5833 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5834 level
= UCOL_PSK_CASE
;
5837 if(isContinuation(CE
)) { // if it's a continuation, we want to save it and
5838 // reverse when we get a first non-continuation CE.
5840 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
5841 } else if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5842 CE
>>= 8; /* get secondary */
5845 dest
[i
++]=(uint8_t)CE
;
5848 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
5849 frenchIndex
-= usedFrench
;
5851 while(i
< count
&& frenchIndex
) {
5852 dest
[i
++] = frenchBuff
[--frenchIndex
];
5857 if(uprv_numAvailableExpCEs(s
)) {
5858 canUpdateState
= FALSE
;
5860 canUpdateState
= TRUE
;
5865 level
= UCOL_PSK_CASE
;
5867 /* fall through to next level */
5869 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
5870 uint32_t caseShift
= UCOL_CASE_SHIFT_START
;
5871 uint8_t caseByte
= UCOL_CASE_BYTE_START
;
5872 uint8_t caseBits
= 0;
5878 // We should save the state only if we
5879 // are sure that we are done with the
5880 // previous iterator state
5881 if(canUpdateState
) {
5882 newState
= s
.iterator
->getState(s
.iterator
);
5883 if(newState
!= UITER_NO_STATE
) {
5884 iterState
= newState
;
5888 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5890 if(CE
==UCOL_NO_MORE_CES
) {
5891 // On the case level we might have an unfinished
5892 // case byte. Add one if it's started.
5893 if(caseShift
!= UCOL_CASE_SHIFT_START
) {
5894 dest
[i
++] = caseByte
;
5897 // We have finished processing CEs on this level.
5898 // However, we don't know if we have enough space
5899 // to add a case level terminator.
5901 // Add the level separator
5902 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5903 // Restart the iteration and move to the
5905 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5906 level
= UCOL_PSK_TERTIARY
;
5908 canUpdateState
= FALSE
;
5913 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5914 if(!isContinuation(CE
) && ((CE
& UCOL_PRIMARYMASK
) != 0 || strength
> UCOL_PRIMARY
)) {
5915 // do the case level if we need to do it. We don't want to calculate
5916 // case level for primary ignorables if we have only primary strength and case level
5917 // otherwise we would break well formedness of CEs
5918 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
5919 caseBits
= (uint8_t)(CE
& 0xC0);
5920 // this copies the case level logic from the
5921 // sort key generation code
5923 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
5924 if((caseBits
& 0xC0) == 0) {
5925 caseByte
|= 1 << (--caseShift
);
5927 caseByte
|= 0 << (--caseShift
);
5929 if(caseShift
== 0) {
5930 dest
[i
++] = caseByte
;
5931 caseShift
= UCOL_CASE_SHIFT_START
;
5932 caseByte
= UCOL_CASE_BYTE_START
;
5934 caseByte
|= ((caseBits
>>6)&1) << (--caseShift
);
5937 if((caseBits
& 0xC0) == 0) {
5938 caseByte
|= 0 << (--caseShift
);
5940 caseByte
|= 1 << (--caseShift
);
5942 if(caseShift
== 0) {
5943 dest
[i
++] = caseByte
;
5944 caseShift
= UCOL_CASE_SHIFT_START
;
5945 caseByte
= UCOL_CASE_BYTE_START
;
5947 caseByte
|= ((caseBits
>>7)&1) << (--caseShift
);
5954 // Not sure this is correct for the case level - revisit
5955 if(uprv_numAvailableExpCEs(s
)) {
5956 canUpdateState
= FALSE
;
5958 canUpdateState
= TRUE
;
5962 level
= UCOL_PSK_TERTIARY
;
5964 /* fall through to next level */
5965 case UCOL_PSK_TERTIARY
:
5966 if(strength
>= UCOL_TERTIARY
) {
5971 // We should save the state only if we
5972 // are sure that we are done with the
5973 // previous iterator state
5974 if(canUpdateState
) {
5975 newState
= s
.iterator
->getState(s
.iterator
);
5976 if(newState
!= UITER_NO_STATE
) {
5977 iterState
= newState
;
5981 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5983 if(CE
==UCOL_NO_MORE_CES
) {
5984 // Add the level separator
5985 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5986 byteCountOrFrenchDone
= 0;
5987 // Restart the iteration an move to the
5989 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5991 level
= UCOL_PSK_QUATERNARY
;
5994 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
5995 notIsContinuation
= !isContinuation(CE
);
5997 if(notIsContinuation
) {
5998 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
5999 CE
^= coll
->caseSwitch
;
6000 CE
&= coll
->tertiaryMask
;
6002 CE
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6006 dest
[i
++]=(uint8_t)CE
;
6009 if(uprv_numAvailableExpCEs(s
)) {
6010 canUpdateState
= FALSE
;
6012 canUpdateState
= TRUE
;
6016 // if we're not doing tertiary
6018 level
= UCOL_PSK_NULL
;
6020 /* fall through to next level */
6021 case UCOL_PSK_QUATERNARY
:
6022 if(strength
>= UCOL_QUATERNARY
) {
6027 // We should save the state only if we
6028 // are sure that we are done with the
6029 // previous iterator state
6030 if(canUpdateState
) {
6031 newState
= s
.iterator
->getState(s
.iterator
);
6032 if(newState
!= UITER_NO_STATE
) {
6033 iterState
= newState
;
6037 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6039 if(CE
==UCOL_NO_MORE_CES
) {
6040 // Add the level separator
6041 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6042 //dest[i++] = UCOL_LEVELTERMINATOR;
6043 byteCountOrFrenchDone
= 0;
6044 // Restart the iteration an move to the
6046 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6048 level
= UCOL_PSK_QUIN
;
6051 if(isShiftedCE(CE
, LVT
, &wasShifted
)) {
6052 CE
>>= 16; /* get primary */
6054 if(byteCountOrFrenchDone
== 0) {
6055 dest
[i
++]=(uint8_t)(CE
>> 8);
6057 byteCountOrFrenchDone
= 0;
6059 if((CE
&=0xff)!=0) {
6062 byteCountOrFrenchDone
= 1;
6065 dest
[i
++]=(uint8_t)CE
;
6069 notIsContinuation
= !isContinuation(CE
);
6070 if(notIsContinuation
) {
6071 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
6072 dest
[i
++] = UCOL_HIRAGANA_QUAD
;
6078 if(uprv_numAvailableExpCEs(s
)) {
6079 canUpdateState
= FALSE
;
6081 canUpdateState
= TRUE
;
6085 // if we're not doing quaternary
6087 level
= UCOL_PSK_NULL
;
6089 /* fall through to next level */
6091 level
= UCOL_PSK_IDENTICAL
;
6092 /* fall through to next level */
6093 case UCOL_PSK_IDENTICAL
:
6094 if(strength
>= UCOL_IDENTICAL
) {
6095 UChar32 first
, second
;
6096 int32_t bocsuBytesWritten
= 0;
6097 // We always need to do identical on
6098 // the NFD form of the string.
6099 if(normIter
== NULL
) {
6100 // we arrived from the level below and
6101 // normalization was not turned on.
6102 // therefore, we need to make a fresh NFD iterator
6103 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6104 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6105 } else if(!doingIdenticalFromStart
) {
6106 // there is an iterator, but we did some other levels.
6107 // therefore, we have a FCD iterator - need to make
6109 // normIter being at the beginning does not guarantee
6110 // that the underlying iterator is at the beginning
6111 iter
->move(iter
, 0, UITER_START
);
6112 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6114 // At this point we have a NFD iterator that is positioned
6115 // in the right place
6116 if(U_FAILURE(*status
)) {
6117 UTRACE_EXIT_STATUS(*status
);
6120 first
= uiter_previous32(s
.iterator
);
6121 // maybe we're at the start of the string
6122 if(first
== U_SENTINEL
) {
6125 uiter_next32(s
.iterator
);
6131 if(j
+1 < bocsuBytesWritten
) {
6132 bocsuBytesUsed
= j
+1;
6137 // On identical level, we will always save
6138 // the state if we reach this point, since
6139 // we don't depend on getNextCE for content
6140 // all the content is in our buffer and we
6141 // already either stored the full buffer OR
6142 // otherwise we won't arrive here.
6143 newState
= s
.iterator
->getState(s
.iterator
);
6144 if(newState
!= UITER_NO_STATE
) {
6145 iterState
= newState
;
6150 second
= uiter_next32(s
.iterator
);
6153 // end condition for identical level
6154 if(second
== U_SENTINEL
) {
6155 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6156 level
= UCOL_PSK_NULL
;
6159 bocsuBytesWritten
= u_writeIdenticalLevelRunTwoChars(first
, second
, buff
);
6163 if(bocsuBytesUsed
!= 0) {
6164 while(bocsuBytesUsed
-->0) {
6169 while(i
< count
&& j
< bocsuBytesWritten
) {
6170 dest
[i
++] = buff
[j
++];
6175 level
= UCOL_PSK_NULL
;
6177 /* fall through to next level */
6185 *status
= U_INTERNAL_PROGRAM_ERROR
;
6186 UTRACE_EXIT_STATUS(*status
);
6191 // Now we need to return stuff. First we want to see whether we have
6192 // done everything for the current state of iterator.
6193 if(byteCountOrFrenchDone
6194 || canUpdateState
== FALSE
6195 || (newState
= s
.iterator
->getState(s
.iterator
)) == UITER_NO_STATE
) {
6196 // Any of above mean that the previous transaction
6197 // wasn't finished and that we should store the
6198 // previous iterator state.
6199 state
[0] = iterState
;
6201 // The transaction is complete. We will continue in the next iteration.
6202 state
[0] = s
.iterator
->getState(s
.iterator
);
6205 // Store the number of bocsu bytes written.
6206 if((bocsuBytesUsed
& UCOL_PSK_BOCSU_BYTES_MASK
) != bocsuBytesUsed
) {
6207 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6209 state
[1] = (bocsuBytesUsed
& UCOL_PSK_BOCSU_BYTES_MASK
) << UCOL_PSK_BOCSU_BYTES_SHIFT
;
6211 // Next we put in the level of comparison
6212 state
[1] |= ((level
& UCOL_PSK_LEVEL_MASK
) << UCOL_PSK_LEVEL_SHIFT
);
6214 // If we are doing French, we need to store whether we have just finished the French level
6215 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6216 state
[1] |= (((state
[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6218 state
[1] |= ((byteCountOrFrenchDone
& UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6221 // Was the latest CE shifted
6223 state
[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT
;
6225 // Check for cces overflow
6226 if((cces
& UCOL_PSK_CONSUMED_CES_MASK
) != cces
) {
6227 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6230 state
[1] |= ((cces
& UCOL_PSK_CONSUMED_CES_MASK
) << UCOL_PSK_CONSUMED_CES_SHIFT
);
6232 // Check for French overflow
6233 if((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) != usedFrench
) {
6234 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6236 // Store number of bytes written in the French secondary continuation sequence
6237 state
[1] |= ((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) << UCOL_PSK_USED_FRENCH_SHIFT
);
6240 // If we have used normalizing iterator, get rid of it
6241 if(normIter
!= NULL
) {
6242 unorm_closeIter(normIter
);
6245 // Return number of meaningful sortkey bytes.
6246 UTRACE_DATA4(UTRACE_VERBOSE
, "dest = %vb, state=%d %d",
6247 dest
,i
, state
[0], state
[1]);
6248 UTRACE_EXIT_VALUE(i
);
6253 * Produce a bound for a given sortkey and a number of levels.
6255 U_CAPI
int32_t U_EXPORT2
6256 ucol_getBound(const uint8_t *source
,
6257 int32_t sourceLength
,
6258 UColBoundMode boundType
,
6259 uint32_t noOfLevels
,
6261 int32_t resultLength
,
6262 UErrorCode
*status
) {
6263 // consistency checks
6264 if(status
== NULL
|| U_FAILURE(*status
)) {
6267 if(source
== NULL
) {
6268 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6272 int32_t sourceIndex
= 0;
6273 // Scan the string until we skip enough of the key OR reach the end of the key
6276 if(source
[sourceIndex
] == UCOL_LEVELTERMINATOR
) {
6279 } while (noOfLevels
> 0
6280 && (source
[sourceIndex
] != 0 || sourceIndex
< sourceLength
));
6282 if((source
[sourceIndex
] == 0 || sourceIndex
== sourceLength
)
6283 && noOfLevels
> 0) {
6284 *status
= U_SORT_KEY_TOO_SHORT_WARNING
;
6288 // READ ME: this code assumes that the values for boundType
6289 // enum will not changes. They are set so that the enum value
6290 // corresponds to the number of extra bytes each bound type
6292 if(result
!= NULL
&& resultLength
>= sourceIndex
+boundType
) {
6293 uprv_memcpy(result
, source
, sourceIndex
);
6295 // Lower bound just gets terminated. No extra bytes
6296 case UCOL_BOUND_LOWER
: // = 0
6298 // Upper bound needs one extra byte
6299 case UCOL_BOUND_UPPER
: // = 1
6300 result
[sourceIndex
++] = 2;
6302 // Upper long bound needs two extra bytes
6303 case UCOL_BOUND_UPPER_LONG
: // = 2
6304 result
[sourceIndex
++] = 0xFF;
6305 result
[sourceIndex
++] = 0xFF;
6308 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6311 result
[sourceIndex
++] = 0;
6315 return sourceIndex
+boundType
+1;
6319 /****************************************************************************/
6320 /* Following are the functions that deal with the properties of a collator */
6321 /* there are new APIs and some compatibility APIs */
6322 /****************************************************************************/
6325 ucol_addLatinOneEntry(UCollator
*coll
, UChar ch
, uint32_t CE
,
6326 int32_t *primShift
, int32_t *secShift
, int32_t *terShift
) {
6327 uint8_t primary1
= 0, primary2
= 0, secondary
= 0, tertiary
= 0;
6328 UBool reverseSecondary
= FALSE
;
6329 if(!isContinuation(CE
)) {
6330 tertiary
= (uint8_t)((CE
& coll
->tertiaryMask
));
6331 tertiary
^= coll
->caseSwitch
;
6332 reverseSecondary
= TRUE
;
6334 tertiary
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6335 tertiary
&= UCOL_REMOVE_CASE
;
6336 reverseSecondary
= FALSE
;
6339 secondary
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6340 primary2
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6341 primary1
= (uint8_t)(CE
>> 8);
6344 coll
->latinOneCEs
[ch
] |= (primary1
<< *primShift
);
6348 if(*primShift
< 0) {
6349 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6350 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6351 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6354 coll
->latinOneCEs
[ch
] |= (primary2
<< *primShift
);
6357 if(secondary
!= 0) {
6358 if(reverseSecondary
&& coll
->frenchCollation
== UCOL_ON
) { // reverse secondary
6359 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] >>= 8; // make space for secondary
6360 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< 24);
6361 } else { // normal case
6362 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< *secShift
);
6367 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] |= (tertiary
<< *terShift
);
6373 ucol_resizeLatinOneTable(UCollator
*coll
, int32_t size
, UErrorCode
*status
) {
6374 uint32_t *newTable
= (uint32_t *)uprv_malloc(size
*sizeof(uint32_t)*3);
6375 if(newTable
== NULL
) {
6376 *status
= U_MEMORY_ALLOCATION_ERROR
;
6377 coll
->latinOneFailed
= TRUE
;
6380 int32_t sizeToCopy
= ((size
<coll
->latinOneTableLen
)?size
:coll
->latinOneTableLen
)*sizeof(uint32_t);
6381 uprv_memset(newTable
, 0, size
*sizeof(uint32_t)*3);
6382 uprv_memcpy(newTable
, coll
->latinOneCEs
, sizeToCopy
);
6383 uprv_memcpy(newTable
+size
, coll
->latinOneCEs
+coll
->latinOneTableLen
, sizeToCopy
);
6384 uprv_memcpy(newTable
+2*size
, coll
->latinOneCEs
+2*coll
->latinOneTableLen
, sizeToCopy
);
6385 coll
->latinOneTableLen
= size
;
6386 uprv_free(coll
->latinOneCEs
);
6387 coll
->latinOneCEs
= newTable
;
6392 ucol_setUpLatinOne(UCollator
*coll
, UErrorCode
*status
) {
6393 UBool result
= TRUE
;
6394 if(coll
->latinOneCEs
== NULL
) {
6395 coll
->latinOneCEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN
*3);
6396 if(coll
->latinOneCEs
== NULL
) {
6397 *status
= U_MEMORY_ALLOCATION_ERROR
;
6400 coll
->latinOneTableLen
= UCOL_LATINONETABLELEN
;
6403 UCollationElements
*it
= ucol_openElements(coll
, &ch
, 1, status
);
6404 uprv_memset(coll
->latinOneCEs
, 0, sizeof(uint32_t)*coll
->latinOneTableLen
*3);
6406 int32_t primShift
= 24, secShift
= 24, terShift
= 24;
6408 int32_t contractionOffset
= UCOL_ENDOFLATINONERANGE
+1;
6410 // TODO: make safe if you get more than you wanted...
6411 for(ch
= 0; ch
<= UCOL_ENDOFLATINONERANGE
; ch
++) {
6412 primShift
= 24; secShift
= 24; terShift
= 24;
6414 CE
= coll
->latinOneMapping
[ch
];
6416 CE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
6417 if(CE
== UCOL_NOT_FOUND
&& coll
->UCA
) {
6418 CE
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
6421 if(CE
< UCOL_NOT_FOUND
) {
6422 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6424 switch (getCETag(CE
)) {
6427 ucol_setText(it
, &ch
, 1, status
);
6428 while((int32_t)(CE
= ucol_next(it
, status
)) != UCOL_NULLORDER
) {
6429 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6430 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6431 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6432 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6435 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6438 case CONTRACTION_TAG
:
6439 // here is the trick
6440 // F2 is contraction. We do something very similar to contractions
6441 // but have two indices, one in the real contraction table and the
6442 // other to where we stuffed things. This hopes that we don't have
6443 // many contractions (this should work for latin-1 tables).
6445 if((CE
& 0x00FFF000) != 0) {
6446 *status
= U_UNSUPPORTED_ERROR
;
6447 goto cleanup_after_failure
;
6450 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
6452 CE
|= (contractionOffset
& 0xFFF) << 12; // insert the offset in latin-1 table
6454 coll
->latinOneCEs
[ch
] = CE
;
6455 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = CE
;
6456 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = CE
;
6458 // We're going to jump into contraction table, pick the elements
6461 CE
= *(coll
->contractionCEs
+
6462 (UCharOffset
- coll
->contractionIndex
));
6463 if(CE
> UCOL_NOT_FOUND
&& getCETag(CE
) == EXPANSION_TAG
) {
6465 uint32_t i
; /* general counter */
6466 uint32_t *CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
6467 size
= getExpansionCount(CE
);
6469 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6470 for(i
= 0; i
<size
; i
++) {
6471 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6472 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6473 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6474 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6477 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6479 } else { /* else, we do */
6480 while(*CEOffset
!= 0) {
6481 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6482 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6483 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6484 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6487 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6490 contractionOffset
++;
6491 } else if(CE
< UCOL_NOT_FOUND
) {
6492 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
++, CE
, &primShift
, &secShift
, &terShift
);
6494 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6495 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6496 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6497 contractionOffset
++;
6500 primShift
= 24; secShift
= 24; terShift
= 24;
6501 if(contractionOffset
== coll
->latinOneTableLen
) { // we need to reallocate
6502 if(!ucol_resizeLatinOneTable(coll
, 2*coll
->latinOneTableLen
, status
)) {
6503 goto cleanup_after_failure
;
6506 } while(*UCharOffset
!= 0xFFFF);
6510 goto cleanup_after_failure
;
6515 if(contractionOffset
< coll
->latinOneTableLen
) {
6516 if(!ucol_resizeLatinOneTable(coll
, contractionOffset
, status
)) {
6517 goto cleanup_after_failure
;
6520 ucol_closeElements(it
);
6523 cleanup_after_failure
:
6524 // status should already be set before arriving here.
6525 coll
->latinOneFailed
= TRUE
;
6526 ucol_closeElements(it
);
6530 void ucol_updateInternalState(UCollator
*coll
, UErrorCode
*status
) {
6531 if(U_SUCCESS(*status
)) {
6532 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6533 coll
->caseSwitch
= UCOL_CASE_SWITCH
;
6535 coll
->caseSwitch
= UCOL_NO_CASE_SWITCH
;
6538 if(coll
->caseLevel
== UCOL_ON
|| coll
->caseFirst
== UCOL_OFF
) {
6539 coll
->tertiaryMask
= UCOL_REMOVE_CASE
;
6540 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
6541 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_OFF
;
6542 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_OFF
;
6543 coll
->tertiaryBottom
= UCOL_COMMON_BOT3
;
6545 coll
->tertiaryMask
= UCOL_KEEP_CASE
;
6546 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_ON
;
6547 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6548 coll
->tertiaryCommon
= UCOL_COMMON3_UPPERFIRST
;
6549 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_UPPER
;
6550 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_UPPER
;
6552 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
6553 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_LOWER
;
6554 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_LOWER
;
6558 /* Set the compression values */
6559 uint8_t tertiaryTotal
= (uint8_t)(coll
->tertiaryTop
- UCOL_COMMON_BOT3
-1);
6560 coll
->tertiaryTopCount
= (uint8_t)(UCOL_PROPORTION3
*tertiaryTotal
); /* we multilply double with int, but need only int */
6561 coll
->tertiaryBottomCount
= (uint8_t)(tertiaryTotal
- coll
->tertiaryTopCount
);
6563 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
== UCOL_TERTIARY
6564 && coll
->frenchCollation
== UCOL_OFF
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
) {
6565 coll
->sortKeyGen
= ucol_calcSortKeySimpleTertiary
;
6567 coll
->sortKeyGen
= ucol_calcSortKey
;
6569 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
<= UCOL_TERTIARY
&& coll
->numericCollation
== UCOL_OFF
6570 && coll
->alternateHandling
== UCOL_NON_IGNORABLE
&& !coll
->latinOneFailed
) {
6571 if(coll
->latinOneCEs
== NULL
|| coll
->latinOneRegenTable
) {
6572 if(ucol_setUpLatinOne(coll
, status
)) { // if we succeed in building latin1 table, we'll use it
6573 //fprintf(stderr, "F");
6574 coll
->latinOneUse
= TRUE
;
6576 coll
->latinOneUse
= FALSE
;
6578 if(*status
== U_UNSUPPORTED_ERROR
) {
6579 *status
= U_ZERO_ERROR
;
6581 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6582 coll
->latinOneUse
= TRUE
;
6585 coll
->latinOneUse
= FALSE
;
6590 U_CAPI
uint32_t U_EXPORT2
6591 ucol_setVariableTop(UCollator
*coll
, const UChar
*varTop
, int32_t len
, UErrorCode
*status
) {
6592 if(U_FAILURE(*status
) || coll
== NULL
) {
6596 len
= u_strlen(varTop
);
6599 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6604 IInit_collIterate(coll
, varTop
, len
, &s
);
6606 uint32_t CE
= ucol_IGetNextCE(coll
, &s
, status
);
6608 /* here we check if we have consumed all characters */
6609 /* you can put in either one character or a contraction */
6610 /* you shouldn't put more... */
6611 if(s
.pos
!= s
.endp
|| CE
== UCOL_NO_MORE_CES
) {
6612 *status
= U_CE_NOT_FOUND_ERROR
;
6616 uint32_t nextCE
= ucol_IGetNextCE(coll
, &s
, status
);
6618 if(isContinuation(nextCE
) && (nextCE
& UCOL_PRIMARYMASK
) != 0) {
6619 *status
= U_PRIMARY_TOO_LONG_ERROR
;
6622 if(coll
->variableTopValue
!= (CE
& UCOL_PRIMARYMASK
)>>16) {
6623 coll
->variableTopValueisDefault
= FALSE
;
6624 coll
->variableTopValue
= (CE
& UCOL_PRIMARYMASK
)>>16;
6627 return CE
& UCOL_PRIMARYMASK
;
6630 U_CAPI
uint32_t U_EXPORT2
ucol_getVariableTop(const UCollator
*coll
, UErrorCode
*status
) {
6631 if(U_FAILURE(*status
) || coll
== NULL
) {
6634 return coll
->variableTopValue
<<16;
6637 U_CAPI
void U_EXPORT2
6638 ucol_restoreVariableTop(UCollator
*coll
, const uint32_t varTop
, UErrorCode
*status
) {
6639 if(U_FAILURE(*status
) || coll
== NULL
) {
6643 if(coll
->variableTopValue
!= (varTop
& UCOL_PRIMARYMASK
)>>16) {
6644 coll
->variableTopValueisDefault
= FALSE
;
6645 coll
->variableTopValue
= (varTop
& UCOL_PRIMARYMASK
)>>16;
6648 /* Attribute setter API */
6649 U_CAPI
void U_EXPORT2
6650 ucol_setAttribute(UCollator
*coll
, UColAttribute attr
, UColAttributeValue value
, UErrorCode
*status
) {
6651 if(U_FAILURE(*status
) || coll
== NULL
) {
6654 UColAttributeValue oldFrench
= coll
->frenchCollation
;
6655 UColAttributeValue oldCaseFirst
= coll
->caseFirst
;
6657 case UCOL_NUMERIC_COLLATION
: /* sort substrings of digits as numbers */
6658 if(value
== UCOL_ON
) {
6659 coll
->numericCollation
= UCOL_ON
;
6660 coll
->numericCollationisDefault
= FALSE
;
6661 } else if (value
== UCOL_OFF
) {
6662 coll
->numericCollation
= UCOL_OFF
;
6663 coll
->numericCollationisDefault
= FALSE
;
6664 } else if (value
== UCOL_DEFAULT
) {
6665 coll
->numericCollationisDefault
= TRUE
;
6666 coll
->numericCollation
= (UColAttributeValue
)coll
->options
->numericCollation
;
6668 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6671 case UCOL_HIRAGANA_QUATERNARY_MODE
: /* special quaternary values for Hiragana */
6672 if(value
== UCOL_ON
) {
6673 coll
->hiraganaQ
= UCOL_ON
;
6674 coll
->hiraganaQisDefault
= FALSE
;
6675 } else if (value
== UCOL_OFF
) {
6676 coll
->hiraganaQ
= UCOL_OFF
;
6677 coll
->hiraganaQisDefault
= FALSE
;
6678 } else if (value
== UCOL_DEFAULT
) {
6679 coll
->hiraganaQisDefault
= TRUE
;
6680 coll
->hiraganaQ
= (UColAttributeValue
)coll
->options
->hiraganaQ
;
6682 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6685 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
6686 if(value
== UCOL_ON
) {
6687 coll
->frenchCollation
= UCOL_ON
;
6688 coll
->frenchCollationisDefault
= FALSE
;
6689 } else if (value
== UCOL_OFF
) {
6690 coll
->frenchCollation
= UCOL_OFF
;
6691 coll
->frenchCollationisDefault
= FALSE
;
6692 } else if (value
== UCOL_DEFAULT
) {
6693 coll
->frenchCollationisDefault
= TRUE
;
6694 coll
->frenchCollation
= (UColAttributeValue
)coll
->options
->frenchCollation
;
6696 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6699 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
6700 if(value
== UCOL_SHIFTED
) {
6701 coll
->alternateHandling
= UCOL_SHIFTED
;
6702 coll
->alternateHandlingisDefault
= FALSE
;
6703 } else if (value
== UCOL_NON_IGNORABLE
) {
6704 coll
->alternateHandling
= UCOL_NON_IGNORABLE
;
6705 coll
->alternateHandlingisDefault
= FALSE
;
6706 } else if (value
== UCOL_DEFAULT
) {
6707 coll
->alternateHandlingisDefault
= TRUE
;
6708 coll
->alternateHandling
= (UColAttributeValue
)coll
->options
->alternateHandling
;
6710 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6713 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
6714 if(value
== UCOL_LOWER_FIRST
) {
6715 coll
->caseFirst
= UCOL_LOWER_FIRST
;
6716 coll
->caseFirstisDefault
= FALSE
;
6717 } else if (value
== UCOL_UPPER_FIRST
) {
6718 coll
->caseFirst
= UCOL_UPPER_FIRST
;
6719 coll
->caseFirstisDefault
= FALSE
;
6720 } else if (value
== UCOL_OFF
) {
6721 coll
->caseFirst
= UCOL_OFF
;
6722 coll
->caseFirstisDefault
= FALSE
;
6723 } else if (value
== UCOL_DEFAULT
) {
6724 coll
->caseFirst
= (UColAttributeValue
)coll
->options
->caseFirst
;
6725 coll
->caseFirstisDefault
= TRUE
;
6727 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6730 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
6731 if(value
== UCOL_ON
) {
6732 coll
->caseLevel
= UCOL_ON
;
6733 coll
->caseLevelisDefault
= FALSE
;
6734 } else if (value
== UCOL_OFF
) {
6735 coll
->caseLevel
= UCOL_OFF
;
6736 coll
->caseLevelisDefault
= FALSE
;
6737 } else if (value
== UCOL_DEFAULT
) {
6738 coll
->caseLevel
= (UColAttributeValue
)coll
->options
->caseLevel
;
6739 coll
->caseLevelisDefault
= TRUE
;
6741 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6744 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
6745 if(value
== UCOL_ON
) {
6746 coll
->normalizationMode
= UCOL_ON
;
6747 coll
->normalizationModeisDefault
= FALSE
;
6748 } else if (value
== UCOL_OFF
) {
6749 coll
->normalizationMode
= UCOL_OFF
;
6750 coll
->normalizationModeisDefault
= FALSE
;
6751 } else if (value
== UCOL_DEFAULT
) {
6752 coll
->normalizationModeisDefault
= TRUE
;
6753 coll
->normalizationMode
= (UColAttributeValue
)coll
->options
->normalizationMode
;
6755 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6758 case UCOL_STRENGTH
: /* attribute for strength */
6759 if (value
== UCOL_DEFAULT
) {
6760 coll
->strengthisDefault
= TRUE
;
6761 coll
->strength
= (UColAttributeValue
)coll
->options
->strength
;
6762 } else if (value
<= UCOL_IDENTICAL
) {
6763 coll
->strengthisDefault
= FALSE
;
6764 coll
->strength
= value
;
6766 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6769 case UCOL_ATTRIBUTE_COUNT
:
6771 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6774 if(oldFrench
!= coll
->frenchCollation
|| oldCaseFirst
!= coll
->caseFirst
) {
6775 coll
->latinOneRegenTable
= TRUE
;
6777 coll
->latinOneRegenTable
= FALSE
;
6779 ucol_updateInternalState(coll
, status
);
6782 U_CAPI UColAttributeValue U_EXPORT2
6783 ucol_getAttribute(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
) {
6784 if(U_FAILURE(*status
) || coll
== NULL
) {
6785 return UCOL_DEFAULT
;
6788 case UCOL_NUMERIC_COLLATION
:
6789 return coll
->numericCollation
;
6790 case UCOL_HIRAGANA_QUATERNARY_MODE
:
6791 return coll
->hiraganaQ
;
6792 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
6793 return coll
->frenchCollation
;
6794 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
6795 return coll
->alternateHandling
;
6796 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
6797 return coll
->caseFirst
;
6798 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
6799 return coll
->caseLevel
;
6800 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
6801 return coll
->normalizationMode
;
6802 case UCOL_STRENGTH
: /* attribute for strength */
6803 return coll
->strength
;
6804 case UCOL_ATTRIBUTE_COUNT
:
6806 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6809 return UCOL_DEFAULT
;
6812 U_CAPI
void U_EXPORT2
6813 ucol_setStrength( UCollator
*coll
,
6814 UCollationStrength strength
)
6816 UErrorCode status
= U_ZERO_ERROR
;
6817 ucol_setAttribute(coll
, UCOL_STRENGTH
, strength
, &status
);
6820 U_CAPI UCollationStrength U_EXPORT2
6821 ucol_getStrength(const UCollator
*coll
)
6823 UErrorCode status
= U_ZERO_ERROR
;
6824 return ucol_getAttribute(coll
, UCOL_STRENGTH
, &status
);
6827 /****************************************************************************/
6828 /* Following are misc functions */
6829 /* there are new APIs and some compatibility APIs */
6830 /****************************************************************************/
6832 U_CAPI
void U_EXPORT2
6833 ucol_getVersion(const UCollator
* coll
,
6834 UVersionInfo versionInfo
)
6836 /* RunTime version */
6837 uint8_t rtVersion
= UCOL_RUNTIME_VERSION
;
6838 /* Builder version*/
6839 uint8_t bdVersion
= coll
->image
->version
[0];
6841 /* Charset Version. Need to get the version from cnv files
6842 * makeconv should populate cnv files with version and
6843 * an api has to be provided in ucnv.h to obtain this version
6845 uint8_t csVersion
= 0;
6847 /* combine the version info */
6848 uint16_t cmbVersion
= (uint16_t)((rtVersion
<<11) | (bdVersion
<<6) | (csVersion
));
6850 /* Tailoring rules */
6851 versionInfo
[0] = (uint8_t)(cmbVersion
>>8);
6852 versionInfo
[1] = (uint8_t)cmbVersion
;
6853 versionInfo
[2] = coll
->image
->version
[1];
6855 versionInfo
[3] = coll
->UCA
->image
->UCAVersion
[0];
6862 /* This internal API checks whether a character is tailored or not */
6863 U_CAPI UBool U_EXPORT2
6864 ucol_isTailored(const UCollator
*coll
, const UChar u
, UErrorCode
*status
) {
6865 uint32_t CE
= UCOL_NOT_FOUND
;
6866 const UChar
*ContractionStart
= NULL
;
6867 if(U_SUCCESS(*status
) && coll
!= NULL
) {
6868 if(coll
== coll
->UCA
) {
6870 } else if(u
< 0x100) { /* latin-1 */
6871 CE
= coll
->latinOneMapping
[u
];
6872 if(coll
->UCA
&& CE
== coll
->UCA
->latinOneMapping
[u
]) {
6875 } else { /* regular */
6876 CE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, u
);
6879 if(isContraction(CE
)) {
6880 ContractionStart
= (UChar
*)coll
->image
+getContractOffset(CE
);
6881 CE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
6884 if(CE
== UCOL_NOT_FOUND
) {
6895 /****************************************************************************/
6896 /* Following are the string compare functions */
6898 /****************************************************************************/
6901 /* ucol_checkIdent internal function. Does byte level string compare. */
6902 /* Used by strcoll if strength == identical and strings */
6903 /* are otherwise equal. Moved out-of-line because this */
6904 /* is a rare case. */
6906 /* Comparison must be done on NFD normalized strings. */
6907 /* FCD is not good enough. */
6909 /* TODO: make an incremental NFD Comparison function, which could */
6910 /* be of general use */
6913 UCollationResult
ucol_checkIdent(collIterate
*sColl
, collIterate
*tColl
, UBool normalize
, UErrorCode
*status
)
6916 // TODO: When we have an UChar iterator, we need to access the whole string. One
6917 // useful modification would be a UChar iterator extract API, since reset next next...
6919 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
6921 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6922 // of same type, but that doesn't really mean that it will stay that way.
6924 // The division for the array length may truncate the array size to
6925 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6926 // for all platforms anyway.
6927 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
6928 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
6929 //UChar sStackBuf[256], tStackBuf[256];
6930 //int32_t sBufSize = 256, tBufSize = 256;
6936 UBool freeSBuf
= FALSE
, freeTBuf
= FALSE
;
6938 if (sColl
->flags
& UCOL_USE_ITERATOR
) {
6939 UNormIterator
*sNIt
= NULL
, *tNIt
= NULL
;
6940 sNIt
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
6941 tNIt
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
6942 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
6943 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
6944 UCharIterator
*sIt
= unorm_setIter(sNIt
, sColl
->iterator
, UNORM_NFD
, status
);
6945 UCharIterator
*tIt
= unorm_setIter(tNIt
, tColl
->iterator
, UNORM_NFD
, status
);
6946 comparison
= u_strCompareIter(sIt
, tIt
, TRUE
);
6947 unorm_closeIter(sNIt
);
6948 unorm_closeIter(tNIt
);
6950 sLen
= (sColl
->flags
& UCOL_ITER_HASLEN
) ? sColl
->endp
- sColl
->string
: -1;
6951 sBuf
= sColl
->string
;
6952 tLen
= (tColl
->flags
& UCOL_ITER_HASLEN
) ? tColl
->endp
- tColl
->string
: -1;
6953 tBuf
= tColl
->string
;
6956 *status
= U_ZERO_ERROR
;
6957 if (unorm_quickCheck(sBuf
, sLen
, UNORM_NFD
, status
) != UNORM_YES
) {
6958 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
6962 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
6963 if(!u_growBufferFromStatic(sColl
->stackWritableBuffer
,
6964 &sColl
->writableBuffer
,
6965 (int32_t *)&sColl
->writableBufSize
, sLen
,
6968 *status
= U_MEMORY_ALLOCATION_ERROR
;
6969 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
6971 *status
= U_ZERO_ERROR
;
6972 sLen
= unorm_decompose(sColl
->writableBuffer
, (int32_t)sColl
->writableBufSize
,
6981 sBuf
= sColl
->writableBuffer
;
6982 if (sBuf
!= sColl
->stackWritableBuffer
) {
6983 sColl
->flags
|= UCOL_ITER_ALLOCATED
;
6987 *status
= U_ZERO_ERROR
;
6988 if (unorm_quickCheck(tBuf
, tLen
, UNORM_NFD
, status
) != UNORM_YES
) {
6989 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
6993 if(*status
== U_BUFFER_OVERFLOW_ERROR
) {
6994 if(!u_growBufferFromStatic(tColl
->stackWritableBuffer
,
6995 &tColl
->writableBuffer
,
6996 (int32_t *)&tColl
->writableBufSize
, tLen
,
6999 *status
= U_MEMORY_ALLOCATION_ERROR
;
7000 return UCOL_LESS
; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7002 *status
= U_ZERO_ERROR
;
7003 tLen
= unorm_decompose(tColl
->writableBuffer
, (int32_t)tColl
->writableBufSize
,
7012 tBuf
= tColl
->writableBuffer
;
7013 if (tBuf
!= tColl
->stackWritableBuffer
) {
7014 tColl
->flags
|= UCOL_ITER_ALLOCATED
;
7019 if (sLen
== -1 && tLen
== -1) {
7020 comparison
= u_strcmpCodePointOrder(sBuf
, tBuf
);
7023 sLen
= u_strlen(sBuf
);
7026 tLen
= u_strlen(tBuf
);
7028 comparison
= u_memcmpCodePointOrder(sBuf
, tBuf
, uprv_min(sLen
, tLen
));
7029 if (comparison
== 0) {
7030 comparison
= sLen
- tLen
;
7035 if (comparison
< 0) {
7037 } else if (comparison
== 0) {
7039 } else /* comparison > 0 */ {
7040 return UCOL_GREATER
;
7044 /* CEBuf - A struct and some inline functions to handle the saving */
7045 /* of CEs in a buffer within ucol_strcoll */
7047 #define UCOL_CEBUF_SIZE 512
7048 typedef struct ucol_CEBuf
{
7052 uint32_t localArray
[UCOL_CEBUF_SIZE
];
7057 inline void UCOL_INIT_CEBUF(ucol_CEBuf
*b
) {
7058 (b
)->buf
= (b
)->pos
= (b
)->localArray
;
7059 (b
)->endp
= (b
)->buf
+ UCOL_CEBUF_SIZE
;
7063 void ucol_CEBuf_Expand(ucol_CEBuf
*b
, collIterate
*ci
) {
7068 ci
->flags
|= UCOL_ITER_ALLOCATED
;
7069 oldSize
= b
->pos
- b
->buf
;
7070 newSize
= oldSize
* 2;
7071 newBuf
= (uint32_t *)uprv_malloc(newSize
* sizeof(uint32_t));
7072 if(newBuf
!= NULL
) {
7073 uprv_memcpy(newBuf
, b
->buf
, oldSize
* sizeof(uint32_t));
7074 if (b
->buf
!= b
->localArray
) {
7078 b
->endp
= b
->buf
+ newSize
;
7079 b
->pos
= b
->buf
+ oldSize
;
7084 inline void UCOL_CEBUF_PUT(ucol_CEBuf
*b
, uint32_t ce
, collIterate
*ci
) {
7085 if (b
->pos
== b
->endp
) {
7086 ucol_CEBuf_Expand(b
, ci
);
7091 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7092 /* It is used when compare gets in trouble and needs to bail out */
7093 static UCollationResult
ucol_compareUsingSortKeys(collIterate
*sColl
,
7097 uint8_t sourceKey
[UCOL_MAX_BUFFER
], targetKey
[UCOL_MAX_BUFFER
];
7098 uint8_t *sourceKeyP
= sourceKey
;
7099 uint8_t *targetKeyP
= targetKey
;
7100 int32_t sourceKeyLen
= UCOL_MAX_BUFFER
, targetKeyLen
= UCOL_MAX_BUFFER
;
7101 const UCollator
*coll
= sColl
->coll
;
7102 UChar
*source
= NULL
;
7103 UChar
*target
= NULL
;
7104 int32_t result
= UCOL_EQUAL
;
7105 UChar sStackBuf
[256], tStackBuf
[256];
7106 int32_t sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7107 int32_t targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7109 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7110 if(sColl
->flags
& UCOL_USE_ITERATOR
) {
7111 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7112 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7114 UChar
*sBufp
= source
;
7116 UChar
*tBufp
= target
;
7117 while(sColl
->iterator
->hasNext(sColl
->iterator
)) {
7118 *sBufp
++ = (UChar
)sColl
->iterator
->next(sColl
->iterator
);
7120 while(tColl
->iterator
->hasNext(tColl
->iterator
)) {
7121 *tBufp
++ = (UChar
)tColl
->iterator
->next(tColl
->iterator
);
7123 sourceLength
= sBufp
- source
;
7124 targetLength
= tBufp
- target
;
7125 } else { // no iterators
7126 sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(sColl
->endp
-sColl
->string
):-1;
7127 targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(tColl
->endp
-tColl
->string
):-1;
7128 source
= sColl
->string
;
7129 target
= tColl
->string
;
7134 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7135 if(sourceKeyLen
> UCOL_MAX_BUFFER
) {
7136 sourceKeyP
= (uint8_t*)uprv_malloc(sourceKeyLen
*sizeof(uint8_t));
7137 if(sourceKeyP
== NULL
) {
7138 *status
= U_MEMORY_ALLOCATION_ERROR
;
7139 goto cleanup_and_do_compare
;
7141 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7144 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7145 if(targetKeyLen
> UCOL_MAX_BUFFER
) {
7146 targetKeyP
= (uint8_t*)uprv_malloc(targetKeyLen
*sizeof(uint8_t));
7147 if(targetKeyP
== NULL
) {
7148 *status
= U_MEMORY_ALLOCATION_ERROR
;
7149 goto cleanup_and_do_compare
;
7151 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7154 result
= uprv_strcmp((const char*)sourceKeyP
, (const char*)targetKeyP
);
7156 cleanup_and_do_compare
:
7157 if(sourceKeyP
!= NULL
&& sourceKeyP
!= sourceKey
) {
7158 uprv_free(sourceKeyP
);
7161 if(targetKeyP
!= NULL
&& targetKeyP
!= targetKey
) {
7162 uprv_free(targetKeyP
);
7167 } else if(result
>0) {
7168 return UCOL_GREATER
;
7175 static inline UCollationResult
7176 ucol_strcollRegular( collIterate
*sColl
, collIterate
*tColl
,
7177 // const UCollator *coll,
7178 // const UChar *source,
7179 // int32_t sourceLength,
7180 // const UChar *target,
7181 // int32_t targetLength,
7186 const UCollator
*coll
= sColl
->coll
;
7189 // setting up the collator parameters
7190 UColAttributeValue strength
= coll
->strength
;
7191 UBool initialCheckSecTer
= (strength
>= UCOL_SECONDARY
);
7193 UBool checkSecTer
= initialCheckSecTer
;
7194 UBool checkTertiary
= (strength
>= UCOL_TERTIARY
);
7195 UBool checkQuad
= (strength
>= UCOL_QUATERNARY
);
7196 UBool checkIdent
= (strength
== UCOL_IDENTICAL
);
7197 UBool checkCase
= (coll
->caseLevel
== UCOL_ON
);
7198 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && checkSecTer
;
7199 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
7200 UBool qShifted
= shifted
&& checkQuad
;
7201 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && checkQuad
;
7203 if(doHiragana
&& shifted
) {
7204 return (ucol_compareUsingSortKeys(sColl
, tColl
, status
));
7206 uint8_t caseSwitch
= coll
->caseSwitch
;
7207 uint8_t tertiaryMask
= coll
->tertiaryMask
;
7209 // This is the lowest primary value that will not be ignored if shifted
7210 uint32_t LVT
= (shifted
)?(coll
->variableTopValue
<<16):0;
7212 UCollationResult result
= UCOL_EQUAL
;
7213 UCollationResult hirResult
= UCOL_EQUAL
;
7215 // Preparing the CE buffers. They will be filled during the primary phase
7218 UCOL_INIT_CEBUF(&sCEs
);
7219 UCOL_INIT_CEBUF(&tCEs
);
7221 uint32_t secS
= 0, secT
= 0;
7222 uint32_t sOrder
=0, tOrder
=0;
7224 // Non shifted primary processing is quite simple
7228 // We fetch CEs until we hit a non ignorable primary or end.
7230 // We get the next CE
7231 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7232 // Stuff it in the buffer
7233 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7234 // And keep just the primary part.
7235 sOrder
&= UCOL_PRIMARYMASK
;
7236 } while(sOrder
== 0);
7238 // see the comments on the above block
7240 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7241 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7242 tOrder
&= UCOL_PRIMARYMASK
;
7243 } while(tOrder
== 0);
7245 // if both primaries are the same
7246 if(sOrder
== tOrder
) {
7247 // and there are no more CEs, we advance to the next level
7248 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7251 if(doHiragana
&& hirResult
== UCOL_EQUAL
) {
7252 if((sColl
->flags
& UCOL_WAS_HIRAGANA
) != (tColl
->flags
& UCOL_WAS_HIRAGANA
)) {
7253 hirResult
= ((sColl
->flags
& UCOL_WAS_HIRAGANA
) > (tColl
->flags
& UCOL_WAS_HIRAGANA
))
7254 ? UCOL_LESS
:UCOL_GREATER
;
7258 // if two primaries are different, we are done
7259 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7262 } // no primary difference... do the rest from the buffers
7263 } else { // shifted - do a slightly more complicated processing :)
7265 UBool sInShifted
= FALSE
;
7266 UBool tInShifted
= FALSE
;
7267 // This version of code can be refactored. However, it seems easier to understand this way.
7268 // Source loop. Sam as the target loop.
7270 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7271 if(sOrder
== UCOL_NO_MORE_CES
) {
7272 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7274 } else if(sOrder
== 0
7275 || (sInShifted
&& (sOrder
& UCOL_PRIMARYMASK
) == 0)) {
7276 /* UCA amendment - ignore ignorables that follow shifted code points */
7278 } else if(isContinuation(sOrder
)) {
7279 if((sOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7281 sOrder
= (sOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7282 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7285 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7288 } else { /* Just lower level values */
7292 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7296 } else { /* regular */
7297 if((sOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7298 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7301 if((sOrder
& UCOL_PRIMARYMASK
) > 0) {
7303 sOrder
&= UCOL_PRIMARYMASK
;
7304 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7307 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
);
7314 sOrder
&= UCOL_PRIMARYMASK
;
7318 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7319 if(tOrder
== UCOL_NO_MORE_CES
) {
7320 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7322 } else if(tOrder
== 0
7323 || (tInShifted
&& (tOrder
& UCOL_PRIMARYMASK
) == 0)) {
7324 /* UCA amendment - ignore ignorables that follow shifted code points */
7326 } else if(isContinuation(tOrder
)) {
7327 if((tOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7329 tOrder
= (tOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7330 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7333 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7336 } else { /* Just lower level values */
7340 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7344 } else { /* regular */
7345 if((tOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7346 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7349 if((tOrder
& UCOL_PRIMARYMASK
) > 0) {
7351 tOrder
&= UCOL_PRIMARYMASK
;
7352 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7355 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
);
7362 tOrder
&= UCOL_PRIMARYMASK
;
7365 if(sOrder
== tOrder
) {
7367 if(doHiragana && hirResult == UCOL_EQUAL) {
7368 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7369 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7370 ? UCOL_LESS:UCOL_GREATER;
7374 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7377 sOrder
= 0; tOrder
= 0;
7381 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7384 } /* no primary difference... do the rest from the buffers */
7387 /* now, we're gonna reexamine collected CEs */
7391 /* This is the secondary level of comparison */
7393 if(!isFrenchSec
) { /* normal */
7398 secS
= *(sCE
++) & UCOL_SECONDARYMASK
;
7402 secT
= *(tCE
++) & UCOL_SECONDARYMASK
;
7406 if(secS
== UCOL_NO_MORE_CES_SECONDARY
) {
7413 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7417 } else { /* do the French */
7418 uint32_t *sCESave
= NULL
;
7419 uint32_t *tCESave
= NULL
;
7420 sCE
= sCEs
.pos
-2; /* this could also be sCEs-- if needs to be optimized */
7423 while (secS
== 0 && sCE
>= sCEs
.buf
) {
7426 if(isContinuation(secS
)) {
7427 while(isContinuation(secS
= *(sCE
--)));
7428 /* after this, secS has the start of continuation, and sCEs points before that */
7429 sCESave
= sCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7430 sCE
+=2; /* need to point to the first continuation CP */
7431 /* However, now you can just continue doing stuff */
7435 if(!isContinuation(secS
)) { /* This means we have finished with this cont */
7436 sCE
= sCESave
; /* reset the pointer to before continuation */
7441 secS
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7444 while(secT
== 0 && tCE
>= tCEs
.buf
) {
7447 if(isContinuation(secT
)) {
7448 while(isContinuation(secT
= *(tCE
--)));
7449 /* after this, secS has the start of continuation, and sCEs points before that */
7450 tCESave
= tCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7451 tCE
+=2; /* need to point to the first continuation CP */
7452 /* However, now you can just continue doing stuff */
7456 if(!isContinuation(secT
)) { /* This means we have finished with this cont */
7457 tCE
= tCESave
; /* reset the pointer to before continuation */
7462 secT
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7466 if(secS
== UCOL_NO_MORE_CES_SECONDARY
|| (sCE
< sCEs
.buf
&& tCE
< tCEs
.buf
)) {
7473 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7480 /* doing the case bit */
7485 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7486 if(!isContinuation(*sCE
++)) {
7488 if(((secS
& UCOL_PRIMARYMASK
) != 0) || strength
> UCOL_PRIMARY
) {
7489 // primary ignorables should not be considered on the case level when the strength is primary
7490 // otherwise, the CEs stop being well-formed
7491 secS
&= UCOL_TERT_CASE_MASK
;
7501 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7502 if(!isContinuation(*tCE
++)) {
7504 if(((secT
& UCOL_PRIMARYMASK
) != 0) || strength
> UCOL_PRIMARY
) {
7505 // primary ignorables should not be considered on the case level when the strength is primary
7506 // otherwise, the CEs stop being well-formed
7507 secT
&= UCOL_TERT_CASE_MASK
;
7517 if((secS
& UCOL_CASE_BIT_MASK
) < (secT
& UCOL_CASE_BIT_MASK
)) {
7520 } else if((secS
& UCOL_CASE_BIT_MASK
) > (secT
& UCOL_CASE_BIT_MASK
)) {
7521 result
= UCOL_GREATER
;
7525 if((secS
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
|| (secT
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
) {
7534 /* Tertiary level */
7541 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7542 secS
= *(sCE
++) & tertiaryMask
;
7543 if(!isContinuation(secS
)) {
7546 secS
&= UCOL_REMOVE_CASE
;
7550 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7551 secT
= *(tCE
++) & tertiaryMask
;
7552 if(!isContinuation(secT
)) {
7555 secT
&= UCOL_REMOVE_CASE
;
7560 if((secS
& UCOL_REMOVE_CASE
) == 1) {
7567 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7574 if(qShifted
/*checkQuad*/) {
7575 UBool sInShifted
= TRUE
;
7576 UBool tInShifted
= TRUE
;
7582 while(secS
== 0 && secS
!= UCOL_NO_MORE_CES
|| (isContinuation(secS
) && !sInShifted
)) {
7584 if(isContinuation(secS
)) {
7588 } else if(secS
> LVT
|| (secS
& UCOL_PRIMARYMASK
) == 0) { /* non continuation */
7589 secS
= UCOL_PRIMARYMASK
;
7595 secS
&= UCOL_PRIMARYMASK
;
7598 while(secT
== 0 && secT
!= UCOL_NO_MORE_CES
|| (isContinuation(secT
) && !tInShifted
)) {
7600 if(isContinuation(secT
)) {
7604 } else if(secT
> LVT
|| (secT
& UCOL_PRIMARYMASK
) == 0) {
7605 secT
= UCOL_PRIMARYMASK
;
7611 secT
&= UCOL_PRIMARYMASK
;
7614 if(secS
== UCOL_NO_MORE_CES_PRIMARY
) {
7621 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7625 } else if(doHiragana
&& hirResult
!= UCOL_EQUAL
) {
7626 // If we're fine on quaternaries, we might be different
7627 // on Hiragana. This, however, might fail us in shifted.
7632 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7633 /* as a tiebreaker if all else is equal. */
7634 /* Getting here should be quite rare - strings are not identical - */
7635 /* that is checked first, but compared == through all other checks. */
7638 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7639 result
= ucol_checkIdent(sColl
, tColl
, TRUE
, status
);
7643 if ((sColl
->flags
| tColl
->flags
) & UCOL_ITER_ALLOCATED
) {
7644 freeHeapWritableBuffer(sColl
);
7645 freeHeapWritableBuffer(tColl
);
7647 if (sCEs
.buf
!= sCEs
.localArray
) {
7648 uprv_free(sCEs
.buf
);
7650 if (tCEs
.buf
!= tCEs
.localArray
) {
7651 uprv_free(tCEs
.buf
);
7659 static inline uint32_t
7660 ucol_getLatinOneContraction(const UCollator
*coll
, int32_t strength
,
7661 uint32_t CE
, const UChar
*s
, int32_t *index
, int32_t len
) {
7662 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
&0xFFF);
7663 int32_t latinOneOffset
= (CE
& 0x00FFF000) >> 12;
7665 UChar schar
= 0, tchar
= 0;
7669 if(s
[*index
] == 0) { // end of string
7670 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7676 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7682 while(schar
> (tchar
= *(UCharOffset
+offset
))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7686 if (schar
== tchar
) {
7688 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
+offset
]);
7692 if(schar
& 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7693 return UCOL_BAIL_OUT_CE
;
7695 // skip completely ignorables
7696 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, schar
);
7697 if(isZeroCE
== 0) { // we have to ignore completely ignorables
7702 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7709 * This is a fast strcoll, geared towards text in Latin-1.
7710 * It supports contractions of size two, French secondaries
7711 * and case switching. You can use it with strengths primary
7712 * to tertiary. It does not support shifted and case level.
7713 * It relies on the table build by setupLatin1Table. If it
7714 * doesn't understand something, it will go to the regular
7717 static inline UCollationResult
7718 ucol_strcollUseLatin1( const UCollator
*coll
,
7719 const UChar
*source
,
7721 const UChar
*target
,
7726 int32_t strength
= coll
->strength
;
7728 int32_t sIndex
= 0, tIndex
= 0;
7729 UChar sChar
= 0, tChar
= 0;
7730 uint32_t sOrder
=0, tOrder
=0;
7732 UBool endOfSource
= FALSE
;
7734 uint32_t *elements
= coll
->latinOneCEs
;
7736 UBool haveContractions
= FALSE
; // if we have contractions in our string
7737 // we cannot do French secondary
7739 // Do the primary level
7741 while(sOrder
==0) { // this loop skips primary ignorables
7742 // sOrder=getNextlatinOneCE(source);
7743 if(sLen
==-1) { // handling zero terminated strings
7744 sChar
=source
[sIndex
++];
7749 } else { // handling strings with known length
7754 sChar
=source
[sIndex
++];
7756 if(sChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7757 //fprintf(stderr, "R");
7759 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7761 sOrder
= elements
[sChar
];
7762 if(sOrder
>= UCOL_NOT_FOUND
) { // if we got a special
7763 // specials can basically be either contractions or bail-out signs. If we get anything
7764 // else, we'll bail out anywasy
7765 if(getCETag(sOrder
) == CONTRACTION_TAG
) {
7766 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, sOrder
, source
, &sIndex
, sLen
);
7767 haveContractions
= TRUE
; // if there are contractions, we cannot do French secondary
7768 // However, if there are contractions in the table, but we always use just one char,
7769 // we might be able to do French. This should be checked out.
7771 if(sOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
7772 //fprintf(stderr, "S");
7774 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7779 while(tOrder
==0) { // this loop skips primary ignorables
7780 // tOrder=getNextlatinOneCE(target);
7781 if(tLen
==-1) { // handling zero terminated strings
7782 tChar
=target
[tIndex
++];
7784 if(endOfSource
) { // this is different than source loop,
7785 // as we already know that source loop is done here,
7786 // so we can either finish the primary loop if both
7787 // strings are done or anounce the result if only
7788 // target is done. Same below.
7791 return UCOL_GREATER
;
7794 } else { // handling strings with known length
7799 return UCOL_GREATER
;
7802 tChar
=target
[tIndex
++];
7804 if(tChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7805 //fprintf(stderr, "R");
7807 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7809 tOrder
= elements
[tChar
];
7810 if(tOrder
>= UCOL_NOT_FOUND
) {
7811 // Handling specials, see the comments for source
7812 if(getCETag(tOrder
) == CONTRACTION_TAG
) {
7813 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, tOrder
, target
, &tIndex
, tLen
);
7814 haveContractions
= TRUE
;
7816 if(tOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
7817 //fprintf(stderr, "S");
7819 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7823 if(endOfSource
) { // source is finished, but target is not, say the result.
7827 if(sOrder
== tOrder
) { // if we have same CEs, we continue the loop
7828 sOrder
= 0; tOrder
= 0;
7831 // compare current top bytes
7832 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
7833 // top bytes differ, return difference
7834 if(sOrder
< tOrder
) {
7836 } else if(sOrder
> tOrder
) {
7837 return UCOL_GREATER
;
7839 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7840 // since we must return enum value
7843 // top bytes match, continue with following bytes
7850 // after primary loop, we definitely know the sizes of strings,
7851 // so we set it and use simpler loop for secondaries and tertiaries
7852 sLen
= sIndex
; tLen
= tIndex
;
7853 if(strength
>= UCOL_SECONDARY
) {
7854 // adjust the table beggining
7855 elements
+= coll
->latinOneTableLen
;
7856 endOfSource
= FALSE
;
7858 if(coll
->frenchCollation
== UCOL_OFF
) { // non French
7859 // This loop is a simplified copy of primary loop
7860 // at this point we know that whole strings are latin-1, so we don't
7861 // check for that. We also know that we only have contractions as
7863 sIndex
= 0; tIndex
= 0;
7870 sChar
=source
[sIndex
++];
7871 sOrder
= elements
[sChar
];
7872 if(sOrder
> UCOL_NOT_FOUND
) {
7873 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, sOrder
, source
, &sIndex
, sLen
);
7882 return UCOL_GREATER
;
7885 tChar
=target
[tIndex
++];
7886 tOrder
= elements
[tChar
];
7887 if(tOrder
> UCOL_NOT_FOUND
) {
7888 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, tOrder
, target
, &tIndex
, tLen
);
7895 if(sOrder
== tOrder
) {
7896 sOrder
= 0; tOrder
= 0;
7899 // see primary loop for comments on this
7900 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
7901 if(sOrder
< tOrder
) {
7903 } else if(sOrder
> tOrder
) {
7904 return UCOL_GREATER
;
7912 if(haveContractions
) { // if we have contractions, we have to bail out
7913 // since we don't really know how to handle them here
7915 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7917 // For French, we go backwards
7918 sIndex
= sLen
; tIndex
= tLen
;
7925 sChar
=source
[--sIndex
];
7926 sOrder
= elements
[sChar
];
7927 // don't even look for contractions
7935 return UCOL_GREATER
;
7938 tChar
=target
[--tIndex
];
7939 tOrder
= elements
[tChar
];
7940 // don't even look for contractions
7946 if(sOrder
== tOrder
) {
7947 sOrder
= 0; tOrder
= 0;
7950 // see the primary loop for comments
7951 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
7952 if(sOrder
< tOrder
) {
7954 } else if(sOrder
> tOrder
) {
7955 return UCOL_GREATER
;
7966 if(strength
>= UCOL_TERTIARY
) {
7967 // tertiary loop is the same as secondary (except no French)
7968 elements
+= coll
->latinOneTableLen
;
7969 sIndex
= 0; tIndex
= 0;
7970 endOfSource
= FALSE
;
7977 sChar
=source
[sIndex
++];
7978 sOrder
= elements
[sChar
];
7979 if(sOrder
> UCOL_NOT_FOUND
) {
7980 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, sOrder
, source
, &sIndex
, sLen
);
7986 return UCOL_EQUAL
; // if both strings are at the end, they are equal
7988 return UCOL_GREATER
;
7991 tChar
=target
[tIndex
++];
7992 tOrder
= elements
[tChar
];
7993 if(tOrder
> UCOL_NOT_FOUND
) {
7994 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, tOrder
, target
, &tIndex
, tLen
);
8000 if(sOrder
== tOrder
) {
8001 sOrder
= 0; tOrder
= 0;
8004 if(((sOrder
^tOrder
)&0xff000000)!=0) {
8005 if(sOrder
< tOrder
) {
8007 } else if(sOrder
> tOrder
) {
8008 return UCOL_GREATER
;
8019 // Preparing the context objects for iterating over strings
8020 collIterate sColl
, tColl
;
8022 IInit_collIterate(coll
, source
, sLen
, &sColl
);
8023 IInit_collIterate(coll
, target
, tLen
, &tColl
);
8024 return ucol_strcollRegular(&sColl
, &tColl
, status
);
8028 U_CAPI UCollationResult U_EXPORT2
8029 ucol_strcollIter( const UCollator
*coll
,
8030 UCharIterator
*sIter
,
8031 UCharIterator
*tIter
,
8032 UErrorCode
*status
) {
8033 if(!status
|| U_FAILURE(*status
)) {
8037 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER
);
8038 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, sIter=%p, tIter=%p", coll
, sIter
, tIter
);
8040 if (sIter
== tIter
) {
8041 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8044 if(sIter
== NULL
|| tIter
== NULL
|| coll
== NULL
) {
8045 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8046 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8050 UCollationResult result
= UCOL_EQUAL
;
8052 // Preparing the context objects for iterating over strings
8053 collIterate sColl
, tColl
;
8054 // The division for the array length may truncate the array size to
8055 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8056 // for all platforms anyway.
8057 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8058 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8059 UNormIterator
*sNormIter
= NULL
, *tNormIter
= NULL
;
8061 IInit_collIterate(coll
, NULL
, -1, &sColl
);
8062 sColl
.iterator
= sIter
;
8063 sColl
.flags
|= UCOL_USE_ITERATOR
;
8064 IInit_collIterate(coll
, NULL
, -1, &tColl
);
8065 tColl
.flags
|= UCOL_USE_ITERATOR
;
8066 tColl
.iterator
= tIter
;
8068 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
) {
8069 sNormIter
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
8070 sColl
.iterator
= unorm_setIter(sNormIter
, sIter
, UNORM_FCD
, status
);
8071 sColl
.flags
&= ~UCOL_ITER_NORM
;
8073 tNormIter
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
8074 tColl
.iterator
= unorm_setIter(tNormIter
, tIter
, UNORM_FCD
, status
);
8075 tColl
.flags
&= ~UCOL_ITER_NORM
;
8078 UChar32 sChar
= U_SENTINEL
, tChar
= U_SENTINEL
;
8080 while((sChar
= sColl
.iterator
->next(sColl
.iterator
)) ==
8081 (tChar
= tColl
.iterator
->next(tColl
.iterator
))) {
8082 if(sChar
== U_SENTINEL
) {
8083 result
= UCOL_EQUAL
;
8088 if(sChar
== U_SENTINEL
) {
8089 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8092 if(tChar
== U_SENTINEL
) {
8093 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8096 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8097 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8099 if (ucol_unsafeCP((UChar
)sChar
, coll
) || ucol_unsafeCP((UChar
)tChar
, coll
))
8101 // We are stopped in the middle of a contraction.
8102 // Scan backwards through the == part of the string looking for the start of the contraction.
8103 // It doesn't matter which string we scan, since they are the same in this region.
8106 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8107 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8109 while (sChar
!= U_SENTINEL
&& ucol_unsafeCP((UChar
)sChar
, coll
));
8113 if(U_SUCCESS(*status
)) {
8114 result
= ucol_strcollRegular(&sColl
, &tColl
, status
);
8118 if(sNormIter
|| tNormIter
) {
8119 unorm_closeIter(sNormIter
);
8120 unorm_closeIter(tNormIter
);
8123 UTRACE_EXIT_VALUE_STATUS(result
, *status
)
8130 /* ucol_strcoll Main public API string comparison function */
8132 U_CAPI UCollationResult U_EXPORT2
8133 ucol_strcoll( const UCollator
*coll
,
8134 const UChar
*source
,
8135 int32_t sourceLength
,
8136 const UChar
*target
,
8137 int32_t targetLength
) {
8140 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL
);
8141 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
8142 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source=%p, target=%p", coll
, source
, target
);
8143 UTRACE_DATA2(UTRACE_VERBOSE
, "source string = %vh ", source
, sourceLength
);
8144 UTRACE_DATA2(UTRACE_VERBOSE
, "target string = %vh ", target
, targetLength
);
8147 UErrorCode status
= U_ZERO_ERROR
;
8148 if(source
== NULL
|| target
== NULL
) {
8149 // do not crash, but return. Should have
8150 // status argument to return error.
8151 UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL
);
8154 collIterate sColl
, tColl
;
8156 /* Scan the strings. Find: */
8157 /* The length of any leading portion that is equal */
8158 /* Whether they are exactly equal. (in which case we just return) */
8159 const UChar
*pSrc
= source
;
8160 const UChar
*pTarg
= target
;
8161 int32_t equalLength
;
8163 if (sourceLength
== -1 && targetLength
== -1) {
8164 // Both strings are null terminated.
8165 // Check for them being the same string, and scan through
8166 // any leading equal portion.
8167 if (source
==target
) {
8168 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8173 if ( *pSrc
!= *pTarg
|| *pSrc
== 0) {
8179 if (*pSrc
== 0 && *pTarg
== 0) {
8180 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8183 equalLength
= pSrc
- source
;
8187 // One or both strings has an explicit length.
8188 /* check if source and target are same strings */
8190 if (source
==target
&& sourceLength
==targetLength
) {
8191 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8194 const UChar
*pSrcEnd
= source
+ sourceLength
;
8195 const UChar
*pTargEnd
= target
+ targetLength
;
8198 // Scan while the strings are bitwise ==, or until one is exhausted.
8200 if (pSrc
== pSrcEnd
|| pTarg
== pTargEnd
) {
8203 if ((*pSrc
== 0 && sourceLength
== -1) || (*pTarg
== 0 && targetLength
== -1)) {
8206 if (*pSrc
!= *pTarg
) {
8212 equalLength
= pSrc
- source
;
8214 // If we made it all the way through both strings, we are done. They are ==
8215 if ((pSrc
==pSrcEnd
|| (pSrcEnd
<pSrc
&& *pSrc
==0)) && /* At end of src string, however it was specified. */
8216 (pTarg
==pTargEnd
|| (pTargEnd
<pTarg
&& *pTarg
==0))) { /* and also at end of dest string */
8217 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8221 if (equalLength
> 0) {
8222 /* There is an identical portion at the beginning of the two strings. */
8223 /* If the identical portion ends within a contraction or a comibining */
8224 /* character sequence, back up to the start of that sequence. */
8225 pSrc
= source
+ equalLength
; /* point to the first differing chars */
8226 pTarg
= target
+ equalLength
;
8227 if (pSrc
!= source
+sourceLength
&& ucol_unsafeCP(*pSrc
, coll
) ||
8228 pTarg
!= target
+targetLength
&& ucol_unsafeCP(*pTarg
, coll
))
8230 // We are stopped in the middle of a contraction.
8231 // Scan backwards through the == part of the string looking for the start of the contraction.
8232 // It doesn't matter which string we scan, since they are the same in this region.
8238 while (equalLength
>0 && ucol_unsafeCP(*pSrc
, coll
));
8241 source
+= equalLength
;
8242 target
+= equalLength
;
8243 if (sourceLength
> 0) {
8244 sourceLength
-= equalLength
;
8246 if (targetLength
> 0) {
8247 targetLength
-= equalLength
;
8251 UCollationResult returnVal
;
8252 if(!coll
->latinOneUse
|| (sourceLength
> 0 && *source
&0xff00) || (targetLength
> 0 && *target
&0xff00)) {
8253 // Preparing the context objects for iterating over strings
8254 IInit_collIterate(coll
, source
, sourceLength
, &sColl
);
8255 IInit_collIterate(coll
, target
, targetLength
, &tColl
);
8256 returnVal
= ucol_strcollRegular(&sColl
, &tColl
, &status
);
8258 returnVal
= ucol_strcollUseLatin1(coll
, source
, sourceLength
, target
, targetLength
, &status
);
8260 UTRACE_EXIT_VALUE(returnVal
);
8264 /* convenience function for comparing strings */
8265 U_CAPI UBool U_EXPORT2
8266 ucol_greater( const UCollator
*coll
,
8267 const UChar
*source
,
8268 int32_t sourceLength
,
8269 const UChar
*target
,
8270 int32_t targetLength
)
8272 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8276 /* convenience function for comparing strings */
8277 U_CAPI UBool U_EXPORT2
8278 ucol_greaterOrEqual( const UCollator
*coll
,
8279 const UChar
*source
,
8280 int32_t sourceLength
,
8281 const UChar
*target
,
8282 int32_t targetLength
)
8284 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8288 /* convenience function for comparing strings */
8289 U_CAPI UBool U_EXPORT2
8290 ucol_equal( const UCollator
*coll
,
8291 const UChar
*source
,
8292 int32_t sourceLength
,
8293 const UChar
*target
,
8294 int32_t targetLength
)
8296 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8300 U_CAPI
void U_EXPORT2
8301 ucol_getUCAVersion(const UCollator
* coll
, UVersionInfo info
) {
8302 if(coll
&& coll
->UCA
) {
8303 uprv_memcpy(info
, coll
->UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
8307 U_CAPI
int32_t U_EXPORT2
8308 ucol_cloneBinary(const UCollator
*coll
,
8309 uint8_t *buffer
, int32_t capacity
,
8313 if(U_FAILURE(*status
)) {
8317 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8320 if(coll
->hasRealData
== TRUE
) {
8321 length
= coll
->image
->size
;
8322 if(length
<= capacity
) {
8323 uprv_memcpy(buffer
, coll
->image
, length
);
8325 *status
= U_BUFFER_OVERFLOW_ERROR
;
8328 length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
8329 if(length
<= capacity
) {
8330 /* build the UCATableHeader with minimal entries */
8331 /* do not copy the header from the UCA file because its values are wrong! */
8332 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
8334 /* reset everything */
8335 uprv_memset(buffer
, 0, length
);
8337 /* set the tailoring-specific values */
8338 UCATableHeader
*myData
= (UCATableHeader
*)buffer
;
8339 myData
->size
= length
;
8341 /* offset for the options, the only part of the data that is present after the header */
8342 myData
->options
= sizeof(UCATableHeader
);
8344 /* need to always set the expansion value for an upper bound of the options */
8345 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
8347 myData
->magic
= UCOL_HEADER_MAGIC
;
8348 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
8349 myData
->charSetFamily
= U_CHARSET_FAMILY
;
8351 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
8352 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
8354 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
8355 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
8356 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
8357 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
8359 /* copy the collator options */
8360 uprv_memcpy(buffer
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
8362 *status
= U_BUFFER_OVERFLOW_ERROR
;
8368 U_CAPI
void U_EXPORT2
8369 ucol_forgetUCA(void)
8372 UCA_DATA_MEM
= NULL
;
8375 #endif /* #if !UCONFIG_NO_COLLATION */