2 *******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * Modification history
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_COLLATION
23 #include "unicode/coleitr.h"
24 #include "unicode/unorm.h"
25 #include "unicode/udata.h"
26 #include "unicode/ustring.h"
31 #include "normalizer2impl.h"
47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
49 #define LAST_BYTE_MASK_ 0xFF
50 #define SECOND_LAST_BYTE_SHIFT_ 8
52 #define ZERO_CC_LIMIT_ 0xC0
54 // this is static pointer to the normalizer fcdTrieIndex
55 // it is always the same between calls to u_cleanup
56 // and therefore writing to it is not synchronized.
57 // It is cleaned in ucol_cleanup
58 static const uint16_t *fcdTrieIndex
=NULL
;
59 // Code points at fcdHighStart and above have a zero FCD value.
60 static UChar32 fcdHighStart
= 0;
62 // These are values from UCA required for
63 // implicit generation and supressing sort key compression
64 // they should regularly be in the UCA, but if one
65 // is running without UCA, it could be a problem
66 static const int32_t maxRegularPrimary
= 0x7A;
67 static const int32_t minImplicitPrimary
= 0xE0;
68 static const int32_t maxImplicitPrimary
= 0xE4;
71 static UBool U_CALLCONV
78 static int32_t U_CALLCONV
79 _getFoldingOffset(uint32_t data
) {
80 return (int32_t)(data
&0xFFFFFF);
87 UBool
initializeFCD(UErrorCode
*status
) {
88 if (fcdTrieIndex
!= NULL
) {
91 // The result is constant, until the library is reloaded.
92 fcdTrieIndex
= unorm_getFCDTrieIndex(fcdHighStart
, status
);
93 ucln_i18n_registerCleanup(UCLN_I18N_UCOL
, ucol_cleanup
);
94 return U_SUCCESS(*status
);
99 inline void IInit_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
100 int32_t sourceLen
, collIterate
*s
,
103 (s
)->string
= (s
)->pos
= sourceString
;
106 if (sourceLen
>= 0) {
107 s
->flags
|= UCOL_ITER_HASLEN
;
108 (s
)->endp
= (UChar
*)sourceString
+sourceLen
;
111 /* change to enable easier checking for end of string for fcdpositon */
114 (s
)->extendCEs
= NULL
;
115 (s
)->extendCEsSize
= 0;
116 (s
)->CEpos
= (s
)->toReturn
= (s
)->CEs
;
117 (s
)->offsetBuffer
= NULL
;
118 (s
)->offsetBufferSize
= 0;
119 (s
)->offsetReturn
= (s
)->offsetStore
= NULL
;
120 (s
)->offsetRepeatCount
= (s
)->offsetRepeatValue
= 0;
121 (s
)->coll
= (collator
);
122 (s
)->nfd
= Normalizer2Factory::getNFDInstance(*status
);
123 (s
)->fcdPosition
= 0;
124 if(collator
->normalizationMode
== UCOL_ON
) {
125 (s
)->flags
|= UCOL_ITER_NORM
;
127 if(collator
->hiraganaQ
== UCOL_ON
&& collator
->strength
>= UCOL_QUATERNARY
) {
128 (s
)->flags
|= UCOL_HIRAGANA_Q
;
130 (s
)->iterator
= NULL
;
131 //(s)->iteratorIndex = 0;
134 U_CAPI
void U_EXPORT2
135 uprv_init_collIterate(const UCollator
*collator
, const UChar
*sourceString
,
136 int32_t sourceLen
, collIterate
*s
,
137 UErrorCode
*status
) {
138 /* Out-of-line version for use from other files. */
139 IInit_collIterate(collator
, sourceString
, sourceLen
, s
, status
);
142 U_CAPI collIterate
* U_EXPORT2
143 uprv_new_collIterate(UErrorCode
*status
) {
144 if(U_FAILURE(*status
)) {
147 collIterate
*s
= new collIterate
;
149 *status
= U_MEMORY_ALLOCATION_ERROR
;
155 U_CAPI
void U_EXPORT2
156 uprv_delete_collIterate(collIterate
*s
) {
160 U_CAPI UBool U_EXPORT2
161 uprv_collIterateAtEnd(collIterate
*s
) {
162 return s
== NULL
|| s
->pos
== s
->endp
;
166 * Backup the state of the collIterate struct data
167 * @param data collIterate to backup
168 * @param backup storage
171 inline void backupState(const collIterate
*data
, collIterateState
*backup
)
173 backup
->fcdPosition
= data
->fcdPosition
;
174 backup
->flags
= data
->flags
;
175 backup
->origFlags
= data
->origFlags
;
176 backup
->pos
= data
->pos
;
177 backup
->bufferaddress
= data
->writableBuffer
.getBuffer();
178 backup
->buffersize
= data
->writableBuffer
.length();
179 backup
->iteratorMove
= 0;
180 backup
->iteratorIndex
= 0;
181 if(data
->iterator
!= NULL
) {
182 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
183 backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
);
184 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
185 if(backup
->iteratorIndex
== UITER_NO_STATE
) {
186 while((backup
->iteratorIndex
= data
->iterator
->getState(data
->iterator
)) == UITER_NO_STATE
) {
187 backup
->iteratorMove
++;
188 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
190 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
196 * Loads the state into the collIterate struct data
197 * @param data collIterate to backup
198 * @param backup storage
199 * @param forwards boolean to indicate if forwards iteration is used,
200 * false indicates backwards iteration
203 inline void loadState(collIterate
*data
, const collIterateState
*backup
,
206 UErrorCode status
= U_ZERO_ERROR
;
207 data
->flags
= backup
->flags
;
208 data
->origFlags
= backup
->origFlags
;
209 if(data
->iterator
!= NULL
) {
210 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
211 data
->iterator
->setState(data
->iterator
, backup
->iteratorIndex
, &status
);
212 if(backup
->iteratorMove
!= 0) {
213 data
->iterator
->move(data
->iterator
, backup
->iteratorMove
, UITER_CURRENT
);
216 data
->pos
= backup
->pos
;
218 if ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
219 data
->writableBuffer
.getBuffer() != backup
->bufferaddress
) {
221 this is when a new buffer has been reallocated and we'll have to
222 calculate the new position.
223 note the new buffer has to contain the contents of the old buffer.
226 data
->pos
= data
->writableBuffer
.getTerminatedBuffer() +
227 (data
->pos
- backup
->bufferaddress
);
230 /* backwards direction */
231 int32_t temp
= backup
->buffersize
-
232 (int32_t)(data
->pos
- backup
->bufferaddress
);
233 data
->pos
= data
->writableBuffer
.getTerminatedBuffer() + (data
->writableBuffer
.length() - temp
);
236 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
238 this is alittle tricky.
239 if we are initially not in the normalization buffer, even if we
240 normalize in the later stage, the data in the buffer will be
241 ignored, since we skip back up to the data string.
242 however if we are already in the normalization buffer, any
243 further normalization will pull data into the normalization
244 buffer and modify the fcdPosition.
245 since we are keeping the data in the buffer for use, the
246 fcdPosition can not be reverted back.
249 data
->fcdPosition
= backup
->fcdPosition
;
254 reallocCEs(collIterate
*data
, int32_t newCapacity
) {
255 uint32_t *oldCEs
= data
->extendCEs
;
259 int32_t length
= data
->CEpos
- oldCEs
;
260 uint32_t *newCEs
= (uint32_t *)uprv_malloc(newCapacity
* 4);
264 uprv_memcpy(newCEs
, oldCEs
, length
* 4);
265 uprv_free(data
->extendCEs
);
266 data
->extendCEs
= newCEs
;
267 data
->extendCEsSize
= newCapacity
;
268 data
->CEpos
= newCEs
+ length
;
273 increaseCEsCapacity(collIterate
*data
) {
275 if(data
->extendCEs
!= NULL
) {
276 oldCapacity
= data
->extendCEsSize
;
278 oldCapacity
= LENGTHOF(data
->CEs
);
280 return reallocCEs(data
, 2 * oldCapacity
);
284 ensureCEsCapacity(collIterate
*data
, int32_t minCapacity
) {
286 if(data
->extendCEs
!= NULL
) {
287 oldCapacity
= data
->extendCEsSize
;
289 oldCapacity
= LENGTHOF(data
->CEs
);
291 if(minCapacity
<= oldCapacity
) {
295 return reallocCEs(data
, minCapacity
> oldCapacity
? minCapacity
: oldCapacity
);
298 void collIterate::appendOffset(int32_t offset
, UErrorCode
&errorCode
) {
299 if(U_FAILURE(errorCode
)) {
302 int32_t length
= offsetStore
== NULL
? 0 : (int32_t)(offsetStore
- offsetBuffer
);
303 if(length
>= offsetBufferSize
) {
304 int32_t newCapacity
= 2 * offsetBufferSize
+ UCOL_EXPAND_CE_BUFFER_SIZE
;
305 int32_t *newBuffer
= reinterpret_cast<int32_t *>(uprv_malloc(newCapacity
* 4));
306 if(newBuffer
== NULL
) {
307 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
311 uprv_memcpy(newBuffer
, offsetBuffer
, length
* 4);
313 uprv_free(offsetBuffer
);
314 offsetBuffer
= newBuffer
;
315 offsetStore
= offsetBuffer
+ length
;
316 offsetBufferSize
= newCapacity
;
318 *offsetStore
++ = offset
;
323 * Checks for a collIterate being positioned at the end of
328 inline UBool
collIter_eos(collIterate
*s
) {
329 if(s
->flags
& UCOL_USE_ITERATOR
) {
330 return !(s
->iterator
->hasNext(s
->iterator
));
332 if ((s
->flags
& UCOL_ITER_HASLEN
) == 0 && *s
->pos
!= 0) {
333 // Null terminated string, but not at null, so not at end.
334 // Whether in main or normalization buffer doesn't matter.
338 // String with length. Can't be in normalization buffer, which is always
340 if (s
->flags
& UCOL_ITER_HASLEN
) {
341 return (s
->pos
== s
->endp
);
344 // We are at a null termination, could be either normalization buffer or main string.
345 if ((s
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
346 // At null at end of main string.
350 // At null at end of normalization buffer. Need to check whether there there are
351 // any characters left in the main buffer.
352 if(s
->origFlags
& UCOL_USE_ITERATOR
) {
353 return !(s
->iterator
->hasNext(s
->iterator
));
354 } else if ((s
->origFlags
& UCOL_ITER_HASLEN
) == 0) {
355 // Null terminated main string. fcdPosition is the 'return' position into main buf.
356 return (*s
->fcdPosition
== 0);
359 // Main string with an end pointer.
360 return s
->fcdPosition
== s
->endp
;
366 * Checks for a collIterate being positioned at the start of
371 inline UBool
collIter_bos(collIterate
*source
) {
372 // if we're going backwards, we need to know whether there is more in the
373 // iterator, even if we are in the side buffer
374 if(source
->flags
& UCOL_USE_ITERATOR
|| source
->origFlags
& UCOL_USE_ITERATOR
) {
375 return !source
->iterator
->hasPrevious(source
->iterator
);
377 if (source
->pos
<= source
->string
||
378 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
379 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)) {
386 inline UBool collIter_SimpleBos(collIterate *source) {
387 // if we're going backwards, we need to know whether there is more in the
388 // iterator, even if we are in the side buffer
389 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
390 return !source->iterator->hasPrevious(source->iterator);
392 if (source->pos == source->string) {
397 //return (data->pos == data->string) ||
400 /****************************************************************************/
401 /* Following are the open/close functions */
403 /****************************************************************************/
406 ucol_initFromBinary(const uint8_t *bin
, int32_t length
,
407 const UCollator
*base
,
411 UCollator
*result
= fillIn
;
412 if(U_FAILURE(*status
)) {
417 // we don't support null base yet
418 *status = U_ILLEGAL_ARGUMENT_ERROR;
422 // We need these and we could be running without UCA
423 uprv_uca_initImplicitConstants(status
);
424 UCATableHeader
*colData
= (UCATableHeader
*)bin
;
425 // do we want version check here? We're trying to figure out whether collators are compatible
426 if((base
&& (uprv_memcmp(colData
->UCAVersion
, base
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0 ||
427 uprv_memcmp(colData
->UCDVersion
, base
->image
->UCDVersion
, sizeof(UVersionInfo
)) != 0)) ||
428 colData
->version
[0] != UCOL_BUILDER_VERSION
)
430 *status
= U_COLLATOR_VERSION_MISMATCH
;
434 if((uint32_t)length
> (paddedsize(sizeof(UCATableHeader
)) + paddedsize(sizeof(UColOptionSet
)))) {
435 result
= ucol_initCollator((const UCATableHeader
*)bin
, result
, base
, status
);
436 if(U_FAILURE(*status
)){
439 result
->hasRealData
= TRUE
;
443 result
= ucol_initCollator(base
->image
, result
, base
, status
);
444 ucol_setOptionsFromHeader(result
, (UColOptionSet
*)(bin
+((const UCATableHeader
*)bin
)->options
), status
);
445 if(U_FAILURE(*status
)){
448 result
->hasRealData
= FALSE
;
451 *status
= U_USELESS_COLLATOR_ERROR
;
455 result
->freeImageOnClose
= FALSE
;
457 result
->actualLocale
= NULL
;
458 result
->validLocale
= NULL
;
459 result
->requestedLocale
= NULL
;
460 result
->rules
= NULL
;
461 result
->rulesLength
= 0;
462 result
->freeRulesOnClose
= FALSE
;
463 result
->ucaRules
= NULL
;
467 U_CAPI UCollator
* U_EXPORT2
468 ucol_openBinary(const uint8_t *bin
, int32_t length
,
469 const UCollator
*base
,
472 return ucol_initFromBinary(bin
, length
, base
, NULL
, status
);
475 U_CAPI
int32_t U_EXPORT2
476 ucol_cloneBinary(const UCollator
*coll
,
477 uint8_t *buffer
, int32_t capacity
,
481 if(U_FAILURE(*status
)) {
485 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
488 if(coll
->hasRealData
== TRUE
) {
489 length
= coll
->image
->size
;
490 if(length
<= capacity
) {
491 uprv_memcpy(buffer
, coll
->image
, length
);
493 *status
= U_BUFFER_OVERFLOW_ERROR
;
496 length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
497 if(length
<= capacity
) {
498 /* build the UCATableHeader with minimal entries */
499 /* do not copy the header from the UCA file because its values are wrong! */
500 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
502 /* reset everything */
503 uprv_memset(buffer
, 0, length
);
505 /* set the tailoring-specific values */
506 UCATableHeader
*myData
= (UCATableHeader
*)buffer
;
507 myData
->size
= length
;
509 /* offset for the options, the only part of the data that is present after the header */
510 myData
->options
= sizeof(UCATableHeader
);
512 /* need to always set the expansion value for an upper bound of the options */
513 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
515 myData
->magic
= UCOL_HEADER_MAGIC
;
516 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
517 myData
->charSetFamily
= U_CHARSET_FAMILY
;
519 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
520 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
522 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
523 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
524 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
525 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
527 /* copy the collator options */
528 uprv_memcpy(buffer
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
530 *status
= U_BUFFER_OVERFLOW_ERROR
;
536 U_CAPI UCollator
* U_EXPORT2
537 ucol_safeClone(const UCollator
*coll
, void *stackBuffer
, int32_t * pBufferSize
, UErrorCode
*status
)
539 UCollator
* localCollator
;
540 int32_t bufferSizeNeeded
= (int32_t)sizeof(UCollator
);
541 char *stackBufferChars
= (char *)stackBuffer
;
542 int32_t imageSize
= 0;
543 int32_t rulesSize
= 0;
544 int32_t rulesPadding
= 0;
547 UBool colAllocated
= FALSE
;
548 UBool imageAllocated
= FALSE
;
550 if (status
== NULL
|| U_FAILURE(*status
)){
553 if ((stackBuffer
&& !pBufferSize
) || !coll
){
554 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
557 if (coll
->rules
&& coll
->freeRulesOnClose
) {
558 rulesSize
= (int32_t)(coll
->rulesLength
+ 1)*sizeof(UChar
);
559 rulesPadding
= (int32_t)(bufferSizeNeeded
% sizeof(UChar
));
560 bufferSizeNeeded
+= rulesSize
+ rulesPadding
;
563 if (stackBuffer
&& *pBufferSize
<= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
564 *pBufferSize
= bufferSizeNeeded
;
568 /* Pointers on 64-bit platforms need to be aligned
569 * on a 64-bit boundry in memory.
571 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
572 int32_t offsetUp
= (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars
);
573 if (*pBufferSize
> offsetUp
) {
574 *pBufferSize
-= offsetUp
;
575 stackBufferChars
+= offsetUp
;
578 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
582 stackBuffer
= (void *)stackBufferChars
;
584 if (stackBuffer
== NULL
|| *pBufferSize
< bufferSizeNeeded
) {
585 /* allocate one here...*/
586 stackBufferChars
= (char *)uprv_malloc(bufferSizeNeeded
);
587 // Null pointer check.
588 if (stackBufferChars
== NULL
) {
589 *status
= U_MEMORY_ALLOCATION_ERROR
;
593 if (U_SUCCESS(*status
)) {
594 *status
= U_SAFECLONE_ALLOCATED_WARNING
;
597 localCollator
= (UCollator
*)stackBufferChars
;
598 rules
= (UChar
*)(stackBufferChars
+ sizeof(UCollator
) + rulesPadding
);
600 UErrorCode tempStatus
= U_ZERO_ERROR
;
601 imageSize
= ucol_cloneBinary(coll
, NULL
, 0, &tempStatus
);
603 if (coll
->freeImageOnClose
) {
604 image
= (uint8_t *)uprv_malloc(imageSize
);
605 // Null pointer check
607 *status
= U_MEMORY_ALLOCATION_ERROR
;
610 ucol_cloneBinary(coll
, image
, imageSize
, status
);
611 imageAllocated
= TRUE
;
614 image
= (uint8_t *)coll
->image
;
616 localCollator
= ucol_initFromBinary(image
, imageSize
, coll
->UCA
, localCollator
, status
);
617 if (U_FAILURE(*status
)) {
622 if (coll
->freeRulesOnClose
) {
623 localCollator
->rules
= u_strcpy(rules
, coll
->rules
);
624 //bufferEnd += rulesSize;
627 localCollator
->rules
= coll
->rules
;
629 localCollator
->freeRulesOnClose
= FALSE
;
630 localCollator
->rulesLength
= coll
->rulesLength
;
634 for(i
= 0; i
< UCOL_ATTRIBUTE_COUNT
; i
++) {
635 ucol_setAttribute(localCollator
, (UColAttribute
)i
, ucol_getAttribute(coll
, (UColAttribute
)i
, status
), status
);
637 // zero copies of pointers
638 localCollator
->actualLocale
= NULL
;
639 localCollator
->validLocale
= NULL
;
640 localCollator
->requestedLocale
= NULL
;
641 localCollator
->ucaRules
= coll
->ucaRules
; // There should only be one copy here.
642 localCollator
->freeOnClose
= colAllocated
;
643 localCollator
->freeImageOnClose
= imageAllocated
;
644 return localCollator
;
647 U_CAPI
void U_EXPORT2
648 ucol_close(UCollator
*coll
)
650 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE
);
651 UTRACE_DATA1(UTRACE_INFO
, "coll = %p", coll
);
653 // these are always owned by each UCollator struct,
654 // so we always free them
655 if(coll
->validLocale
!= NULL
) {
656 uprv_free(coll
->validLocale
);
658 if(coll
->actualLocale
!= NULL
) {
659 uprv_free(coll
->actualLocale
);
661 if(coll
->requestedLocale
!= NULL
) {
662 uprv_free(coll
->requestedLocale
);
664 if(coll
->latinOneCEs
!= NULL
) {
665 uprv_free(coll
->latinOneCEs
);
667 if(coll
->options
!= NULL
&& coll
->freeOptionsOnClose
) {
668 uprv_free(coll
->options
);
670 if(coll
->rules
!= NULL
&& coll
->freeRulesOnClose
) {
671 uprv_free((UChar
*)coll
->rules
);
673 if(coll
->image
!= NULL
&& coll
->freeImageOnClose
) {
674 uprv_free((UCATableHeader
*)coll
->image
);
676 if(coll
->leadBytePermutationTable
!= NULL
) {
677 uprv_free(coll
->leadBytePermutationTable
);
679 if(coll
->reorderCodes
!= NULL
) {
680 uprv_free(coll
->reorderCodes
);
683 /* Here, it would be advisable to close: */
684 /* - UData for UCA (unless we stuff it in the root resb */
685 /* Again, do we need additional housekeeping... HMMM! */
686 UTRACE_DATA1(UTRACE_INFO
, "coll->freeOnClose: %d", coll
->freeOnClose
);
687 if(coll
->freeOnClose
){
688 /* for safeClone, if freeOnClose is FALSE,
689 don't free the other instance data */
696 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
697 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
698 U_CFUNC
uint8_t* U_EXPORT2
699 ucol_cloneRuleData(const UCollator
*coll
, int32_t *length
, UErrorCode
*status
)
701 uint8_t *result
= NULL
;
702 if(U_FAILURE(*status
)) {
705 if(coll
->hasRealData
== TRUE
) {
706 *length
= coll
->image
->size
;
707 result
= (uint8_t *)uprv_malloc(*length
);
709 if (result
== NULL
) {
710 *status
= U_MEMORY_ALLOCATION_ERROR
;
713 uprv_memcpy(result
, coll
->image
, *length
);
715 *length
= (int32_t)(paddedsize(sizeof(UCATableHeader
))+paddedsize(sizeof(UColOptionSet
)));
716 result
= (uint8_t *)uprv_malloc(*length
);
718 if (result
== NULL
) {
719 *status
= U_MEMORY_ALLOCATION_ERROR
;
723 /* build the UCATableHeader with minimal entries */
724 /* do not copy the header from the UCA file because its values are wrong! */
725 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
727 /* reset everything */
728 uprv_memset(result
, 0, *length
);
730 /* set the tailoring-specific values */
731 UCATableHeader
*myData
= (UCATableHeader
*)result
;
732 myData
->size
= *length
;
734 /* offset for the options, the only part of the data that is present after the header */
735 myData
->options
= sizeof(UCATableHeader
);
737 /* need to always set the expansion value for an upper bound of the options */
738 myData
->expansion
= myData
->options
+ sizeof(UColOptionSet
);
740 myData
->magic
= UCOL_HEADER_MAGIC
;
741 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
742 myData
->charSetFamily
= U_CHARSET_FAMILY
;
744 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
745 uprv_memcpy(myData
->version
, coll
->image
->version
, sizeof(UVersionInfo
));
747 uprv_memcpy(myData
->UCAVersion
, coll
->image
->UCAVersion
, sizeof(UVersionInfo
));
748 uprv_memcpy(myData
->UCDVersion
, coll
->image
->UCDVersion
, sizeof(UVersionInfo
));
749 uprv_memcpy(myData
->formatVersion
, coll
->image
->formatVersion
, sizeof(UVersionInfo
));
750 myData
->jamoSpecial
= coll
->image
->jamoSpecial
;
752 /* copy the collator options */
753 uprv_memcpy(result
+paddedsize(sizeof(UCATableHeader
)), coll
->options
, sizeof(UColOptionSet
));
758 void ucol_setOptionsFromHeader(UCollator
* result
, UColOptionSet
* opts
, UErrorCode
*status
) {
759 if(U_FAILURE(*status
)) {
762 result
->caseFirst
= (UColAttributeValue
)opts
->caseFirst
;
763 result
->caseLevel
= (UColAttributeValue
)opts
->caseLevel
;
764 result
->frenchCollation
= (UColAttributeValue
)opts
->frenchCollation
;
765 result
->normalizationMode
= (UColAttributeValue
)opts
->normalizationMode
;
766 if(result
->normalizationMode
== UCOL_ON
&& !initializeFCD(status
)) {
769 result
->strength
= (UColAttributeValue
)opts
->strength
;
770 result
->variableTopValue
= opts
->variableTopValue
;
771 result
->alternateHandling
= (UColAttributeValue
)opts
->alternateHandling
;
772 result
->hiraganaQ
= (UColAttributeValue
)opts
->hiraganaQ
;
773 result
->numericCollation
= (UColAttributeValue
)opts
->numericCollation
;
774 result
->caseFirstisDefault
= TRUE
;
775 result
->caseLevelisDefault
= TRUE
;
776 result
->frenchCollationisDefault
= TRUE
;
777 result
->normalizationModeisDefault
= TRUE
;
778 result
->strengthisDefault
= TRUE
;
779 result
->variableTopValueisDefault
= TRUE
;
780 result
->alternateHandlingisDefault
= TRUE
;
781 result
->hiraganaQisDefault
= TRUE
;
782 result
->numericCollationisDefault
= TRUE
;
784 ucol_updateInternalState(result
, status
);
786 result
->options
= opts
;
791 * Approximate determination if a character is at a contraction end.
792 * Guaranteed to be TRUE if a character is at the end of a contraction,
793 * otherwise it is not deterministic.
794 * @param c character to be determined
795 * @param coll collator
798 inline UBool
ucol_contractionEndCP(UChar c
, const UCollator
*coll
) {
799 if (c
< coll
->minContrEndCP
) {
805 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
806 if (U16_IS_TRAIL(c
)) {
809 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
811 htbyte
= coll
->contrEndCP
[hash
>>3];
812 return (((htbyte
>> (hash
& 7)) & 1) == 1);
818 * i_getCombiningClass()
819 * A fast, at least partly inline version of u_getCombiningClass()
820 * This is a candidate for further optimization. Used heavily
821 * in contraction processing.
824 inline uint8_t i_getCombiningClass(UChar32 c
, const UCollator
*coll
) {
826 if ((c
>= 0x300 && ucol_unsafeCP(c
, coll
)) || c
> 0xFFFF) {
827 sCC
= u_getCombiningClass(c
);
832 UCollator
* ucol_initCollator(const UCATableHeader
*image
, UCollator
*fillIn
, const UCollator
*UCA
, UErrorCode
*status
) {
834 UCollator
*result
= fillIn
;
835 if(U_FAILURE(*status
) || image
== NULL
) {
840 result
= (UCollator
*)uprv_malloc(sizeof(UCollator
));
842 *status
= U_MEMORY_ALLOCATION_ERROR
;
845 result
->freeOnClose
= TRUE
;
847 result
->freeOnClose
= FALSE
;
850 result
->image
= image
;
851 result
->mapping
.getFoldingOffset
= _getFoldingOffset
;
852 const uint8_t *mapping
= (uint8_t*)result
->image
+result
->image
->mappingPosition
;
853 utrie_unserialize(&result
->mapping
, mapping
, result
->image
->endExpansionCE
- result
->image
->mappingPosition
, status
);
854 if(U_FAILURE(*status
)) {
855 if(result
->freeOnClose
== TRUE
) {
862 result
->latinOneMapping
= UTRIE_GET32_LATIN1(&result
->mapping
);
863 result
->contractionCEs
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->contractionCEs
);
864 result
->contractionIndex
= (UChar
*)((uint8_t*)result
->image
+result
->image
->contractionIndex
);
865 result
->expansion
= (uint32_t*)((uint8_t*)result
->image
+result
->image
->expansion
);
866 result
->rules
= NULL
;
867 result
->rulesLength
= 0;
868 result
->freeRulesOnClose
= FALSE
;
869 result
->reorderCodes
= NULL
;
870 result
->reorderCodesLength
= 0;
871 result
->leadBytePermutationTable
= NULL
;
873 /* get the version info from UCATableHeader and populate the Collator struct*/
874 result
->dataVersion
[0] = result
->image
->version
[0]; /* UCA Builder version*/
875 result
->dataVersion
[1] = result
->image
->version
[1]; /* UCA Tailoring rules version*/
876 result
->dataVersion
[2] = 0;
877 result
->dataVersion
[3] = 0;
879 result
->unsafeCP
= (uint8_t *)result
->image
+ result
->image
->unsafeCP
;
880 result
->minUnsafeCP
= 0;
881 for (c
=0; c
<0x300; c
++) { // Find the smallest unsafe char.
882 if (ucol_unsafeCP(c
, result
)) break;
884 result
->minUnsafeCP
= c
;
886 result
->contrEndCP
= (uint8_t *)result
->image
+ result
->image
->contrEndCP
;
887 result
->minContrEndCP
= 0;
888 for (c
=0; c
<0x300; c
++) { // Find the Contraction-ending char.
889 if (ucol_contractionEndCP(c
, result
)) break;
891 result
->minContrEndCP
= c
;
893 /* max expansion tables */
894 result
->endExpansionCE
= (uint32_t*)((uint8_t*)result
->image
+
895 result
->image
->endExpansionCE
);
896 result
->lastEndExpansionCE
= result
->endExpansionCE
+
897 result
->image
->endExpansionCECount
- 1;
898 result
->expansionCESize
= (uint8_t*)result
->image
+
899 result
->image
->expansionCESize
;
902 //result->errorCode = *status;
904 result
->latinOneCEs
= NULL
;
906 result
->latinOneRegenTable
= FALSE
;
907 result
->latinOneFailed
= FALSE
;
910 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
911 result
->ucaRules
= NULL
;
912 result
->actualLocale
= NULL
;
913 result
->validLocale
= NULL
;
914 result
->requestedLocale
= NULL
;
915 result
->hasRealData
= FALSE
; // real data lives in .dat file...
916 result
->freeImageOnClose
= FALSE
;
919 ucol_setOptionsFromHeader(
921 (UColOptionSet
*)((uint8_t*)result
->image
+result
->image
->options
),
923 result
->freeOptionsOnClose
= FALSE
;
928 /* new Mark's code */
931 * For generation of Implicit CEs
934 * Cleaned up so that changes can be made more easily.
936 # First Implicit: E26A792D
937 # Last Implicit: E3DC70C0
938 # First CJK: E0030300
940 # First CJK_A: E0A9DF00
941 # Last CJK_A: E0DE3100
943 /* Following is a port of Mark's code for new treatment of implicits.
944 * It is positioned here, since ucol_initUCA need to initialize the
945 * variables below according to the data in the fractional UCA.
950 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
951 * b) bump any non-CJK characters by 10FFFF.
952 * The relevant blocks are:
953 * A: 4E00..9FFF; CJK Unified Ideographs
954 * F900..FAFF; CJK Compatibility Ideographs
955 * B: 3400..4DBF; CJK Unified Ideographs Extension A
956 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
958 * no new B characters are allocated between 4E00 and FAFF, and
959 * no new A characters are outside of this range,
960 * (very high probability) this simple code will work.
961 * The reordered blocks are:
963 * Block2 is CJK_COMPAT_USED
966 * Any other CJK gets its normal code point
967 * Any non-CJK gets +10FFFF
968 * When we reorder Block1, we make sure that it is at the very start,
969 * so that it will use a 3-byte form.
970 * Warning: the we only pick up the compatibility characters that are
971 * NOT decomposed, so that block is smaller!
976 NON_CJK_OFFSET
= 0x110000,
977 UCOL_MAX_INPUT
= 0x220001; // 2 * Unicode range + 2
980 * Precomputed by initImplicitConstants()
983 final3Multiplier
= 0,
984 final4Multiplier
= 0,
998 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
999 // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
1001 CJK_LIMIT
= 0x9FCB+1,
1002 // Unified CJK ideographs in the compatibility ideographs block.
1003 CJK_COMPAT_USED_BASE
= 0xFA0E,
1004 CJK_COMPAT_USED_LIMIT
= 0xFA2F+1,
1005 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1006 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1007 CJK_A_BASE
= 0x3400,
1008 CJK_A_LIMIT
= 0x4DB5+1,
1009 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1010 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1011 CJK_B_BASE
= 0x20000,
1012 CJK_B_LIMIT
= 0x2A6D6+1,
1013 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1014 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1015 CJK_C_BASE
= 0x2A700,
1016 CJK_C_LIMIT
= 0x2B734+1,
1017 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1018 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1019 CJK_D_BASE
= 0x2B740,
1020 CJK_D_LIMIT
= 0x2B81D+1;
1021 // when adding to this list, look for all occurrences (in project)
1022 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1024 static UChar32
swapCJK(UChar32 i
) {
1025 if (i
< CJK_A_BASE
) {
1027 } else if (i
< CJK_A_LIMIT
) {
1028 // Extension A has lower code points than the original Unihan+compat
1029 // but sorts higher.
1030 return i
- CJK_A_BASE
1031 + (CJK_LIMIT
- CJK_BASE
)
1032 + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
1033 } else if (i
< CJK_BASE
) {
1035 } else if (i
< CJK_LIMIT
) {
1036 return i
- CJK_BASE
;
1037 } else if (i
< CJK_COMPAT_USED_BASE
) {
1039 } else if (i
< CJK_COMPAT_USED_LIMIT
) {
1040 return i
- CJK_COMPAT_USED_BASE
1041 + (CJK_LIMIT
- CJK_BASE
);
1042 } else if (i
< CJK_B_BASE
) {
1044 } else if (i
< CJK_B_LIMIT
) {
1045 return i
; // non-BMP-CJK
1046 } else if (i
< CJK_C_BASE
) {
1048 } else if (i
< CJK_C_LIMIT
) {
1049 return i
; // non-BMP-CJK
1050 } else if (i
< CJK_D_BASE
) {
1052 } else if (i
< CJK_D_LIMIT
) {
1053 return i
; // non-BMP-CJK
1055 return i
+ NON_CJK_OFFSET
; // non-CJK
1058 U_CAPI UChar32 U_EXPORT2
1059 uprv_uca_getRawFromCodePoint(UChar32 i
) {
1060 return swapCJK(i
)+1;
1063 U_CAPI UChar32 U_EXPORT2
1064 uprv_uca_getCodePointFromRaw(UChar32 i
) {
1067 if(i
>= NON_CJK_OFFSET
) {
1068 result
= i
- NON_CJK_OFFSET
;
1069 } else if(i
>= CJK_B_BASE
) {
1071 } else if(i
< CJK_A_LIMIT
+ (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) { // rest of CJKs, compacted
1072 if(i
< CJK_LIMIT
- CJK_BASE
) {
1073 result
= i
+ CJK_BASE
;
1074 } else if(i
< (CJK_LIMIT
- CJK_BASE
) + (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
)) {
1075 result
= i
+ CJK_COMPAT_USED_BASE
- (CJK_LIMIT
- CJK_BASE
);
1077 result
= i
+ CJK_A_BASE
- (CJK_LIMIT
- CJK_BASE
) - (CJK_COMPAT_USED_LIMIT
- CJK_COMPAT_USED_BASE
);
1085 // GET IMPLICIT PRIMARY WEIGHTS
1086 // Return value is left justified primary key
1087 U_CAPI
uint32_t U_EXPORT2
1088 uprv_uca_getImplicitFromRaw(UChar32 cp
) {
1090 if (cp < 0 || cp > UCOL_MAX_INPUT) {
1091 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1094 int32_t last0
= cp
- min4Boundary
;
1096 int32_t last1
= cp
/ final3Count
;
1097 last0
= cp
% final3Count
;
1099 int32_t last2
= last1
/ medialCount
;
1100 last1
%= medialCount
;
1102 last0
= minTrail
+ last0
*final3Multiplier
; // spread out, leaving gap at start
1103 last1
= minTrail
+ last1
; // offset
1104 last2
= min3Primary
+ last2
; // offset
1106 if (last2 >= min4Primary) {
1107 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1110 return (last2
<< 24) + (last1
<< 16) + (last0
<< 8);
1112 int32_t last1
= last0
/ final4Count
;
1113 last0
%= final4Count
;
1115 int32_t last2
= last1
/ medialCount
;
1116 last1
%= medialCount
;
1118 int32_t last3
= last2
/ medialCount
;
1119 last2
%= medialCount
;
1121 last0
= minTrail
+ last0
*final4Multiplier
; // spread out, leaving gap at start
1122 last1
= minTrail
+ last1
; // offset
1123 last2
= minTrail
+ last2
; // offset
1124 last3
= min4Primary
+ last3
; // offset
1126 if (last3 > max4Primary) {
1127 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1130 return (last3
<< 24) + (last2
<< 16) + (last1
<< 8) + last0
;
1134 static uint32_t U_EXPORT2
1135 uprv_uca_getImplicitPrimary(UChar32 cp
) {
1136 //fprintf(stdout, "Incoming: %04x\n", cp);
1137 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1141 // we now have a range of numbers from 0 to 21FFFF.
1143 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1144 //fprintf(stdout, "CJK swapped: %04x\n", cp);
1146 return uprv_uca_getImplicitFromRaw(cp
);
1150 * Converts implicit CE into raw integer ("code point")
1152 * @return -1 if illegal format
1154 U_CAPI UChar32 U_EXPORT2
1155 uprv_uca_getRawFromImplicit(uint32_t implicit
) {
1157 UChar32 b3
= implicit
& 0xFF;
1158 UChar32 b2
= (implicit
>> 8) & 0xFF;
1159 UChar32 b1
= (implicit
>> 16) & 0xFF;
1160 UChar32 b0
= (implicit
>> 24) & 0xFF;
1162 // simple parameter checks
1163 if (b0
< min3Primary
|| b0
> max4Primary
1164 || b1
< minTrail
|| b1
> maxTrail
)
1169 // take care of the final values, and compose
1170 if (b0
< min4Primary
) {
1171 if (b2
< minTrail
|| b2
> max3Trail
|| b3
!= 0)
1174 UChar32 remainder
= b2
% final3Multiplier
;
1178 b2
/= final3Multiplier
;
1179 result
= ((b0
* medialCount
) + b1
) * final3Count
+ b2
;
1181 if (b2
< minTrail
|| b2
> maxTrail
1182 || b3
< minTrail
|| b3
> max4Trail
)
1186 UChar32 remainder
= b3
% final4Multiplier
;
1189 b3
/= final4Multiplier
;
1191 result
= (((b0
* medialCount
) + b1
) * medialCount
+ b2
) * final4Count
+ b3
+ min4Boundary
;
1194 if (result
< 0 || result
> UCOL_MAX_INPUT
)
1200 static inline int32_t divideAndRoundUp(int a
, int b
) {
1204 /* this function is either called from initUCA or from genUCA before
1205 * doing canonical closure for the UCA.
1209 * Set up to generate implicits.
1210 * Maintenance Note: this function may end up being called more than once, due
1211 * to threading races during initialization. Make sure that
1212 * none of the Constants is ever transiently assigned an
1216 * @param minTrail final byte
1217 * @param maxTrail final byte
1218 * @param gap3 the gap we leave for tailoring for 3-byte forms
1219 * @param gap4 the gap we leave for tailoring for 4-byte forms
1221 static void initImplicitConstants(int minPrimary
, int maxPrimary
,
1222 int minTrailIn
, int maxTrailIn
,
1223 int gap3
, int primaries3count
,
1224 UErrorCode
*status
) {
1225 // some simple parameter checks
1226 if ((minPrimary
< 0 || minPrimary
>= maxPrimary
|| maxPrimary
> 0xFF)
1227 || (minTrailIn
< 0 || minTrailIn
>= maxTrailIn
|| maxTrailIn
> 0xFF)
1228 || (primaries3count
< 1))
1230 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1234 minTrail
= minTrailIn
;
1235 maxTrail
= maxTrailIn
;
1237 min3Primary
= minPrimary
;
1238 max4Primary
= maxPrimary
;
1239 // compute constants for use later.
1240 // number of values we can use in trailing bytes
1241 // leave room for empty values between AND above, e.g. if gap = 2
1242 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1243 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1244 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1245 final3Multiplier
= gap3
+ 1;
1246 final3Count
= (maxTrail
- minTrail
+ 1) / final3Multiplier
;
1247 max3Trail
= minTrail
+ (final3Count
- 1) * final3Multiplier
;
1249 // medials can use full range
1250 medialCount
= (maxTrail
- minTrail
+ 1);
1251 // find out how many values fit in each form
1252 int32_t threeByteCount
= medialCount
* final3Count
;
1253 // now determine where the 3/4 boundary is.
1254 // we use 3 bytes below the boundary, and 4 above
1255 int32_t primariesAvailable
= maxPrimary
- minPrimary
+ 1;
1256 int32_t primaries4count
= primariesAvailable
- primaries3count
;
1259 int32_t min3ByteCoverage
= primaries3count
* threeByteCount
;
1260 min4Primary
= minPrimary
+ primaries3count
;
1261 min4Boundary
= min3ByteCoverage
;
1262 // Now expand out the multiplier for the 4 bytes, and redo.
1264 int32_t totalNeeded
= UCOL_MAX_INPUT
- min4Boundary
;
1265 int32_t neededPerPrimaryByte
= divideAndRoundUp(totalNeeded
, primaries4count
);
1266 int32_t neededPerFinalByte
= divideAndRoundUp(neededPerPrimaryByte
, medialCount
* medialCount
);
1267 int32_t gap4
= (maxTrail
- minTrail
- 1) / neededPerFinalByte
;
1269 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1272 final4Multiplier
= gap4
+ 1;
1273 final4Count
= neededPerFinalByte
;
1274 max4Trail
= minTrail
+ (final4Count
- 1) * final4Multiplier
;
1278 * Supply parameters for generating implicit CEs
1280 U_CAPI
void U_EXPORT2
1281 uprv_uca_initImplicitConstants(UErrorCode
*status
) {
1282 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1283 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1284 initImplicitConstants(minImplicitPrimary
, maxImplicitPrimary
, 0x04, 0xFE, 1, 1, status
);
1288 /* collIterNormalize Incremental Normalization happens here. */
1289 /* pick up the range of chars identifed by FCD, */
1290 /* normalize it into the collIterate's writable buffer, */
1291 /* switch the collIterate's state to use the writable buffer. */
1294 void collIterNormalize(collIterate
*collationSource
)
1296 UErrorCode status
= U_ZERO_ERROR
;
1297 const UChar
*srcP
= collationSource
->pos
- 1; /* Start of chars to normalize */
1298 const UChar
*endP
= collationSource
->fcdPosition
; /* End of region to normalize+1 */
1300 collationSource
->nfd
->normalize(UnicodeString(FALSE
, srcP
, (int32_t)(endP
- srcP
)),
1301 collationSource
->writableBuffer
,
1303 if (U_FAILURE(status
)) {
1305 fprintf(stderr
, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status
));
1310 collationSource
->pos
= collationSource
->writableBuffer
.getTerminatedBuffer();
1311 collationSource
->origFlags
= collationSource
->flags
;
1312 collationSource
->flags
|= UCOL_ITER_INNORMBUF
;
1313 collationSource
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
1317 // This function takes the iterator and extracts normalized stuff up to the next boundary
1318 // It is similar in the end results to the collIterNormalize, but for the cases when we
1321 inline void normalizeIterator(collIterate *collationSource) {
1322 UErrorCode status = U_ZERO_ERROR;
1323 UBool wasNormalized = FALSE;
1324 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1325 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1326 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1327 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1328 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1329 // reallocate and terminate
1330 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1331 &collationSource->writableBuffer,
1332 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1336 fprintf(stderr, "normalizeIterator(), out of memory\n");
1340 status = U_ZERO_ERROR;
1341 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1342 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1343 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1344 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1346 // Terminate the buffer - we already checked that it is big enough
1347 collationSource->writableBuffer[normLen] = 0;
1348 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1349 collationSource->flags |= UCOL_ITER_ALLOCATED;
1351 collationSource->pos = collationSource->writableBuffer;
1352 collationSource->origFlags = collationSource->flags;
1353 collationSource->flags |= UCOL_ITER_INNORMBUF;
1354 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1358 /* Incremental FCD check and normalize */
1359 /* Called from getNextCE when normalization state is suspect. */
1360 /* When entering, the state is known to be this: */
1361 /* o We are working in the main buffer of the collIterate, not the side */
1362 /* writable buffer. When in the side buffer, normalization mode is always off, */
1363 /* so we won't get here. */
1364 /* o The leading combining class from the current character is 0 or */
1365 /* the trailing combining class of the previous char was zero. */
1366 /* True because the previous call to this function will have always exited */
1367 /* that way, and we get called for every char where cc might be non-zero. */
1369 inline UBool
collIterFCD(collIterate
*collationSource
) {
1370 const UChar
*srcP
, *endP
;
1372 uint8_t prevTrailingCC
= 0;
1374 UBool needNormalize
= FALSE
;
1376 srcP
= collationSource
->pos
-1;
1378 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1379 endP
= collationSource
->endp
;
1384 // Get the trailing combining class of the current character. If it's zero,
1387 fcd
= unorm_nextFCD16(fcdTrieIndex
, fcdHighStart
, srcP
, endP
);
1389 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1391 if (prevTrailingCC
!= 0) {
1392 // The current char has a non-zero trailing CC. Scan forward until we find
1393 // a char with a leading cc of zero.
1394 while (endP
== NULL
|| srcP
!= endP
)
1396 const UChar
*savedSrcP
= srcP
;
1399 fcd
= unorm_nextFCD16(fcdTrieIndex
, fcdHighStart
, srcP
, endP
);
1400 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1401 if (leadingCC
== 0) {
1402 srcP
= savedSrcP
; // Hit char that is not part of combining sequence.
1403 // back up over it. (Could be surrogate pair!)
1407 if (leadingCC
< prevTrailingCC
) {
1408 needNormalize
= TRUE
;
1411 prevTrailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1416 collationSource
->fcdPosition
= (UChar
*)srcP
;
1418 return needNormalize
;
1421 /****************************************************************************/
1422 /* Following are the CE retrieval functions */
1424 /****************************************************************************/
1426 static uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
);
1427 static uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
);
1429 /* there should be a macro version of this function in the header file */
1430 /* This is the first function that tries to fetch a collation element */
1431 /* If it's not succesfull or it encounters a more difficult situation */
1432 /* some more sofisticated and slower functions are invoked */
1434 inline uint32_t ucol_IGetNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1436 if (collationSource
->CEpos
> collationSource
->toReturn
) { /* Are there any CEs from previous expansions? */
1437 order
= *(collationSource
->toReturn
++); /* if so, return them */
1438 if(collationSource
->CEpos
== collationSource
->toReturn
) {
1439 collationSource
->CEpos
= collationSource
->toReturn
= collationSource
->extendCEs
? collationSource
->extendCEs
: collationSource
->CEs
;
1445 collationSource
->offsetReturn
= NULL
;
1448 for (;;) /* Loop handles case when incremental normalize switches */
1449 { /* to or from the side buffer / original string, and we */
1450 /* need to start again to get the next character. */
1452 if ((collationSource
->flags
& (UCOL_ITER_HASLEN
| UCOL_ITER_INNORMBUF
| UCOL_ITER_NORM
| UCOL_HIRAGANA_Q
| UCOL_USE_ITERATOR
)) == 0)
1454 // The source string is null terminated and we're not working from the side buffer,
1455 // and we're not normalizing. This is the fast path.
1456 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1457 ch
= *collationSource
->pos
++;
1462 return UCOL_NO_MORE_CES
;
1466 if (collationSource
->flags
& UCOL_ITER_HASLEN
) {
1467 // Normal path for strings when length is specified.
1468 // (We can't be in side buffer because it is always null terminated.)
1469 if (collationSource
->pos
>= collationSource
->endp
) {
1470 // Ran off of the end of the main source string. We're done.
1471 return UCOL_NO_MORE_CES
;
1473 ch
= *collationSource
->pos
++;
1475 else if(collationSource
->flags
& UCOL_USE_ITERATOR
) {
1476 UChar32 iterCh
= collationSource
->iterator
->next(collationSource
->iterator
);
1477 if(iterCh
== U_SENTINEL
) {
1478 return UCOL_NO_MORE_CES
;
1484 // Null terminated string.
1485 ch
= *collationSource
->pos
++;
1487 // Ran off end of buffer.
1488 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1489 // Ran off end of main string. backing up one character.
1490 collationSource
->pos
--;
1491 return UCOL_NO_MORE_CES
;
1495 // Hit null in the normalize side buffer.
1496 // Usually this means the end of the normalized data,
1497 // except for one odd case: a null followed by combining chars,
1498 // which is the case if we are at the start of the buffer.
1499 if (collationSource
->pos
== collationSource
->writableBuffer
.getBuffer()+1) {
1503 // Null marked end of side buffer.
1504 // Revert to the main string and
1505 // loop back to top to try again to get a character.
1506 collationSource
->pos
= collationSource
->fcdPosition
;
1507 collationSource
->flags
= collationSource
->origFlags
;
1513 if(collationSource
->flags
&UCOL_HIRAGANA_Q
) {
1514 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1515 * based on whether the previous codepoint was Hiragana or Katakana.
1517 if(((ch
>=0x3040 && ch
<=0x3096) || (ch
>= 0x309d && ch
<= 0x309f)) ||
1518 ((collationSource
->flags
& UCOL_WAS_HIRAGANA
) && (ch
>= 0x3099 && ch
<= 0x309C))) {
1519 collationSource
->flags
|= UCOL_WAS_HIRAGANA
;
1521 collationSource
->flags
&= ~UCOL_WAS_HIRAGANA
;
1525 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1526 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1527 if ((collationSource
->flags
& UCOL_ITER_NORM
) == 0) {
1531 if (collationSource
->fcdPosition
>= collationSource
->pos
) {
1532 // An earlier FCD check has already covered the current character.
1533 // We can go ahead and process this char.
1537 if (ch
< ZERO_CC_LIMIT_
) {
1538 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1542 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1543 // We need to peek at the next character in order to tell if we are FCD
1544 if ((collationSource
->flags
& UCOL_ITER_HASLEN
) && collationSource
->pos
>= collationSource
->endp
) {
1545 // We are at the last char of source string.
1546 // It is always OK for FCD check.
1550 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1551 if (*collationSource
->pos
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
1557 // Need a more complete FCD check and possible normalization.
1558 if (collIterFCD(collationSource
)) {
1559 collIterNormalize(collationSource
);
1561 if ((collationSource
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
1562 // No normalization was needed. Go ahead and process the char we already had.
1566 // Some normalization happened. Next loop iteration will pick up a char
1567 // from the normalization buffer.
1573 /* For latin-1 characters we never need to fall back to the UCA table */
1574 /* because all of the UCA data is replicated in the latinOneMapping array */
1575 order
= coll
->latinOneMapping
[ch
];
1576 if (order
> UCOL_NOT_FOUND
) {
1577 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
);
1582 // Always use UCA for Han, Hangul
1583 // (Han extension A is before main Han block)
1584 // **** Han compatibility chars ?? ****
1585 if ((collationSource
->flags
& UCOL_FORCE_HAN_IMPLICIT
) != 0 &&
1586 (ch
>= UCOL_FIRST_HAN_A
&& ch
<= UCOL_LAST_HANGUL
)) {
1587 if (ch
> UCOL_LAST_HAN
&& ch
< UCOL_FIRST_HANGUL
) {
1588 // between the two target ranges; do normal lookup
1589 // **** this range is YI, Modifier tone letters, ****
1590 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
1591 // **** Latin-D might be tailored, so we need to ****
1592 // **** do the normal lookup for these guys. ****
1593 order
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
1595 // in one of the target ranges; use UCA
1596 order
= UCOL_NOT_FOUND
;
1599 order
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
1602 if(order
> UCOL_NOT_FOUND
) { /* if a CE is special */
1603 order
= ucol_prv_getSpecialCE(coll
, ch
, order
, collationSource
, status
); /* and try to get the special CE */
1606 if(order
== UCOL_NOT_FOUND
&& coll
->UCA
) { /* We couldn't find a good CE in the tailoring */
1607 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1608 order
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
1610 if(order
> UCOL_NOT_FOUND
) { /* UCA also gives us a special CE */
1611 order
= ucol_prv_getSpecialCE(coll
->UCA
, ch
, order
, collationSource
, status
);
1615 } while ( order
== UCOL_IGNORABLE
&& ch
>= UCOL_FIRST_HANGUL
&& ch
<= UCOL_LAST_HANGUL
);
1617 if(order
== UCOL_NOT_FOUND
) {
1618 order
= getImplicit(ch
, collationSource
);
1620 return order
; /* return the CE */
1623 /* ucol_getNextCE, out-of-line version for use from other files. */
1624 U_CAPI
uint32_t U_EXPORT2
1625 ucol_getNextCE(const UCollator
*coll
, collIterate
*collationSource
, UErrorCode
*status
) {
1626 return ucol_IGetNextCE(coll
, collationSource
, status
);
1631 * Incremental previous normalization happens here. Pick up the range of chars
1632 * identifed by FCD, normalize it into the collIterate's writable buffer,
1633 * switch the collIterate's state to use the writable buffer.
1634 * @param data collation iterator data
1637 void collPrevIterNormalize(collIterate
*data
)
1639 UErrorCode status
= U_ZERO_ERROR
;
1640 const UChar
*pEnd
= data
->pos
; /* End normalize + 1 */
1641 const UChar
*pStart
;
1643 /* Start normalize */
1644 if (data
->fcdPosition
== NULL
) {
1645 pStart
= data
->string
;
1648 pStart
= data
->fcdPosition
+ 1;
1652 data
->nfd
->normalize(UnicodeString(FALSE
, pStart
, (int32_t)((pEnd
- pStart
) + 1)),
1653 data
->writableBuffer
,
1656 if(U_FAILURE(status
)) {
1660 this puts the null termination infront of the normalized string instead
1663 data
->writableBuffer
.insert(0, (UChar
)0);
1666 * The usual case at this point is that we've got a base
1667 * character followed by marks that were normalized. If
1668 * fcdPosition is NULL, that means that we backed up to
1669 * the beginning of the string and there's no base character.
1671 * Forward processing will usually normalize when it sees
1672 * the first mark, so that mark will get it's natural offset
1673 * and the rest will get the offset of the character following
1674 * the marks. The base character will also get its natural offset.
1676 * We write the offset of the base character, if there is one,
1677 * followed by the offset of the first mark and then the offsets
1678 * of the rest of the marks.
1680 int32_t firstMarkOffset
= 0;
1681 int32_t trailOffset
= (int32_t)(data
->pos
- data
->string
+ 1);
1682 int32_t trailCount
= normLen
- 1;
1684 if (data
->fcdPosition
!= NULL
) {
1685 int32_t baseOffset
= (int32_t)(data
->fcdPosition
- data
->string
);
1686 UChar baseChar
= *data
->fcdPosition
;
1688 firstMarkOffset
= baseOffset
+ 1;
1691 * If the base character is the start of a contraction, forward processing
1692 * will normalize the marks while checking for the contraction, which means
1693 * that the offset of the first mark will the same as the other marks.
1695 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1697 if (baseChar
>= 0x100) {
1698 uint32_t baseOrder
= UTRIE_GET32_FROM_LEAD(&data
->coll
->mapping
, baseChar
);
1700 if (baseOrder
== UCOL_NOT_FOUND
&& data
->coll
->UCA
) {
1701 baseOrder
= UTRIE_GET32_FROM_LEAD(&data
->coll
->UCA
->mapping
, baseChar
);
1704 if (baseOrder
> UCOL_NOT_FOUND
&& getCETag(baseOrder
) == CONTRACTION_TAG
) {
1705 firstMarkOffset
= trailOffset
;
1709 data
->appendOffset(baseOffset
, status
);
1712 data
->appendOffset(firstMarkOffset
, status
);
1714 for (int32_t i
= 0; i
< trailCount
; i
+= 1) {
1715 data
->appendOffset(trailOffset
, status
);
1718 data
->offsetRepeatValue
= trailOffset
;
1720 data
->offsetReturn
= data
->offsetStore
- 1;
1721 if (data
->offsetReturn
== data
->offsetBuffer
) {
1722 data
->offsetStore
= data
->offsetBuffer
;
1725 data
->pos
= data
->writableBuffer
.getTerminatedBuffer() + 1 + normLen
;
1726 data
->origFlags
= data
->flags
;
1727 data
->flags
|= UCOL_ITER_INNORMBUF
;
1728 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
1733 * Incremental FCD check for previous iteration and normalize. Called from
1734 * getPrevCE when normalization state is suspect.
1735 * When entering, the state is known to be this:
1736 * o We are working in the main buffer of the collIterate, not the side
1737 * writable buffer. When in the side buffer, normalization mode is always
1738 * off, so we won't get here.
1739 * o The leading combining class from the current character is 0 or the
1740 * trailing combining class of the previous char was zero.
1741 * True because the previous call to this function will have always exited
1742 * that way, and we get called for every char where cc might be non-zero.
1743 * @param data collation iterate struct
1744 * @return normalization status, TRUE for normalization to be done, FALSE
1748 inline UBool
collPrevIterFCD(collIterate
*data
)
1750 const UChar
*src
, *start
;
1752 uint8_t trailingCC
= 0;
1754 UBool result
= FALSE
;
1756 start
= data
->string
;
1757 src
= data
->pos
+ 1;
1759 /* Get the trailing combining class of the current character. */
1760 fcd
= unorm_prevFCD16(fcdTrieIndex
, fcdHighStart
, start
, src
);
1762 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1764 if (leadingCC
!= 0) {
1766 The current char has a non-zero leading combining class.
1767 Scan backward until we find a char with a trailing cc of zero.
1772 data
->fcdPosition
= NULL
;
1776 fcd
= unorm_prevFCD16(fcdTrieIndex
, fcdHighStart
, start
, src
);
1778 trailingCC
= (uint8_t)(fcd
& LAST_BYTE_MASK_
);
1780 if (trailingCC
== 0) {
1784 if (leadingCC
< trailingCC
) {
1788 leadingCC
= (uint8_t)(fcd
>> SECOND_LAST_BYTE_SHIFT_
);
1792 data
->fcdPosition
= (UChar
*)src
;
1797 /** gets a code unit from the string at a given offset
1798 * Handles both normal and iterative cases.
1799 * No error checking - caller beware!
1802 UChar
peekCodeUnit(collIterate
*source
, int32_t offset
) {
1803 if(source
->pos
!= NULL
) {
1804 return *(source
->pos
+ offset
);
1805 } else if(source
->iterator
!= NULL
) {
1808 source
->iterator
->move(source
->iterator
, offset
, UITER_CURRENT
);
1809 c
= source
->iterator
->next(source
->iterator
);
1810 source
->iterator
->move(source
->iterator
, -offset
-1, UITER_CURRENT
);
1812 c
= source
->iterator
->current(source
->iterator
);
1814 return c
>= 0 ? (UChar
)c
: 0xfffd; // If the caller works properly, we should never see c<0.
1820 // Code point version. Treats the offset as a _code point_ delta.
1821 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1822 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1824 UChar32
peekCodePoint(collIterate
*source
, int32_t offset
) {
1826 if(source
->pos
!= NULL
) {
1827 const UChar
*p
= source
->pos
;
1829 // Skip forward over (offset-1) code points.
1830 while(--offset
>= 0) {
1831 if(U16_IS_LEAD(*p
++) && U16_IS_TRAIL(*p
)) {
1835 // Read the code point there.
1838 if(U16_IS_LEAD(c
) && U16_IS_TRAIL(trail
= *p
)) {
1839 c
= U16_GET_SUPPLEMENTARY(c
, trail
);
1841 } else /* offset<0 */ {
1842 // Skip backward over (offset-1) code points.
1843 while(++offset
< 0) {
1844 if(U16_IS_TRAIL(*--p
) && U16_IS_LEAD(*(p
- 1))) {
1848 // Read the code point before that.
1851 if(U16_IS_TRAIL(c
) && U16_IS_LEAD(lead
= *(p
- 1))) {
1852 c
= U16_GET_SUPPLEMENTARY(lead
, c
);
1855 } else if(source
->iterator
!= NULL
) {
1857 // Skip forward over (offset-1) code points.
1858 int32_t fwd
= offset
;
1860 uiter_next32(source
->iterator
);
1862 // Read the code point there.
1863 c
= uiter_current32(source
->iterator
);
1864 // Return to the starting point, skipping backward over (offset-1) code points.
1865 while(offset
-- > 0) {
1866 uiter_previous32(source
->iterator
);
1868 } else /* offset<0 */ {
1869 // Read backward, reading offset code points, remember only the last-read one.
1870 int32_t back
= offset
;
1872 c
= uiter_previous32(source
->iterator
);
1873 } while(++back
< 0);
1874 // Return to the starting position, skipping forward over offset code points.
1876 uiter_next32(source
->iterator
);
1877 } while(++offset
< 0);
1886 * Determines if we are at the start of the data string in the backwards
1887 * collation iterator
1888 * @param data collation iterator
1889 * @return TRUE if we are at the start
1892 inline UBool
isAtStartPrevIterate(collIterate
*data
) {
1893 if(data
->pos
== NULL
&& data
->iterator
!= NULL
) {
1894 return !data
->iterator
->hasPrevious(data
->iterator
);
1896 //return (collIter_bos(data)) ||
1897 return (data
->pos
== data
->string
) ||
1898 ((data
->flags
& UCOL_ITER_INNORMBUF
) &&
1899 *(data
->pos
- 1) == 0 && data
->fcdPosition
== NULL
);
1903 inline void goBackOne(collIterate
*data
) {
1905 // somehow, it looks like we need to keep iterator synced up
1906 // at all times, as above.
1910 if(data
->iterator
) {
1911 data
->iterator
->previous(data
->iterator
);
1914 if(data
->iterator
&& (data
->flags
& UCOL_USE_ITERATOR
)) {
1915 data
->iterator
->previous(data
->iterator
);
1923 * Inline function that gets a simple CE.
1924 * So what it does is that it will first check the expansion buffer. If the
1925 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1926 * is different from the string pointer, we return the collation element at the
1927 * return pointer and decrement it.
1928 * For more complicated CEs it resorts to getComplicatedCE.
1929 * @param coll collator data
1930 * @param data collation iterator struct
1931 * @param status error status
1934 inline uint32_t ucol_IGetPrevCE(const UCollator
*coll
, collIterate
*data
,
1937 uint32_t result
= (uint32_t)UCOL_NULLORDER
;
1939 if (data
->offsetReturn
!= NULL
) {
1940 if (data
->offsetRepeatCount
> 0) {
1941 data
->offsetRepeatCount
-= 1;
1943 if (data
->offsetReturn
== data
->offsetBuffer
) {
1944 data
->offsetReturn
= NULL
;
1945 data
->offsetStore
= data
->offsetBuffer
;
1947 data
->offsetReturn
-= 1;
1952 if ((data
->extendCEs
&& data
->toReturn
> data
->extendCEs
) ||
1953 (!data
->extendCEs
&& data
->toReturn
> data
->CEs
))
1955 data
->toReturn
-= 1;
1956 result
= *(data
->toReturn
);
1957 if (data
->CEs
== data
->toReturn
|| data
->extendCEs
== data
->toReturn
) {
1958 data
->CEpos
= data
->toReturn
;
1966 Loop handles case when incremental normalize switches to or from the
1967 side buffer / original string, and we need to start again to get the
1971 if (data
->flags
& UCOL_ITER_HASLEN
) {
1973 Normal path for strings when length is specified.
1974 Not in side buffer because it is always null terminated.
1976 if (data
->pos
<= data
->string
) {
1977 /* End of the main source string */
1978 return UCOL_NO_MORE_CES
;
1983 // we are using an iterator to go back. Pray for us!
1984 else if (data
->flags
& UCOL_USE_ITERATOR
) {
1985 UChar32 iterCh
= data
->iterator
->previous(data
->iterator
);
1986 if(iterCh
== U_SENTINEL
) {
1987 return UCOL_NO_MORE_CES
;
1995 /* we are in the side buffer. */
1998 At the start of the normalize side buffer.
2000 Because pointer points to the last accessed character,
2001 hence we have to increment it by one here.
2003 data
->flags
= data
->origFlags
;
2004 data
->offsetRepeatValue
= 0;
2006 if (data
->fcdPosition
== NULL
) {
2007 data
->pos
= data
->string
;
2008 return UCOL_NO_MORE_CES
;
2011 data
->pos
= data
->fcdPosition
+ 1;
2018 if(data
->flags
&UCOL_HIRAGANA_Q
) {
2019 if(ch
>=0x3040 && ch
<=0x309f) {
2020 data
->flags
|= UCOL_WAS_HIRAGANA
;
2022 data
->flags
&= ~UCOL_WAS_HIRAGANA
;
2027 * got a character to determine if there's fcd and/or normalization
2029 * if the current character is not fcd.
2030 * if current character is at the start of the string
2031 * Trailing combining class == 0.
2032 * Note if pos is in the writablebuffer, norm is always 0
2034 if (ch
< ZERO_CC_LIMIT_
||
2035 // this should propel us out of the loop in the iterator case
2036 (data
->flags
& UCOL_ITER_NORM
) == 0 ||
2037 (data
->fcdPosition
!= NULL
&& data
->fcdPosition
<= data
->pos
)
2038 || data
->string
== data
->pos
) {
2042 if (ch
< NFC_ZERO_CC_BLOCK_LIMIT_
) {
2043 /* if next character is FCD */
2044 if (data
->pos
== data
->string
) {
2045 /* First char of string is always OK for FCD check */
2049 /* Not first char of string, do the FCD fast test */
2050 if (*(data
->pos
- 1) < NFC_ZERO_CC_BLOCK_LIMIT_
) {
2055 /* Need a more complete FCD check and possible normalization. */
2056 if (collPrevIterFCD(data
)) {
2057 collPrevIterNormalize(data
);
2060 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2061 /* No normalization. Go ahead and process the char. */
2066 Some normalization happened.
2067 Next loop picks up a char from the normalization buffer.
2071 /* attempt to handle contractions, after removal of the backwards
2074 if (ucol_contractionEndCP(ch
, coll
) && !isAtStartPrevIterate(data
)) {
2075 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, UCOL_CONTRACTION
, data
, status
);
2078 result
= coll
->latinOneMapping
[ch
];
2081 // Always use UCA for [3400..9FFF], [AC00..D7AF]
2082 // **** [FA0E..FA2F] ?? ****
2083 if ((data
->flags
& UCOL_FORCE_HAN_IMPLICIT
) != 0 &&
2084 (ch
>= 0x3400 && ch
<= 0xD7AF)) {
2085 if (ch
> 0x9FFF && ch
< 0xAC00) {
2086 // between the two target ranges; do normal lookup
2087 // **** this range is YI, Modifier tone letters, ****
2088 // **** Latin-D, Syloti Nagari, Phagas-pa. ****
2089 // **** Latin-D might be tailored, so we need to ****
2090 // **** do the normal lookup for these guys. ****
2091 result
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
2093 result
= UCOL_NOT_FOUND
;
2096 result
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
2099 if (result
> UCOL_NOT_FOUND
) {
2100 result
= ucol_prv_getSpecialPrevCE(coll
, ch
, result
, data
, status
);
2102 if (result
== UCOL_NOT_FOUND
) { // Not found in master list
2103 if (!isAtStartPrevIterate(data
) &&
2104 ucol_contractionEndCP(ch
, data
->coll
))
2106 result
= UCOL_CONTRACTION
;
2109 result
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
2113 if (result
> UCOL_NOT_FOUND
) {
2115 result
= ucol_prv_getSpecialPrevCE(coll
->UCA
, ch
, result
, data
, status
);
2120 } while ( result
== UCOL_IGNORABLE
&& ch
>= UCOL_FIRST_HANGUL
&& ch
<= UCOL_LAST_HANGUL
);
2122 if(result
== UCOL_NOT_FOUND
) {
2123 result
= getPrevImplicit(ch
, data
);
2131 /* ucol_getPrevCE, out-of-line version for use from other files. */
2132 U_CFUNC
uint32_t U_EXPORT2
2133 ucol_getPrevCE(const UCollator
*coll
, collIterate
*data
,
2134 UErrorCode
*status
) {
2135 return ucol_IGetPrevCE(coll
, data
, status
);
2139 /* this should be connected to special Jamo handling */
2140 U_CFUNC
uint32_t U_EXPORT2
2141 ucol_getFirstCE(const UCollator
*coll
, UChar u
, UErrorCode
*status
) {
2143 IInit_collIterate(coll
, &u
, 1, &colIt
, status
);
2144 if(U_FAILURE(*status
)) {
2147 return ucol_IGetNextCE(coll
, &colIt
, status
);
2151 * Inserts the argument character into the end of the buffer pushing back the
2153 * @param data collIterate struct data
2154 * @param ch character to be appended
2155 * @return the position of the new addition
2158 inline const UChar
* insertBufferEnd(collIterate
*data
, UChar ch
)
2160 int32_t oldLength
= data
->writableBuffer
.length();
2161 return data
->writableBuffer
.append(ch
).getTerminatedBuffer() + oldLength
;
2165 * Inserts the argument string into the end of the buffer pushing back the
2167 * @param data collIterate struct data
2168 * @param string to be appended
2169 * @param length of the string to be appended
2170 * @return the position of the new addition
2173 inline const UChar
* insertBufferEnd(collIterate
*data
, const UChar
*str
, int32_t length
)
2175 int32_t oldLength
= data
->writableBuffer
.length();
2176 return data
->writableBuffer
.append(str
, length
).getTerminatedBuffer() + oldLength
;
2180 * Special normalization function for contraction in the forwards iterator.
2181 * This normalization sequence will place the current character at source->pos
2182 * and its following normalized sequence into the buffer.
2183 * The fcd position, pos will be changed.
2184 * pos will now point to positions in the buffer.
2185 * Flags will be changed accordingly.
2186 * @param data collation iterator data
2189 inline void normalizeNextContraction(collIterate
*data
)
2192 UErrorCode status
= U_ZERO_ERROR
;
2193 /* because the pointer points to the next character */
2194 const UChar
*pStart
= data
->pos
- 1;
2197 if ((data
->flags
& UCOL_ITER_INNORMBUF
) == 0) {
2198 data
->writableBuffer
.setTo(*(pStart
- 1));
2202 strsize
= data
->writableBuffer
.length();
2205 pEnd
= data
->fcdPosition
;
2207 data
->writableBuffer
.append(
2208 data
->nfd
->normalize(UnicodeString(FALSE
, pStart
, (int32_t)(pEnd
- pStart
)), status
));
2209 if(U_FAILURE(status
)) {
2213 data
->pos
= data
->writableBuffer
.getTerminatedBuffer() + strsize
;
2214 data
->origFlags
= data
->flags
;
2215 data
->flags
|= UCOL_ITER_INNORMBUF
;
2216 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2220 * Contraction character management function that returns the next character
2221 * for the forwards iterator.
2222 * Does nothing if the next character is in buffer and not the first character
2224 * Else it checks next character in data string to see if it is normalizable.
2225 * If it is not, the character is simply copied into the buffer, else
2226 * the whole normalized substring is copied into the buffer, including the
2227 * current character.
2228 * @param data collation element iterator data
2229 * @return next character
2232 inline UChar
getNextNormalizedChar(collIterate
*data
)
2236 // Here we need to add the iterator code. One problem is the way
2237 // end of string is handled. If we just return next char, it could
2238 // be the sentinel. Most of the cases already check for this, but we
2240 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ) {
2241 /* if no normalization and not in buffer. */
2242 if(data
->flags
& UCOL_USE_ITERATOR
) {
2243 return (UChar
)data
->iterator
->next(data
->iterator
);
2245 return *(data
->pos
++);
2249 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2250 //normalizeIterator(data);
2253 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2254 if ((innormbuf
&& *data
->pos
!= 0) ||
2255 (data
->fcdPosition
!= NULL
&& !innormbuf
&&
2256 data
->pos
< data
->fcdPosition
)) {
2258 if next character is in normalized buffer, no further normalization
2261 return *(data
->pos
++);
2264 if (data
->flags
& UCOL_ITER_HASLEN
) {
2265 /* in data string */
2266 if (data
->pos
+ 1 == data
->endp
) {
2267 return *(data
->pos
++);
2272 // inside the normalization buffer, but at the end
2273 // (since we encountered zero). This means, in the
2274 // case we're using char iterator, that we need to
2275 // do another round of normalization.
2276 //if(data->origFlags & UCOL_USE_ITERATOR) {
2277 // we need to restore original flags,
2278 // otherwise, we'll lose them
2279 //data->flags = data->origFlags;
2280 //normalizeIterator(data);
2281 //return *(data->pos++);
2284 in writable buffer, at this point fcdPosition can not be
2285 pointing to the end of the data string. see contracting tag.
2287 if(data
->fcdPosition
) {
2288 if (*(data
->fcdPosition
+ 1) == 0 ||
2289 data
->fcdPosition
+ 1 == data
->endp
) {
2290 /* at the end of the string, dump it into the normalizer */
2291 data
->pos
= insertBufferEnd(data
, *(data
->fcdPosition
)) + 1;
2292 // Check if data->pos received a null pointer
2293 if (data
->pos
== NULL
) {
2294 return (UChar
)-1; // Return to indicate error.
2296 return *(data
->fcdPosition
++);
2298 data
->pos
= data
->fcdPosition
;
2299 } else if(data
->origFlags
& UCOL_USE_ITERATOR
) {
2300 // if we are here, we're using a normalizing iterator.
2301 // we should just continue further.
2302 data
->flags
= data
->origFlags
;
2304 return (UChar
)data
->iterator
->next(data
->iterator
);
2309 if (*(data
->pos
+ 1) == 0) {
2310 return *(data
->pos
++);
2316 nextch
= *data
->pos
;
2319 * if the current character is not fcd.
2320 * Trailing combining class == 0.
2322 if ((data
->fcdPosition
== NULL
|| data
->fcdPosition
< data
->pos
) &&
2323 (nextch
>= NFC_ZERO_CC_BLOCK_LIMIT_
||
2324 ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
)) {
2326 Need a more complete FCD check and possible normalization.
2327 normalize substring will be appended to buffer
2329 if (collIterFCD(data
)) {
2330 normalizeNextContraction(data
);
2331 return *(data
->pos
++);
2333 else if (innormbuf
) {
2334 /* fcdposition shifted even when there's no normalization, if we
2335 don't input the rest into this, we'll get the wrong position when
2336 we reach the end of the writableBuffer */
2337 int32_t length
= (int32_t)(data
->fcdPosition
- data
->pos
+ 1);
2338 data
->pos
= insertBufferEnd(data
, data
->pos
- 1, length
);
2339 // Check if data->pos received a null pointer
2340 if (data
->pos
== NULL
) {
2341 return (UChar
)-1; // Return to indicate error.
2343 return *(data
->pos
++);
2349 no normalization is to be done hence only one character will be
2350 appended to the buffer.
2352 data
->pos
= insertBufferEnd(data
, ch
) + 1;
2353 // Check if data->pos received a null pointer
2354 if (data
->pos
== NULL
) {
2355 return (UChar
)-1; // Return to indicate error.
2359 /* points back to the pos in string */
2366 * Function to copy the buffer into writableBuffer and sets the fcd position to
2367 * the correct position
2368 * @param source data string source
2369 * @param buffer character buffer
2372 inline void setDiscontiguosAttribute(collIterate
*source
, const UnicodeString
&buffer
)
2374 /* okay confusing part here. to ensure that the skipped characters are
2375 considered later, we need to place it in the appropriate position in the
2376 normalization buffer and reassign the pos pointer. simple case if pos
2377 reside in string, simply copy to normalization buffer and
2378 fcdposition = pos, pos = start of normalization buffer. if pos in
2379 normalization buffer, we'll insert the copy infront of pos and point pos
2380 to the start of the normalization buffer. why am i doing these copies?
2381 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2382 not require any changes, which be really painful. */
2383 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
2384 int32_t replaceLength
= source
->pos
- source
->writableBuffer
.getBuffer();
2385 source
->writableBuffer
.replace(0, replaceLength
, buffer
);
2388 source
->fcdPosition
= source
->pos
;
2389 source
->origFlags
= source
->flags
;
2390 source
->flags
|= UCOL_ITER_INNORMBUF
;
2391 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
| UCOL_USE_ITERATOR
);
2392 source
->writableBuffer
= buffer
;
2395 source
->pos
= source
->writableBuffer
.getTerminatedBuffer();
2399 * Function to get the discontiguos collation element within the source.
2400 * Note this function will set the position to the appropriate places.
2401 * @param coll current collator used
2402 * @param source data string source
2403 * @param constart index to the start character in the contraction table
2404 * @return discontiguos collation element offset
2407 uint32_t getDiscontiguous(const UCollator
*coll
, collIterate
*source
,
2408 const UChar
*constart
)
2410 /* source->pos currently points to the second combining character after
2411 the start character */
2412 const UChar
*temppos
= source
->pos
;
2413 UnicodeString buffer
;
2414 const UChar
*tempconstart
= constart
;
2415 uint8_t tempflags
= source
->flags
;
2416 UBool multicontraction
= FALSE
;
2417 collIterateState discState
;
2419 backupState(source
, &discState
);
2421 buffer
.setTo(peekCodePoint(source
, -1));
2428 if (((source
->flags
& UCOL_ITER_HASLEN
) && source
->pos
>= source
->endp
)
2429 || (peekCodeUnit(source
, 0) == 0 &&
2430 //|| (*source->pos == 0 &&
2431 ((source
->flags
& UCOL_ITER_INNORMBUF
) == 0 ||
2432 source
->fcdPosition
== NULL
||
2433 source
->fcdPosition
== source
->endp
||
2434 *(source
->fcdPosition
) == 0 ||
2435 u_getCombiningClass(*(source
->fcdPosition
)) == 0)) ||
2436 /* end of string in null terminated string or stopped by a
2437 null character, note fcd does not always point to a base
2438 character after the discontiguos change */
2439 u_getCombiningClass(peekCodePoint(source
, 0)) == 0) {
2440 //u_getCombiningClass(*(source->pos)) == 0) {
2441 //constart = (UChar *)coll->image + getContractOffset(CE);
2442 if (multicontraction
) {
2443 source
->pos
= temppos
- 1;
2444 setDiscontiguosAttribute(source
, buffer
);
2445 return *(coll
->contractionCEs
+
2446 (tempconstart
- coll
->contractionIndex
));
2448 constart
= tempconstart
;
2452 UCharOffset
= (UChar
*)(tempconstart
+ 1); /* skip the backward offset*/
2453 schar
= getNextNormalizedChar(source
);
2455 while (schar
> (tchar
= *UCharOffset
)) {
2459 if (schar
!= tchar
) {
2460 /* not the correct codepoint. we stuff the current codepoint into
2461 the discontiguos buffer and try the next character */
2462 buffer
.append(schar
);
2466 if (u_getCombiningClass(schar
) ==
2467 u_getCombiningClass(peekCodePoint(source
, -2))) {
2468 buffer
.append(schar
);
2471 result
= *(coll
->contractionCEs
+
2472 (UCharOffset
- coll
->contractionIndex
));
2475 if (result
== UCOL_NOT_FOUND
) {
2477 } else if (isContraction(result
)) {
2478 /* this is a multi-contraction*/
2479 tempconstart
= (UChar
*)coll
->image
+ getContractOffset(result
);
2480 if (*(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
))
2481 != UCOL_NOT_FOUND
) {
2482 multicontraction
= TRUE
;
2483 temppos
= source
->pos
+ 1;
2486 setDiscontiguosAttribute(source
, buffer
);
2491 /* no problems simply reverting just like that,
2492 if we are in string before getting into this function, points back to
2493 string hence no problem.
2494 if we are in normalization buffer before getting into this function,
2495 since we'll never use another normalization within this function, we
2496 know that fcdposition points to a base character. the normalization buffer
2497 never change, hence this revert works. */
2498 loadState(source
, &discState
, TRUE
);
2501 //source->pos = temppos - 1;
2502 source
->flags
= tempflags
;
2503 return *(coll
->contractionCEs
+ (constart
- coll
->contractionIndex
));
2506 /* now uses Mark's getImplicitPrimary code */
2508 inline uint32_t getImplicit(UChar32 cp
, collIterate
*collationSource
) {
2509 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
2510 *(collationSource
->CEpos
++) = ((r
& 0x0000FFFF)<<16) | 0x000000C0;
2511 collationSource
->offsetRepeatCount
+= 1;
2512 return (r
& UCOL_PRIMARYMASK
) | 0x00000505; // This was 'order'
2516 * Inserts the argument character into the front of the buffer replacing the
2517 * front null terminator.
2518 * @param data collation element iterator data
2519 * @param ch character to be appended
2522 inline void insertBufferFront(collIterate
*data
, UChar ch
)
2524 data
->pos
= data
->writableBuffer
.setCharAt(0, ch
).insert(0, (UChar
)0).getTerminatedBuffer() + 2;
2528 * Special normalization function for contraction in the previous iterator.
2529 * This normalization sequence will place the current character at source->pos
2530 * and its following normalized sequence into the buffer.
2531 * The fcd position, pos will be changed.
2532 * pos will now point to positions in the buffer.
2533 * Flags will be changed accordingly.
2534 * @param data collation iterator data
2537 inline void normalizePrevContraction(collIterate
*data
, UErrorCode
*status
)
2539 const UChar
*pEnd
= data
->pos
+ 1; /* End normalize + 1 */
2540 const UChar
*pStart
;
2542 UnicodeString endOfBuffer
;
2543 if (data
->flags
& UCOL_ITER_HASLEN
) {
2545 normalization buffer not used yet, we'll pull down the next
2546 character into the end of the buffer
2548 endOfBuffer
.setTo(*pEnd
);
2551 endOfBuffer
.setTo(data
->writableBuffer
, 1); // after the leading NUL
2554 if (data
->fcdPosition
== NULL
) {
2555 pStart
= data
->string
;
2558 pStart
= data
->fcdPosition
+ 1;
2561 data
->nfd
->normalize(UnicodeString(FALSE
, pStart
, (int32_t)(pEnd
- pStart
)),
2562 data
->writableBuffer
,
2565 if(U_FAILURE(*status
)) {
2569 this puts the null termination infront of the normalized string instead
2573 data
->writableBuffer
.insert(0, (UChar
)0).append(endOfBuffer
).getTerminatedBuffer() +
2575 data
->origFlags
= data
->flags
;
2576 data
->flags
|= UCOL_ITER_INNORMBUF
;
2577 data
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
2581 * Contraction character management function that returns the previous character
2582 * for the backwards iterator.
2583 * Does nothing if the previous character is in buffer and not the first
2585 * Else it checks previous character in data string to see if it is
2587 * If it is not, the character is simply copied into the buffer, else
2588 * the whole normalized substring is copied into the buffer, including the
2589 * current character.
2590 * @param data collation element iterator data
2591 * @return previous character
2594 inline UChar
getPrevNormalizedChar(collIterate
*data
, UErrorCode
*status
)
2599 UBool innormbuf
= (UBool
)(data
->flags
& UCOL_ITER_INNORMBUF
);
2600 if ((data
->flags
& (UCOL_ITER_NORM
| UCOL_ITER_INNORMBUF
)) == 0 ||
2601 (innormbuf
&& *(data
->pos
- 1) != 0)) {
2603 if no normalization.
2604 if previous character is in normalized buffer, no further normalization
2607 if(data
->flags
& UCOL_USE_ITERATOR
) {
2608 data
->iterator
->move(data
->iterator
, -1, UITER_CURRENT
);
2609 return (UChar
)data
->iterator
->next(data
->iterator
);
2611 return *(data
->pos
- 1);
2616 if ((data
->fcdPosition
==NULL
)||(data
->flags
& UCOL_ITER_HASLEN
)) {
2617 /* in data string */
2618 if ((start
- 1) == data
->string
) {
2619 return *(start
- 1);
2623 prevch
= *(start
- 1);
2627 in writable buffer, at this point fcdPosition can not be NULL.
2628 see contracting tag.
2630 if (data
->fcdPosition
== data
->string
) {
2631 /* at the start of the string, just dump it into the normalizer */
2632 insertBufferFront(data
, *(data
->fcdPosition
));
2633 data
->fcdPosition
= NULL
;
2634 return *(data
->pos
- 1);
2636 start
= data
->fcdPosition
;
2638 prevch
= *(start
- 1);
2641 * if the current character is not fcd.
2642 * Trailing combining class == 0.
2644 if (data
->fcdPosition
> start
&&
2645 (ch
>= NFC_ZERO_CC_BLOCK_LIMIT_
|| prevch
>= NFC_ZERO_CC_BLOCK_LIMIT_
))
2648 Need a more complete FCD check and possible normalization.
2649 normalize substring will be appended to buffer
2651 const UChar
*backuppos
= data
->pos
;
2653 if (collPrevIterFCD(data
)) {
2654 normalizePrevContraction(data
, status
);
2655 return *(data
->pos
- 1);
2657 data
->pos
= backuppos
;
2658 data
->fcdPosition
++;
2663 no normalization is to be done hence only one character will be
2664 appended to the buffer.
2666 insertBufferFront(data
, ch
);
2667 data
->fcdPosition
--;
2673 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2674 /* It is called by getNextCE */
2676 /* The following should be even */
2677 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2679 uint32_t ucol_prv_getSpecialCE(const UCollator
*coll
, UChar ch
, uint32_t CE
, collIterate
*source
, UErrorCode
*status
) {
2680 collIterateState entryState
;
2681 backupState(source
, &entryState
);
2685 // This loop will repeat only in the case of contractions, and only when a contraction
2686 // is found and the first CE resulting from that contraction is itself a special
2687 // (an expansion, for example.) All other special CE types are fully handled the
2688 // first time through, and the loop exits.
2690 const uint32_t *CEOffset
= NULL
;
2691 switch(getCETag(CE
)) {
2693 /* This one is not found, and we'll let somebody else bother about it... no more games */
2697 // Special processing is getting a CE that is preceded by a certain prefix
2698 // Currently this is only needed for optimizing Japanese length and iteration marks.
2699 // When we encouter a special processing tag, we go backwards and try to see if
2701 // Contraction tables are used - so the whole process is not unlike contraction.
2702 // prefix data is stored backwards in the table.
2703 const UChar
*UCharOffset
;
2705 collIterateState prefixState
;
2706 backupState(source
, &prefixState
);
2707 loadState(source
, &entryState
, TRUE
);
2708 goBackOne(source
); // We want to look at the point where we entered - actually one
2712 // This loop will run once per source string character, for as long as we
2713 // are matching a potential contraction sequence
2715 // First we position ourselves at the begining of contraction sequence
2716 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2717 if (collIter_bos(source
)) {
2718 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2721 schar
= getPrevNormalizedChar(source
, status
);
2724 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2728 if (schar
== tchar
) {
2729 // Found the source string char in the table.
2730 // Pick up the corresponding CE from the table.
2731 CE
= *(coll
->contractionCEs
+
2732 (UCharOffset
- coll
->contractionIndex
));
2736 // Source string char was not in the table.
2737 // We have not found the prefix.
2738 CE
= *(coll
->contractionCEs
+
2739 (ContractionStart
- coll
->contractionIndex
));
2743 // The source string char was in the contraction table, and the corresponding
2744 // CE is not a prefix CE. We found the prefix, break
2745 // out of loop, this CE will end up being returned. This is the normal
2746 // way out of prefix handling when the source actually contained
2751 if(CE
!= UCOL_NOT_FOUND
) { // we found something and we can merilly continue
2752 loadState(source
, &prefixState
, TRUE
);
2753 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2754 source
->flags
= source
->origFlags
;
2756 } else { // prefix search was a failure, we have to backup all the way to the start
2757 loadState(source
, &entryState
, TRUE
);
2761 case CONTRACTION_TAG
:
2763 /* This should handle contractions */
2764 collIterateState state
;
2765 backupState(source
, &state
);
2766 uint32_t firstCE
= *(coll
->contractionCEs
+ ((UChar
*)coll
->image
+getContractOffset(CE
) - coll
->contractionIndex
)); //UCOL_NOT_FOUND;
2767 const UChar
*UCharOffset
;
2771 /* This loop will run once per source string character, for as long as we */
2772 /* are matching a potential contraction sequence */
2774 /* First we position ourselves at the begining of contraction sequence */
2775 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
2777 if (collIter_eos(source
)) {
2778 // Ran off the end of the source string.
2779 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
2780 // So we'll pick whatever we have at the point...
2781 if (CE
== UCOL_NOT_FOUND
) {
2782 // back up the source over all the chars we scanned going into this contraction.
2784 loadState(source
, &state
, TRUE
);
2785 if(source
->origFlags
& UCOL_USE_ITERATOR
) {
2786 source
->flags
= source
->origFlags
;
2792 uint8_t maxCC
= (uint8_t)(*(UCharOffset
)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2793 uint8_t allSame
= (uint8_t)(*(UCharOffset
++)>>8);
2795 schar
= getNextNormalizedChar(source
);
2796 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2800 if (schar
== tchar
) {
2801 // Found the source string char in the contraction table.
2802 // Pick up the corresponding CE from the table.
2803 CE
= *(coll
->contractionCEs
+
2804 (UCharOffset
- coll
->contractionIndex
));
2808 // Source string char was not in contraction table.
2809 // Unless we have a discontiguous contraction, we have finished
2810 // with this contraction.
2811 // in order to do the proper detection, we
2812 // need to see if we're dealing with a supplementary
2813 /* We test whether the next two char are surrogate pairs.
2814 * This test is done if the iterator is not NULL.
2815 * If there is no surrogate pair, the iterator
2816 * goes back one if needed. */
2817 UChar32 miss
= schar
;
2818 if (source
->iterator
) {
2819 UChar32 surrNextChar
; /* the next char in the iteration to test */
2820 int32_t prevPos
; /* holds the previous position before move forward of the source iterator */
2821 if(U16_IS_LEAD(schar
) && source
->iterator
->hasNext(source
->iterator
)) {
2822 prevPos
= source
->iterator
->index
;
2823 surrNextChar
= getNextNormalizedChar(source
);
2824 if (U16_IS_TRAIL(surrNextChar
)) {
2825 miss
= U16_GET_SUPPLEMENTARY(schar
, surrNextChar
);
2826 } else if (prevPos
< source
->iterator
->index
){
2830 } else if (U16_IS_LEAD(schar
)) {
2831 miss
= U16_GET_SUPPLEMENTARY(schar
, getNextNormalizedChar(source
));
2837 (sCC
= i_getCombiningClass(miss
, coll
)) == 0 ||
2839 (allSame
!= 0 && sCC
== maxCC
) ||
2840 collIter_eos(source
))
2842 // Contraction can not be discontiguous.
2843 goBackOne(source
); // back up the source string by one,
2844 // because the character we just looked at was
2845 // not part of the contraction. */
2846 if(U_IS_SUPPLEMENTARY(miss
)) {
2849 CE
= *(coll
->contractionCEs
+
2850 (ContractionStart
- coll
->contractionIndex
));
2853 // Contraction is possibly discontiguous.
2854 // Scan more of source string looking for a match
2857 /* find the next character if schar is not a base character
2858 and we are not yet at the end of the string */
2859 tempchar
= getNextNormalizedChar(source
);
2860 // probably need another supplementary thingie here
2862 if (i_getCombiningClass(tempchar
, coll
) == 0) {
2864 if(U_IS_SUPPLEMENTARY(miss
)) {
2867 /* Spit out the last char of the string, wasn't tasty enough */
2868 CE
= *(coll
->contractionCEs
+
2869 (ContractionStart
- coll
->contractionIndex
));
2871 CE
= getDiscontiguous(coll
, source
, ContractionStart
);
2874 } // else after if(schar == tchar)
2876 if(CE
== UCOL_NOT_FOUND
) {
2877 /* The Source string did not match the contraction that we were checking. */
2878 /* Back up the source position to undo the effects of having partially */
2879 /* scanned through what ultimately proved to not be a contraction. */
2880 loadState(source
, &state
, TRUE
);
2885 if(!isContraction(CE
)) {
2886 // The source string char was in the contraction table, and the corresponding
2887 // CE is not a contraction CE. We completed the contraction, break
2888 // out of loop, this CE will end up being returned. This is the normal
2889 // way out of contraction handling when the source actually contained
2895 // The source string char was in the contraction table, and the corresponding
2896 // CE is IS a contraction CE. We will continue looping to check the source
2897 // string for the remaining chars in the contraction.
2898 uint32_t tempCE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
2899 if(tempCE
!= UCOL_NOT_FOUND
) {
2900 // We have scanned a a section of source string for which there is a
2901 // CE from the contraction table. Remember the CE and scan position, so
2902 // that we can return to this point if further scanning fails to
2903 // match a longer contraction sequence.
2907 backupState(source
, &state
);
2908 getNextNormalizedChar(source
);
2910 // Another way to do this is:
2911 //collIterateState tempState;
2912 //backupState(source, &tempState);
2913 //goBackOne(source);
2914 //backupState(source, &state);
2915 //loadState(source, &tempState, TRUE);
2917 // The problem is that for incomplete contractions we have to remember the previous
2918 // position. Before, the only thing I needed to do was state.pos--;
2919 // After iterator introduction and especially after introduction of normalizing
2920 // iterators, it became much more difficult to decrease the saved state.
2921 // I'm not yet sure which of the two methods above is faster.
2925 } // case CONTRACTION_TAG:
2926 case LONG_PRIMARY_TAG
:
2928 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
2929 CE
= ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
2930 source
->offsetRepeatCount
+= 1;
2935 /* This should handle expansion. */
2936 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2937 /* I have to decide where continuations are going to be dealt with */
2939 uint32_t i
; /* general counter */
2941 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
2942 size
= getExpansionCount(CE
);
2944 //source->offsetRepeatCount = -1;
2946 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2947 for(i
= 1; i
<size
; i
++) {
2948 *(source
->CEpos
++) = *CEOffset
++;
2949 source
->offsetRepeatCount
+= 1;
2951 } else { /* else, we do */
2952 while(*CEOffset
!= 0) {
2953 *(source
->CEpos
++) = *CEOffset
++;
2954 source
->offsetRepeatCount
+= 1;
2963 We do a check to see if we want to collate digits as numbers; if so we generate
2964 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2967 uint32_t i
; /* general counter */
2969 if (source
->coll
->numericCollation
== UCOL_ON
){
2970 collIterateState digitState
= {0,0,0,0,0,0,0,0,0};
2974 uint32_t digIndx
= 0;
2975 uint32_t endIndex
= 0;
2976 uint32_t trailingZeroIndex
= 0;
2978 uint8_t collateVal
= 0;
2980 UBool nonZeroValReached
= FALSE
;
2982 uint8_t numTempBuf
[UCOL_MAX_DIGITS_FOR_NUMBER
/2 + 3]; // I just need a temporary place to store my generated CEs.
2984 We parse the source string until we hit a char that's NOT a digit.
2985 Use this u_charDigitValue. This might be slow because we have to
2986 handle surrogates...
2989 if (U16_IS_LEAD(ch)){
2990 if (!collIter_eos(source)) {
2991 backupState(source, &digitState);
2992 UChar trail = getNextNormalizedChar(source);
2993 if(U16_IS_TRAIL(trail)) {
2994 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2996 loadState(source, &digitState, TRUE);
3005 digVal = u_charDigitValue(char32);
3007 digVal
= u_charDigitValue(cp
); // if we have arrived here, we have
3008 // already processed possible supplementaries that trigered the digit tag -
3009 // all supplementaries are marked in the UCA.
3011 We pad a zero in front of the first element anyways. This takes
3012 care of the (probably) most common case where people are sorting things followed
3017 // Make sure we have enough space. No longer needed;
3018 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3019 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3020 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3022 // Skipping over leading zeroes.
3024 nonZeroValReached
= TRUE
;
3026 if (nonZeroValReached
) {
3028 We parse the digit string into base 100 numbers (this fits into a byte).
3029 We only add to the buffer in twos, thus if we are parsing an odd character,
3030 that serves as the 'tens' digit while the if we are parsing an even one, that
3031 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3032 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3033 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3034 than all the other bytes.
3037 if (digIndx
% 2 == 1){
3038 collateVal
+= (uint8_t)digVal
;
3040 // We don't enter the low-order-digit case unless we've already seen
3041 // the high order, or for the first digit, which is always non-zero.
3042 if (collateVal
!= 0)
3043 trailingZeroIndex
= 0;
3045 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3049 // We drop the collation value into the buffer so if we need to do
3050 // a "front patch" we don't have to check to see if we're hitting the
3052 collateVal
= (uint8_t)(digVal
* 10);
3054 // Check for trailing zeroes.
3055 if (collateVal
== 0)
3057 if (!trailingZeroIndex
)
3058 trailingZeroIndex
= (digIndx
/2) + 2;
3061 trailingZeroIndex
= 0;
3063 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3068 // Get next character.
3069 if (!collIter_eos(source
)){
3070 ch
= getNextNormalizedChar(source
);
3071 if (U16_IS_LEAD(ch
)){
3072 if (!collIter_eos(source
)) {
3073 backupState(source
, &digitState
);
3074 UChar trail
= getNextNormalizedChar(source
);
3075 if(U16_IS_TRAIL(trail
)) {
3076 char32
= U16_GET_SUPPLEMENTARY(ch
, trail
);
3078 loadState(source
, &digitState
, TRUE
);
3086 if ((digVal
= u_charDigitValue(char32
)) == -1 || digIndx
> UCOL_MAX_DIGITS_FOR_NUMBER
){
3087 // Resetting position to point to the next unprocessed char. We
3088 // overshot it when doing our test/set for numbers.
3089 if (char32
> 0xFFFF) { // For surrogates.
3090 loadState(source
, &digitState
, TRUE
);
3091 //goBackOne(source);
3101 if (nonZeroValReached
== FALSE
){
3106 endIndex
= trailingZeroIndex
? trailingZeroIndex
: ((digIndx
/2) + 2) ;
3107 if (digIndx
% 2 != 0){
3109 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3110 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3111 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3112 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3115 for(i
= 2; i
< endIndex
; i
++){
3116 numTempBuf
[i
] = (((((numTempBuf
[i
] - 6)/2) % 10) * 10) +
3117 (((numTempBuf
[i
+1])-6)/2) / 10) * 2 + 6;
3122 // Subtract one off of the last byte.
3123 numTempBuf
[endIndex
-1] -= 1;
3126 We want to skip over the first two slots in the buffer. The first slot
3127 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3128 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3130 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
3131 numTempBuf
[1] = (uint8_t)(0x80 + ((digIndx
/2) & 0x7F));
3133 // Now transfer the collation key to our collIterate struct.
3134 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3135 //size = ((endIndex+1) & ~1)/2;
3136 CE
= (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3137 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3138 UCOL_BYTE_COMMON
; // Tertiary weight.
3139 i
= 2; // Reset the index into the buffer.
3142 uint32_t primWeight
= numTempBuf
[i
++] << 8;
3144 primWeight
|= numTempBuf
[i
++];
3145 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3149 // no numeric mode, we'll just switch to whatever we stashed and continue
3150 CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
3156 /* various implicits optimization */
3157 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
3158 /* UCA is filled with these. Tailorings are NOT_FOUND */
3159 return getImplicit(cp
, source
);
3160 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3161 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3162 return getImplicit(cp
, source
);
3163 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3165 static const uint32_t
3166 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3167 //const uint32_t LCount = 19;
3168 static const uint32_t VCount
= 21;
3169 static const uint32_t TCount
= 28;
3170 //const uint32_t NCount = VCount * TCount; // 588
3171 //const uint32_t SCount = LCount * NCount; // 11172
3172 uint32_t L
= ch
- SBase
;
3174 // divide into pieces
3176 uint32_t T
= L
% TCount
; // we do it in this order since some compilers can do % and / in one operation
3178 uint32_t V
= L
% VCount
;
3187 // return the first CE, but first put the rest into the expansion buffer
3188 if (!source
->coll
->image
->jamoSpecial
) { // FAST PATH
3190 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, V
);
3192 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, T
);
3195 return UTRIE_GET32_FROM_LEAD(&coll
->mapping
, L
);
3197 } else { // Jamo is Special
3198 // Since Hanguls pass the FCD check, it is
3199 // guaranteed that we won't be in
3200 // the normalization buffer if something like this happens
3202 // However, if we are using a uchar iterator and normalization
3203 // is ON, the Hangul that lead us here is going to be in that
3204 // normalization buffer. Here we want to restore the uchar
3205 // iterator state and pull out of the normalization buffer
3206 if(source
->iterator
!= NULL
&& source
->flags
& UCOL_ITER_INNORMBUF
) {
3207 source
->flags
= source
->origFlags
; // restore the iterator
3211 // Move Jamos into normalization buffer
3212 UChar
*buffer
= source
->writableBuffer
.getBuffer(4);
3213 int32_t bufferLength
;
3214 buffer
[0] = (UChar
)L
;
3215 buffer
[1] = (UChar
)V
;
3217 buffer
[2] = (UChar
)T
;
3222 source
->writableBuffer
.releaseBuffer(bufferLength
);
3224 // Indicate where to continue in main input string after exhausting the writableBuffer
3225 source
->fcdPosition
= source
->pos
;
3227 source
->pos
= source
->writableBuffer
.getTerminatedBuffer();
3228 source
->origFlags
= source
->flags
;
3229 source
->flags
|= UCOL_ITER_INNORMBUF
;
3230 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
3232 return(UCOL_IGNORABLE
);
3236 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3237 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3238 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3239 /* we treat it like an unassigned code point. */
3242 collIterateState state
;
3243 backupState(source
, &state
);
3244 if (collIter_eos(source
) || !(U16_IS_TRAIL((trail
= getNextNormalizedChar(source
))))) {
3245 // we chould have stepped one char forward and it might have turned that it
3246 // was not a trail surrogate. In that case, we have to backup.
3247 loadState(source
, &state
, TRUE
);
3248 return UCOL_NOT_FOUND
;
3250 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3251 CE
= UTRIE_GET32_FROM_OFFSET_TRAIL(&coll
->mapping
, CE
&0xFFFFFF, trail
);
3252 if(CE
== UCOL_NOT_FOUND
) { // there are tailored surrogates in this block, but not this one.
3253 // We need to backup
3254 loadState(source
, &state
, TRUE
);
3257 // calculate the supplementary code point value, if surrogate was not tailored
3258 cp
= ((((uint32_t)ch
)<<10UL)+(trail
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3262 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
3264 if( source
->flags
& UCOL_USE_ITERATOR
) {
3265 if(U_IS_TRAIL(nextChar
= (UChar
)source
->iterator
->current(source
->iterator
))) {
3266 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3267 source
->iterator
->next(source
->iterator
);
3268 return getImplicit(cp
, source
);
3270 } else if((((source
->flags
& UCOL_ITER_HASLEN
) == 0 ) || (source
->pos
<source
->endp
)) &&
3271 U_IS_TRAIL((nextChar
=*source
->pos
))) {
3272 cp
= U16_GET_SUPPLEMENTARY(ch
, nextChar
);
3274 return getImplicit(cp
, source
);
3276 return UCOL_NOT_FOUND
;
3277 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
3278 return UCOL_NOT_FOUND
; /* broken surrogate sequence */
3280 /* not yet implemented */
3281 /* probably after 1.8 */
3282 return UCOL_NOT_FOUND
;
3284 *status
= U_INTERNAL_PROGRAM_ERROR
;
3288 if (CE
<= UCOL_NOT_FOUND
) break;
3294 /* now uses Mark's getImplicitPrimary code */
3296 inline uint32_t getPrevImplicit(UChar32 cp
, collIterate
*collationSource
) {
3297 uint32_t r
= uprv_uca_getImplicitPrimary(cp
);
3299 *(collationSource
->CEpos
++) = (r
& UCOL_PRIMARYMASK
) | 0x00000505;
3300 collationSource
->toReturn
= collationSource
->CEpos
;
3302 // **** doesn't work if using iterator ****
3303 if (collationSource
->flags
& UCOL_ITER_INNORMBUF
) {
3304 collationSource
->offsetRepeatCount
= 1;
3306 int32_t firstOffset
= (int32_t)(collationSource
->pos
- collationSource
->string
);
3308 UErrorCode errorCode
= U_ZERO_ERROR
;
3309 collationSource
->appendOffset(firstOffset
, errorCode
);
3310 collationSource
->appendOffset(firstOffset
+ 1, errorCode
);
3312 collationSource
->offsetReturn
= collationSource
->offsetStore
- 1;
3313 *(collationSource
->offsetBuffer
) = firstOffset
;
3314 if (collationSource
->offsetReturn
== collationSource
->offsetBuffer
) {
3315 collationSource
->offsetStore
= collationSource
->offsetBuffer
;
3319 return ((r
& 0x0000FFFF)<<16) | 0x000000C0;
3323 * This function handles the special CEs like contractions, expansions,
3325 * It is called by both getPrevCE
3327 uint32_t ucol_prv_getSpecialPrevCE(const UCollator
*coll
, UChar ch
, uint32_t CE
,
3328 collIterate
*source
,
3331 const uint32_t *CEOffset
= NULL
;
3332 UChar
*UCharOffset
= NULL
;
3334 const UChar
*constart
= NULL
;
3336 UChar buffer
[UCOL_MAX_BUFFER
];
3337 uint32_t *endCEBuffer
;
3339 int32_t noChars
= 0;
3340 int32_t CECount
= 0;
3344 /* the only ces that loops are thai and contractions */
3345 switch (getCETag(CE
))
3347 case NOT_FOUND_TAG
: /* this tag always returns */
3352 // Special processing is getting a CE that is preceded by a certain prefix
3353 // Currently this is only needed for optimizing Japanese length and iteration marks.
3354 // When we encouter a special processing tag, we go backwards and try to see if
3356 // Contraction tables are used - so the whole process is not unlike contraction.
3357 // prefix data is stored backwards in the table.
3358 const UChar
*UCharOffset
;
3360 collIterateState prefixState
;
3361 backupState(source
, &prefixState
);
3363 // This loop will run once per source string character, for as long as we
3364 // are matching a potential contraction sequence
3366 // First we position ourselves at the begining of contraction sequence
3367 const UChar
*ContractionStart
= UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
3369 if (collIter_bos(source
)) {
3370 CE
= *(coll
->contractionCEs
+ (UCharOffset
- coll
->contractionIndex
));
3373 schar
= getPrevNormalizedChar(source
, status
);
3376 while(schar
> (tchar
= *UCharOffset
)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3380 if (schar
== tchar
) {
3381 // Found the source string char in the table.
3382 // Pick up the corresponding CE from the table.
3383 CE
= *(coll
->contractionCEs
+
3384 (UCharOffset
- coll
->contractionIndex
));
3388 // if there is a completely ignorable code point in the middle of
3389 // a prefix, we need to act as if it's not there
3390 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3391 // lone surrogates cannot be set to zero as it would break other processing
3392 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, schar
);
3393 // it's easy for BMP code points
3396 } else if(U16_IS_SURROGATE(schar
)) {
3397 // for supplementary code points, we have to check the next one
3398 // situations where we are going to ignore
3399 // 1. beginning of the string: schar is a lone surrogate
3400 // 2. schar is a lone surrogate
3401 // 3. schar is a trail surrogate in a valid surrogate sequence
3402 // that is explicitly set to zero.
3403 if (!collIter_bos(source
)) {
3405 if(!U16_IS_SURROGATE_LEAD(schar
) && U16_IS_LEAD(lead
= getPrevNormalizedChar(source
, status
))) {
3406 isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, lead
);
3407 if(isSpecial(isZeroCE
) && getCETag(isZeroCE
) == SURROGATE_TAG
) {
3408 uint32_t finalCE
= UTRIE_GET32_FROM_OFFSET_TRAIL(&coll
->mapping
, isZeroCE
&0xFFFFFF, schar
);
3410 // this is a real, assigned completely ignorable code point
3416 // lone surrogate, treat like unassigned
3417 return UCOL_NOT_FOUND
;
3420 // lone surrogate at the beggining, treat like unassigned
3421 return UCOL_NOT_FOUND
;
3424 // Source string char was not in the table.
3425 // We have not found the prefix.
3426 CE
= *(coll
->contractionCEs
+
3427 (ContractionStart
- coll
->contractionIndex
));
3431 // The source string char was in the contraction table, and the corresponding
3432 // CE is not a prefix CE. We found the prefix, break
3433 // out of loop, this CE will end up being returned. This is the normal
3434 // way out of prefix handling when the source actually contained
3439 loadState(source
, &prefixState
, TRUE
);
3443 case CONTRACTION_TAG
: {
3444 /* to ensure that the backwards and forwards iteration matches, we
3445 take the current region of most possible match and pass it through
3446 the forward iteration. this will ensure that the obstinate problem of
3447 overlapping contractions will not occur.
3449 schar
= peekCodeUnit(source
, 0);
3450 constart
= (UChar
*)coll
->image
+ getContractOffset(CE
);
3451 if (isAtStartPrevIterate(source
)
3452 /* commented away contraction end checks after adding the checks
3454 /* start of string or this is not the end of any contraction */
3455 CE
= *(coll
->contractionCEs
+
3456 (constart
- coll
->contractionIndex
));
3460 UCharOffset
= strbuffer
+ (UCOL_MAX_BUFFER
- 1);
3461 *(UCharOffset
--) = 0;
3463 // have to swap thai characters
3464 while (ucol_unsafeCP(schar
, coll
)) {
3465 *(UCharOffset
) = schar
;
3468 schar
= getPrevNormalizedChar(source
, status
);
3470 // TODO: when we exhaust the contraction buffer,
3471 // it needs to get reallocated. The problem is
3472 // that the size depends on the string which is
3473 // not iterated over. However, since we're travelling
3474 // backwards, we already had to set the iterator at
3475 // the end - so we might as well know where we are?
3476 if (UCharOffset
+ 1 == buffer
) {
3477 /* we have exhausted the buffer */
3478 int32_t newsize
= 0;
3479 if(source
->pos
) { // actually dealing with a position
3480 newsize
= (int32_t)(source
->pos
- source
->string
+ 1);
3481 } else { // iterator
3482 newsize
= 4 * UCOL_MAX_BUFFER
;
3484 strbuffer
= (UChar
*)uprv_malloc(sizeof(UChar
) *
3485 (newsize
+ UCOL_MAX_BUFFER
));
3487 if (strbuffer
== NULL
) {
3488 *status
= U_MEMORY_ALLOCATION_ERROR
;
3489 return UCOL_NO_MORE_CES
;
3491 UCharOffset
= strbuffer
+ newsize
;
3492 uprv_memcpy(UCharOffset
, buffer
,
3493 UCOL_MAX_BUFFER
* sizeof(UChar
));
3496 if ((source
->pos
&& (source
->pos
== source
->string
||
3497 ((source
->flags
& UCOL_ITER_INNORMBUF
) &&
3498 *(source
->pos
- 1) == 0 && source
->fcdPosition
== NULL
)))
3499 || (source
->iterator
&& !source
->iterator
->hasPrevious(source
->iterator
))) {
3503 /* adds the initial base character to the string */
3504 *(UCharOffset
) = schar
;
3509 // **** doesn't work if using iterator ****
3510 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
3513 offsetBias
= (int32_t)(source
->pos
- source
->string
);
3516 /* a new collIterate is used to simplify things, since using the current
3517 collIterate will mean that the forward and backwards iteration will
3518 share and change the same buffers. we don't want to get into that. */
3522 IInit_collIterate(coll
, UCharOffset
, noChars
, &temp
, status
);
3523 if(U_FAILURE(*status
)) {
3524 return UCOL_NULLORDER
;
3526 temp
.flags
&= ~UCOL_ITER_NORM
;
3527 temp
.flags
|= source
->flags
& UCOL_FORCE_HAN_IMPLICIT
;
3529 rawOffset
= (int32_t)(temp
.pos
- temp
.string
); // should always be zero?
3530 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3532 if (source
->extendCEs
) {
3533 endCEBuffer
= source
->extendCEs
+ source
->extendCEsSize
;
3534 CECount
= (int32_t)((source
->CEpos
- source
->extendCEs
)/sizeof(uint32_t));
3536 endCEBuffer
= source
->CEs
+ UCOL_EXPAND_CE_BUFFER_SIZE
;
3537 CECount
= (int32_t)((source
->CEpos
- source
->CEs
)/sizeof(uint32_t));
3540 while (CE
!= UCOL_NO_MORE_CES
) {
3541 *(source
->CEpos
++) = CE
;
3543 if (offsetBias
>= 0) {
3544 source
->appendOffset(rawOffset
+ offsetBias
, *status
);
3548 if (source
->CEpos
== endCEBuffer
) {
3549 /* ran out of CE space, reallocate to new buffer.
3550 If reallocation fails, reset pointers and bail out,
3551 there's no guarantee of the right character position after
3553 if (!increaseCEsCapacity(source
)) {
3554 *status
= U_MEMORY_ALLOCATION_ERROR
;
3558 endCEBuffer
= source
->extendCEs
+ source
->extendCEsSize
;
3561 if ((temp
.flags
& UCOL_ITER_INNORMBUF
) != 0) {
3562 rawOffset
= (int32_t)(temp
.fcdPosition
- temp
.string
);
3564 rawOffset
= (int32_t)(temp
.pos
- temp
.string
);
3567 CE
= ucol_IGetNextCE(coll
, &temp
, status
);
3570 if (strbuffer
!= buffer
) {
3571 uprv_free(strbuffer
);
3573 if (U_FAILURE(*status
)) {
3574 return (uint32_t)UCOL_NULLORDER
;
3577 if (source
->offsetRepeatValue
!= 0) {
3578 if (CECount
> noChars
) {
3579 source
->offsetRepeatCount
+= temp
.offsetRepeatCount
;
3581 // **** does this really skip the right offsets? ****
3582 source
->offsetReturn
-= (noChars
- CECount
);
3586 if (offsetBias
>= 0) {
3587 source
->offsetReturn
= source
->offsetStore
- 1;
3588 if (source
->offsetReturn
== source
->offsetBuffer
) {
3589 source
->offsetStore
= source
->offsetBuffer
;
3593 source
->toReturn
= source
->CEpos
- 1;
3594 if (source
->toReturn
== source
->CEs
) {
3595 source
->CEpos
= source
->CEs
;
3598 return *(source
->toReturn
);
3600 case LONG_PRIMARY_TAG
:
3602 *(source
->CEpos
++) = ((CE
& 0xFFFF00) << 8) | (UCOL_BYTE_COMMON
<< 8) | UCOL_BYTE_COMMON
;
3603 *(source
->CEpos
++) = ((CE
& 0xFF)<<24)|UCOL_CONTINUATION_MARKER
;
3604 source
->toReturn
= source
->CEpos
- 1;
3606 if (source
->flags
& UCOL_ITER_INNORMBUF
) {
3607 source
->offsetRepeatCount
= 1;
3609 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
3611 source
->appendOffset(firstOffset
, *status
);
3612 source
->appendOffset(firstOffset
+ 1, *status
);
3614 source
->offsetReturn
= source
->offsetStore
- 1;
3615 *(source
->offsetBuffer
) = firstOffset
;
3616 if (source
->offsetReturn
== source
->offsetBuffer
) {
3617 source
->offsetStore
= source
->offsetBuffer
;
3622 return *(source
->toReturn
);
3625 case EXPANSION_TAG
: /* this tag always returns */
3628 This should handle expansion.
3629 NOTE: we can encounter both continuations and expansions in an expansion!
3630 I have to decide where continuations are going to be dealt with
3632 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
3634 // **** doesn't work if using iterator ****
3635 if (source
->offsetReturn
!= NULL
) {
3636 if (! (source
->flags
& UCOL_ITER_INNORMBUF
) && source
->offsetReturn
== source
->offsetBuffer
) {
3637 source
->offsetStore
= source
->offsetBuffer
;
3643 /* find the offset to expansion table */
3644 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3645 size
= getExpansionCount(CE
);
3648 if there are less than 16 elements in expansion, we don't terminate
3652 for (count
= 0; count
< size
; count
++) {
3653 *(source
->CEpos
++) = *CEOffset
++;
3655 if (firstOffset
>= 0) {
3656 source
->appendOffset(firstOffset
+ 1, *status
);
3661 while (*CEOffset
!= 0) {
3662 *(source
->CEpos
++) = *CEOffset
++;
3664 if (firstOffset
>= 0) {
3665 source
->appendOffset(firstOffset
+ 1, *status
);
3670 if (firstOffset
>= 0) {
3671 source
->offsetReturn
= source
->offsetStore
- 1;
3672 *(source
->offsetBuffer
) = firstOffset
;
3673 if (source
->offsetReturn
== source
->offsetBuffer
) {
3674 source
->offsetStore
= source
->offsetBuffer
;
3677 source
->offsetRepeatCount
+= size
- 1;
3680 source
->toReturn
= source
->CEpos
- 1;
3681 // in case of one element expansion, we
3682 // want to immediately return CEpos
3683 if(source
->toReturn
== source
->CEs
) {
3684 source
->CEpos
= source
->CEs
;
3687 return *(source
->toReturn
);
3693 We do a check to see if we want to collate digits as numbers; if so we generate
3694 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3696 uint32_t i
; /* general counter */
3698 if (source
->coll
->numericCollation
== UCOL_ON
){
3699 uint32_t digIndx
= 0;
3700 uint32_t endIndex
= 0;
3701 uint32_t leadingZeroIndex
= 0;
3702 uint32_t trailingZeroCount
= 0;
3704 uint8_t collateVal
= 0;
3706 UBool nonZeroValReached
= FALSE
;
3708 uint8_t numTempBuf
[UCOL_MAX_DIGITS_FOR_NUMBER
/2 + 2]; // I just need a temporary place to store my generated CEs.
3710 We parse the source string until we hit a char that's NOT a digit.
3711 Use this u_charDigitValue. This might be slow because we have to
3712 handle surrogates...
3715 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3716 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3717 element we process when going backward. To determine how long that chunk might be, we may need to make
3718 two passes through the loop that collects digits - one to see how long the string is (and how much is
3719 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3720 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3721 element chunk after resetting the state to the initialState at the right side of the digit string.
3723 uint32_t ceLimit
= 0;
3724 UChar initial_ch
= ch
;
3725 collIterateState initialState
= {0,0,0,0,0,0,0,0,0};
3726 backupState(source
, &initialState
);
3729 collIterateState state
= {0,0,0,0,0,0,0,0,0};
3733 if (U16_IS_TRAIL (ch
)) {
3734 if (!collIter_bos(source
)){
3735 UChar lead
= getPrevNormalizedChar(source
, status
);
3736 if(U16_IS_LEAD(lead
)) {
3737 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
3748 digVal
= u_charDigitValue(char32
);
3751 // Make sure we have enough space. No longer needed;
3752 // at this point the largest value of digIndx when we need to save data in numTempBuf
3753 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3754 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3756 // Skip over trailing zeroes, and keep a count of them.
3758 nonZeroValReached
= TRUE
;
3760 if (nonZeroValReached
) {
3762 We parse the digit string into base 100 numbers (this fits into a byte).
3763 We only add to the buffer in twos, thus if we are parsing an odd character,
3764 that serves as the 'tens' digit while the if we are parsing an even one, that
3765 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3766 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3767 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3768 than all the other bytes.
3770 Since we're doing in this reverse we want to put the first digit encountered into the
3771 ones place and the second digit encountered into the tens place.
3774 if ((digIndx
+ trailingZeroCount
) % 2 == 1) {
3775 // High-order digit case (tens place)
3776 collateVal
+= (uint8_t)(digVal
* 10);
3778 // We cannot set leadingZeroIndex unless it has been set for the
3779 // low-order digit. Therefore, all we can do for the high-order
3780 // digit is turn it off, never on.
3781 // The only time we will have a high digit without a low is for
3782 // the very first non-zero digit, so no zero check is necessary.
3783 if (collateVal
!= 0)
3784 leadingZeroIndex
= 0;
3786 // The first pass through, digIndx may exceed the limit, but in that case
3787 // we no longer care about numTempBuf contents since they will be discarded
3788 if ( digIndx
< UCOL_MAX_DIGITS_FOR_NUMBER
) {
3789 numTempBuf
[(digIndx
/2) + 2] = collateVal
*2 + 6;
3793 // Low-order digit case (ones place)
3794 collateVal
= (uint8_t)digVal
;
3796 // Check for leading zeroes.
3797 if (collateVal
== 0) {
3798 if (!leadingZeroIndex
)
3799 leadingZeroIndex
= (digIndx
/2) + 2;
3801 leadingZeroIndex
= 0;
3803 // No need to write to buffer; the case of a last odd digit
3804 // is handled below.
3808 ++trailingZeroCount
;
3810 if (!collIter_bos(source
)) {
3811 ch
= getPrevNormalizedChar(source
, status
);
3812 //goBackOne(source);
3813 if (U16_IS_TRAIL(ch
)) {
3814 backupState(source
, &state
);
3815 if (!collIter_bos(source
)) {
3817 UChar lead
= getPrevNormalizedChar(source
, status
);
3819 if(U16_IS_LEAD(lead
)) {
3820 char32
= U16_GET_SUPPLEMENTARY(lead
,ch
);
3822 loadState(source
, &state
, FALSE
);
3829 if ((digVal
= u_charDigitValue(char32
)) == -1 || (ceLimit
> 0 && (digIndx
+ trailingZeroCount
) >= ceLimit
)) {
3830 if (char32
> 0xFFFF) {// For surrogates.
3831 loadState(source
, &state
, FALSE
);
3833 // Don't need to "reverse" the goBackOne call,
3834 // as this points to the next position to process..
3835 //if (char32 > 0xFFFF) // For surrogates.
3836 //getNextNormalizedChar(source);
3845 if (digIndx
+ trailingZeroCount
<= UCOL_MAX_DIGITS_FOR_NUMBER
) {
3846 // our collation element is not too big, go ahead and finish with it
3849 // our digit string is too long for a collation element;
3850 // set the limit for it, reset the state and begin again
3851 ceLimit
= (digIndx
+ trailingZeroCount
) % UCOL_MAX_DIGITS_FOR_NUMBER
;
3852 if ( ceLimit
== 0 ) {
3853 ceLimit
= UCOL_MAX_DIGITS_FOR_NUMBER
;
3856 loadState(source
, &initialState
, FALSE
);
3857 digIndx
= endIndex
= leadingZeroIndex
= trailingZeroCount
= 0;
3859 nonZeroValReached
= FALSE
;
3862 if (! nonZeroValReached
) {
3864 trailingZeroCount
= 0;
3868 if ((digIndx
+ trailingZeroCount
) % 2 != 0) {
3869 numTempBuf
[((digIndx
)/2) + 2] = collateVal
*2 + 6;
3870 digIndx
+= 1; // The implicit leading zero
3872 if (trailingZeroCount
% 2 != 0) {
3873 // We had to consume one trailing zero for the low digit
3874 // of the least significant byte
3875 digIndx
+= 1; // The trailing zero not in the exponent
3876 trailingZeroCount
-= 1;
3879 endIndex
= leadingZeroIndex
? leadingZeroIndex
: ((digIndx
/2) + 2) ;
3881 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3885 We want to skip over the first two slots in the buffer. The first slot
3886 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3887 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3888 The exponent must be adjusted by the number of leading zeroes, and the number of
3891 numTempBuf
[0] = UCOL_CODAN_PLACEHOLDER
;
3892 uint32_t exponent
= (digIndx
+trailingZeroCount
)/2;
3893 if (leadingZeroIndex
)
3894 exponent
-= ((digIndx
/2) + 2 - leadingZeroIndex
);
3895 numTempBuf
[1] = (uint8_t)(0x80 + (exponent
& 0x7F));
3897 // Now transfer the collation key to our collIterate struct.
3898 // The total size for our collation key is half of endIndex, rounded up.
3899 int32_t size
= (endIndex
+1)/2;
3900 if(!ensureCEsCapacity(source
, size
)) {
3901 return UCOL_NULLORDER
;
3903 *(source
->CEpos
++) = (((numTempBuf
[0] << 8) | numTempBuf
[1]) << UCOL_PRIMARYORDERSHIFT
) | //Primary weight
3904 (UCOL_BYTE_COMMON
<< UCOL_SECONDARYORDERSHIFT
) | // Secondary weight
3905 UCOL_BYTE_COMMON
; // Tertiary weight.
3906 i
= endIndex
- 1; // Reset the index into the buffer.
3908 uint32_t primWeight
= numTempBuf
[i
--] << 8;
3910 primWeight
|= numTempBuf
[i
--];
3911 *(source
->CEpos
++) = (primWeight
<< UCOL_PRIMARYORDERSHIFT
) | UCOL_CONTINUATION_MARKER
;
3914 source
->toReturn
= source
->CEpos
-1;
3915 return *(source
->toReturn
);
3917 CEOffset
= (uint32_t *)coll
->image
+ getExpansionOffset(CE
);
3923 case HANGUL_SYLLABLE_TAG
: /* AC00-D7AF*/
3925 static const uint32_t
3926 SBase
= 0xAC00, LBase
= 0x1100, VBase
= 0x1161, TBase
= 0x11A7;
3927 //const uint32_t LCount = 19;
3928 static const uint32_t VCount
= 21;
3929 static const uint32_t TCount
= 28;
3930 //const uint32_t NCount = VCount * TCount; /* 588 */
3931 //const uint32_t SCount = LCount * NCount; /* 11172 */
3933 uint32_t L
= ch
- SBase
;
3936 we do it in this order since some compilers can do % and / in one
3939 uint32_t T
= L
% TCount
;
3941 uint32_t V
= L
% VCount
;
3949 int32_t firstOffset
= (int32_t)(source
->pos
- source
->string
);
3950 source
->appendOffset(firstOffset
, *status
);
3953 * return the first CE, but first put the rest into the expansion buffer
3955 if (!source
->coll
->image
->jamoSpecial
) {
3956 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, L
);
3957 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, V
);
3958 source
->appendOffset(firstOffset
+ 1, *status
);
3961 *(source
->CEpos
++) = UTRIE_GET32_FROM_LEAD(&coll
->mapping
, T
);
3962 source
->appendOffset(firstOffset
+ 1, *status
);
3965 source
->toReturn
= source
->CEpos
- 1;
3967 source
->offsetReturn
= source
->offsetStore
- 1;
3968 if (source
->offsetReturn
== source
->offsetBuffer
) {
3969 source
->offsetStore
= source
->offsetBuffer
;
3972 return *(source
->toReturn
);
3974 // Since Hanguls pass the FCD check, it is
3975 // guaranteed that we won't be in
3976 // the normalization buffer if something like this happens
3978 // Move Jamos into normalization buffer
3979 UChar
*tempbuffer
= source
->writableBuffer
.getBuffer(5);
3980 int32_t tempbufferLength
, jamoOffset
;
3982 tempbuffer
[1] = (UChar
)L
;
3983 tempbuffer
[2] = (UChar
)V
;
3985 tempbuffer
[3] = (UChar
)T
;
3986 tempbufferLength
= 4;
3988 tempbufferLength
= 3;
3990 source
->writableBuffer
.releaseBuffer(tempbufferLength
);
3992 // Indicate where to continue in main input string after exhausting the writableBuffer
3993 if (source
->pos
== source
->string
) {
3995 source
->fcdPosition
= NULL
;
3997 jamoOffset
= source
->pos
- source
->string
;
3998 source
->fcdPosition
= source
->pos
-1;
4001 // Append offsets for the additional chars
4002 // (not the 0, and not the L whose offsets match the original Hangul)
4003 int32_t jamoRemaining
= tempbufferLength
- 2;
4004 jamoOffset
++; // appended offsets should match end of original Hangul
4005 while (jamoRemaining
-- > 0) {
4006 source
->appendOffset(jamoOffset
, *status
);
4009 source
->offsetRepeatValue
= jamoOffset
;
4011 source
->offsetReturn
= source
->offsetStore
- 1;
4012 if (source
->offsetReturn
== source
->offsetBuffer
) {
4013 source
->offsetStore
= source
->offsetBuffer
;
4016 source
->pos
= source
->writableBuffer
.getTerminatedBuffer() + tempbufferLength
;
4017 source
->origFlags
= source
->flags
;
4018 source
->flags
|= UCOL_ITER_INNORMBUF
;
4019 source
->flags
&= ~(UCOL_ITER_NORM
| UCOL_ITER_HASLEN
);
4021 return(UCOL_IGNORABLE
);
4025 case IMPLICIT_TAG
: /* everything that is not defined otherwise */
4026 return getPrevImplicit(ch
, source
);
4028 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4029 case CJK_IMPLICIT_TAG
: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4030 return getPrevImplicit(ch
, source
);
4032 case SURROGATE_TAG
: /* This is a surrogate pair */
4033 /* essentially an engaged lead surrogate. */
4034 /* if you have encountered it here, it means that a */
4035 /* broken sequence was encountered and this is an error */
4036 return UCOL_NOT_FOUND
;
4038 case LEAD_SURROGATE_TAG
: /* D800-DBFF*/
4039 return UCOL_NOT_FOUND
; /* broken surrogate sequence */
4041 case TRAIL_SURROGATE_TAG
: /* DC00-DFFF*/
4046 if (isAtStartPrevIterate(source
)) {
4047 /* we are at the start of the string, wrong place to be at */
4048 return UCOL_NOT_FOUND
;
4050 if (source
->pos
!= source
->writableBuffer
.getBuffer()) {
4051 prev
= source
->pos
- 1;
4053 prev
= source
->fcdPosition
;
4057 /* Handles Han and Supplementary characters here.*/
4058 if (U16_IS_LEAD(prevChar
)) {
4059 cp
= ((((uint32_t)prevChar
)<<10UL)+(ch
)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4062 return UCOL_NOT_FOUND
; /* like unassigned */
4065 return getPrevImplicit(cp
, source
);
4068 /* UCA is filled with these. Tailorings are NOT_FOUND */
4069 /* not yet implemented */
4070 case CHARSET_TAG
: /* this tag always returns */
4071 /* probably after 1.8 */
4072 return UCOL_NOT_FOUND
;
4074 default: /* this tag always returns */
4075 *status
= U_INTERNAL_PROGRAM_ERROR
;
4080 if (CE
<= UCOL_NOT_FOUND
) {
4088 /* This should really be a macro */
4089 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4092 uint8_t *reallocateBuffer(uint8_t **secondaries
, uint8_t *secStart
, uint8_t *second
, uint32_t *secSize
, uint32_t newSize
, UErrorCode
*status
) {
4094 fprintf(stderr
, ".");
4096 uint8_t *newStart
= NULL
;
4097 uint32_t offset
= (uint32_t)(*secondaries
-secStart
);
4099 if(secStart
==second
) {
4100 newStart
=(uint8_t*)uprv_malloc(newSize
);
4101 if(newStart
==NULL
) {
4102 *status
= U_MEMORY_ALLOCATION_ERROR
;
4105 uprv_memcpy(newStart
, secStart
, *secondaries
-secStart
);
4107 newStart
=(uint8_t*)uprv_realloc(secStart
, newSize
);
4108 if(newStart
==NULL
) {
4109 *status
= U_MEMORY_ALLOCATION_ERROR
;
4110 /* Since we're reallocating, return original reference so we don't loose it. */
4114 *secondaries
=newStart
+offset
;
4120 /* This should really be a macro */
4121 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4122 /* secondaries in French */
4124 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4134 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4136 while((start)<(end)) { \
4138 *(start)++ = *(end); \
4143 /****************************************************************************/
4144 /* Following are the sortkey generation functions */
4146 /****************************************************************************/
4149 * Merge two sort keys.
4150 * This is useful, for example, to combine sort keys from first and last names
4151 * to sort such pairs.
4152 * Merged sort keys consider on each collation level the first part first entirely,
4153 * then the second one.
4154 * It is possible to merge multiple sort keys by consecutively merging
4155 * another one with the intermediate result.
4157 * The length of the merge result is the sum of the lengths of the input sort keys
4160 * @param src1 the first sort key
4161 * @param src1Length the length of the first sort key, including the zero byte at the end;
4162 * can be -1 if the function is to find the length
4163 * @param src2 the second sort key
4164 * @param src2Length the length of the second sort key, including the zero byte at the end;
4165 * can be -1 if the function is to find the length
4166 * @param dest the buffer where the merged sort key is written,
4167 * can be NULL if destCapacity==0
4168 * @param destCapacity the number of bytes in the dest buffer
4169 * @return the length of the merged sort key, src1Length+src2Length-1;
4170 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4171 * in which cases the contents of dest is undefined
4175 U_CAPI
int32_t U_EXPORT2
4176 ucol_mergeSortkeys(const uint8_t *src1
, int32_t src1Length
,
4177 const uint8_t *src2
, int32_t src2Length
,
4178 uint8_t *dest
, int32_t destCapacity
) {
4182 /* check arguments */
4183 if( src1
==NULL
|| src1Length
<-2 || src1Length
==0 || (src1Length
>0 && src1
[src1Length
-1]!=0) ||
4184 src2
==NULL
|| src2Length
<-2 || src2Length
==0 || (src2Length
>0 && src2
[src2Length
-1]!=0) ||
4185 destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)
4187 /* error, attempt to write a zero byte and return 0 */
4188 if(dest
!=NULL
&& destCapacity
>0) {
4194 /* check lengths and capacity */
4196 src1Length
=(int32_t)uprv_strlen((const char *)src1
)+1;
4199 src2Length
=(int32_t)uprv_strlen((const char *)src2
)+1;
4202 destLength
=src1Length
+src2Length
-1;
4203 if(destLength
>destCapacity
) {
4204 /* the merged sort key does not fit into the destination */
4208 /* merge the sort keys with the same number of levels */
4209 while(*src1
!=0 && *src2
!=0) { /* while both have another level */
4210 /* copy level from src1 not including 00 or 01 */
4211 while((b
=*src1
)>=2) {
4216 /* add a 02 merge separator */
4219 /* copy level from src2 not including 00 or 01 */
4220 while((b
=*src2
)>=2) {
4225 /* if both sort keys have another level, then add a 01 level separator and continue */
4226 if(*src1
==1 && *src2
==1) {
4234 * here, at least one sort key is finished now, but the other one
4235 * might have some contents left from containing more levels;
4236 * that contents is just appended to the result
4239 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4242 /* append src2, "the other, unfinished sort key" */
4243 uprv_strcpy((char *)dest
, (const char *)src2
);
4245 /* trust that neither sort key contained illegally embedded zero bytes */
4250 U_CAPI
int32_t U_EXPORT2
4251 ucol_getSortKey(const UCollator
*coll
,
4252 const UChar
*source
,
4253 int32_t sourceLength
,
4255 int32_t resultLength
)
4257 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY
);
4258 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
4259 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source string = %vh ", coll
, source
,
4260 ((sourceLength
==-1 && source
!=NULL
) ? u_strlen(source
) : sourceLength
));
4263 UErrorCode status
= U_ZERO_ERROR
;
4264 int32_t keySize
= 0;
4266 if(source
!= NULL
) {
4267 // source == NULL is actually an error situation, but we would need to
4268 // have an error code to return it. Until we introduce a new
4269 // API, it stays like this
4271 /* this uses the function pointer that is set in updateinternalstate */
4272 /* currently, there are two funcs: */
4273 /*ucol_calcSortKey(...);*/
4274 /*ucol_calcSortKeySimpleTertiary(...);*/
4276 keySize
= coll
->sortKeyGen(coll
, source
, sourceLength
, &result
, resultLength
, FALSE
, &status
);
4277 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4278 // That's not good. Something unusual happened.
4279 // We don't know how much we initialized before we failed.
4280 // NULL terminate for safety.
4281 // We have no way say that we have generated a partial sort key.
4286 UTRACE_DATA2(UTRACE_VERBOSE
, "Sort Key = %vb", result
, keySize
);
4287 UTRACE_EXIT_STATUS(status
);
4291 /* this function is called by the C++ API for sortkey generation */
4293 ucol_getSortKeyWithAllocation(const UCollator
*coll
,
4294 const UChar
*source
, int32_t sourceLength
,
4296 UErrorCode
*pErrorCode
) {
4298 return coll
->sortKeyGen(coll
, source
, sourceLength
, pResult
, 0, TRUE
, pErrorCode
);
4301 #define UCOL_FSEC_BUF_SIZE 256
4303 // Is this primary weight compressible?
4304 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4305 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4307 isCompressible(const UCollator
* /*coll*/, uint8_t primary1
) {
4308 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY
<= primary1
&& primary1
<= maxRegularPrimary
;
4311 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4312 /* or if we run out of space while making a sortkey and want to return ASAP */
4313 int32_t ucol_getSortKeySize(const UCollator
*coll
, collIterate
*s
, int32_t currentSize
, UColAttributeValue strength
, int32_t len
) {
4314 UErrorCode status
= U_ZERO_ERROR
;
4315 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4316 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4317 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4318 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4319 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4320 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4321 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4322 //UBool qShifted = shifted && (compareQuad == 0);
4323 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4324 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4325 uint8_t fSecsBuff
[UCOL_FSEC_BUF_SIZE
];
4326 uint8_t *fSecs
= fSecsBuff
;
4327 uint32_t fSecsLen
= 0, fSecsMaxLen
= UCOL_FSEC_BUF_SIZE
;
4328 uint8_t *frenchStartPtr
= NULL
, *frenchEndPtr
= NULL
;
4330 uint32_t variableTopValue
= coll
->variableTopValue
;
4331 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4334 /* allocate one more space for hiragana */
4336 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4338 uint32_t order
= UCOL_NO_MORE_CES
;
4339 uint8_t primary1
= 0;
4340 uint8_t primary2
= 0;
4341 uint8_t secondary
= 0;
4342 uint8_t tertiary
= 0;
4343 int32_t caseShift
= 0;
4344 uint32_t c2
= 0, c3
= 0, c4
= 0; /* variables for compression */
4346 uint8_t caseSwitch
= coll
->caseSwitch
;
4347 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4348 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4350 UBool wasShifted
= FALSE
;
4351 UBool notIsContinuation
= FALSE
;
4352 uint8_t leadPrimary
= 0;
4356 order
= ucol_IGetNextCE(coll
, s
, &status
);
4357 if(order
== UCOL_NO_MORE_CES
) {
4365 notIsContinuation
= !isContinuation(order
);
4368 if(notIsContinuation
) {
4369 tertiary
= (uint8_t)((order
& UCOL_BYTE_SIZE_MASK
));
4371 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4373 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4374 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4375 primary1
= (uint8_t)(order
>> 8);
4377 /* no need to permute since the actual code values don't matter
4378 if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
4379 primary1 = coll->leadBytePermutationTable[primary1];
4383 if((shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4384 || (!notIsContinuation
&& wasShifted
)))
4385 || (wasShifted
&& primary1
== 0)) { /* amendment to the UCA says that primary ignorables */
4386 /* and other ignorables should be removed if following a shifted code point */
4387 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4388 /* we should just completely ignore it */
4391 if(compareQuad
== 0) {
4393 currentSize
+= (c2
/UCOL_BOT_COUNT4
)+1;
4404 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4405 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4406 /* calculate sortkey size */
4407 if(primary1
!= UCOL_IGNORABLE
) {
4408 if(notIsContinuation
) {
4409 if(leadPrimary
== primary1
) {
4412 if(leadPrimary
!= 0) {
4415 if(primary2
== UCOL_IGNORABLE
) {
4416 /* one byter, not compressed */
4419 } else if(isCompressible(coll
, primary1
)) {
4421 leadPrimary
= primary1
;
4428 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4430 if(primary2
!= UCOL_IGNORABLE
) {
4436 if(secondary
> compareSec
) { /* I think that != 0 test should be != IGNORABLE */
4438 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4442 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4443 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+1;
4445 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+1;
4452 fSecs
[fSecsLen
++] = secondary
;
4453 if(fSecsLen
== fSecsMaxLen
) {
4455 if(fSecs
== fSecsBuff
) {
4456 fSecsTemp
= (uint8_t *)uprv_malloc(2*fSecsLen
);
4458 fSecsTemp
= (uint8_t *)uprv_realloc(fSecs
, 2*fSecsLen
);
4460 if(fSecsTemp
== NULL
) {
4461 status
= U_MEMORY_ALLOCATION_ERROR
;
4467 if(notIsContinuation
) {
4468 if (frenchStartPtr
!= NULL
) {
4469 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4470 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4471 frenchStartPtr
= NULL
;
4474 if (frenchStartPtr
== NULL
) {
4475 frenchStartPtr
= fSecs
+fSecsLen
-2;
4477 frenchEndPtr
= fSecs
+fSecsLen
-1;
4482 if(doCase
&& (primary1
> 0 || strength
>= UCOL_SECONDARY
)) {
4483 // do the case level if we need to do it. We don't want to calculate
4484 // case level for primary ignorables if we have only primary strength and case level
4485 // otherwise we would break well formedness of CEs
4486 if (caseShift
== 0) {
4488 caseShift
= UCOL_CASE_SHIFT_START
;
4490 if((tertiary
&0x3F) > 0 && notIsContinuation
) {
4492 if((tertiary
&0xC0) != 0) {
4493 if (caseShift
== 0) {
4495 caseShift
= UCOL_CASE_SHIFT_START
;
4501 if(notIsContinuation
) {
4502 tertiary
^= caseSwitch
;
4506 tertiary
&= tertiaryMask
;
4507 if(tertiary
> compareTer
) { /* I think that != 0 test should be != IGNORABLE */
4508 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4512 if((tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
)
4513 || (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
)) {
4514 currentSize
+= (c3
/(uint32_t)coll
->tertiaryTopCount
)+1;
4516 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
)+1;
4524 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4525 if(s
->flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4526 if(c4
>0) { // Close this part
4527 currentSize
+= (c4
/UCOL_BOT_COUNT4
)+1;
4530 currentSize
++; // Add the Hiragana
4531 } else { // This wasn't Hiragana, so we can continue adding stuff
4540 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4544 if(frenchStartPtr
!= NULL
) {
4545 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4547 for(i
= 0; i
<fSecsLen
; i
++) {
4548 secondary
= *(fSecs
+fSecsLen
-i
-1);
4549 /* This is compression code. */
4550 if (secondary
== UCOL_COMMON2
) {
4554 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4555 currentSize
+= (c2
/(uint32_t)UCOL_TOP_COUNT2
)+((c2
%(uint32_t)UCOL_TOP_COUNT2
!= 0)?1:0);
4557 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4565 currentSize
+= (c2
/(uint32_t)UCOL_BOT_COUNT2
)+((c2
%(uint32_t)UCOL_BOT_COUNT2
!= 0)?1:0);
4567 if(fSecs
!= fSecsBuff
) {
4573 currentSize
+= (c3
/(uint32_t)coll
->tertiaryBottomCount
) + ((c3
%(uint32_t)coll
->tertiaryBottomCount
!= 0)?1:0);
4576 if(c4
> 0 && compareQuad
== 0) {
4577 currentSize
+= (c4
/(uint32_t)UCOL_BOT_COUNT4
)+((c4
%(uint32_t)UCOL_BOT_COUNT4
!= 0)?1:0);
4581 currentSize
+= u_lengthOfIdenticalLevelRun(s
->string
, len
);
4587 inline void doCaseShift(uint8_t **cases
, uint32_t &caseShift
) {
4588 if (caseShift
== 0) {
4589 *(*cases
)++ = UCOL_CASE_BYTE_START
;
4590 caseShift
= UCOL_CASE_SHIFT_START
;
4594 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4595 // know how many values we wanted to add, even if we didn't add them all
4597 inline void addWithIncrement(uint8_t *&primaries
, uint8_t *limit
, uint32_t &size
, const uint8_t value
) {
4599 if(primaries
< limit
) {
4600 *(primaries
)++ = value
;
4604 // Packs the secondary buffer when processing French locale. Adds the terminator.
4606 inline uint8_t *packFrench(uint8_t *primaries
, uint8_t *primEnd
, uint8_t *secondaries
, uint32_t *secsize
, uint8_t *frenchStartPtr
, uint8_t *frenchEndPtr
) {
4609 uint32_t i
= 0, size
= 0;
4610 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4611 addWithIncrement(primaries
, primEnd
, i
, UCOL_LEVELTERMINATOR
);
4612 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4613 if(frenchStartPtr
!= NULL
) {
4614 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4616 for(i
= 0; i
<*secsize
; i
++) {
4617 secondary
= *(secondaries
-i
-1);
4618 /* This is compression code. */
4619 if (secondary
== UCOL_COMMON2
) {
4623 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4624 while (count2
> UCOL_TOP_COUNT2
) {
4625 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
));
4626 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4628 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1)));
4630 while (count2
> UCOL_BOT_COUNT2
) {
4631 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4632 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4634 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4638 addWithIncrement(primaries
, primEnd
, size
, secondary
);
4642 while (count2
> UCOL_BOT_COUNT2
) {
4643 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
));
4644 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4646 addWithIncrement(primaries
, primEnd
, size
, (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1)));
4652 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4654 /* This is the sortkey work horse function */
4655 U_CFUNC
int32_t U_CALLCONV
4656 ucol_calcSortKey(const UCollator
*coll
,
4657 const UChar
*source
,
4658 int32_t sourceLength
,
4660 uint32_t resultLength
,
4661 UBool allocateSKBuffer
,
4664 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4666 uint32_t i
= 0; /* general purpose counter */
4668 /* Stack allocated buffers for buffers we use */
4669 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
], caseB
[UCOL_CASE_MAX_BUFFER
], quad
[UCOL_QUAD_MAX_BUFFER
];
4671 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
, *cases
= caseB
, *quads
= quad
;
4673 if(U_FAILURE(*status
)) {
4677 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
4678 primaries
= *result
= prim
;
4679 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
4682 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
,
4683 caseSize
= UCOL_CASE_MAX_BUFFER
, quadSize
= UCOL_QUAD_MAX_BUFFER
;
4685 uint32_t sortKeySize
= 1; /* it is always \0 terminated */
4687 UnicodeString normSource
;
4689 int32_t len
= (sourceLength
== -1 ? u_strlen(source
) : sourceLength
);
4691 UColAttributeValue strength
= coll
->strength
;
4693 uint8_t compareSec
= (uint8_t)((strength
>= UCOL_SECONDARY
)?0:0xFF);
4694 uint8_t compareTer
= (uint8_t)((strength
>= UCOL_TERTIARY
)?0:0xFF);
4695 uint8_t compareQuad
= (uint8_t)((strength
>= UCOL_QUATERNARY
)?0:0xFF);
4696 UBool compareIdent
= (strength
== UCOL_IDENTICAL
);
4697 UBool doCase
= (coll
->caseLevel
== UCOL_ON
);
4698 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && (compareSec
== 0);
4699 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
4700 //UBool qShifted = shifted && (compareQuad == 0);
4701 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && (compareQuad
== 0);
4703 uint32_t variableTopValue
= coll
->variableTopValue
;
4704 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4705 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4706 uint8_t UCOL_COMMON_BOT4
= (uint8_t)((coll
->variableTopValue
>>8)+1);
4707 uint8_t UCOL_HIRAGANA_QUAD
= 0;
4709 UCOL_HIRAGANA_QUAD
=UCOL_COMMON_BOT4
++;
4710 /* allocate one more space for hiragana, value for hiragana */
4712 uint8_t UCOL_BOT_COUNT4
= (uint8_t)(0xFF - UCOL_COMMON_BOT4
);
4714 /* support for special features like caselevel and funky secondaries */
4715 uint8_t *frenchStartPtr
= NULL
;
4716 uint8_t *frenchEndPtr
= NULL
;
4717 uint32_t caseShift
= 0;
4719 sortKeySize
+= ((compareSec
?0:1) + (compareTer
?0:1) + (doCase
?1:0) + /*(qShifted?1:0)*/(compareQuad
?0:1) + (compareIdent
?1:0));
4721 /* If we need to normalize, we'll do it all at once at the beginning! */
4722 const Normalizer2
*norm2
;
4724 norm2
= Normalizer2Factory::getNFDInstance(*status
);
4725 } else if(coll
->normalizationMode
!= UCOL_OFF
) {
4726 norm2
= Normalizer2Factory::getFCDInstance(*status
);
4731 normSource
.setTo(FALSE
, source
, len
);
4732 int32_t qcYesLength
= norm2
->spanQuickCheckYes(normSource
, *status
);
4733 if(qcYesLength
!= len
) {
4734 UnicodeString unnormalized
= normSource
.tempSubString(qcYesLength
);
4735 normSource
.truncate(qcYesLength
);
4736 norm2
->normalizeSecondAndAppend(normSource
, unnormalized
, *status
);
4737 source
= normSource
.getBuffer();
4738 len
= normSource
.length();
4742 IInit_collIterate(coll
, source
, len
, &s
, status
);
4743 if(U_FAILURE(*status
)) {
4746 s
.flags
&= ~UCOL_ITER_NORM
; // source passed the FCD test or else was normalized.
4748 if(resultLength
== 0 || primaries
== NULL
) {
4749 return ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
4751 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 1;
4752 if(strength
> UCOL_PRIMARY
) {
4756 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
4758 uint8_t *primStart
= primaries
;
4759 uint8_t *secStart
= secondaries
;
4760 uint8_t *terStart
= tertiaries
;
4761 uint8_t *caseStart
= cases
;
4762 uint8_t *quadStart
= quads
;
4766 uint8_t primary1
= 0;
4767 uint8_t primary2
= 0;
4768 uint8_t secondary
= 0;
4769 uint8_t tertiary
= 0;
4770 uint8_t caseSwitch
= coll
->caseSwitch
;
4771 uint8_t tertiaryMask
= coll
->tertiaryMask
;
4772 int8_t tertiaryAddition
= coll
->tertiaryAddition
;
4773 uint8_t tertiaryTop
= coll
->tertiaryTop
;
4774 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
4775 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
4776 uint8_t caseBits
= 0;
4778 UBool finished
= FALSE
;
4779 UBool wasShifted
= FALSE
;
4780 UBool notIsContinuation
= FALSE
;
4782 uint32_t prevBuffSize
= 0;
4784 uint32_t count2
= 0, count3
= 0, count4
= 0;
4785 uint8_t leadPrimary
= 0;
4788 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
4790 order
= ucol_IGetNextCE(coll
, &s
, status
);
4791 if(order
== UCOL_NO_MORE_CES
) {
4800 notIsContinuation
= !isContinuation(order
);
4802 if(notIsContinuation
) {
4803 tertiary
= (uint8_t)(order
& UCOL_BYTE_SIZE_MASK
);
4805 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
4808 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4809 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
4810 primary1
= (uint8_t)(order
>> 8);
4812 uint8_t originalPrimary1
= primary1
;
4813 if(notIsContinuation
&& coll
->leadBytePermutationTable
!= NULL
) {
4814 primary1
= coll
->leadBytePermutationTable
[primary1
];
4817 if((shifted
&& ((notIsContinuation
&& order
<= variableTopValue
&& primary1
> 0)
4818 || (!notIsContinuation
&& wasShifted
)))
4819 || (wasShifted
&& primary1
== 0)) /* amendment to the UCA says that primary ignorables */
4821 /* and other ignorables should be removed if following a shifted code point */
4822 if(primary1
== 0) { /* if we were shifted and we got an ignorable code point */
4823 /* we should just completely ignore it */
4826 if(compareQuad
== 0) {
4828 while (count4
> UCOL_BOT_COUNT4
) {
4829 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
4830 count4
-= UCOL_BOT_COUNT4
;
4832 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
4835 /* We are dealing with a variable and we're treating them as shifted */
4836 /* This is a shifted ignorable */
4837 if(primary1
!= 0) { /* we need to check this since we could be in continuation */
4838 *quads
++ = primary1
;
4841 *quads
++ = primary2
;
4847 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4848 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
4849 /* regular and simple sortkey calc */
4850 if(primary1
!= UCOL_IGNORABLE
) {
4851 if(notIsContinuation
) {
4852 if(leadPrimary
== primary1
) {
4853 *primaries
++ = primary2
;
4855 if(leadPrimary
!= 0) {
4856 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
4858 if(primary2
== UCOL_IGNORABLE
) {
4859 /* one byter, not compressed */
4860 *primaries
++ = primary1
;
4862 } else if(isCompressible(coll
, originalPrimary1
)) {
4864 *primaries
++ = leadPrimary
= primary1
;
4865 if(primaries
<= primarySafeEnd
) {
4866 *primaries
++ = primary2
;
4870 *primaries
++ = primary1
;
4871 if(primaries
<= primarySafeEnd
) {
4872 *primaries
++ = primary2
;
4876 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4877 *primaries
++ = primary1
;
4878 if((primary2
!= UCOL_IGNORABLE
) && (primaries
<= primarySafeEnd
)) {
4879 *primaries
++ = primary2
; /* second part */
4884 if(secondary
> compareSec
) {
4886 /* This is compression code. */
4887 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
4891 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
4892 while (count2
> UCOL_TOP_COUNT2
) {
4893 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
4894 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
4896 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
4898 while (count2
> UCOL_BOT_COUNT2
) {
4899 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
4900 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
4902 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
4906 *secondaries
++ = secondary
;
4909 *secondaries
++ = secondary
;
4910 /* Do the special handling for French secondaries */
4911 /* We need to get continuation elements and do intermediate restore */
4912 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4913 if(notIsContinuation
) {
4914 if (frenchStartPtr
!= NULL
) {
4915 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4916 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr
, frenchEndPtr
);
4917 frenchStartPtr
= NULL
;
4920 if (frenchStartPtr
== NULL
) {
4921 frenchStartPtr
= secondaries
- 2;
4923 frenchEndPtr
= secondaries
-1;
4928 if(doCase
&& (primary1
> 0 || strength
>= UCOL_SECONDARY
)) {
4929 // do the case level if we need to do it. We don't want to calculate
4930 // case level for primary ignorables if we have only primary strength and case level
4931 // otherwise we would break well formedness of CEs
4932 doCaseShift(&cases
, caseShift
);
4933 if(notIsContinuation
) {
4934 caseBits
= (uint8_t)(tertiary
& 0xC0);
4937 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
4938 if((caseBits
& 0xC0) == 0) {
4939 *(cases
-1) |= 1 << (--caseShift
);
4941 *(cases
-1) |= 0 << (--caseShift
);
4943 doCaseShift(&cases
, caseShift
);
4944 *(cases
-1) |= ((caseBits
>>6)&1) << (--caseShift
);
4947 if((caseBits
& 0xC0) == 0) {
4948 *(cases
-1) |= 0 << (--caseShift
);
4950 *(cases
-1) |= 1 << (--caseShift
);
4952 doCaseShift(&cases
, caseShift
);
4953 *(cases
-1) |= ((caseBits
>>7)&1) << (--caseShift
);
4960 if(notIsContinuation
) {
4961 tertiary
^= caseSwitch
;
4965 tertiary
&= tertiaryMask
;
4966 if(tertiary
> compareTer
) {
4967 /* This is compression code. */
4968 /* sequence size check is included in the if clause */
4969 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
4972 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
4973 tertiary
+= tertiaryAddition
;
4974 } else if(tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
4975 tertiary
-= tertiaryAddition
;
4978 if ((tertiary
> tertiaryCommon
)) {
4979 while (count3
> coll
->tertiaryTopCount
) {
4980 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
4981 count3
-= (uint32_t)coll
->tertiaryTopCount
;
4983 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
4985 while (count3
> coll
->tertiaryBottomCount
) {
4986 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
4987 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
4989 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
4993 *tertiaries
++ = tertiary
;
4997 if(/*qShifted*/(compareQuad
==0) && notIsContinuation
) {
4998 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
4999 if(count4
>0) { // Close this part
5000 while (count4
> UCOL_BOT_COUNT4
) {
5001 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5002 count4
-= UCOL_BOT_COUNT4
;
5004 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5007 *quads
++ = UCOL_HIRAGANA_QUAD
; // Add the Hiragana
5008 } else { // This wasn't Hiragana, so we can continue adding stuff
5014 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5015 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5016 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
, status
);
5017 if(U_FAILURE(*status
)) {
5018 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5022 s
.flags
&= ~UCOL_ITER_NORM
;
5023 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, strength
, len
);
5024 *status
= U_BUFFER_OVERFLOW_ERROR
;
5027 } else { /* It's much nicer if we can actually reallocate */
5028 int32_t sks
= sortKeySize
+(int32_t)((primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
)+(cases
-caseStart
)+(quads
-quadStart
));
5029 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5030 if(U_SUCCESS(*status
)) {
5031 *result
= primStart
;
5032 primarySafeEnd
= primStart
+ resultLength
- 1;
5033 if(strength
> UCOL_PRIMARY
) {
5037 /* We ran out of memory!? We can't recover. */
5038 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5048 prevBuffSize
= minBufferSize
;
5050 uint32_t frenchStartOffset
= 0, frenchEndOffset
= 0;
5051 if (frenchStartPtr
!= NULL
) {
5052 frenchStartOffset
= (uint32_t)(frenchStartPtr
- secStart
);
5053 frenchEndOffset
= (uint32_t)(frenchEndPtr
- secStart
);
5055 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5056 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5057 caseStart
= reallocateBuffer(&cases
, caseStart
, caseB
, &caseSize
, 2*caseSize
, status
);
5058 quadStart
= reallocateBuffer(&quads
, quadStart
, quad
, &quadSize
, 2*quadSize
, status
);
5059 if(U_FAILURE(*status
)) {
5060 /* We ran out of memory!? We can't recover. */
5061 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5064 if (frenchStartPtr
!= NULL
) {
5065 frenchStartPtr
= secStart
+ frenchStartOffset
;
5066 frenchEndPtr
= secStart
+ frenchEndOffset
;
5072 /* Here, we are generally done with processing */
5073 /* bailing out would not be too productive */
5075 if(U_SUCCESS(*status
)) {
5076 sortKeySize
+= (uint32_t)(primaries
- primStart
);
5077 /* we have done all the CE's, now let's put them together to form a key */
5078 if(compareSec
== 0) {
5080 while (count2
> UCOL_BOT_COUNT2
) {
5081 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5082 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5084 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5086 uint32_t secsize
= (uint32_t)(secondaries
-secStart
);
5087 if(!isFrenchSec
) { // Regular situation, we know the length of secondaries
5088 sortKeySize
+= secsize
;
5089 if(sortKeySize
<= resultLength
) {
5090 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5091 uprv_memcpy(primaries
, secStart
, secsize
);
5092 primaries
+= secsize
;
5094 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
5095 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5096 if(U_SUCCESS(*status
)) {
5097 *result
= primStart
;
5098 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5099 uprv_memcpy(primaries
, secStart
, secsize
);
5100 primaries
+= secsize
;
5103 /* We ran out of memory!? We can't recover. */
5104 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5108 *status
= U_BUFFER_OVERFLOW_ERROR
;
5111 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5112 uint8_t *newPrim
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
5113 sortKeySize
+= secsize
;
5114 if(sortKeySize
<= resultLength
) { // if we managed to pack fine
5115 primaries
= newPrim
; // update the primary pointer
5116 } else { // overflow, need to reallocate and redo
5117 if(allocateSKBuffer
== TRUE
) { /* need to save our butts if we cannot reallocate */
5118 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5119 if(U_SUCCESS(*status
)) {
5120 primaries
= packFrench(primaries
, primStart
+resultLength
, secondaries
, &secsize
, frenchStartPtr
, frenchEndPtr
);
5123 /* We ran out of memory!? We can't recover. */
5124 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5128 *status
= U_BUFFER_OVERFLOW_ERROR
;
5135 uint32_t casesize
= (uint32_t)(cases
- caseStart
);
5136 sortKeySize
+= casesize
;
5137 if(sortKeySize
<= resultLength
) {
5138 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5139 uprv_memcpy(primaries
, caseStart
, casesize
);
5140 primaries
+= casesize
;
5142 if(allocateSKBuffer
== TRUE
) {
5143 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5144 if(U_SUCCESS(*status
)) {
5145 *result
= primStart
;
5146 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5147 uprv_memcpy(primaries
, caseStart
, casesize
);
5150 /* We ran out of memory!? We can't recover. */
5151 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5155 *status
= U_BUFFER_OVERFLOW_ERROR
;
5160 if(compareTer
== 0) {
5162 if (coll
->tertiaryCommon
!= UCOL_COMMON_BOT3
) {
5163 while (count3
>= coll
->tertiaryTopCount
) {
5164 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5165 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5167 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5169 while (count3
> coll
->tertiaryBottomCount
) {
5170 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5171 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5173 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5176 uint32_t tersize
= (uint32_t)(tertiaries
- terStart
);
5177 sortKeySize
+= tersize
;
5178 if(sortKeySize
<= resultLength
) {
5179 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5180 uprv_memcpy(primaries
, terStart
, tersize
);
5181 primaries
+= tersize
;
5183 if(allocateSKBuffer
== TRUE
) {
5184 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5185 if(U_SUCCESS(*status
)) {
5186 *result
= primStart
;
5187 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5188 uprv_memcpy(primaries
, terStart
, tersize
);
5191 /* We ran out of memory!? We can't recover. */
5192 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5196 *status
= U_BUFFER_OVERFLOW_ERROR
;
5200 if(compareQuad
== 0/*qShifted == TRUE*/) {
5202 while (count4
> UCOL_BOT_COUNT4
) {
5203 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ UCOL_BOT_COUNT4
);
5204 count4
-= UCOL_BOT_COUNT4
;
5206 *quads
++ = (uint8_t)(UCOL_COMMON_BOT4
+ (count4
-1));
5208 uint32_t quadsize
= (uint32_t)(quads
- quadStart
);
5209 sortKeySize
+= quadsize
;
5210 if(sortKeySize
<= resultLength
) {
5211 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5212 uprv_memcpy(primaries
, quadStart
, quadsize
);
5213 primaries
+= quadsize
;
5215 if(allocateSKBuffer
== TRUE
) {
5216 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5217 if(U_SUCCESS(*status
)) {
5218 *result
= primStart
;
5219 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5220 uprv_memcpy(primaries
, quadStart
, quadsize
);
5223 /* We ran out of memory!? We can't recover. */
5224 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5228 *status
= U_BUFFER_OVERFLOW_ERROR
;
5234 sortKeySize
+= u_lengthOfIdenticalLevelRun(s
.string
, len
);
5235 if(sortKeySize
<= resultLength
) {
5236 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5237 primaries
+= u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5239 if(allocateSKBuffer
== TRUE
) {
5240 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, sortKeySize
, status
);
5241 if(U_SUCCESS(*status
)) {
5242 *result
= primStart
;
5243 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5244 u_writeIdenticalLevelRun(s
.string
, len
, primaries
);
5247 /* We ran out of memory!? We can't recover. */
5248 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5252 *status
= U_BUFFER_OVERFLOW_ERROR
;
5257 *(primaries
++) = '\0';
5260 if(allocateSKBuffer
== TRUE
) {
5261 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5263 if (*result
== NULL
) {
5264 *status
= U_MEMORY_ALLOCATION_ERROR
;
5267 uprv_memcpy(*result
, primStart
, sortKeySize
);
5268 if(primStart
!= prim
) {
5269 uprv_free(primStart
);
5274 if (allocateSKBuffer
== FALSE
&& resultLength
> 0 && U_FAILURE(*status
) && *status
!= U_BUFFER_OVERFLOW_ERROR
) {
5275 /* NULL terminate for safety */
5278 if(terStart
!= tert
) {
5279 uprv_free(terStart
);
5280 uprv_free(secStart
);
5281 uprv_free(caseStart
);
5282 uprv_free(quadStart
);
5285 /* To avoid memory leak, free the offset buffer if necessary. */
5286 ucol_freeOffsetBuffer(&s
);
5292 U_CFUNC
int32_t U_CALLCONV
5293 ucol_calcSortKeySimpleTertiary(const UCollator
*coll
,
5294 const UChar
*source
,
5295 int32_t sourceLength
,
5297 uint32_t resultLength
,
5298 UBool allocateSKBuffer
,
5303 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5304 uint32_t i
= 0; /* general purpose counter */
5306 /* Stack allocated buffers for buffers we use */
5307 uint8_t prim
[UCOL_PRIMARY_MAX_BUFFER
], second
[UCOL_SECONDARY_MAX_BUFFER
], tert
[UCOL_TERTIARY_MAX_BUFFER
];
5309 uint8_t *primaries
= *result
, *secondaries
= second
, *tertiaries
= tert
;
5311 if(U_FAILURE(*status
)) {
5315 if(primaries
== NULL
&& allocateSKBuffer
== TRUE
) {
5316 primaries
= *result
= prim
;
5317 resultLength
= UCOL_PRIMARY_MAX_BUFFER
;
5320 uint32_t secSize
= UCOL_SECONDARY_MAX_BUFFER
, terSize
= UCOL_TERTIARY_MAX_BUFFER
;
5322 uint32_t sortKeySize
= 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5324 UnicodeString normSource
;
5326 int32_t len
= sourceLength
;
5328 /* If we need to normalize, we'll do it all at once at the beginning! */
5329 if(coll
->normalizationMode
!= UCOL_OFF
) {
5330 normSource
.setTo(len
< 0, source
, len
);
5331 const Normalizer2
*norm2
= Normalizer2Factory::getFCDInstance(*status
);
5332 int32_t qcYesLength
= norm2
->spanQuickCheckYes(normSource
, *status
);
5333 if(qcYesLength
!= normSource
.length()) {
5334 UnicodeString unnormalized
= normSource
.tempSubString(qcYesLength
);
5335 normSource
.truncate(qcYesLength
);
5336 norm2
->normalizeSecondAndAppend(normSource
, unnormalized
, *status
);
5337 source
= normSource
.getBuffer();
5338 len
= normSource
.length();
5342 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
, status
);
5343 if(U_FAILURE(*status
)) {
5346 s
.flags
&= ~UCOL_ITER_NORM
; // source passed the FCD test or else was normalized.
5348 if(resultLength
== 0 || primaries
== NULL
) {
5349 return ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5352 uint8_t *primarySafeEnd
= primaries
+ resultLength
- 2;
5354 uint32_t minBufferSize
= UCOL_MAX_BUFFER
;
5356 uint8_t *primStart
= primaries
;
5357 uint8_t *secStart
= secondaries
;
5358 uint8_t *terStart
= tertiaries
;
5362 uint8_t primary1
= 0;
5363 uint8_t primary2
= 0;
5364 uint8_t secondary
= 0;
5365 uint8_t tertiary
= 0;
5366 uint8_t caseSwitch
= coll
->caseSwitch
;
5367 uint8_t tertiaryMask
= coll
->tertiaryMask
;
5368 int8_t tertiaryAddition
= coll
->tertiaryAddition
;
5369 uint8_t tertiaryTop
= coll
->tertiaryTop
;
5370 uint8_t tertiaryBottom
= coll
->tertiaryBottom
;
5371 uint8_t tertiaryCommon
= coll
->tertiaryCommon
;
5373 uint32_t prevBuffSize
= 0;
5375 UBool finished
= FALSE
;
5376 UBool notIsContinuation
= FALSE
;
5378 uint32_t count2
= 0, count3
= 0;
5379 uint8_t leadPrimary
= 0;
5382 for(i
=prevBuffSize
; i
<minBufferSize
; ++i
) {
5384 order
= ucol_IGetNextCE(coll
, &s
, status
);
5390 if(order
== UCOL_NO_MORE_CES
) {
5395 notIsContinuation
= !isContinuation(order
);
5397 if(notIsContinuation
) {
5398 tertiary
= (uint8_t)((order
& tertiaryMask
));
5400 tertiary
= (uint8_t)((order
& UCOL_REMOVE_CONTINUATION
));
5403 secondary
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5404 primary2
= (uint8_t)((order
>>= 8) & UCOL_BYTE_SIZE_MASK
);
5405 primary1
= (uint8_t)(order
>> 8);
5407 uint8_t originalPrimary1
= primary1
;
5408 if (coll
->leadBytePermutationTable
!= NULL
&& notIsContinuation
) {
5409 primary1
= coll
->leadBytePermutationTable
[primary1
];
5412 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5413 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
5414 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5415 /* regular and simple sortkey calc */
5416 if(primary1
!= UCOL_IGNORABLE
) {
5417 if(notIsContinuation
) {
5418 if(leadPrimary
== primary1
) {
5419 *primaries
++ = primary2
;
5421 if(leadPrimary
!= 0) {
5422 *primaries
++ = (uint8_t)((primary1
> leadPrimary
) ? UCOL_BYTE_UNSHIFTED_MAX
: UCOL_BYTE_UNSHIFTED_MIN
);
5424 if(primary2
== UCOL_IGNORABLE
) {
5425 /* one byter, not compressed */
5426 *primaries
++ = primary1
;
5428 } else if(isCompressible(coll
, originalPrimary1
)) {
5430 *primaries
++ = leadPrimary
= primary1
;
5431 *primaries
++ = primary2
;
5434 *primaries
++ = primary1
;
5435 *primaries
++ = primary2
;
5438 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5439 *primaries
++ = primary1
;
5440 if(primary2
!= UCOL_IGNORABLE
) {
5441 *primaries
++ = primary2
; /* second part */
5446 if(secondary
> 0) { /* I think that != 0 test should be != IGNORABLE */
5447 /* This is compression code. */
5448 if (secondary
== UCOL_COMMON2
&& notIsContinuation
) {
5452 if (secondary
> UCOL_COMMON2
) { // not necessary for 4th level.
5453 while (count2
> UCOL_TOP_COUNT2
) {
5454 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- UCOL_TOP_COUNT2
);
5455 count2
-= (uint32_t)UCOL_TOP_COUNT2
;
5457 *secondaries
++ = (uint8_t)(UCOL_COMMON_TOP2
- (count2
-1));
5459 while (count2
> UCOL_BOT_COUNT2
) {
5460 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5461 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5463 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5467 *secondaries
++ = secondary
;
5471 if(notIsContinuation
) {
5472 tertiary
^= caseSwitch
;
5476 /* This is compression code. */
5477 /* sequence size check is included in the if clause */
5478 if (tertiary
== tertiaryCommon
&& notIsContinuation
) {
5481 if(tertiary
> tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_NORMAL
) {
5482 tertiary
+= tertiaryAddition
;
5483 } else if (tertiary
<= tertiaryCommon
&& tertiaryCommon
== UCOL_COMMON3_UPPERFIRST
) {
5484 tertiary
-= tertiaryAddition
;
5487 if ((tertiary
> tertiaryCommon
)) {
5488 while (count3
> coll
->tertiaryTopCount
) {
5489 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5490 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5492 *tertiaries
++ = (uint8_t)(tertiaryTop
- (count3
-1));
5494 while (count3
> coll
->tertiaryBottomCount
) {
5495 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5496 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5498 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5502 *tertiaries
++ = tertiary
;
5506 if(primaries
> primarySafeEnd
) { /* We have stepped over the primary buffer */
5507 if(allocateSKBuffer
== FALSE
) { /* need to save our butts if we cannot reallocate */
5508 IInit_collIterate(coll
, (UChar
*)source
, len
, &s
, status
);
5509 if(U_FAILURE(*status
)) {
5510 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5514 s
.flags
&= ~UCOL_ITER_NORM
;
5515 sortKeySize
= ucol_getSortKeySize(coll
, &s
, sortKeySize
, coll
->strength
, len
);
5516 *status
= U_BUFFER_OVERFLOW_ERROR
;
5519 } else { /* It's much nicer if we can actually reallocate */
5520 int32_t sks
= sortKeySize
+(int32_t)((primaries
- primStart
)+(secondaries
- secStart
)+(tertiaries
- terStart
));
5521 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sks
, status
);
5522 if(U_SUCCESS(*status
)) {
5523 *result
= primStart
;
5524 primarySafeEnd
= primStart
+ resultLength
- 2;
5526 /* We ran out of memory!? We can't recover. */
5527 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5537 prevBuffSize
= minBufferSize
;
5538 secStart
= reallocateBuffer(&secondaries
, secStart
, second
, &secSize
, 2*secSize
, status
);
5539 terStart
= reallocateBuffer(&tertiaries
, terStart
, tert
, &terSize
, 2*terSize
, status
);
5541 if(U_FAILURE(*status
)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5542 /* We ran out of memory!? We can't recover. */
5543 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5549 if(U_SUCCESS(*status
)) {
5550 sortKeySize
+= (uint32_t)(primaries
- primStart
);
5551 /* we have done all the CE's, now let's put them together to form a key */
5553 while (count2
> UCOL_BOT_COUNT2
) {
5554 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ UCOL_BOT_COUNT2
);
5555 count2
-= (uint32_t)UCOL_BOT_COUNT2
;
5557 *secondaries
++ = (uint8_t)(UCOL_COMMON_BOT2
+ (count2
-1));
5559 uint32_t secsize
= (uint32_t)(secondaries
-secStart
);
5560 sortKeySize
+= secsize
;
5561 if(sortKeySize
<= resultLength
) {
5562 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5563 uprv_memcpy(primaries
, secStart
, secsize
);
5564 primaries
+= secsize
;
5566 if(allocateSKBuffer
== TRUE
) {
5567 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5568 if(U_SUCCESS(*status
)) {
5569 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5570 *result
= primStart
;
5571 uprv_memcpy(primaries
, secStart
, secsize
);
5574 /* We ran out of memory!? We can't recover. */
5575 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5579 *status
= U_BUFFER_OVERFLOW_ERROR
;
5584 if (coll
->tertiaryCommon
!= UCOL_COMMON3_NORMAL
) {
5585 while (count3
>= coll
->tertiaryTopCount
) {
5586 *tertiaries
++ = (uint8_t)(tertiaryTop
- coll
->tertiaryTopCount
);
5587 count3
-= (uint32_t)coll
->tertiaryTopCount
;
5589 *tertiaries
++ = (uint8_t)(tertiaryTop
- count3
);
5591 while (count3
> coll
->tertiaryBottomCount
) {
5592 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ coll
->tertiaryBottomCount
);
5593 count3
-= (uint32_t)coll
->tertiaryBottomCount
;
5595 *tertiaries
++ = (uint8_t)(tertiaryBottom
+ (count3
-1));
5598 uint32_t tersize
= (uint32_t)(tertiaries
- terStart
);
5599 sortKeySize
+= tersize
;
5600 if(sortKeySize
<= resultLength
) {
5601 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5602 uprv_memcpy(primaries
, terStart
, tersize
);
5603 primaries
+= tersize
;
5605 if(allocateSKBuffer
== TRUE
) {
5606 primStart
= reallocateBuffer(&primaries
, *result
, prim
, &resultLength
, 2*sortKeySize
, status
);
5607 if(U_SUCCESS(*status
)) {
5608 *result
= primStart
;
5609 *(primaries
++) = UCOL_LEVELTERMINATOR
;
5610 uprv_memcpy(primaries
, terStart
, tersize
);
5613 /* We ran out of memory!? We can't recover. */
5614 sortKeySize
= DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY
;
5618 *status
= U_BUFFER_OVERFLOW_ERROR
;
5622 *(primaries
++) = '\0';
5625 if(allocateSKBuffer
== TRUE
) {
5626 *result
= (uint8_t*)uprv_malloc(sortKeySize
);
5628 if (*result
== NULL
) {
5629 *status
= U_MEMORY_ALLOCATION_ERROR
;
5632 uprv_memcpy(*result
, primStart
, sortKeySize
);
5633 if(primStart
!= prim
) {
5634 uprv_free(primStart
);
5639 if (allocateSKBuffer
== FALSE
&& resultLength
> 0 && U_FAILURE(*status
) && *status
!= U_BUFFER_OVERFLOW_ERROR
) {
5640 /* NULL terminate for safety */
5643 if(terStart
!= tert
) {
5644 uprv_free(terStart
);
5645 uprv_free(secStart
);
5648 /* To avoid memory leak, free the offset buffer if necessary. */
5649 ucol_freeOffsetBuffer(&s
);
5655 UBool
isShiftedCE(uint32_t CE
, uint32_t LVT
, UBool
*wasShifted
) {
5656 UBool notIsContinuation
= !isContinuation(CE
);
5657 uint8_t primary1
= (uint8_t)((CE
>> 24) & 0xFF);
5658 if((LVT
&& ((notIsContinuation
&& (CE
& 0xFFFF0000)<= LVT
&& primary1
> 0)
5659 || (!notIsContinuation
&& *wasShifted
)))
5660 || (*wasShifted
&& primary1
== 0)) /* amendment to the UCA says that primary ignorables */
5662 // The stuff below should probably be in the sortkey code... maybe not...
5663 if(primary1
!= 0) { /* if we were shifted and we got an ignorable code point */
5664 /* we should just completely ignore it */
5668 //*wasShifted = TRUE;
5671 *wasShifted
= FALSE
;
5676 void terminatePSKLevel(int32_t level
, int32_t maxLevel
, int32_t &i
, uint8_t *dest
) {
5677 if(level
< maxLevel
) {
5678 dest
[i
++] = UCOL_LEVELTERMINATOR
;
5684 /** enumeration of level identifiers for partial sort key generation */
5686 UCOL_PSK_PRIMARY
= 0,
5687 UCOL_PSK_SECONDARY
= 1,
5689 UCOL_PSK_TERTIARY
= 3,
5690 UCOL_PSK_QUATERNARY
= 4,
5691 UCOL_PSK_QUIN
= 5, /** This is an extra level, not used - but we have three bits to blow */
5692 UCOL_PSK_IDENTICAL
= 6,
5693 UCOL_PSK_NULL
= 7, /** level for the end of sort key. Will just produce zeros */
5697 /** collation state enum. *_SHIFT value is how much to shift right
5698 * to get the state piece to the right. *_MASK value should be
5699 * ANDed with the shifted state. This data is stored in state[1]
5703 UCOL_PSK_LEVEL_SHIFT
= 0, /** level identificator. stores an enum value from above */
5704 UCOL_PSK_LEVEL_MASK
= 7, /** three bits */
5705 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
= 3, /** number of bytes of primary or quaternary already written */
5706 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
= 1,
5707 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5708 * This field is also used to denote that the French secondary level is finished
5710 UCOL_PSK_WAS_SHIFTED_SHIFT
= 4,/** was the last value shifted */
5711 UCOL_PSK_WAS_SHIFTED_MASK
= 1, /** can be 0 or 1 (Boolean) */
5712 UCOL_PSK_USED_FRENCH_SHIFT
= 5,/** how many French bytes have we already written */
5713 UCOL_PSK_USED_FRENCH_MASK
= 3, /** up to 4 bytes. See comment just below */
5714 /** When we do French we need to reverse secondary values. However, continuations
5715 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5717 UCOL_PSK_BOCSU_BYTES_SHIFT
= 7,
5718 UCOL_PSK_BOCSU_BYTES_MASK
= 3,
5719 UCOL_PSK_CONSUMED_CES_SHIFT
= 9,
5720 UCOL_PSK_CONSUMED_CES_MASK
= 0x7FFFF
5723 // macro calculating the number of expansion CEs available
5724 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5727 /** main sortkey part procedure. On the first call,
5728 * you should pass in a collator, an iterator, empty state
5729 * state[0] == state[1] == 0, a buffer to hold results
5730 * number of bytes you need and an error code pointer.
5731 * Make sure your buffer is big enough to hold the wanted
5732 * number of sortkey bytes. I don't check.
5733 * The only meaningful status you can get back is
5734 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5735 * have been dealt a raw deal and that you probably won't
5736 * be able to use partial sortkey generation for this
5737 * particular combination of string and collator. This
5738 * is highly unlikely, but you should still check the error code.
5739 * Any other status means that you're not in a sane situation
5740 * anymore. After the first call, preserve state values and
5741 * use them on subsequent calls to obtain more bytes of a sortkey.
5742 * Use until the number of bytes written is smaller than the requested
5743 * number of bytes. Generated sortkey is not compatible with the
5744 * one generated by ucol_getSortKey, as we don't do any compression.
5745 * However, levels are still terminated by a 1 (one) and the sortkey
5746 * is terminated by a 0 (zero). Identical level is the same as in the
5747 * regular sortkey - internal bocu-1 implementation is used.
5748 * For curious, although you cannot do much about this, here is
5749 * the structure of state words.
5750 * state[0] - iterator state. Depends on the iterator implementation,
5751 * but allows the iterator to continue where it stopped in
5752 * the last iteration.
5753 * state[1] - collation processing state. Here is the distribution
5755 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5756 * quaternary, quin (we don't use this one), identical and
5757 * null (producing only zeroes - first one to terminate the
5758 * sortkey and subsequent to fill the buffer).
5759 * 3 - byte count. Number of bytes written on the primary level.
5760 * 4 - was shifted. Whether the previous iteration finished in the
5762 * 5, 6 - French continuation bytes written. See the comment in the enum
5763 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5764 * the identical level.
5765 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5766 * since thes last successful update of the iterator state.
5768 U_CAPI
int32_t U_EXPORT2
5769 ucol_nextSortKeyPart(const UCollator
*coll
,
5770 UCharIterator
*iter
,
5772 uint8_t *dest
, int32_t count
,
5775 /* error checking */
5776 if(status
==NULL
|| U_FAILURE(*status
)) {
5779 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART
);
5780 if( coll
==NULL
|| iter
==NULL
||
5782 count
<0 || (count
>0 && dest
==NULL
)
5784 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
5785 UTRACE_EXIT_STATUS(status
);
5789 UTRACE_DATA6(UTRACE_VERBOSE
, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5790 coll
, iter
, state
[0], state
[1], dest
, count
);
5794 UTRACE_EXIT_VALUE(0);
5797 /** Setting up situation according to the state we got from the previous iteration */
5798 // The state of the iterator from the previous invocation
5799 uint32_t iterState
= state
[0];
5800 // Has the last iteration ended in the shifted state
5801 UBool wasShifted
= ((state
[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT
) & UCOL_PSK_WAS_SHIFTED_MASK
)?TRUE
:FALSE
;
5802 // What is the current level of the sortkey?
5803 int32_t level
= (state
[1] >> UCOL_PSK_LEVEL_SHIFT
) & UCOL_PSK_LEVEL_MASK
;
5804 // Have we written only one byte from a two byte primary in the previous iteration?
5805 // Also on secondary level - have we finished with the French secondary?
5806 int32_t byteCountOrFrenchDone
= (state
[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
;
5807 // number of bytes in the continuation buffer for French
5808 int32_t usedFrench
= (state
[1] >> UCOL_PSK_USED_FRENCH_SHIFT
) & UCOL_PSK_USED_FRENCH_MASK
;
5809 // Number of bytes already written from a bocsu sequence. Since
5810 // the longes bocsu sequence is 4 long, this can be up to 3.
5811 int32_t bocsuBytesUsed
= (state
[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT
) & UCOL_PSK_BOCSU_BYTES_MASK
;
5812 // Number of elements that need to be consumed in this iteration because
5813 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5814 // so we had to save the last valid state.
5815 int32_t cces
= (state
[1] >> UCOL_PSK_CONSUMED_CES_SHIFT
) & UCOL_PSK_CONSUMED_CES_MASK
;
5817 /** values that depend on the collator attributes */
5818 // strength of the collator.
5819 int32_t strength
= ucol_getAttribute(coll
, UCOL_STRENGTH
, status
);
5820 // maximal level of the partial sortkey. Need to take whether case level is done
5821 int32_t maxLevel
= 0;
5822 if(strength
< UCOL_TERTIARY
) {
5823 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
5824 maxLevel
= UCOL_PSK_CASE
;
5826 maxLevel
= strength
;
5829 if(strength
== UCOL_TERTIARY
) {
5830 maxLevel
= UCOL_PSK_TERTIARY
;
5831 } else if(strength
== UCOL_QUATERNARY
) {
5832 maxLevel
= UCOL_PSK_QUATERNARY
;
5833 } else { // identical
5834 maxLevel
= UCOL_IDENTICAL
;
5837 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5838 uint8_t UCOL_HIRAGANA_QUAD
=
5839 (ucol_getAttribute(coll
, UCOL_HIRAGANA_QUATERNARY_MODE
, status
) == UCOL_ON
)?0xFE:0xFF;
5840 // Boundary value that decides whether a CE is shifted or not
5841 uint32_t LVT
= (coll
->alternateHandling
== UCOL_SHIFTED
)?(coll
->variableTopValue
<<16):0;
5842 // Are we doing French collation?
5843 UBool doingFrench
= (ucol_getAttribute(coll
, UCOL_FRENCH_COLLATION
, status
) == UCOL_ON
);
5845 /** initializing the collation state */
5846 UBool notIsContinuation
= FALSE
;
5847 uint32_t CE
= UCOL_NO_MORE_CES
;
5850 IInit_collIterate(coll
, NULL
, -1, &s
, status
);
5851 if(U_FAILURE(*status
)) {
5852 UTRACE_EXIT_STATUS(*status
);
5856 s
.flags
|= UCOL_USE_ITERATOR
;
5857 // This variable tells us whether we have produced some other levels in this iteration
5858 // before we moved to the identical level. In that case, we need to switch the
5859 // type of the iterator.
5860 UBool doingIdenticalFromStart
= FALSE
;
5861 // Normalizing iterator
5862 // The division for the array length may truncate the array size to
5863 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5864 // for all platforms anyway.
5865 UAlignedMemory stackNormIter
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
5866 UNormIterator
*normIter
= NULL
;
5867 // If the normalization is turned on for the collator and we are below identical level
5868 // we will use a FCD normalizing iterator
5869 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
&& level
< UCOL_PSK_IDENTICAL
) {
5870 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
5871 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_FCD
, status
);
5872 s
.flags
&= ~UCOL_ITER_NORM
;
5873 if(U_FAILURE(*status
)) {
5874 UTRACE_EXIT_STATUS(*status
);
5877 } else if(level
== UCOL_PSK_IDENTICAL
) {
5878 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5879 // will be updating the state - and this cannot be done on an ordinary iterator.
5880 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
5881 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
5882 s
.flags
&= ~UCOL_ITER_NORM
;
5883 if(U_FAILURE(*status
)) {
5884 UTRACE_EXIT_STATUS(*status
);
5887 doingIdenticalFromStart
= TRUE
;
5890 // This is the tentative new state of the iterator. The problem
5891 // is that the iterator might return an undefined state, in
5892 // which case we should save the last valid state and increase
5893 // the iterator skip value.
5894 uint32_t newState
= 0;
5896 // First, we set the iterator to the last valid position
5897 // from the last iteration. This was saved in state[0].
5898 if(iterState
== 0) {
5900 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
&& !byteCountOrFrenchDone
) {
5901 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
5903 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5906 /* reset to previous state */
5907 s
.iterator
->setState(s
.iterator
, iterState
, status
);
5908 if(U_FAILURE(*status
)) {
5909 UTRACE_EXIT_STATUS(*status
);
5916 // This variable tells us whether we can attempt to update the state
5917 // of iterator. Situations where we don't want to update iterator state
5918 // are the existence of expansion CEs that are not yet processed, and
5919 // finishing the case level without enough space in the buffer to insert
5920 // a level terminator.
5921 UBool canUpdateState
= TRUE
;
5923 // Consume all the CEs that were consumed at the end of the previous
5924 // iteration without updating the iterator state. On identical level,
5925 // consume the code points.
5926 int32_t counter
= cces
;
5927 if(level
< UCOL_PSK_IDENTICAL
) {
5928 while(counter
-->0) {
5929 // If we're doing French and we are on the secondary level,
5931 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
5932 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
5934 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5936 if(CE
==UCOL_NO_MORE_CES
) {
5937 /* should not happen */
5938 *status
=U_INTERNAL_PROGRAM_ERROR
;
5939 UTRACE_EXIT_STATUS(*status
);
5942 if(uprv_numAvailableExpCEs(s
)) {
5943 canUpdateState
= FALSE
;
5947 while(counter
-->0) {
5948 uiter_next32(s
.iterator
);
5952 // French secondary needs to know whether the iterator state of zero came from previous level OR
5953 // from a new invocation...
5954 UBool wasDoingPrimary
= FALSE
;
5955 // destination buffer byte counter. When this guy
5956 // gets to count, we're done with the iteration
5958 // used to count the zero bytes written after we
5959 // have finished with the sort key
5963 // Hm.... I think we're ready to plunge in. Basic story is as following:
5964 // we have a fall through case based on level. This is used for initial
5965 // positioning on iteration start. Every level processor contains a
5966 // for(;;) which will be broken when we exhaust all the CEs. Other
5967 // way to exit is a goto saveState, which happens when we have filled
5970 case UCOL_PSK_PRIMARY
:
5971 wasDoingPrimary
= TRUE
;
5976 // We should save the state only if we
5977 // are sure that we are done with the
5978 // previous iterator state
5979 if(canUpdateState
&& byteCountOrFrenchDone
== 0) {
5980 newState
= s
.iterator
->getState(s
.iterator
);
5981 if(newState
!= UITER_NO_STATE
) {
5982 iterState
= newState
;
5986 CE
= ucol_IGetNextCE(coll
, &s
, status
);
5988 if(CE
==UCOL_NO_MORE_CES
) {
5989 // Add the level separator
5990 terminatePSKLevel(level
, maxLevel
, i
, dest
);
5991 byteCountOrFrenchDone
=0;
5992 // Restart the iteration an move to the
5994 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
5996 level
= UCOL_PSK_SECONDARY
;
5999 if(!isContinuation(CE
)){
6000 if(coll
->leadBytePermutationTable
!= NULL
){
6001 CE
= (coll
->leadBytePermutationTable
[CE
>>24] << 24) | (CE
& 0x00FFFFFF);
6004 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6005 CE
>>= UCOL_PRIMARYORDERSHIFT
; /* get primary */
6007 if(byteCountOrFrenchDone
== 0) {
6008 // get the second byte of primary
6009 dest
[i
++]=(uint8_t)(CE
>> 8);
6011 byteCountOrFrenchDone
= 0;
6013 if((CE
&=0xff)!=0) {
6016 byteCountOrFrenchDone
= 1;
6020 dest
[i
++]=(uint8_t)CE
;
6024 if(uprv_numAvailableExpCEs(s
)) {
6025 canUpdateState
= FALSE
;
6027 canUpdateState
= TRUE
;
6030 /* fall through to next level */
6031 case UCOL_PSK_SECONDARY
:
6032 if(strength
>= UCOL_SECONDARY
) {
6038 // We should save the state only if we
6039 // are sure that we are done with the
6040 // previous iterator state
6041 if(canUpdateState
) {
6042 newState
= s
.iterator
->getState(s
.iterator
);
6043 if(newState
!= UITER_NO_STATE
) {
6044 iterState
= newState
;
6048 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6050 if(CE
==UCOL_NO_MORE_CES
) {
6051 // Add the level separator
6052 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6053 byteCountOrFrenchDone
= 0;
6054 // Restart the iteration an move to the
6056 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6058 level
= UCOL_PSK_CASE
;
6061 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6062 CE
>>= 8; /* get secondary */
6064 dest
[i
++]=(uint8_t)CE
;
6067 if(uprv_numAvailableExpCEs(s
)) {
6068 canUpdateState
= FALSE
;
6070 canUpdateState
= TRUE
;
6073 } else { // French secondary processing
6074 uint8_t frenchBuff
[UCOL_MAX_BUFFER
];
6075 int32_t frenchIndex
= 0;
6076 // Here we are going backwards.
6077 // If the iterator is at the beggining, it should be
6079 if(wasDoingPrimary
) {
6080 s
.iterator
->move(s
.iterator
, 0, UITER_LIMIT
);
6087 if(canUpdateState
) {
6088 newState
= s
.iterator
->getState(s
.iterator
);
6089 if(newState
!= UITER_NO_STATE
) {
6090 iterState
= newState
;
6094 CE
= ucol_IGetPrevCE(coll
, &s
, status
);
6096 if(CE
==UCOL_NO_MORE_CES
) {
6097 // Add the level separator
6098 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6099 byteCountOrFrenchDone
= 0;
6100 // Restart the iteration an move to the next level
6101 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6102 level
= UCOL_PSK_CASE
;
6105 if(isContinuation(CE
)) { // if it's a continuation, we want to save it and
6106 // reverse when we get a first non-continuation CE.
6108 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
6109 } else if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6110 CE
>>= 8; /* get secondary */
6113 dest
[i
++]=(uint8_t)CE
;
6116 frenchBuff
[frenchIndex
++] = (uint8_t)CE
;
6117 frenchIndex
-= usedFrench
;
6119 while(i
< count
&& frenchIndex
) {
6120 dest
[i
++] = frenchBuff
[--frenchIndex
];
6125 if(uprv_numAvailableExpCEs(s
)) {
6126 canUpdateState
= FALSE
;
6128 canUpdateState
= TRUE
;
6133 level
= UCOL_PSK_CASE
;
6135 /* fall through to next level */
6137 if(ucol_getAttribute(coll
, UCOL_CASE_LEVEL
, status
) == UCOL_ON
) {
6138 uint32_t caseShift
= UCOL_CASE_SHIFT_START
;
6139 uint8_t caseByte
= UCOL_CASE_BYTE_START
;
6140 uint8_t caseBits
= 0;
6143 U_ASSERT(caseShift
<= UCOL_CASE_SHIFT_START
);
6147 // We should save the state only if we
6148 // are sure that we are done with the
6149 // previous iterator state
6150 if(canUpdateState
) {
6151 newState
= s
.iterator
->getState(s
.iterator
);
6152 if(newState
!= UITER_NO_STATE
) {
6153 iterState
= newState
;
6157 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6159 if(CE
==UCOL_NO_MORE_CES
) {
6160 // On the case level we might have an unfinished
6161 // case byte. Add one if it's started.
6162 if(caseShift
!= UCOL_CASE_SHIFT_START
) {
6163 dest
[i
++] = caseByte
;
6166 // We have finished processing CEs on this level.
6167 // However, we don't know if we have enough space
6168 // to add a case level terminator.
6170 // Add the level separator
6171 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6172 // Restart the iteration and move to the
6174 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6175 level
= UCOL_PSK_TERTIARY
;
6177 canUpdateState
= FALSE
;
6182 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6183 if(!isContinuation(CE
) && ((CE
& UCOL_PRIMARYMASK
) != 0 || strength
> UCOL_PRIMARY
)) {
6184 // do the case level if we need to do it. We don't want to calculate
6185 // case level for primary ignorables if we have only primary strength and case level
6186 // otherwise we would break well formedness of CEs
6187 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6188 caseBits
= (uint8_t)(CE
& 0xC0);
6189 // this copies the case level logic from the
6190 // sort key generation code
6192 if (caseShift
== 0) {
6193 dest
[i
++] = caseByte
;
6194 caseShift
= UCOL_CASE_SHIFT_START
;
6195 caseByte
= UCOL_CASE_BYTE_START
;
6197 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6198 if((caseBits
& 0xC0) == 0) {
6199 caseByte
|= 1 << (--caseShift
);
6201 caseByte
|= 0 << (--caseShift
);
6203 if(caseShift
== 0) {
6204 dest
[i
++] = caseByte
;
6205 caseShift
= UCOL_CASE_SHIFT_START
;
6206 caseByte
= UCOL_CASE_BYTE_START
;
6208 caseByte
|= ((caseBits
>>6)&1) << (--caseShift
);
6211 if((caseBits
& 0xC0) == 0) {
6212 caseByte
|= 0 << (--caseShift
);
6214 caseByte
|= 1 << (--caseShift
);
6216 if(caseShift
== 0) {
6217 dest
[i
++] = caseByte
;
6218 caseShift
= UCOL_CASE_SHIFT_START
;
6219 caseByte
= UCOL_CASE_BYTE_START
;
6221 caseByte
|= ((caseBits
>>7)&1) << (--caseShift
);
6228 // Not sure this is correct for the case level - revisit
6229 if(uprv_numAvailableExpCEs(s
)) {
6230 canUpdateState
= FALSE
;
6232 canUpdateState
= TRUE
;
6236 level
= UCOL_PSK_TERTIARY
;
6238 /* fall through to next level */
6239 case UCOL_PSK_TERTIARY
:
6240 if(strength
>= UCOL_TERTIARY
) {
6245 // We should save the state only if we
6246 // are sure that we are done with the
6247 // previous iterator state
6248 if(canUpdateState
) {
6249 newState
= s
.iterator
->getState(s
.iterator
);
6250 if(newState
!= UITER_NO_STATE
) {
6251 iterState
= newState
;
6255 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6257 if(CE
==UCOL_NO_MORE_CES
) {
6258 // Add the level separator
6259 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6260 byteCountOrFrenchDone
= 0;
6261 // Restart the iteration an move to the
6263 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6265 level
= UCOL_PSK_QUATERNARY
;
6268 if(!isShiftedCE(CE
, LVT
, &wasShifted
)) {
6269 notIsContinuation
= !isContinuation(CE
);
6271 if(notIsContinuation
) {
6272 CE
= (uint8_t)(CE
& UCOL_BYTE_SIZE_MASK
);
6273 CE
^= coll
->caseSwitch
;
6274 CE
&= coll
->tertiaryMask
;
6276 CE
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6280 dest
[i
++]=(uint8_t)CE
;
6283 if(uprv_numAvailableExpCEs(s
)) {
6284 canUpdateState
= FALSE
;
6286 canUpdateState
= TRUE
;
6290 // if we're not doing tertiary
6292 level
= UCOL_PSK_NULL
;
6294 /* fall through to next level */
6295 case UCOL_PSK_QUATERNARY
:
6296 if(strength
>= UCOL_QUATERNARY
) {
6301 // We should save the state only if we
6302 // are sure that we are done with the
6303 // previous iterator state
6304 if(canUpdateState
) {
6305 newState
= s
.iterator
->getState(s
.iterator
);
6306 if(newState
!= UITER_NO_STATE
) {
6307 iterState
= newState
;
6311 CE
= ucol_IGetNextCE(coll
, &s
, status
);
6313 if(CE
==UCOL_NO_MORE_CES
) {
6314 // Add the level separator
6315 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6316 //dest[i++] = UCOL_LEVELTERMINATOR;
6317 byteCountOrFrenchDone
= 0;
6318 // Restart the iteration an move to the
6320 s
.iterator
->move(s
.iterator
, 0, UITER_START
);
6322 level
= UCOL_PSK_QUIN
;
6327 if(isShiftedCE(CE
, LVT
, &wasShifted
)) {
6328 CE
>>= 16; /* get primary */
6330 if(byteCountOrFrenchDone
== 0) {
6331 dest
[i
++]=(uint8_t)(CE
>> 8);
6333 byteCountOrFrenchDone
= 0;
6335 if((CE
&=0xff)!=0) {
6338 byteCountOrFrenchDone
= 1;
6341 dest
[i
++]=(uint8_t)CE
;
6345 notIsContinuation
= !isContinuation(CE
);
6346 if(notIsContinuation
) {
6347 if(s
.flags
& UCOL_WAS_HIRAGANA
) { // This was Hiragana and we need to note it
6348 dest
[i
++] = UCOL_HIRAGANA_QUAD
;
6354 if(uprv_numAvailableExpCEs(s
)) {
6355 canUpdateState
= FALSE
;
6357 canUpdateState
= TRUE
;
6361 // if we're not doing quaternary
6363 level
= UCOL_PSK_NULL
;
6365 /* fall through to next level */
6367 level
= UCOL_PSK_IDENTICAL
;
6368 /* fall through to next level */
6369 case UCOL_PSK_IDENTICAL
:
6370 if(strength
>= UCOL_IDENTICAL
) {
6371 UChar32 first
, second
;
6372 int32_t bocsuBytesWritten
= 0;
6373 // We always need to do identical on
6374 // the NFD form of the string.
6375 if(normIter
== NULL
) {
6376 // we arrived from the level below and
6377 // normalization was not turned on.
6378 // therefore, we need to make a fresh NFD iterator
6379 normIter
= unorm_openIter(stackNormIter
, sizeof(stackNormIter
), status
);
6380 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6381 } else if(!doingIdenticalFromStart
) {
6382 // there is an iterator, but we did some other levels.
6383 // therefore, we have a FCD iterator - need to make
6385 // normIter being at the beginning does not guarantee
6386 // that the underlying iterator is at the beginning
6387 iter
->move(iter
, 0, UITER_START
);
6388 s
.iterator
= unorm_setIter(normIter
, iter
, UNORM_NFD
, status
);
6390 // At this point we have a NFD iterator that is positioned
6391 // in the right place
6392 if(U_FAILURE(*status
)) {
6393 UTRACE_EXIT_STATUS(*status
);
6396 first
= uiter_previous32(s
.iterator
);
6397 // maybe we're at the start of the string
6398 if(first
== U_SENTINEL
) {
6401 uiter_next32(s
.iterator
);
6407 if(j
+1 < bocsuBytesWritten
) {
6408 bocsuBytesUsed
= j
+1;
6413 // On identical level, we will always save
6414 // the state if we reach this point, since
6415 // we don't depend on getNextCE for content
6416 // all the content is in our buffer and we
6417 // already either stored the full buffer OR
6418 // otherwise we won't arrive here.
6419 newState
= s
.iterator
->getState(s
.iterator
);
6420 if(newState
!= UITER_NO_STATE
) {
6421 iterState
= newState
;
6426 second
= uiter_next32(s
.iterator
);
6429 // end condition for identical level
6430 if(second
== U_SENTINEL
) {
6431 terminatePSKLevel(level
, maxLevel
, i
, dest
);
6432 level
= UCOL_PSK_NULL
;
6435 bocsuBytesWritten
= u_writeIdenticalLevelRunTwoChars(first
, second
, buff
);
6439 if(bocsuBytesUsed
!= 0) {
6440 while(bocsuBytesUsed
-->0) {
6445 while(i
< count
&& j
< bocsuBytesWritten
) {
6446 dest
[i
++] = buff
[j
++];
6451 level
= UCOL_PSK_NULL
;
6453 /* fall through to next level */
6461 *status
= U_INTERNAL_PROGRAM_ERROR
;
6462 UTRACE_EXIT_STATUS(*status
);
6467 // Now we need to return stuff. First we want to see whether we have
6468 // done everything for the current state of iterator.
6469 if(byteCountOrFrenchDone
6470 || canUpdateState
== FALSE
6471 || (newState
= s
.iterator
->getState(s
.iterator
)) == UITER_NO_STATE
)
6473 // Any of above mean that the previous transaction
6474 // wasn't finished and that we should store the
6475 // previous iterator state.
6476 state
[0] = iterState
;
6478 // The transaction is complete. We will continue in the next iteration.
6479 state
[0] = s
.iterator
->getState(s
.iterator
);
6482 // Store the number of bocsu bytes written.
6483 if((bocsuBytesUsed
& UCOL_PSK_BOCSU_BYTES_MASK
) != bocsuBytesUsed
) {
6484 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6486 state
[1] = (bocsuBytesUsed
& UCOL_PSK_BOCSU_BYTES_MASK
) << UCOL_PSK_BOCSU_BYTES_SHIFT
;
6488 // Next we put in the level of comparison
6489 state
[1] |= ((level
& UCOL_PSK_LEVEL_MASK
) << UCOL_PSK_LEVEL_SHIFT
);
6491 // If we are doing French, we need to store whether we have just finished the French level
6492 if(level
== UCOL_PSK_SECONDARY
&& doingFrench
) {
6493 state
[1] |= (((state
[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6495 state
[1] |= ((byteCountOrFrenchDone
& UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK
) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT
);
6498 // Was the latest CE shifted
6500 state
[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT
;
6502 // Check for cces overflow
6503 if((cces
& UCOL_PSK_CONSUMED_CES_MASK
) != cces
) {
6504 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6507 state
[1] |= ((cces
& UCOL_PSK_CONSUMED_CES_MASK
) << UCOL_PSK_CONSUMED_CES_SHIFT
);
6509 // Check for French overflow
6510 if((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) != usedFrench
) {
6511 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
6513 // Store number of bytes written in the French secondary continuation sequence
6514 state
[1] |= ((usedFrench
& UCOL_PSK_USED_FRENCH_MASK
) << UCOL_PSK_USED_FRENCH_SHIFT
);
6517 // If we have used normalizing iterator, get rid of it
6518 if(normIter
!= NULL
) {
6519 unorm_closeIter(normIter
);
6522 /* To avoid memory leak, free the offset buffer if necessary. */
6523 ucol_freeOffsetBuffer(&s
);
6525 // Return number of meaningful sortkey bytes.
6526 UTRACE_DATA4(UTRACE_VERBOSE
, "dest = %vb, state=%d %d",
6527 dest
,i
, state
[0], state
[1]);
6528 UTRACE_EXIT_VALUE(i
);
6533 * Produce a bound for a given sortkey and a number of levels.
6535 U_CAPI
int32_t U_EXPORT2
6536 ucol_getBound(const uint8_t *source
,
6537 int32_t sourceLength
,
6538 UColBoundMode boundType
,
6539 uint32_t noOfLevels
,
6541 int32_t resultLength
,
6544 // consistency checks
6545 if(status
== NULL
|| U_FAILURE(*status
)) {
6548 if(source
== NULL
) {
6549 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6553 int32_t sourceIndex
= 0;
6554 // Scan the string until we skip enough of the key OR reach the end of the key
6557 if(source
[sourceIndex
] == UCOL_LEVELTERMINATOR
) {
6560 } while (noOfLevels
> 0
6561 && (source
[sourceIndex
] != 0 || sourceIndex
< sourceLength
));
6563 if((source
[sourceIndex
] == 0 || sourceIndex
== sourceLength
)
6564 && noOfLevels
> 0) {
6565 *status
= U_SORT_KEY_TOO_SHORT_WARNING
;
6569 // READ ME: this code assumes that the values for boundType
6570 // enum will not changes. They are set so that the enum value
6571 // corresponds to the number of extra bytes each bound type
6573 if(result
!= NULL
&& resultLength
>= sourceIndex
+boundType
) {
6574 uprv_memcpy(result
, source
, sourceIndex
);
6576 // Lower bound just gets terminated. No extra bytes
6577 case UCOL_BOUND_LOWER
: // = 0
6579 // Upper bound needs one extra byte
6580 case UCOL_BOUND_UPPER
: // = 1
6581 result
[sourceIndex
++] = 2;
6583 // Upper long bound needs two extra bytes
6584 case UCOL_BOUND_UPPER_LONG
: // = 2
6585 result
[sourceIndex
++] = 0xFF;
6586 result
[sourceIndex
++] = 0xFF;
6589 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6592 result
[sourceIndex
++] = 0;
6596 return sourceIndex
+boundType
+1;
6600 /****************************************************************************/
6601 /* Following are the functions that deal with the properties of a collator */
6602 /* there are new APIs and some compatibility APIs */
6603 /****************************************************************************/
6606 ucol_addLatinOneEntry(UCollator
*coll
, UChar ch
, uint32_t CE
,
6607 int32_t *primShift
, int32_t *secShift
, int32_t *terShift
)
6609 uint8_t primary1
= 0, primary2
= 0, secondary
= 0, tertiary
= 0;
6610 UBool reverseSecondary
= FALSE
;
6611 UBool continuation
= isContinuation(CE
);
6613 tertiary
= (uint8_t)((CE
& coll
->tertiaryMask
));
6614 tertiary
^= coll
->caseSwitch
;
6615 reverseSecondary
= TRUE
;
6617 tertiary
= (uint8_t)((CE
& UCOL_REMOVE_CONTINUATION
));
6618 tertiary
&= UCOL_REMOVE_CASE
;
6619 reverseSecondary
= FALSE
;
6622 secondary
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6623 primary2
= (uint8_t)((CE
>>= 8) & UCOL_BYTE_SIZE_MASK
);
6624 primary1
= (uint8_t)(CE
>> 8);
6627 if (coll
->leadBytePermutationTable
!= NULL
&& !continuation
) {
6628 primary1
= coll
->leadBytePermutationTable
[primary1
];
6631 coll
->latinOneCEs
[ch
] |= (primary1
<< *primShift
);
6635 if(*primShift
< 0) {
6636 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6637 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6638 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6641 coll
->latinOneCEs
[ch
] |= (primary2
<< *primShift
);
6644 if(secondary
!= 0) {
6645 if(reverseSecondary
&& coll
->frenchCollation
== UCOL_ON
) { // reverse secondary
6646 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] >>= 8; // make space for secondary
6647 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< 24);
6648 } else { // normal case
6649 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] |= (secondary
<< *secShift
);
6654 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] |= (tertiary
<< *terShift
);
6660 ucol_resizeLatinOneTable(UCollator
*coll
, int32_t size
, UErrorCode
*status
) {
6661 uint32_t *newTable
= (uint32_t *)uprv_malloc(size
*sizeof(uint32_t)*3);
6662 if(newTable
== NULL
) {
6663 *status
= U_MEMORY_ALLOCATION_ERROR
;
6664 coll
->latinOneFailed
= TRUE
;
6667 int32_t sizeToCopy
= ((size
<coll
->latinOneTableLen
)?size
:coll
->latinOneTableLen
)*sizeof(uint32_t);
6668 uprv_memset(newTable
, 0, size
*sizeof(uint32_t)*3);
6669 uprv_memcpy(newTable
, coll
->latinOneCEs
, sizeToCopy
);
6670 uprv_memcpy(newTable
+size
, coll
->latinOneCEs
+coll
->latinOneTableLen
, sizeToCopy
);
6671 uprv_memcpy(newTable
+2*size
, coll
->latinOneCEs
+2*coll
->latinOneTableLen
, sizeToCopy
);
6672 coll
->latinOneTableLen
= size
;
6673 uprv_free(coll
->latinOneCEs
);
6674 coll
->latinOneCEs
= newTable
;
6679 ucol_setUpLatinOne(UCollator
*coll
, UErrorCode
*status
) {
6680 UBool result
= TRUE
;
6681 if(coll
->latinOneCEs
== NULL
) {
6682 coll
->latinOneCEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN
*3);
6683 if(coll
->latinOneCEs
== NULL
) {
6684 *status
= U_MEMORY_ALLOCATION_ERROR
;
6687 coll
->latinOneTableLen
= UCOL_LATINONETABLELEN
;
6690 UCollationElements
*it
= ucol_openElements(coll
, &ch
, 1, status
);
6691 // Check for null pointer
6692 if (U_FAILURE(*status
)) {
6695 uprv_memset(coll
->latinOneCEs
, 0, sizeof(uint32_t)*coll
->latinOneTableLen
*3);
6697 int32_t primShift
= 24, secShift
= 24, terShift
= 24;
6699 int32_t contractionOffset
= UCOL_ENDOFLATINONERANGE
+1;
6701 // TODO: make safe if you get more than you wanted...
6702 for(ch
= 0; ch
<= UCOL_ENDOFLATINONERANGE
; ch
++) {
6703 primShift
= 24; secShift
= 24; terShift
= 24;
6705 CE
= coll
->latinOneMapping
[ch
];
6707 CE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, ch
);
6708 if(CE
== UCOL_NOT_FOUND
&& coll
->UCA
) {
6709 CE
= UTRIE_GET32_FROM_LEAD(&coll
->UCA
->mapping
, ch
);
6712 if(CE
< UCOL_NOT_FOUND
) {
6713 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6715 switch (getCETag(CE
)) {
6718 ucol_setText(it
, &ch
, 1, status
);
6719 while((int32_t)(CE
= ucol_next(it
, status
)) != UCOL_NULLORDER
) {
6720 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6721 coll
->latinOneCEs
[ch
] = UCOL_BAIL_OUT_CE
;
6722 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6723 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = UCOL_BAIL_OUT_CE
;
6726 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6729 case CONTRACTION_TAG
:
6730 // here is the trick
6731 // F2 is contraction. We do something very similar to contractions
6732 // but have two indices, one in the real contraction table and the
6733 // other to where we stuffed things. This hopes that we don't have
6734 // many contractions (this should work for latin-1 tables).
6736 if((CE
& 0x00FFF000) != 0) {
6737 *status
= U_UNSUPPORTED_ERROR
;
6738 goto cleanup_after_failure
;
6741 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
);
6743 CE
|= (contractionOffset
& 0xFFF) << 12; // insert the offset in latin-1 table
6745 coll
->latinOneCEs
[ch
] = CE
;
6746 coll
->latinOneCEs
[coll
->latinOneTableLen
+ch
] = CE
;
6747 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+ch
] = CE
;
6749 // We're going to jump into contraction table, pick the elements
6752 CE
= *(coll
->contractionCEs
+
6753 (UCharOffset
- coll
->contractionIndex
));
6754 if(CE
> UCOL_NOT_FOUND
&& getCETag(CE
) == EXPANSION_TAG
) {
6756 uint32_t i
; /* general counter */
6757 uint32_t *CEOffset
= (uint32_t *)coll
->image
+getExpansionOffset(CE
); /* find the offset to expansion table */
6758 size
= getExpansionCount(CE
);
6760 if(size
!= 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6761 for(i
= 0; i
<size
; i
++) {
6762 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6763 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6764 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6765 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6768 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6770 } else { /* else, we do */
6771 while(*CEOffset
!= 0) {
6772 if(primShift
< 0 || secShift
< 0 || terShift
< 0) {
6773 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6774 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6775 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6778 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
, *CEOffset
++, &primShift
, &secShift
, &terShift
);
6781 contractionOffset
++;
6782 } else if(CE
< UCOL_NOT_FOUND
) {
6783 ucol_addLatinOneEntry(coll
, (UChar
)contractionOffset
++, CE
, &primShift
, &secShift
, &terShift
);
6785 coll
->latinOneCEs
[(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6786 coll
->latinOneCEs
[coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6787 coll
->latinOneCEs
[2*coll
->latinOneTableLen
+(UChar
)contractionOffset
] = UCOL_BAIL_OUT_CE
;
6788 contractionOffset
++;
6791 primShift
= 24; secShift
= 24; terShift
= 24;
6792 if(contractionOffset
== coll
->latinOneTableLen
) { // we need to reallocate
6793 if(!ucol_resizeLatinOneTable(coll
, 2*coll
->latinOneTableLen
, status
)) {
6794 goto cleanup_after_failure
;
6797 } while(*UCharOffset
!= 0xFFFF);
6802 // 0xB7 is a precontext character defined in UCA5.1, a special
6803 // handle is implemeted in order to save LatinOne table for
6806 ucol_addLatinOneEntry(coll
, ch
, CE
, &primShift
, &secShift
, &terShift
);
6809 goto cleanup_after_failure
;
6814 goto cleanup_after_failure
;
6819 if(contractionOffset
< coll
->latinOneTableLen
) {
6820 if(!ucol_resizeLatinOneTable(coll
, contractionOffset
, status
)) {
6821 goto cleanup_after_failure
;
6824 ucol_closeElements(it
);
6827 cleanup_after_failure
:
6828 // status should already be set before arriving here.
6829 coll
->latinOneFailed
= TRUE
;
6830 ucol_closeElements(it
);
6834 void ucol_updateInternalState(UCollator
*coll
, UErrorCode
*status
) {
6835 if(U_SUCCESS(*status
)) {
6836 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6837 coll
->caseSwitch
= UCOL_CASE_SWITCH
;
6839 coll
->caseSwitch
= UCOL_NO_CASE_SWITCH
;
6842 if(coll
->caseLevel
== UCOL_ON
|| coll
->caseFirst
== UCOL_OFF
) {
6843 coll
->tertiaryMask
= UCOL_REMOVE_CASE
;
6844 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
6845 coll
->tertiaryAddition
= (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF
; /* Should be 0x80 */
6846 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_OFF
;
6847 coll
->tertiaryBottom
= UCOL_COMMON_BOT3
;
6849 coll
->tertiaryMask
= UCOL_KEEP_CASE
;
6850 coll
->tertiaryAddition
= UCOL_FLAG_BIT_MASK_CASE_SW_ON
;
6851 if(coll
->caseFirst
== UCOL_UPPER_FIRST
) {
6852 coll
->tertiaryCommon
= UCOL_COMMON3_UPPERFIRST
;
6853 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_UPPER
;
6854 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_UPPER
;
6856 coll
->tertiaryCommon
= UCOL_COMMON3_NORMAL
;
6857 coll
->tertiaryTop
= UCOL_COMMON_TOP3_CASE_SW_LOWER
;
6858 coll
->tertiaryBottom
= UCOL_COMMON_BOTTOM3_CASE_SW_LOWER
;
6862 /* Set the compression values */
6863 uint8_t tertiaryTotal
= (uint8_t)(coll
->tertiaryTop
- UCOL_COMMON_BOT3
-1);
6864 coll
->tertiaryTopCount
= (uint8_t)(UCOL_PROPORTION3
*tertiaryTotal
); /* we multilply double with int, but need only int */
6865 coll
->tertiaryBottomCount
= (uint8_t)(tertiaryTotal
- coll
->tertiaryTopCount
);
6867 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
== UCOL_TERTIARY
6868 && coll
->frenchCollation
== UCOL_OFF
&& coll
->alternateHandling
== UCOL_NON_IGNORABLE
)
6870 coll
->sortKeyGen
= ucol_calcSortKeySimpleTertiary
;
6872 coll
->sortKeyGen
= ucol_calcSortKey
;
6874 if(coll
->caseLevel
== UCOL_OFF
&& coll
->strength
<= UCOL_TERTIARY
&& coll
->numericCollation
== UCOL_OFF
6875 && coll
->alternateHandling
== UCOL_NON_IGNORABLE
&& !coll
->latinOneFailed
)
6877 if(coll
->latinOneCEs
== NULL
|| coll
->latinOneRegenTable
) {
6878 if(ucol_setUpLatinOne(coll
, status
)) { // if we succeed in building latin1 table, we'll use it
6879 //fprintf(stderr, "F");
6880 coll
->latinOneUse
= TRUE
;
6882 coll
->latinOneUse
= FALSE
;
6884 if(*status
== U_UNSUPPORTED_ERROR
) {
6885 *status
= U_ZERO_ERROR
;
6887 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6888 coll
->latinOneUse
= TRUE
;
6891 coll
->latinOneUse
= FALSE
;
6896 U_CAPI
uint32_t U_EXPORT2
6897 ucol_setVariableTop(UCollator
*coll
, const UChar
*varTop
, int32_t len
, UErrorCode
*status
) {
6898 if(U_FAILURE(*status
) || coll
== NULL
) {
6902 len
= u_strlen(varTop
);
6905 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6910 IInit_collIterate(coll
, varTop
, len
, &s
, status
);
6911 if(U_FAILURE(*status
)) {
6915 uint32_t CE
= ucol_IGetNextCE(coll
, &s
, status
);
6917 /* here we check if we have consumed all characters */
6918 /* you can put in either one character or a contraction */
6919 /* you shouldn't put more... */
6920 if(s
.pos
!= s
.endp
|| CE
== UCOL_NO_MORE_CES
) {
6921 *status
= U_CE_NOT_FOUND_ERROR
;
6925 uint32_t nextCE
= ucol_IGetNextCE(coll
, &s
, status
);
6927 if(isContinuation(nextCE
) && (nextCE
& UCOL_PRIMARYMASK
) != 0) {
6928 *status
= U_PRIMARY_TOO_LONG_ERROR
;
6931 if(coll
->variableTopValue
!= (CE
& UCOL_PRIMARYMASK
)>>16) {
6932 coll
->variableTopValueisDefault
= FALSE
;
6933 coll
->variableTopValue
= (CE
& UCOL_PRIMARYMASK
)>>16;
6936 /* To avoid memory leak, free the offset buffer if necessary. */
6937 ucol_freeOffsetBuffer(&s
);
6939 return CE
& UCOL_PRIMARYMASK
;
6942 U_CAPI
uint32_t U_EXPORT2
ucol_getVariableTop(const UCollator
*coll
, UErrorCode
*status
) {
6943 if(U_FAILURE(*status
) || coll
== NULL
) {
6946 return coll
->variableTopValue
<<16;
6949 U_CAPI
void U_EXPORT2
6950 ucol_restoreVariableTop(UCollator
*coll
, const uint32_t varTop
, UErrorCode
*status
) {
6951 if(U_FAILURE(*status
) || coll
== NULL
) {
6955 if(coll
->variableTopValue
!= (varTop
& UCOL_PRIMARYMASK
)>>16) {
6956 coll
->variableTopValueisDefault
= FALSE
;
6957 coll
->variableTopValue
= (varTop
& UCOL_PRIMARYMASK
)>>16;
6960 /* Attribute setter API */
6961 U_CAPI
void U_EXPORT2
6962 ucol_setAttribute(UCollator
*coll
, UColAttribute attr
, UColAttributeValue value
, UErrorCode
*status
) {
6963 if(U_FAILURE(*status
) || coll
== NULL
) {
6966 UColAttributeValue oldFrench
= coll
->frenchCollation
;
6967 UColAttributeValue oldCaseFirst
= coll
->caseFirst
;
6969 case UCOL_NUMERIC_COLLATION
: /* sort substrings of digits as numbers */
6970 if(value
== UCOL_ON
) {
6971 coll
->numericCollation
= UCOL_ON
;
6972 coll
->numericCollationisDefault
= FALSE
;
6973 } else if (value
== UCOL_OFF
) {
6974 coll
->numericCollation
= UCOL_OFF
;
6975 coll
->numericCollationisDefault
= FALSE
;
6976 } else if (value
== UCOL_DEFAULT
) {
6977 coll
->numericCollationisDefault
= TRUE
;
6978 coll
->numericCollation
= (UColAttributeValue
)coll
->options
->numericCollation
;
6980 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6983 case UCOL_HIRAGANA_QUATERNARY_MODE
: /* special quaternary values for Hiragana */
6984 if(value
== UCOL_ON
) {
6985 coll
->hiraganaQ
= UCOL_ON
;
6986 coll
->hiraganaQisDefault
= FALSE
;
6987 } else if (value
== UCOL_OFF
) {
6988 coll
->hiraganaQ
= UCOL_OFF
;
6989 coll
->hiraganaQisDefault
= FALSE
;
6990 } else if (value
== UCOL_DEFAULT
) {
6991 coll
->hiraganaQisDefault
= TRUE
;
6992 coll
->hiraganaQ
= (UColAttributeValue
)coll
->options
->hiraganaQ
;
6994 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
6997 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
6998 if(value
== UCOL_ON
) {
6999 coll
->frenchCollation
= UCOL_ON
;
7000 coll
->frenchCollationisDefault
= FALSE
;
7001 } else if (value
== UCOL_OFF
) {
7002 coll
->frenchCollation
= UCOL_OFF
;
7003 coll
->frenchCollationisDefault
= FALSE
;
7004 } else if (value
== UCOL_DEFAULT
) {
7005 coll
->frenchCollationisDefault
= TRUE
;
7006 coll
->frenchCollation
= (UColAttributeValue
)coll
->options
->frenchCollation
;
7008 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7011 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
7012 if(value
== UCOL_SHIFTED
) {
7013 coll
->alternateHandling
= UCOL_SHIFTED
;
7014 coll
->alternateHandlingisDefault
= FALSE
;
7015 } else if (value
== UCOL_NON_IGNORABLE
) {
7016 coll
->alternateHandling
= UCOL_NON_IGNORABLE
;
7017 coll
->alternateHandlingisDefault
= FALSE
;
7018 } else if (value
== UCOL_DEFAULT
) {
7019 coll
->alternateHandlingisDefault
= TRUE
;
7020 coll
->alternateHandling
= (UColAttributeValue
)coll
->options
->alternateHandling
;
7022 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7025 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
7026 if(value
== UCOL_LOWER_FIRST
) {
7027 coll
->caseFirst
= UCOL_LOWER_FIRST
;
7028 coll
->caseFirstisDefault
= FALSE
;
7029 } else if (value
== UCOL_UPPER_FIRST
) {
7030 coll
->caseFirst
= UCOL_UPPER_FIRST
;
7031 coll
->caseFirstisDefault
= FALSE
;
7032 } else if (value
== UCOL_OFF
) {
7033 coll
->caseFirst
= UCOL_OFF
;
7034 coll
->caseFirstisDefault
= FALSE
;
7035 } else if (value
== UCOL_DEFAULT
) {
7036 coll
->caseFirst
= (UColAttributeValue
)coll
->options
->caseFirst
;
7037 coll
->caseFirstisDefault
= TRUE
;
7039 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7042 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
7043 if(value
== UCOL_ON
) {
7044 coll
->caseLevel
= UCOL_ON
;
7045 coll
->caseLevelisDefault
= FALSE
;
7046 } else if (value
== UCOL_OFF
) {
7047 coll
->caseLevel
= UCOL_OFF
;
7048 coll
->caseLevelisDefault
= FALSE
;
7049 } else if (value
== UCOL_DEFAULT
) {
7050 coll
->caseLevel
= (UColAttributeValue
)coll
->options
->caseLevel
;
7051 coll
->caseLevelisDefault
= TRUE
;
7053 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7056 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
7057 if(value
== UCOL_ON
) {
7058 coll
->normalizationMode
= UCOL_ON
;
7059 coll
->normalizationModeisDefault
= FALSE
;
7060 initializeFCD(status
);
7061 } else if (value
== UCOL_OFF
) {
7062 coll
->normalizationMode
= UCOL_OFF
;
7063 coll
->normalizationModeisDefault
= FALSE
;
7064 } else if (value
== UCOL_DEFAULT
) {
7065 coll
->normalizationModeisDefault
= TRUE
;
7066 coll
->normalizationMode
= (UColAttributeValue
)coll
->options
->normalizationMode
;
7067 if(coll
->normalizationMode
== UCOL_ON
) {
7068 initializeFCD(status
);
7071 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7074 case UCOL_STRENGTH
: /* attribute for strength */
7075 if (value
== UCOL_DEFAULT
) {
7076 coll
->strengthisDefault
= TRUE
;
7077 coll
->strength
= (UColAttributeValue
)coll
->options
->strength
;
7078 } else if (value
<= UCOL_IDENTICAL
) {
7079 coll
->strengthisDefault
= FALSE
;
7080 coll
->strength
= value
;
7082 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7085 case UCOL_ATTRIBUTE_COUNT
:
7087 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7090 if(oldFrench
!= coll
->frenchCollation
|| oldCaseFirst
!= coll
->caseFirst
) {
7091 coll
->latinOneRegenTable
= TRUE
;
7093 coll
->latinOneRegenTable
= FALSE
;
7095 ucol_updateInternalState(coll
, status
);
7098 U_CAPI UColAttributeValue U_EXPORT2
7099 ucol_getAttribute(const UCollator
*coll
, UColAttribute attr
, UErrorCode
*status
) {
7100 if(U_FAILURE(*status
) || coll
== NULL
) {
7101 return UCOL_DEFAULT
;
7104 case UCOL_NUMERIC_COLLATION
:
7105 return coll
->numericCollation
;
7106 case UCOL_HIRAGANA_QUATERNARY_MODE
:
7107 return coll
->hiraganaQ
;
7108 case UCOL_FRENCH_COLLATION
: /* attribute for direction of secondary weights*/
7109 return coll
->frenchCollation
;
7110 case UCOL_ALTERNATE_HANDLING
: /* attribute for handling variable elements*/
7111 return coll
->alternateHandling
;
7112 case UCOL_CASE_FIRST
: /* who goes first, lower case or uppercase */
7113 return coll
->caseFirst
;
7114 case UCOL_CASE_LEVEL
: /* do we have an extra case level */
7115 return coll
->caseLevel
;
7116 case UCOL_NORMALIZATION_MODE
: /* attribute for normalization */
7117 return coll
->normalizationMode
;
7118 case UCOL_STRENGTH
: /* attribute for strength */
7119 return coll
->strength
;
7120 case UCOL_ATTRIBUTE_COUNT
:
7122 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
7125 return UCOL_DEFAULT
;
7128 U_CAPI
void U_EXPORT2
7129 ucol_setStrength( UCollator
*coll
,
7130 UCollationStrength strength
)
7132 UErrorCode status
= U_ZERO_ERROR
;
7133 ucol_setAttribute(coll
, UCOL_STRENGTH
, strength
, &status
);
7136 U_CAPI UCollationStrength U_EXPORT2
7137 ucol_getStrength(const UCollator
*coll
)
7139 UErrorCode status
= U_ZERO_ERROR
;
7140 return ucol_getAttribute(coll
, UCOL_STRENGTH
, &status
);
7143 U_INTERNAL
int32_t U_EXPORT2
7144 ucol_getReorderCodes(const UCollator
*coll
,
7146 int32_t destCapacity
,
7147 UErrorCode
*pErrorCode
) {
7148 if (U_FAILURE(*pErrorCode
)) {
7152 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
7153 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
7157 if (coll
->reorderCodesLength
> destCapacity
) {
7158 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
7159 return coll
->reorderCodesLength
;
7161 for (int32_t i
= 0; i
< coll
->reorderCodesLength
; i
++) {
7162 dest
[i
] = coll
->reorderCodes
[i
];
7164 return coll
->reorderCodesLength
;
7167 U_INTERNAL
void U_EXPORT2
7168 ucol_setReorderCodes(UCollator
*coll
,
7169 const int32_t *reorderCodes
,
7170 int32_t reorderCodesLength
,
7171 UErrorCode
*pErrorCode
) {
7172 if (U_FAILURE(*pErrorCode
)) {
7176 if (reorderCodesLength
< 0 || (reorderCodesLength
> 0 && reorderCodes
== NULL
)) {
7177 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
7181 uprv_free(coll
->reorderCodes
);
7182 coll
->reorderCodes
= NULL
;
7183 coll
->reorderCodesLength
= 0;
7184 if (reorderCodesLength
== 0) {
7185 uprv_free(coll
->leadBytePermutationTable
);
7186 coll
->leadBytePermutationTable
= NULL
;
7189 coll
->reorderCodes
= (int32_t*) uprv_malloc(reorderCodesLength
* sizeof(int32_t));
7190 if (coll
->reorderCodes
== NULL
) {
7191 *pErrorCode
= U_MEMORY_ALLOCATION_ERROR
;
7194 for (int32_t i
= 0; i
< reorderCodesLength
; i
++) {
7195 coll
->reorderCodes
[i
] = reorderCodes
[i
];
7197 coll
->reorderCodesLength
= reorderCodesLength
;
7198 ucol_buildPermutationTable(coll
, pErrorCode
);
7199 if (U_FAILURE(*pErrorCode
)) {
7200 uprv_free(coll
->reorderCodes
);
7201 coll
->reorderCodes
= NULL
;
7202 coll
->reorderCodesLength
= 0;
7207 /****************************************************************************/
7208 /* Following are misc functions */
7209 /* there are new APIs and some compatibility APIs */
7210 /****************************************************************************/
7212 U_CAPI
void U_EXPORT2
7213 ucol_getVersion(const UCollator
* coll
,
7214 UVersionInfo versionInfo
)
7216 /* RunTime version */
7217 uint8_t rtVersion
= UCOL_RUNTIME_VERSION
;
7218 /* Builder version*/
7219 uint8_t bdVersion
= coll
->image
->version
[0];
7221 /* Charset Version. Need to get the version from cnv files
7222 * makeconv should populate cnv files with version and
7223 * an api has to be provided in ucnv.h to obtain this version
7225 uint8_t csVersion
= 0;
7227 /* combine the version info */
7228 uint16_t cmbVersion
= (uint16_t)((rtVersion
<<11) | (bdVersion
<<6) | (csVersion
));
7230 /* Tailoring rules */
7231 versionInfo
[0] = (uint8_t)(cmbVersion
>>8);
7232 versionInfo
[1] = (uint8_t)cmbVersion
;
7233 versionInfo
[2] = coll
->image
->version
[1];
7235 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
7236 versionInfo
[3] = (coll
->UCA
->image
->UCAVersion
[0] & 0x1f) << 3 | (coll
->UCA
->image
->UCAVersion
[1] & 0x07);
7243 /* This internal API checks whether a character is tailored or not */
7244 U_CAPI UBool U_EXPORT2
7245 ucol_isTailored(const UCollator
*coll
, const UChar u
, UErrorCode
*status
) {
7246 if(U_FAILURE(*status
) || coll
== NULL
|| coll
== coll
->UCA
) {
7250 uint32_t CE
= UCOL_NOT_FOUND
;
7251 const UChar
*ContractionStart
= NULL
;
7252 if(u
< 0x100) { /* latin-1 */
7253 CE
= coll
->latinOneMapping
[u
];
7254 if(coll
->UCA
&& CE
== coll
->UCA
->latinOneMapping
[u
]) {
7257 } else { /* regular */
7258 CE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, u
);
7261 if(isContraction(CE
)) {
7262 ContractionStart
= (UChar
*)coll
->image
+getContractOffset(CE
);
7263 CE
= *(coll
->contractionCEs
+ (ContractionStart
- coll
->contractionIndex
));
7266 return (UBool
)(CE
!= UCOL_NOT_FOUND
);
7270 /****************************************************************************/
7271 /* Following are the string compare functions */
7273 /****************************************************************************/
7276 /* ucol_checkIdent internal function. Does byte level string compare. */
7277 /* Used by strcoll if strength == identical and strings */
7278 /* are otherwise equal. */
7280 /* Comparison must be done on NFD normalized strings. */
7281 /* FCD is not good enough. */
7284 UCollationResult
ucol_checkIdent(collIterate
*sColl
, collIterate
*tColl
, UBool normalize
, UErrorCode
*status
)
7286 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7287 // of same type, but that doesn't really mean that it will stay that way.
7290 if (sColl
->flags
& UCOL_USE_ITERATOR
) {
7291 // The division for the array length may truncate the array size to
7292 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7293 // for all platforms anyway.
7294 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7295 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
7296 UNormIterator
*sNIt
= NULL
, *tNIt
= NULL
;
7297 sNIt
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
7298 tNIt
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
7299 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7300 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7301 UCharIterator
*sIt
= unorm_setIter(sNIt
, sColl
->iterator
, UNORM_NFD
, status
);
7302 UCharIterator
*tIt
= unorm_setIter(tNIt
, tColl
->iterator
, UNORM_NFD
, status
);
7303 comparison
= u_strCompareIter(sIt
, tIt
, TRUE
);
7304 unorm_closeIter(sNIt
);
7305 unorm_closeIter(tNIt
);
7307 int32_t sLen
= (sColl
->flags
& UCOL_ITER_HASLEN
) ? (int32_t)(sColl
->endp
- sColl
->string
) : -1;
7308 const UChar
*sBuf
= sColl
->string
;
7309 int32_t tLen
= (tColl
->flags
& UCOL_ITER_HASLEN
) ? (int32_t)(tColl
->endp
- tColl
->string
) : -1;
7310 const UChar
*tBuf
= tColl
->string
;
7313 *status
= U_ZERO_ERROR
;
7314 // Note: We could use Normalizer::compare() or similar, but for short strings
7315 // which may not be in FCD it might be faster to just NFD them.
7316 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
7317 // NFD'ing immediately might be faster for long strings,
7318 // but string comparison is usually done on relatively short strings.
7319 sColl
->nfd
->normalize(UnicodeString((sColl
->flags
& UCOL_ITER_HASLEN
) == 0, sBuf
, sLen
),
7320 sColl
->writableBuffer
,
7322 tColl
->nfd
->normalize(UnicodeString((tColl
->flags
& UCOL_ITER_HASLEN
) == 0, tBuf
, tLen
),
7323 tColl
->writableBuffer
,
7325 if(U_FAILURE(*status
)) {
7328 comparison
= sColl
->writableBuffer
.compareCodePointOrder(tColl
->writableBuffer
);
7330 comparison
= u_strCompare(sBuf
, sLen
, tBuf
, tLen
, TRUE
);
7334 if (comparison
< 0) {
7336 } else if (comparison
== 0) {
7338 } else /* comparison > 0 */ {
7339 return UCOL_GREATER
;
7343 /* CEBuf - A struct and some inline functions to handle the saving */
7344 /* of CEs in a buffer within ucol_strcoll */
7346 #define UCOL_CEBUF_SIZE 512
7347 typedef struct ucol_CEBuf
{
7351 uint32_t localArray
[UCOL_CEBUF_SIZE
];
7356 inline void UCOL_INIT_CEBUF(ucol_CEBuf
*b
) {
7357 (b
)->buf
= (b
)->pos
= (b
)->localArray
;
7358 (b
)->endp
= (b
)->buf
+ UCOL_CEBUF_SIZE
;
7362 void ucol_CEBuf_Expand(ucol_CEBuf
*b
, collIterate
*ci
, UErrorCode
*status
) {
7367 ci
->flags
|= UCOL_ITER_ALLOCATED
;
7368 oldSize
= (uint32_t)(b
->pos
- b
->buf
);
7369 newSize
= oldSize
* 2;
7370 newBuf
= (uint32_t *)uprv_malloc(newSize
* sizeof(uint32_t));
7371 if(newBuf
== NULL
) {
7372 *status
= U_MEMORY_ALLOCATION_ERROR
;
7375 uprv_memcpy(newBuf
, b
->buf
, oldSize
* sizeof(uint32_t));
7376 if (b
->buf
!= b
->localArray
) {
7380 b
->endp
= b
->buf
+ newSize
;
7381 b
->pos
= b
->buf
+ oldSize
;
7386 inline void UCOL_CEBUF_PUT(ucol_CEBuf
*b
, uint32_t ce
, collIterate
*ci
, UErrorCode
*status
) {
7387 if (b
->pos
== b
->endp
) {
7388 ucol_CEBuf_Expand(b
, ci
, status
);
7390 if (U_SUCCESS(*status
)) {
7395 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7396 /* It is used when compare gets in trouble and needs to bail out */
7397 static UCollationResult
ucol_compareUsingSortKeys(collIterate
*sColl
,
7401 uint8_t sourceKey
[UCOL_MAX_BUFFER
], targetKey
[UCOL_MAX_BUFFER
];
7402 uint8_t *sourceKeyP
= sourceKey
;
7403 uint8_t *targetKeyP
= targetKey
;
7404 int32_t sourceKeyLen
= UCOL_MAX_BUFFER
, targetKeyLen
= UCOL_MAX_BUFFER
;
7405 const UCollator
*coll
= sColl
->coll
;
7406 const UChar
*source
= NULL
;
7407 const UChar
*target
= NULL
;
7408 int32_t result
= UCOL_EQUAL
;
7409 UnicodeString sourceString
, targetString
;
7410 int32_t sourceLength
;
7411 int32_t targetLength
;
7413 if(sColl
->flags
& UCOL_USE_ITERATOR
) {
7414 sColl
->iterator
->move(sColl
->iterator
, 0, UITER_START
);
7415 tColl
->iterator
->move(tColl
->iterator
, 0, UITER_START
);
7417 while((c
=sColl
->iterator
->next(sColl
->iterator
))>=0) {
7418 sourceString
.append((UChar
)c
);
7420 while((c
=tColl
->iterator
->next(tColl
->iterator
))>=0) {
7421 targetString
.append((UChar
)c
);
7423 source
= sourceString
.getBuffer();
7424 sourceLength
= sourceString
.length();
7425 target
= targetString
.getBuffer();
7426 targetLength
= targetString
.length();
7427 } else { // no iterators
7428 sourceLength
= (sColl
->flags
&UCOL_ITER_HASLEN
)?(int32_t)(sColl
->endp
-sColl
->string
):-1;
7429 targetLength
= (tColl
->flags
&UCOL_ITER_HASLEN
)?(int32_t)(tColl
->endp
-tColl
->string
):-1;
7430 source
= sColl
->string
;
7431 target
= tColl
->string
;
7436 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7437 if(sourceKeyLen
> UCOL_MAX_BUFFER
) {
7438 sourceKeyP
= (uint8_t*)uprv_malloc(sourceKeyLen
*sizeof(uint8_t));
7439 if(sourceKeyP
== NULL
) {
7440 *status
= U_MEMORY_ALLOCATION_ERROR
;
7441 goto cleanup_and_do_compare
;
7443 sourceKeyLen
= ucol_getSortKey(coll
, source
, sourceLength
, sourceKeyP
, sourceKeyLen
);
7446 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7447 if(targetKeyLen
> UCOL_MAX_BUFFER
) {
7448 targetKeyP
= (uint8_t*)uprv_malloc(targetKeyLen
*sizeof(uint8_t));
7449 if(targetKeyP
== NULL
) {
7450 *status
= U_MEMORY_ALLOCATION_ERROR
;
7451 goto cleanup_and_do_compare
;
7453 targetKeyLen
= ucol_getSortKey(coll
, target
, targetLength
, targetKeyP
, targetKeyLen
);
7456 result
= uprv_strcmp((const char*)sourceKeyP
, (const char*)targetKeyP
);
7458 cleanup_and_do_compare
:
7459 if(sourceKeyP
!= NULL
&& sourceKeyP
!= sourceKey
) {
7460 uprv_free(sourceKeyP
);
7463 if(targetKeyP
!= NULL
&& targetKeyP
!= targetKey
) {
7464 uprv_free(targetKeyP
);
7469 } else if(result
>0) {
7470 return UCOL_GREATER
;
7477 static UCollationResult
7478 ucol_strcollRegular(collIterate
*sColl
, collIterate
*tColl
, UErrorCode
*status
)
7482 const UCollator
*coll
= sColl
->coll
;
7485 // setting up the collator parameters
7486 UColAttributeValue strength
= coll
->strength
;
7487 UBool initialCheckSecTer
= (strength
>= UCOL_SECONDARY
);
7489 UBool checkSecTer
= initialCheckSecTer
;
7490 UBool checkTertiary
= (strength
>= UCOL_TERTIARY
);
7491 UBool checkQuad
= (strength
>= UCOL_QUATERNARY
);
7492 UBool checkIdent
= (strength
== UCOL_IDENTICAL
);
7493 UBool checkCase
= (coll
->caseLevel
== UCOL_ON
);
7494 UBool isFrenchSec
= (coll
->frenchCollation
== UCOL_ON
) && checkSecTer
;
7495 UBool shifted
= (coll
->alternateHandling
== UCOL_SHIFTED
);
7496 UBool qShifted
= shifted
&& checkQuad
;
7497 UBool doHiragana
= (coll
->hiraganaQ
== UCOL_ON
) && checkQuad
;
7499 if(doHiragana
&& shifted
) {
7500 return (ucol_compareUsingSortKeys(sColl
, tColl
, status
));
7502 uint8_t caseSwitch
= coll
->caseSwitch
;
7503 uint8_t tertiaryMask
= coll
->tertiaryMask
;
7505 // This is the lowest primary value that will not be ignored if shifted
7506 uint32_t LVT
= (shifted
)?(coll
->variableTopValue
<<16):0;
7508 UCollationResult result
= UCOL_EQUAL
;
7509 UCollationResult hirResult
= UCOL_EQUAL
;
7511 // Preparing the CE buffers. They will be filled during the primary phase
7514 UCOL_INIT_CEBUF(&sCEs
);
7515 UCOL_INIT_CEBUF(&tCEs
);
7517 uint32_t secS
= 0, secT
= 0;
7518 uint32_t sOrder
=0, tOrder
=0;
7520 // Non shifted primary processing is quite simple
7524 // We fetch CEs until we hit a non ignorable primary or end.
7526 // We get the next CE
7527 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7528 // Stuff it in the buffer
7529 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7530 // And keep just the primary part.
7531 sOrder
&= UCOL_PRIMARYMASK
;
7532 } while(sOrder
== 0);
7534 // see the comments on the above block
7536 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7537 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7538 tOrder
&= UCOL_PRIMARYMASK
;
7539 } while(tOrder
== 0);
7541 // if both primaries are the same
7542 if(sOrder
== tOrder
) {
7543 // and there are no more CEs, we advance to the next level
7544 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7547 if(doHiragana
&& hirResult
== UCOL_EQUAL
) {
7548 if((sColl
->flags
& UCOL_WAS_HIRAGANA
) != (tColl
->flags
& UCOL_WAS_HIRAGANA
)) {
7549 hirResult
= ((sColl
->flags
& UCOL_WAS_HIRAGANA
) > (tColl
->flags
& UCOL_WAS_HIRAGANA
))
7550 ? UCOL_LESS
:UCOL_GREATER
;
7554 // only need to check one for continuation
7555 // if one is then the other must be or the preceding CE would be a prefix of the other
7556 if (coll
->leadBytePermutationTable
!= NULL
&& !isContinuation(sOrder
)) {
7557 sOrder
= (coll
->leadBytePermutationTable
[sOrder
>>24] << 24) | (sOrder
& 0x00FFFFFF);
7558 tOrder
= (coll
->leadBytePermutationTable
[tOrder
>>24] << 24) | (tOrder
& 0x00FFFFFF);
7560 // if two primaries are different, we are done
7561 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7564 } // no primary difference... do the rest from the buffers
7565 } else { // shifted - do a slightly more complicated processing :)
7567 UBool sInShifted
= FALSE
;
7568 UBool tInShifted
= FALSE
;
7569 // This version of code can be refactored. However, it seems easier to understand this way.
7570 // Source loop. Sam as the target loop.
7572 sOrder
= ucol_IGetNextCE(coll
, sColl
, status
);
7573 if(sOrder
== UCOL_NO_MORE_CES
) {
7574 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7576 } else if(sOrder
== 0 || (sInShifted
&& (sOrder
& UCOL_PRIMARYMASK
) == 0)) {
7577 /* UCA amendment - ignore ignorables that follow shifted code points */
7579 } else if(isContinuation(sOrder
)) {
7580 if((sOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7582 sOrder
= (sOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7583 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7586 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7589 } else { /* Just lower level values */
7593 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7597 } else { /* regular */
7598 if(coll
->leadBytePermutationTable
!= NULL
){
7599 sOrder
= (coll
->leadBytePermutationTable
[sOrder
>>24] << 24) | (sOrder
& 0x00FFFFFF);
7601 if((sOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7602 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7605 if((sOrder
& UCOL_PRIMARYMASK
) > 0) {
7607 sOrder
&= UCOL_PRIMARYMASK
;
7608 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7611 UCOL_CEBUF_PUT(&sCEs
, sOrder
, sColl
, status
);
7618 sOrder
&= UCOL_PRIMARYMASK
;
7622 tOrder
= ucol_IGetNextCE(coll
, tColl
, status
);
7623 if(tOrder
== UCOL_NO_MORE_CES
) {
7624 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7626 } else if(tOrder
== 0 || (tInShifted
&& (tOrder
& UCOL_PRIMARYMASK
) == 0)) {
7627 /* UCA amendment - ignore ignorables that follow shifted code points */
7629 } else if(isContinuation(tOrder
)) {
7630 if((tOrder
& UCOL_PRIMARYMASK
) > 0) { /* There is primary value */
7632 tOrder
= (tOrder
& UCOL_PRIMARYMASK
) | 0xC0; /* preserve interesting continuation */
7633 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7636 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7639 } else { /* Just lower level values */
7643 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7647 } else { /* regular */
7648 if(coll
->leadBytePermutationTable
!= NULL
){
7649 tOrder
= (coll
->leadBytePermutationTable
[tOrder
>>24] << 24) | (tOrder
& 0x00FFFFFF);
7651 if((tOrder
& UCOL_PRIMARYMASK
) > LVT
) {
7652 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7655 if((tOrder
& UCOL_PRIMARYMASK
) > 0) {
7657 tOrder
&= UCOL_PRIMARYMASK
;
7658 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7661 UCOL_CEBUF_PUT(&tCEs
, tOrder
, tColl
, status
);
7668 tOrder
&= UCOL_PRIMARYMASK
;
7671 if(sOrder
== tOrder
) {
7673 if(doHiragana && hirResult == UCOL_EQUAL) {
7674 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7675 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7676 ? UCOL_LESS:UCOL_GREATER;
7680 if(sOrder
== UCOL_NO_MORE_CES_PRIMARY
) {
7688 result
= (sOrder
< tOrder
) ? UCOL_LESS
: UCOL_GREATER
;
7691 } /* no primary difference... do the rest from the buffers */
7694 /* now, we're gonna reexamine collected CEs */
7698 /* This is the secondary level of comparison */
7700 if(!isFrenchSec
) { /* normal */
7705 secS
= *(sCE
++) & UCOL_SECONDARYMASK
;
7709 secT
= *(tCE
++) & UCOL_SECONDARYMASK
;
7713 if(secS
== UCOL_NO_MORE_CES_SECONDARY
) {
7720 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7724 } else { /* do the French */
7725 uint32_t *sCESave
= NULL
;
7726 uint32_t *tCESave
= NULL
;
7727 sCE
= sCEs
.pos
-2; /* this could also be sCEs-- if needs to be optimized */
7730 while (secS
== 0 && sCE
>= sCEs
.buf
) {
7731 if(sCESave
== NULL
) {
7733 if(isContinuation(secS
)) {
7734 while(isContinuation(secS
= *(sCE
--)))
7736 /* after this, secS has the start of continuation, and sCEs points before that */
7737 sCESave
= sCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7738 sCE
+=2; /* need to point to the first continuation CP */
7739 /* However, now you can just continue doing stuff */
7743 if(!isContinuation(secS
)) { /* This means we have finished with this cont */
7744 sCE
= sCESave
; /* reset the pointer to before continuation */
7746 secS
= 0; /* Fetch a fresh CE before the continuation sequence. */
7750 secS
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7753 while(secT
== 0 && tCE
>= tCEs
.buf
) {
7754 if(tCESave
== NULL
) {
7756 if(isContinuation(secT
)) {
7757 while(isContinuation(secT
= *(tCE
--)))
7759 /* after this, secS has the start of continuation, and sCEs points before that */
7760 tCESave
= tCE
; /* we save it, so that we know where to come back AND that we need to go forward */
7761 tCE
+=2; /* need to point to the first continuation CP */
7762 /* However, now you can just continue doing stuff */
7766 if(!isContinuation(secT
)) { /* This means we have finished with this cont */
7767 tCE
= tCESave
; /* reset the pointer to before continuation */
7769 secT
= 0; /* Fetch a fresh CE before the continuation sequence. */
7773 secT
&= UCOL_SECONDARYMASK
; /* remove the continuation bit */
7777 if(secS
== UCOL_NO_MORE_CES_SECONDARY
|| (sCE
< sCEs
.buf
&& tCE
< tCEs
.buf
)) {
7784 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7791 /* doing the case bit */
7796 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7797 if(!isContinuation(*sCE
++)) {
7799 if(((secS
& UCOL_PRIMARYMASK
) != 0) || strength
> UCOL_PRIMARY
) {
7800 // primary ignorables should not be considered on the case level when the strength is primary
7801 // otherwise, the CEs stop being well-formed
7802 secS
&= UCOL_TERT_CASE_MASK
;
7812 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7813 if(!isContinuation(*tCE
++)) {
7815 if(((secT
& UCOL_PRIMARYMASK
) != 0) || strength
> UCOL_PRIMARY
) {
7816 // primary ignorables should not be considered on the case level when the strength is primary
7817 // otherwise, the CEs stop being well-formed
7818 secT
&= UCOL_TERT_CASE_MASK
;
7828 if((secS
& UCOL_CASE_BIT_MASK
) < (secT
& UCOL_CASE_BIT_MASK
)) {
7831 } else if((secS
& UCOL_CASE_BIT_MASK
) > (secT
& UCOL_CASE_BIT_MASK
)) {
7832 result
= UCOL_GREATER
;
7836 if((secS
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
|| (secT
& UCOL_REMOVE_CASE
) == UCOL_NO_MORE_CES_TERTIARY
) {
7845 /* Tertiary level */
7852 while((secS
& UCOL_REMOVE_CASE
) == 0) {
7853 secS
= *(sCE
++) & tertiaryMask
;
7854 if(!isContinuation(secS
)) {
7857 secS
&= UCOL_REMOVE_CASE
;
7861 while((secT
& UCOL_REMOVE_CASE
) == 0) {
7862 secT
= *(tCE
++) & tertiaryMask
;
7863 if(!isContinuation(secT
)) {
7866 secT
&= UCOL_REMOVE_CASE
;
7871 if((secS
& UCOL_REMOVE_CASE
) == 1) {
7878 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7885 if(qShifted
/*checkQuad*/) {
7886 UBool sInShifted
= TRUE
;
7887 UBool tInShifted
= TRUE
;
7893 while((secS
== 0 && secS
!= UCOL_NO_MORE_CES
) || (isContinuation(secS
) && !sInShifted
)) {
7895 if(isContinuation(secS
)) {
7899 } else if(secS
> LVT
|| (secS
& UCOL_PRIMARYMASK
) == 0) { /* non continuation */
7900 secS
= UCOL_PRIMARYMASK
;
7906 secS
&= UCOL_PRIMARYMASK
;
7909 while((secT
== 0 && secT
!= UCOL_NO_MORE_CES
) || (isContinuation(secT
) && !tInShifted
)) {
7911 if(isContinuation(secT
)) {
7915 } else if(secT
> LVT
|| (secT
& UCOL_PRIMARYMASK
) == 0) {
7916 secT
= UCOL_PRIMARYMASK
;
7922 secT
&= UCOL_PRIMARYMASK
;
7925 if(secS
== UCOL_NO_MORE_CES_PRIMARY
) {
7932 result
= (secS
< secT
) ? UCOL_LESS
: UCOL_GREATER
;
7936 } else if(doHiragana
&& hirResult
!= UCOL_EQUAL
) {
7937 // If we're fine on quaternaries, we might be different
7938 // on Hiragana. This, however, might fail us in shifted.
7943 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7944 /* as a tiebreaker if all else is equal. */
7945 /* Getting here should be quite rare - strings are not identical - */
7946 /* that is checked first, but compared == through all other checks. */
7949 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7950 result
= ucol_checkIdent(sColl
, tColl
, TRUE
, status
);
7954 if ((sColl
->flags
| tColl
->flags
) & UCOL_ITER_ALLOCATED
) {
7955 if (sCEs
.buf
!= sCEs
.localArray
) {
7956 uprv_free(sCEs
.buf
);
7958 if (tCEs
.buf
!= tCEs
.localArray
) {
7959 uprv_free(tCEs
.buf
);
7966 static UCollationResult
7967 ucol_strcollRegular(const UCollator
*coll
,
7968 const UChar
*source
, int32_t sourceLength
,
7969 const UChar
*target
, int32_t targetLength
,
7970 UErrorCode
*status
) {
7971 collIterate sColl
, tColl
;
7972 // Preparing the context objects for iterating over strings
7973 IInit_collIterate(coll
, source
, sourceLength
, &sColl
, status
);
7974 IInit_collIterate(coll
, target
, targetLength
, &tColl
, status
);
7975 if(U_FAILURE(*status
)) {
7978 return ucol_strcollRegular(&sColl
, &tColl
, status
);
7981 static inline uint32_t
7982 ucol_getLatinOneContraction(const UCollator
*coll
, int32_t strength
,
7983 uint32_t CE
, const UChar
*s
, int32_t *index
, int32_t len
)
7985 const UChar
*UCharOffset
= (UChar
*)coll
->image
+getContractOffset(CE
&0xFFF);
7986 int32_t latinOneOffset
= (CE
& 0x00FFF000) >> 12;
7988 UChar schar
= 0, tchar
= 0;
7992 if(s
[*index
] == 0) { // end of string
7993 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
7999 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8005 while(schar
> (tchar
= *(UCharOffset
+offset
))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8009 if (schar
== tchar
) {
8011 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
+offset
]);
8015 if(schar
& 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8016 return UCOL_BAIL_OUT_CE
;
8018 // skip completely ignorables
8019 uint32_t isZeroCE
= UTRIE_GET32_FROM_LEAD(&coll
->mapping
, schar
);
8020 if(isZeroCE
== 0) { // we have to ignore completely ignorables
8025 return(coll
->latinOneCEs
[strength
*coll
->latinOneTableLen
+latinOneOffset
]);
8032 * This is a fast strcoll, geared towards text in Latin-1.
8033 * It supports contractions of size two, French secondaries
8034 * and case switching. You can use it with strengths primary
8035 * to tertiary. It does not support shifted and case level.
8036 * It relies on the table build by setupLatin1Table. If it
8037 * doesn't understand something, it will go to the regular
8040 static UCollationResult
8041 ucol_strcollUseLatin1( const UCollator
*coll
,
8042 const UChar
*source
,
8044 const UChar
*target
,
8049 int32_t strength
= coll
->strength
;
8051 int32_t sIndex
= 0, tIndex
= 0;
8052 UChar sChar
= 0, tChar
= 0;
8053 uint32_t sOrder
=0, tOrder
=0;
8055 UBool endOfSource
= FALSE
;
8057 uint32_t *elements
= coll
->latinOneCEs
;
8059 UBool haveContractions
= FALSE
; // if we have contractions in our string
8060 // we cannot do French secondary
8062 // Do the primary level
8064 while(sOrder
==0) { // this loop skips primary ignorables
8065 // sOrder=getNextlatinOneCE(source);
8066 if(sLen
==-1) { // handling zero terminated strings
8067 sChar
=source
[sIndex
++];
8072 } else { // handling strings with known length
8077 sChar
=source
[sIndex
++];
8079 if(sChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8080 //fprintf(stderr, "R");
8081 return ucol_strcollRegular(coll
, source
, sLen
, target
, tLen
, status
);
8083 sOrder
= elements
[sChar
];
8084 if(sOrder
>= UCOL_NOT_FOUND
) { // if we got a special
8085 // specials can basically be either contractions or bail-out signs. If we get anything
8086 // else, we'll bail out anywasy
8087 if(getCETag(sOrder
) == CONTRACTION_TAG
) {
8088 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, sOrder
, source
, &sIndex
, sLen
);
8089 haveContractions
= TRUE
; // if there are contractions, we cannot do French secondary
8090 // However, if there are contractions in the table, but we always use just one char,
8091 // we might be able to do French. This should be checked out.
8093 if(sOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8094 //fprintf(stderr, "S");
8095 return ucol_strcollRegular(coll
, source
, sLen
, target
, tLen
, status
);
8100 while(tOrder
==0) { // this loop skips primary ignorables
8101 // tOrder=getNextlatinOneCE(target);
8102 if(tLen
==-1) { // handling zero terminated strings
8103 tChar
=target
[tIndex
++];
8105 if(endOfSource
) { // this is different than source loop,
8106 // as we already know that source loop is done here,
8107 // so we can either finish the primary loop if both
8108 // strings are done or anounce the result if only
8109 // target is done. Same below.
8112 return UCOL_GREATER
;
8115 } else { // handling strings with known length
8120 return UCOL_GREATER
;
8123 tChar
=target
[tIndex
++];
8125 if(tChar
&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8126 //fprintf(stderr, "R");
8127 return ucol_strcollRegular(coll
, source
, sLen
, target
, tLen
, status
);
8129 tOrder
= elements
[tChar
];
8130 if(tOrder
>= UCOL_NOT_FOUND
) {
8131 // Handling specials, see the comments for source
8132 if(getCETag(tOrder
) == CONTRACTION_TAG
) {
8133 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_PRIMARY
, tOrder
, target
, &tIndex
, tLen
);
8134 haveContractions
= TRUE
;
8136 if(tOrder
>= UCOL_NOT_FOUND
/*== UCOL_BAIL_OUT_CE*/) {
8137 //fprintf(stderr, "S");
8138 return ucol_strcollRegular(coll
, source
, sLen
, target
, tLen
, status
);
8142 if(endOfSource
) { // source is finished, but target is not, say the result.
8146 if(sOrder
== tOrder
) { // if we have same CEs, we continue the loop
8147 sOrder
= 0; tOrder
= 0;
8150 // compare current top bytes
8151 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8152 // top bytes differ, return difference
8153 if(sOrder
< tOrder
) {
8155 } else if(sOrder
> tOrder
) {
8156 return UCOL_GREATER
;
8158 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8159 // since we must return enum value
8162 // top bytes match, continue with following bytes
8169 // after primary loop, we definitely know the sizes of strings,
8170 // so we set it and use simpler loop for secondaries and tertiaries
8171 sLen
= sIndex
; tLen
= tIndex
;
8172 if(strength
>= UCOL_SECONDARY
) {
8173 // adjust the table beggining
8174 elements
+= coll
->latinOneTableLen
;
8175 endOfSource
= FALSE
;
8177 if(coll
->frenchCollation
== UCOL_OFF
) { // non French
8178 // This loop is a simplified copy of primary loop
8179 // at this point we know that whole strings are latin-1, so we don't
8180 // check for that. We also know that we only have contractions as
8182 sIndex
= 0; tIndex
= 0;
8189 sChar
=source
[sIndex
++];
8190 sOrder
= elements
[sChar
];
8191 if(sOrder
> UCOL_NOT_FOUND
) {
8192 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, sOrder
, source
, &sIndex
, sLen
);
8201 return UCOL_GREATER
;
8204 tChar
=target
[tIndex
++];
8205 tOrder
= elements
[tChar
];
8206 if(tOrder
> UCOL_NOT_FOUND
) {
8207 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_SECONDARY
, tOrder
, target
, &tIndex
, tLen
);
8214 if(sOrder
== tOrder
) {
8215 sOrder
= 0; tOrder
= 0;
8218 // see primary loop for comments on this
8219 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8220 if(sOrder
< tOrder
) {
8222 } else if(sOrder
> tOrder
) {
8223 return UCOL_GREATER
;
8231 if(haveContractions
) { // if we have contractions, we have to bail out
8232 // since we don't really know how to handle them here
8233 return ucol_strcollRegular(coll
, source
, sLen
, target
, tLen
, status
);
8235 // For French, we go backwards
8236 sIndex
= sLen
; tIndex
= tLen
;
8243 sChar
=source
[--sIndex
];
8244 sOrder
= elements
[sChar
];
8245 // don't even look for contractions
8253 return UCOL_GREATER
;
8256 tChar
=target
[--tIndex
];
8257 tOrder
= elements
[tChar
];
8258 // don't even look for contractions
8264 if(sOrder
== tOrder
) {
8265 sOrder
= 0; tOrder
= 0;
8268 // see the primary loop for comments
8269 if(((sOrder
^tOrder
)&0xFF000000)!=0) {
8270 if(sOrder
< tOrder
) {
8272 } else if(sOrder
> tOrder
) {
8273 return UCOL_GREATER
;
8284 if(strength
>= UCOL_TERTIARY
) {
8285 // tertiary loop is the same as secondary (except no French)
8286 elements
+= coll
->latinOneTableLen
;
8287 sIndex
= 0; tIndex
= 0;
8288 endOfSource
= FALSE
;
8295 sChar
=source
[sIndex
++];
8296 sOrder
= elements
[sChar
];
8297 if(sOrder
> UCOL_NOT_FOUND
) {
8298 sOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, sOrder
, source
, &sIndex
, sLen
);
8304 return UCOL_EQUAL
; // if both strings are at the end, they are equal
8306 return UCOL_GREATER
;
8309 tChar
=target
[tIndex
++];
8310 tOrder
= elements
[tChar
];
8311 if(tOrder
> UCOL_NOT_FOUND
) {
8312 tOrder
= ucol_getLatinOneContraction(coll
, UCOL_TERTIARY
, tOrder
, target
, &tIndex
, tLen
);
8318 if(sOrder
== tOrder
) {
8319 sOrder
= 0; tOrder
= 0;
8322 if(((sOrder
^tOrder
)&0xff000000)!=0) {
8323 if(sOrder
< tOrder
) {
8325 } else if(sOrder
> tOrder
) {
8326 return UCOL_GREATER
;
8338 U_CAPI UCollationResult U_EXPORT2
8339 ucol_strcollIter( const UCollator
*coll
,
8340 UCharIterator
*sIter
,
8341 UCharIterator
*tIter
,
8344 if(!status
|| U_FAILURE(*status
)) {
8348 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER
);
8349 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, sIter=%p, tIter=%p", coll
, sIter
, tIter
);
8351 if (sIter
== tIter
) {
8352 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8355 if(sIter
== NULL
|| tIter
== NULL
|| coll
== NULL
) {
8356 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
8357 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8361 UCollationResult result
= UCOL_EQUAL
;
8363 // Preparing the context objects for iterating over strings
8364 collIterate sColl
, tColl
;
8365 IInit_collIterate(coll
, NULL
, -1, &sColl
, status
);
8366 IInit_collIterate(coll
, NULL
, -1, &tColl
, status
);
8367 if(U_FAILURE(*status
)) {
8368 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL
, *status
)
8371 // The division for the array length may truncate the array size to
8372 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8373 // for all platforms anyway.
8374 UAlignedMemory stackNormIter1
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8375 UAlignedMemory stackNormIter2
[UNORM_ITER_SIZE
/sizeof(UAlignedMemory
)];
8376 UNormIterator
*sNormIter
= NULL
, *tNormIter
= NULL
;
8378 sColl
.iterator
= sIter
;
8379 sColl
.flags
|= UCOL_USE_ITERATOR
;
8380 tColl
.flags
|= UCOL_USE_ITERATOR
;
8381 tColl
.iterator
= tIter
;
8383 if(ucol_getAttribute(coll
, UCOL_NORMALIZATION_MODE
, status
) == UCOL_ON
) {
8384 sNormIter
= unorm_openIter(stackNormIter1
, sizeof(stackNormIter1
), status
);
8385 sColl
.iterator
= unorm_setIter(sNormIter
, sIter
, UNORM_FCD
, status
);
8386 sColl
.flags
&= ~UCOL_ITER_NORM
;
8388 tNormIter
= unorm_openIter(stackNormIter2
, sizeof(stackNormIter2
), status
);
8389 tColl
.iterator
= unorm_setIter(tNormIter
, tIter
, UNORM_FCD
, status
);
8390 tColl
.flags
&= ~UCOL_ITER_NORM
;
8393 UChar32 sChar
= U_SENTINEL
, tChar
= U_SENTINEL
;
8395 while((sChar
= sColl
.iterator
->next(sColl
.iterator
)) ==
8396 (tChar
= tColl
.iterator
->next(tColl
.iterator
))) {
8397 if(sChar
== U_SENTINEL
) {
8398 result
= UCOL_EQUAL
;
8403 if(sChar
== U_SENTINEL
) {
8404 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8407 if(tChar
== U_SENTINEL
) {
8408 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8411 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8412 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8414 if (ucol_unsafeCP((UChar
)sChar
, coll
) || ucol_unsafeCP((UChar
)tChar
, coll
))
8416 // We are stopped in the middle of a contraction.
8417 // Scan backwards through the == part of the string looking for the start of the contraction.
8418 // It doesn't matter which string we scan, since they are the same in this region.
8421 sChar
= sColl
.iterator
->previous(sColl
.iterator
);
8422 tChar
= tColl
.iterator
->previous(tColl
.iterator
);
8424 while (sChar
!= U_SENTINEL
&& ucol_unsafeCP((UChar
)sChar
, coll
));
8428 if(U_SUCCESS(*status
)) {
8429 result
= ucol_strcollRegular(&sColl
, &tColl
, status
);
8433 if(sNormIter
|| tNormIter
) {
8434 unorm_closeIter(sNormIter
);
8435 unorm_closeIter(tNormIter
);
8438 UTRACE_EXIT_VALUE_STATUS(result
, *status
)
8444 /* ucol_strcoll Main public API string comparison function */
8446 U_CAPI UCollationResult U_EXPORT2
8447 ucol_strcoll( const UCollator
*coll
,
8448 const UChar
*source
,
8449 int32_t sourceLength
,
8450 const UChar
*target
,
8451 int32_t targetLength
)
8455 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL
);
8456 if (UTRACE_LEVEL(UTRACE_VERBOSE
)) {
8457 UTRACE_DATA3(UTRACE_VERBOSE
, "coll=%p, source=%p, target=%p", coll
, source
, target
);
8458 UTRACE_DATA2(UTRACE_VERBOSE
, "source string = %vh ", source
, sourceLength
);
8459 UTRACE_DATA2(UTRACE_VERBOSE
, "target string = %vh ", target
, targetLength
);
8462 if(source
== NULL
|| target
== NULL
) {
8463 // do not crash, but return. Should have
8464 // status argument to return error.
8465 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8469 /* Quick check if source and target are same strings. */
8470 /* They should either both be NULL terminated or the explicit length should be set on both. */
8471 if (source
==target
&& sourceLength
==targetLength
) {
8472 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8476 /* Scan the strings. Find: */
8477 /* The length of any leading portion that is equal */
8478 /* Whether they are exactly equal. (in which case we just return) */
8479 const UChar
*pSrc
= source
;
8480 const UChar
*pTarg
= target
;
8481 int32_t equalLength
;
8483 if (sourceLength
== -1 && targetLength
== -1) {
8484 // Both strings are null terminated.
8485 // Scan through any leading equal portion.
8486 while (*pSrc
== *pTarg
&& *pSrc
!= 0) {
8490 if (*pSrc
== 0 && *pTarg
== 0) {
8491 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8494 equalLength
= (int32_t)(pSrc
- source
);
8498 // One or both strings has an explicit length.
8499 const UChar
*pSrcEnd
= source
+ sourceLength
;
8500 const UChar
*pTargEnd
= target
+ targetLength
;
8502 // Scan while the strings are bitwise ==, or until one is exhausted.
8504 if (pSrc
== pSrcEnd
|| pTarg
== pTargEnd
) {
8507 if ((*pSrc
== 0 && sourceLength
== -1) || (*pTarg
== 0 && targetLength
== -1)) {
8510 if (*pSrc
!= *pTarg
) {
8516 equalLength
= (int32_t)(pSrc
- source
);
8518 // If we made it all the way through both strings, we are done. They are ==
8519 if ((pSrc
==pSrcEnd
|| (pSrcEnd
<pSrc
&& *pSrc
==0)) && /* At end of src string, however it was specified. */
8520 (pTarg
==pTargEnd
|| (pTargEnd
<pTarg
&& *pTarg
==0))) /* and also at end of dest string */
8522 UTRACE_EXIT_VALUE(UCOL_EQUAL
);
8526 if (equalLength
> 0) {
8527 /* There is an identical portion at the beginning of the two strings. */
8528 /* If the identical portion ends within a contraction or a comibining */
8529 /* character sequence, back up to the start of that sequence. */
8531 // These values should already be set by the code above.
8532 //pSrc = source + equalLength; /* point to the first differing chars */
8533 //pTarg = target + equalLength;
8534 if ((pSrc
!= source
+sourceLength
&& ucol_unsafeCP(*pSrc
, coll
)) ||
8535 (pTarg
!= target
+targetLength
&& ucol_unsafeCP(*pTarg
, coll
)))
8537 // We are stopped in the middle of a contraction.
8538 // Scan backwards through the == part of the string looking for the start of the contraction.
8539 // It doesn't matter which string we scan, since they are the same in this region.
8545 while (equalLength
>0 && ucol_unsafeCP(*pSrc
, coll
));
8548 source
+= equalLength
;
8549 target
+= equalLength
;
8550 if (sourceLength
> 0) {
8551 sourceLength
-= equalLength
;
8553 if (targetLength
> 0) {
8554 targetLength
-= equalLength
;
8558 UErrorCode status
= U_ZERO_ERROR
;
8559 UCollationResult returnVal
;
8560 if(!coll
->latinOneUse
|| (sourceLength
> 0 && *source
&0xff00) || (targetLength
> 0 && *target
&0xff00)) {
8561 returnVal
= ucol_strcollRegular(coll
, source
, sourceLength
, target
, targetLength
, &status
);
8563 returnVal
= ucol_strcollUseLatin1(coll
, source
, sourceLength
, target
, targetLength
, &status
);
8565 UTRACE_EXIT_VALUE(returnVal
);
8569 /* convenience function for comparing strings */
8570 U_CAPI UBool U_EXPORT2
8571 ucol_greater( const UCollator
*coll
,
8572 const UChar
*source
,
8573 int32_t sourceLength
,
8574 const UChar
*target
,
8575 int32_t targetLength
)
8577 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8581 /* convenience function for comparing strings */
8582 U_CAPI UBool U_EXPORT2
8583 ucol_greaterOrEqual( const UCollator
*coll
,
8584 const UChar
*source
,
8585 int32_t sourceLength
,
8586 const UChar
*target
,
8587 int32_t targetLength
)
8589 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8593 /* convenience function for comparing strings */
8594 U_CAPI UBool U_EXPORT2
8595 ucol_equal( const UCollator
*coll
,
8596 const UChar
*source
,
8597 int32_t sourceLength
,
8598 const UChar
*target
,
8599 int32_t targetLength
)
8601 return (ucol_strcoll(coll
, source
, sourceLength
, target
, targetLength
)
8605 U_CAPI
void U_EXPORT2
8606 ucol_getUCAVersion(const UCollator
* coll
, UVersionInfo info
) {
8607 if(coll
&& coll
->UCA
) {
8608 uprv_memcpy(info
, coll
->UCA
->image
->UCAVersion
, sizeof(UVersionInfo
));
8612 #endif /* #if !UCONFIG_NO_COLLATION */