2 ******************************************************************************
3 * Copyright (C) 2001-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 ******************************************************************************/
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/ucoleitr.h"
21 #include "unicode/ustring.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/uobject.h"
29 #define BUFFER_LENGTH 100
31 #define DEFAULT_BUFFER_SIZE 16
34 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
42 #define DELETE_ARRAY(array) uprv_free((void *) (array))
44 typedef struct icu::collIterate collIterator
;
57 RCEI defaultBuffer
[DEFAULT_BUFFER_SIZE
];
66 void put(uint32_t ce
, int32_t ixLow
, int32_t ixHigh
);
70 RCEBuffer::RCEBuffer()
72 buffer
= defaultBuffer
;
74 bufferSize
= DEFAULT_BUFFER_SIZE
;
77 RCEBuffer::~RCEBuffer()
79 if (buffer
!= defaultBuffer
) {
84 UBool
RCEBuffer::empty() const
86 return bufferIndex
<= 0;
89 void RCEBuffer::put(uint32_t ce
, int32_t ixLow
, int32_t ixHigh
)
91 if (bufferIndex
>= bufferSize
) {
92 RCEI
*newBuffer
= NEW_ARRAY(RCEI
, bufferSize
+ BUFFER_GROW
);
94 ARRAY_COPY(newBuffer
, buffer
, bufferSize
);
96 if (buffer
!= defaultBuffer
) {
101 bufferSize
+= BUFFER_GROW
;
104 buffer
[bufferIndex
].ce
= ce
;
105 buffer
[bufferIndex
].low
= ixLow
;
106 buffer
[bufferIndex
].high
= ixHigh
;
111 const RCEI
*RCEBuffer::get()
113 if (bufferIndex
> 0) {
114 return &buffer
[--bufferIndex
];
129 PCEI defaultBuffer
[DEFAULT_BUFFER_SIZE
];
139 void put(uint64_t ce
, int32_t ixLow
, int32_t ixHigh
);
143 PCEBuffer::PCEBuffer()
145 buffer
= defaultBuffer
;
147 bufferSize
= DEFAULT_BUFFER_SIZE
;
150 PCEBuffer::~PCEBuffer()
152 if (buffer
!= defaultBuffer
) {
153 DELETE_ARRAY(buffer
);
157 void PCEBuffer::reset()
162 UBool
PCEBuffer::empty() const
164 return bufferIndex
<= 0;
167 void PCEBuffer::put(uint64_t ce
, int32_t ixLow
, int32_t ixHigh
)
169 if (bufferIndex
>= bufferSize
) {
170 PCEI
*newBuffer
= NEW_ARRAY(PCEI
, bufferSize
+ BUFFER_GROW
);
172 ARRAY_COPY(newBuffer
, buffer
, bufferSize
);
174 if (buffer
!= defaultBuffer
) {
175 DELETE_ARRAY(buffer
);
179 bufferSize
+= BUFFER_GROW
;
182 buffer
[bufferIndex
].ce
= ce
;
183 buffer
[bufferIndex
].low
= ixLow
;
184 buffer
[bufferIndex
].high
= ixHigh
;
189 const PCEI
*PCEBuffer::get()
191 if (bufferIndex
> 0) {
192 return &buffer
[--bufferIndex
];
199 * This inherits from UObject so that
200 * it can be allocated by new and the
201 * constructor for PCEBuffer is called.
203 struct UCollationPCE
: public UObject
206 UCollationStrength strength
;
209 uint32_t variableTop
;
211 UCollationPCE(UCollationElements
*elems
);
214 void init(const UCollator
*coll
);
216 virtual UClassID
getDynamicClassID() const;
217 static UClassID
getStaticClassID();
220 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE
)
222 UCollationPCE::UCollationPCE(UCollationElements
*elems
)
224 init(elems
->iteratordata_
.coll
);
227 void UCollationPCE::init(const UCollator
*coll
)
229 UErrorCode status
= U_ZERO_ERROR
;
231 strength
= ucol_getStrength(coll
);
232 toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
234 variableTop
= coll
->variableTopValue
<< 16;
237 UCollationPCE::~UCollationPCE()
246 inline uint64_t processCE(UCollationElements
*elems
, uint32_t ce
)
248 uint64_t primary
= 0, secondary
= 0, tertiary
= 0, quaternary
= 0;
250 // This is clean, but somewhat slow...
251 // We could apply the mask to ce and then
252 // just get all three orders...
253 switch(elems
->pce
->strength
) {
255 tertiary
= ucol_tertiaryOrder(ce
);
256 /* note fall-through */
259 secondary
= ucol_secondaryOrder(ce
);
260 /* note fall-through */
263 primary
= ucol_primaryOrder(ce
);
266 // **** This should probably handle continuations too. ****
267 // **** That means that we need 24 bits for the primary ****
268 // **** instead of the 16 that we're currently using. ****
269 // **** So we can lay out the 64 bits as: 24.12.12.16. ****
270 // **** Another complication with continuations is that ****
271 // **** the *second* CE is marked as a continuation, so ****
272 // **** we always have to peek ahead to know how long ****
273 // **** the primary is... ****
274 if ((elems
->pce
->toShift
&& elems
->pce
->variableTop
> ce
&& primary
!= 0)
275 || (elems
->pce
->isShifted
&& primary
== 0)) {
278 return UCOL_IGNORABLE
;
281 if (elems
->pce
->strength
>= UCOL_QUATERNARY
) {
282 quaternary
= primary
;
285 primary
= secondary
= tertiary
= 0;
286 elems
->pce
->isShifted
= TRUE
;
288 if (elems
->pce
->strength
>= UCOL_QUATERNARY
) {
292 elems
->pce
->isShifted
= FALSE
;
295 return primary
<< 48 | secondary
<< 32 | tertiary
<< 16 | quaternary
;
298 U_CAPI
void U_EXPORT2
299 uprv_init_pce(const UCollationElements
*elems
)
301 if (elems
->pce
!= NULL
) {
302 elems
->pce
->init(elems
->iteratordata_
.coll
);
308 /* public methods ---------------------------------------------------- */
310 U_CAPI UCollationElements
* U_EXPORT2
311 ucol_openElements(const UCollator
*coll
,
316 if (U_FAILURE(*status
)) {
320 UCollationElements
*result
= new UCollationElements
;
321 if (result
== NULL
) {
322 *status
= U_MEMORY_ALLOCATION_ERROR
;
326 result
->reset_
= TRUE
;
327 result
->isWritable
= FALSE
;
333 uprv_init_collIterate(coll
, text
, textLength
, &result
->iteratordata_
, status
);
339 U_CAPI
void U_EXPORT2
340 ucol_closeElements(UCollationElements
*elems
)
343 collIterate
*ci
= &elems
->iteratordata_
;
346 uprv_free(ci
->extendCEs
);
349 if (ci
->offsetBuffer
) {
350 uprv_free(ci
->offsetBuffer
);
353 if (elems
->isWritable
&& elems
->iteratordata_
.string
!= NULL
)
355 uprv_free((UChar
*)elems
->iteratordata_
.string
);
358 if (elems
->pce
!= NULL
) {
366 U_CAPI
void U_EXPORT2
367 ucol_reset(UCollationElements
*elems
)
369 collIterate
*ci
= &(elems
->iteratordata_
);
370 elems
->reset_
= TRUE
;
371 ci
->pos
= ci
->string
;
372 if ((ci
->flags
& UCOL_ITER_HASLEN
) == 0 || ci
->endp
== NULL
) {
373 ci
->endp
= ci
->string
+ u_strlen(ci
->string
);
375 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
376 ci
->flags
= (ci
->flags
& UCOL_FORCE_HAN_IMPLICIT
) | UCOL_ITER_HASLEN
;
377 if (ci
->coll
->normalizationMode
== UCOL_ON
) {
378 ci
->flags
|= UCOL_ITER_NORM
;
381 ci
->writableBuffer
.remove();
382 ci
->fcdPosition
= NULL
;
384 //ci->offsetReturn = ci->offsetStore = NULL;
385 ci
->offsetRepeatCount
= ci
->offsetRepeatValue
= 0;
388 U_CAPI
void U_EXPORT2
389 ucol_forceHanImplicit(UCollationElements
*elems
, UErrorCode
*status
)
391 if (U_FAILURE(*status
)) {
396 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
400 elems
->iteratordata_
.flags
|= UCOL_FORCE_HAN_IMPLICIT
;
403 U_CAPI
int32_t U_EXPORT2
404 ucol_next(UCollationElements
*elems
,
408 if (U_FAILURE(*status
)) {
409 return UCOL_NULLORDER
;
412 elems
->reset_
= FALSE
;
414 result
= (int32_t)ucol_getNextCE(elems
->iteratordata_
.coll
,
415 &elems
->iteratordata_
,
418 if (result
== UCOL_NO_MORE_CES
) {
419 result
= UCOL_NULLORDER
;
424 U_CAPI
int64_t U_EXPORT2
425 ucol_nextProcessed(UCollationElements
*elems
,
430 const UCollator
*coll
= elems
->iteratordata_
.coll
;
431 int64_t result
= UCOL_IGNORABLE
;
432 uint32_t low
= 0, high
= 0;
434 if (U_FAILURE(*status
)) {
435 return UCOL_PROCESSED_NULLORDER
;
438 if (elems
->pce
== NULL
) {
439 elems
->pce
= new UCollationPCE(elems
);
441 elems
->pce
->pceBuffer
.reset();
444 elems
->reset_
= FALSE
;
447 low
= ucol_getOffset(elems
);
448 uint32_t ce
= (uint32_t) ucol_getNextCE(coll
, &elems
->iteratordata_
, status
);
449 high
= ucol_getOffset(elems
);
451 if (ce
== UCOL_NO_MORE_CES
) {
452 result
= UCOL_PROCESSED_NULLORDER
;
456 result
= processCE(elems
, ce
);
457 } while (result
== UCOL_IGNORABLE
);
463 if (ixHigh
!= NULL
) {
470 U_CAPI
int32_t U_EXPORT2
471 ucol_previous(UCollationElements
*elems
,
474 if(U_FAILURE(*status
)) {
475 return UCOL_NULLORDER
;
481 if (elems
->reset_
&& (elems
->iteratordata_
.pos
== elems
->iteratordata_
.string
)) {
482 if (elems
->iteratordata_
.endp
== NULL
) {
483 elems
->iteratordata_
.endp
= elems
->iteratordata_
.string
+
484 u_strlen(elems
->iteratordata_
.string
);
485 elems
->iteratordata_
.flags
|= UCOL_ITER_HASLEN
;
487 elems
->iteratordata_
.pos
= elems
->iteratordata_
.endp
;
488 elems
->iteratordata_
.fcdPosition
= elems
->iteratordata_
.endp
;
491 elems
->reset_
= FALSE
;
493 result
= (int32_t)ucol_getPrevCE(elems
->iteratordata_
.coll
,
494 &(elems
->iteratordata_
),
497 if (result
== UCOL_NO_MORE_CES
) {
498 result
= UCOL_NULLORDER
;
505 U_CAPI
int64_t U_EXPORT2
506 ucol_previousProcessed(UCollationElements
*elems
,
511 const UCollator
*coll
= elems
->iteratordata_
.coll
;
512 int64_t result
= UCOL_IGNORABLE
;
513 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
514 // UCollationStrength strength = ucol_getStrength(coll);
515 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
516 // uint32_t variableTop = coll->variableTopValue;
517 int32_t low
= 0, high
= 0;
519 if (U_FAILURE(*status
)) {
520 return UCOL_PROCESSED_NULLORDER
;
524 (elems
->iteratordata_
.pos
== elems
->iteratordata_
.string
)) {
525 if (elems
->iteratordata_
.endp
== NULL
) {
526 elems
->iteratordata_
.endp
= elems
->iteratordata_
.string
+
527 u_strlen(elems
->iteratordata_
.string
);
528 elems
->iteratordata_
.flags
|= UCOL_ITER_HASLEN
;
531 elems
->iteratordata_
.pos
= elems
->iteratordata_
.endp
;
532 elems
->iteratordata_
.fcdPosition
= elems
->iteratordata_
.endp
;
535 if (elems
->pce
== NULL
) {
536 elems
->pce
= new UCollationPCE(elems
);
538 //elems->pce->pceBuffer.reset();
541 elems
->reset_
= FALSE
;
543 while (elems
->pce
->pceBuffer
.empty()) {
544 // buffer raw CEs up to non-ignorable primary
548 // **** do we need to reset rceb, or will it always be empty at this point ****
550 high
= ucol_getOffset(elems
);
551 ce
= ucol_getPrevCE(coll
, &elems
->iteratordata_
, status
);
552 low
= ucol_getOffset(elems
);
554 if (ce
== UCOL_NO_MORE_CES
) {
555 if (! rceb
.empty()) {
562 rceb
.put(ce
, low
, high
);
563 } while ((ce
& UCOL_PRIMARYMASK
) == 0);
565 // process the raw CEs
566 while (! rceb
.empty()) {
567 const RCEI
*rcei
= rceb
.get();
569 result
= processCE(elems
, rcei
->ce
);
571 if (result
!= UCOL_IGNORABLE
) {
572 elems
->pce
->pceBuffer
.put(result
, rcei
->low
, rcei
->high
);
578 if (elems
->pce
->pceBuffer
.empty()) {
579 // **** Is -1 the right value for ixLow, ixHigh? ****
584 if (ixHigh
!= NULL
) {
588 return UCOL_PROCESSED_NULLORDER
;
591 const PCEI
*pcei
= elems
->pce
->pceBuffer
.get();
597 if (ixHigh
!= NULL
) {
598 *ixHigh
= pcei
->high
;
604 U_CAPI
int32_t U_EXPORT2
605 ucol_getMaxExpansion(const UCollationElements
*elems
,
611 UCOL_GETMAXEXPANSION(elems
->iteratordata_
.coll
, (uint32_t)order
, result
);
613 const UCollator
*coll
= elems
->iteratordata_
.coll
;
614 const uint32_t *start
;
615 const uint32_t *limit
;
617 uint32_t strengthMask
= 0;
618 uint32_t mOrder
= (uint32_t) order
;
620 switch (coll
->strength
)
623 strengthMask
|= UCOL_TERTIARYORDERMASK
;
627 strengthMask
|= UCOL_SECONDARYORDERMASK
;
631 strengthMask
|= UCOL_PRIMARYORDERMASK
;
634 mOrder
&= strengthMask
;
635 start
= (coll
)->endExpansionCE
;
636 limit
= (coll
)->lastEndExpansionCE
;
638 while (start
< limit
- 1) {
639 mid
= start
+ ((limit
- start
) >> 1);
640 if (mOrder
<= (*mid
& strengthMask
)) {
647 // FIXME: with a masked search, there might be more than one hit,
648 // so we need to look forward and backward from the match to find all
650 if ((*start
& strengthMask
) == mOrder
) {
651 result
= *((coll
)->expansionCESize
+ (start
- (coll
)->endExpansionCE
));
652 } else if ((*limit
& strengthMask
) == mOrder
) {
653 result
= *(coll
->expansionCESize
+ (limit
- coll
->endExpansionCE
));
654 } else if ((mOrder
& 0xFFFF) == 0x00C0) {
664 U_CAPI
void U_EXPORT2
665 ucol_setText( UCollationElements
*elems
,
670 if (U_FAILURE(*status
)) {
674 if (elems
->isWritable
&& elems
->iteratordata_
.string
!= NULL
)
676 uprv_free((UChar
*)elems
->iteratordata_
.string
);
683 elems
->isWritable
= FALSE
;
685 /* free offset buffer to avoid memory leak before initializing. */
686 ucol_freeOffsetBuffer(&(elems
->iteratordata_
));
687 /* Ensure that previously allocated extendCEs is freed before setting to NULL. */
688 if (elems
->iteratordata_
.extendCEs
!= NULL
) {
689 uprv_free(elems
->iteratordata_
.extendCEs
);
691 uprv_init_collIterate(elems
->iteratordata_
.coll
, text
, textLength
,
692 &elems
->iteratordata_
, status
);
694 elems
->reset_
= TRUE
;
697 U_CAPI
int32_t U_EXPORT2
698 ucol_getOffset(const UCollationElements
*elems
)
700 const collIterate
*ci
= &(elems
->iteratordata_
);
702 if (ci
->offsetRepeatCount
> 0 && ci
->offsetRepeatValue
!= 0) {
703 return ci
->offsetRepeatValue
;
706 if (ci
->offsetReturn
!= NULL
) {
707 return *ci
->offsetReturn
;
710 // while processing characters in normalization buffer getOffset will
711 // return the next non-normalized character.
712 // should be inline with the old implementation since the old codes uses
713 // nextDecomp in normalizer which also decomposes the string till the
714 // first base character is found.
715 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
716 if (ci
->fcdPosition
== NULL
) {
719 return (int32_t)(ci
->fcdPosition
- ci
->string
);
722 return (int32_t)(ci
->pos
- ci
->string
);
726 U_CAPI
void U_EXPORT2
727 ucol_setOffset(UCollationElements
*elems
,
731 if (U_FAILURE(*status
)) {
735 // this methods will clean up any use of the writable buffer and points to
736 // the original string
737 collIterate
*ci
= &(elems
->iteratordata_
);
738 ci
->pos
= ci
->string
+ offset
;
739 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
740 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
741 ci
->flags
= ci
->origFlags
;
743 if ((ci
->flags
& UCOL_ITER_HASLEN
) == 0) {
744 ci
->endp
= ci
->string
+ u_strlen(ci
->string
);
745 ci
->flags
|= UCOL_ITER_HASLEN
;
747 ci
->fcdPosition
= NULL
;
748 elems
->reset_
= FALSE
;
750 ci
->offsetReturn
= NULL
;
751 ci
->offsetStore
= ci
->offsetBuffer
;
752 ci
->offsetRepeatCount
= ci
->offsetRepeatValue
= 0;
755 U_CAPI
int32_t U_EXPORT2
756 ucol_primaryOrder (int32_t order
)
758 order
&= UCOL_PRIMARYMASK
;
759 return (order
>> UCOL_PRIMARYORDERSHIFT
);
762 U_CAPI
int32_t U_EXPORT2
763 ucol_secondaryOrder (int32_t order
)
765 order
&= UCOL_SECONDARYMASK
;
766 return (order
>> UCOL_SECONDARYORDERSHIFT
);
769 U_CAPI
int32_t U_EXPORT2
770 ucol_tertiaryOrder (int32_t order
)
772 return (order
& UCOL_TERTIARYMASK
);
776 void ucol_freeOffsetBuffer(collIterate
*s
) {
777 if (s
!= NULL
&& s
->offsetBuffer
!= NULL
) {
778 uprv_free(s
->offsetBuffer
);
779 s
->offsetBuffer
= NULL
;
780 s
->offsetBufferSize
= 0;
784 #endif /* #if !UCONFIG_NO_COLLATION */