2 ******************************************************************************
3 * Copyright (C) 2001-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 ******************************************************************************/
16 #include "unicode/utypes.h"
18 #if !UCONFIG_NO_COLLATION
20 #include "unicode/ucoleitr.h"
21 #include "unicode/ustring.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/uobject.h"
29 #define BUFFER_LENGTH 100
31 #define DEFAULT_BUFFER_SIZE 16
34 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
42 #define DELETE_ARRAY(array) uprv_free((void *) (array))
44 typedef struct collIterate collIterator
;
57 RCEI defaultBuffer
[DEFAULT_BUFFER_SIZE
];
66 void put(uint32_t ce
, int32_t ixLow
, int32_t ixHigh
);
70 RCEBuffer::RCEBuffer()
72 buffer
= defaultBuffer
;
74 bufferSize
= DEFAULT_BUFFER_SIZE
;
77 RCEBuffer::~RCEBuffer()
79 if (buffer
!= defaultBuffer
) {
84 UBool
RCEBuffer::empty() const
86 return bufferIndex
<= 0;
89 void RCEBuffer::put(uint32_t ce
, int32_t ixLow
, int32_t ixHigh
)
91 if (bufferIndex
>= bufferSize
) {
92 RCEI
*newBuffer
= NEW_ARRAY(RCEI
, bufferSize
+ BUFFER_GROW
);
94 ARRAY_COPY(newBuffer
, buffer
, bufferSize
);
96 if (buffer
!= defaultBuffer
) {
101 bufferSize
+= BUFFER_GROW
;
104 buffer
[bufferIndex
].ce
= ce
;
105 buffer
[bufferIndex
].low
= ixLow
;
106 buffer
[bufferIndex
].high
= ixHigh
;
111 const RCEI
*RCEBuffer::get()
113 if (bufferIndex
> 0) {
114 return &buffer
[--bufferIndex
];
129 PCEI defaultBuffer
[DEFAULT_BUFFER_SIZE
];
139 void put(uint64_t ce
, int32_t ixLow
, int32_t ixHigh
);
143 PCEBuffer::PCEBuffer()
145 buffer
= defaultBuffer
;
147 bufferSize
= DEFAULT_BUFFER_SIZE
;
150 PCEBuffer::~PCEBuffer()
152 if (buffer
!= defaultBuffer
) {
153 DELETE_ARRAY(buffer
);
157 void PCEBuffer::reset()
162 UBool
PCEBuffer::empty() const
164 return bufferIndex
<= 0;
167 void PCEBuffer::put(uint64_t ce
, int32_t ixLow
, int32_t ixHigh
)
169 if (bufferIndex
>= bufferSize
) {
170 PCEI
*newBuffer
= NEW_ARRAY(PCEI
, bufferSize
+ BUFFER_GROW
);
172 ARRAY_COPY(newBuffer
, buffer
, bufferSize
);
174 if (buffer
!= defaultBuffer
) {
175 DELETE_ARRAY(buffer
);
179 bufferSize
+= BUFFER_GROW
;
182 buffer
[bufferIndex
].ce
= ce
;
183 buffer
[bufferIndex
].low
= ixLow
;
184 buffer
[bufferIndex
].high
= ixHigh
;
189 const PCEI
*PCEBuffer::get()
191 if (bufferIndex
> 0) {
192 return &buffer
[--bufferIndex
];
199 * This inherits from UObject so that
200 * it can be allocated by new and the
201 * constructor for PCEBuffer is called.
203 struct UCollationPCE
: public UObject
206 UCollationStrength strength
;
209 uint32_t variableTop
;
211 UCollationPCE(UCollationElements
*elems
);
214 void init(const UCollator
*coll
);
216 virtual UClassID
getDynamicClassID() const;
217 static UClassID
getStaticClassID();
220 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE
)
222 UCollationPCE::UCollationPCE(UCollationElements
*elems
)
224 init(elems
->iteratordata_
.coll
);
227 void UCollationPCE::init(const UCollator
*coll
)
229 UErrorCode status
= U_ZERO_ERROR
;
231 strength
= ucol_getStrength(coll
);
232 toShift
= ucol_getAttribute(coll
, UCOL_ALTERNATE_HANDLING
, &status
) == UCOL_SHIFTED
;
234 variableTop
= coll
->variableTopValue
<< 16;
237 UCollationPCE::~UCollationPCE()
246 inline uint64_t processCE(UCollationElements
*elems
, uint32_t ce
)
248 uint64_t primary
= 0, secondary
= 0, tertiary
= 0, quaternary
= 0;
250 // This is clean, but somewhat slow...
251 // We could apply the mask to ce and then
252 // just get all three orders...
253 switch(elems
->pce
->strength
) {
255 tertiary
= ucol_tertiaryOrder(ce
);
256 /* note fall-through */
259 secondary
= ucol_secondaryOrder(ce
);
260 /* note fall-through */
263 primary
= ucol_primaryOrder(ce
);
267 if (elems
->pce
->toShift
&& (elems
->pce
->variableTop
> ce
&& primary
!= 0)
268 || (elems
->pce
->isShifted
&& primary
== 0)) {
271 return UCOL_IGNORABLE
;
274 if (elems
->pce
->strength
>= UCOL_QUATERNARY
) {
275 quaternary
= primary
;
278 primary
= secondary
= tertiary
= 0;
279 elems
->pce
->isShifted
= TRUE
;
281 if (elems
->pce
->strength
>= UCOL_QUATERNARY
) {
285 elems
->pce
->isShifted
= FALSE
;
289 return primary
<< 48 | secondary
<< 32 | tertiary
<< 16 | quaternary
;
292 U_CAPI
void U_EXPORT2
293 uprv_init_pce(const UCollationElements
*elems
)
295 if (elems
->pce
!= NULL
) {
296 elems
->pce
->init(elems
->iteratordata_
.coll
);
302 /* public methods ---------------------------------------------------- */
304 U_CAPI UCollationElements
* U_EXPORT2
305 ucol_openElements(const UCollator
*coll
,
310 UCollationElements
*result
;
312 if (U_FAILURE(*status
)) {
316 result
= (UCollationElements
*)uprv_malloc(sizeof(UCollationElements
));
318 if (result
== NULL
) {
319 *status
= U_MEMORY_ALLOCATION_ERROR
;
323 result
->reset_
= TRUE
;
324 result
->isWritable
= FALSE
;
330 uprv_init_collIterate(coll
, text
, textLength
, &result
->iteratordata_
);
335 U_CAPI
void U_EXPORT2
336 ucol_closeElements(UCollationElements
*elems
)
339 collIterate
*ci
= &elems
->iteratordata_
;
342 if (ci
->writableBuffer
!= ci
->stackWritableBuffer
) {
343 uprv_free(ci
->writableBuffer
);
347 uprv_free(ci
->extendCEs
);
350 if (ci
->offsetBuffer
) {
351 uprv_free(ci
->offsetBuffer
);
355 if (elems
->isWritable
&& elems
->iteratordata_
.string
!= NULL
)
357 uprv_free(elems
->iteratordata_
.string
);
360 if (elems
->pce
!= NULL
) {
368 U_CAPI
void U_EXPORT2
369 ucol_reset(UCollationElements
*elems
)
371 collIterate
*ci
= &(elems
->iteratordata_
);
372 elems
->reset_
= TRUE
;
373 ci
->pos
= ci
->string
;
374 if ((ci
->flags
& UCOL_ITER_HASLEN
) == 0 || ci
->endp
== NULL
) {
375 ci
->endp
= ci
->string
+ u_strlen(ci
->string
);
377 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
378 ci
->flags
= UCOL_ITER_HASLEN
;
379 if (ci
->coll
->normalizationMode
== UCOL_ON
) {
380 ci
->flags
|= UCOL_ITER_NORM
;
383 if (ci
->stackWritableBuffer
!= ci
->writableBuffer
) {
384 uprv_free(ci
->writableBuffer
);
385 ci
->writableBuffer
= ci
->stackWritableBuffer
;
386 ci
->writableBufSize
= UCOL_WRITABLE_BUFFER_SIZE
;
388 ci
->fcdPosition
= NULL
;
390 //ci->offsetReturn = ci->offsetStore = NULL;
391 ci
->offsetRepeatCount
= ci
->offsetRepeatValue
= 0;
394 U_CAPI
int32_t U_EXPORT2
395 ucol_next(UCollationElements
*elems
,
399 if (U_FAILURE(*status
)) {
400 return UCOL_NULLORDER
;
403 elems
->reset_
= FALSE
;
405 result
= (int32_t)ucol_getNextCE(elems
->iteratordata_
.coll
,
406 &elems
->iteratordata_
,
409 if (result
== UCOL_NO_MORE_CES
) {
410 result
= UCOL_NULLORDER
;
415 U_CAPI
int64_t U_EXPORT2
416 ucol_nextProcessed(UCollationElements
*elems
,
421 const UCollator
*coll
= elems
->iteratordata_
.coll
;
422 int64_t result
= UCOL_IGNORABLE
;
423 uint32_t low
= 0, high
= 0;
425 if (U_FAILURE(*status
)) {
426 return UCOL_PROCESSED_NULLORDER
;
429 if (elems
->pce
== NULL
) {
430 elems
->pce
= new UCollationPCE(elems
);
432 elems
->pce
->pceBuffer
.reset();
435 elems
->reset_
= FALSE
;
438 low
= ucol_getOffset(elems
);
439 uint32_t ce
= (uint32_t) ucol_getNextCE(coll
, &elems
->iteratordata_
, status
);
440 high
= ucol_getOffset(elems
);
442 if (ce
== UCOL_NO_MORE_CES
) {
443 result
= UCOL_PROCESSED_NULLORDER
;
447 result
= processCE(elems
, ce
);
448 } while (result
== UCOL_IGNORABLE
);
454 if (ixHigh
!= NULL
) {
461 U_CAPI
int32_t U_EXPORT2
462 ucol_previous(UCollationElements
*elems
,
465 if(U_FAILURE(*status
)) {
466 return UCOL_NULLORDER
;
472 if (elems
->reset_
&& (elems
->iteratordata_
.pos
== elems
->iteratordata_
.string
)) {
473 if (elems
->iteratordata_
.endp
== NULL
) {
474 elems
->iteratordata_
.endp
= elems
->iteratordata_
.string
+
475 u_strlen(elems
->iteratordata_
.string
);
476 elems
->iteratordata_
.flags
|= UCOL_ITER_HASLEN
;
478 elems
->iteratordata_
.pos
= elems
->iteratordata_
.endp
;
479 elems
->iteratordata_
.fcdPosition
= elems
->iteratordata_
.endp
;
482 elems
->reset_
= FALSE
;
484 result
= (int32_t)ucol_getPrevCE(elems
->iteratordata_
.coll
,
485 &(elems
->iteratordata_
),
488 if (result
== UCOL_NO_MORE_CES
) {
489 result
= UCOL_NULLORDER
;
496 U_CAPI
int64_t U_EXPORT2
497 ucol_previousProcessed(UCollationElements
*elems
,
502 const UCollator
*coll
= elems
->iteratordata_
.coll
;
503 int64_t result
= UCOL_IGNORABLE
;
504 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
505 // UCollationStrength strength = ucol_getStrength(coll);
506 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
507 // uint32_t variableTop = coll->variableTopValue;
508 int32_t low
= 0, high
= 0;
510 if (U_FAILURE(*status
)) {
511 return UCOL_PROCESSED_NULLORDER
;
515 (elems
->iteratordata_
.pos
== elems
->iteratordata_
.string
)) {
516 if (elems
->iteratordata_
.endp
== NULL
) {
517 elems
->iteratordata_
.endp
= elems
->iteratordata_
.string
+
518 u_strlen(elems
->iteratordata_
.string
);
519 elems
->iteratordata_
.flags
|= UCOL_ITER_HASLEN
;
522 elems
->iteratordata_
.pos
= elems
->iteratordata_
.endp
;
523 elems
->iteratordata_
.fcdPosition
= elems
->iteratordata_
.endp
;
526 if (elems
->pce
== NULL
) {
527 elems
->pce
= new UCollationPCE(elems
);
529 //elems->pce->pceBuffer.reset();
532 elems
->reset_
= FALSE
;
534 while (elems
->pce
->pceBuffer
.empty()) {
535 // buffer raw CEs up to non-ignorable primary
539 // **** do we need to reset rceb, or will it always be empty at this point ****
541 high
= ucol_getOffset(elems
);
542 ce
= ucol_getPrevCE(coll
, &elems
->iteratordata_
, status
);
543 low
= ucol_getOffset(elems
);
545 if (ce
== UCOL_NO_MORE_CES
) {
546 if (! rceb
.empty()) {
553 rceb
.put(ce
, low
, high
);
554 } while ((ce
& UCOL_PRIMARYMASK
) == 0);
556 // process the raw CEs
557 while (! rceb
.empty()) {
558 const RCEI
*rcei
= rceb
.get();
560 result
= processCE(elems
, rcei
->ce
);
562 if (result
!= UCOL_IGNORABLE
) {
563 elems
->pce
->pceBuffer
.put(result
, rcei
->low
, rcei
->high
);
569 if (elems
->pce
->pceBuffer
.empty()) {
570 // **** Is -1 the right value for ixLow, ixHigh? ****
575 if (ixHigh
!= NULL
) {
579 return UCOL_PROCESSED_NULLORDER
;
582 const PCEI
*pcei
= elems
->pce
->pceBuffer
.get();
588 if (ixHigh
!= NULL
) {
589 *ixHigh
= pcei
->high
;
595 U_CAPI
int32_t U_EXPORT2
596 ucol_getMaxExpansion(const UCollationElements
*elems
,
602 UCOL_GETMAXEXPANSION(elems
->iteratordata_
.coll
, (uint32_t)order
, result
);
604 const UCollator
*coll
= elems
->iteratordata_
.coll
;
605 const uint32_t *start
;
606 const uint32_t *limit
;
608 uint32_t strengthMask
= 0;
609 uint32_t mOrder
= (uint32_t) order
;
611 switch (coll
->strength
)
614 strengthMask
|= UCOL_TERTIARYORDERMASK
;
618 strengthMask
|= UCOL_SECONDARYORDERMASK
;
622 strengthMask
|= UCOL_PRIMARYORDERMASK
;
625 mOrder
&= strengthMask
;
626 start
= (coll
)->endExpansionCE
;
627 limit
= (coll
)->lastEndExpansionCE
;
629 while (start
< limit
- 1) {
630 mid
= start
+ ((limit
- start
) >> 1);
631 if (mOrder
<= (*mid
& strengthMask
)) {
638 // FIXME: with a masked search, there might be more than one hit,
639 // so we need to look forward and backward from the match to find all
641 if ((*start
& strengthMask
) == mOrder
) {
642 result
= *((coll
)->expansionCESize
+ (start
- (coll
)->endExpansionCE
));
643 } else if ((*limit
& strengthMask
) == mOrder
) {
644 result
= *(coll
->expansionCESize
+ (limit
- coll
->endExpansionCE
));
645 } else if ((mOrder
& 0xFFFF) == 0x00C0) {
655 U_CAPI
void U_EXPORT2
656 ucol_setText( UCollationElements
*elems
,
661 if (U_FAILURE(*status
)) {
665 if (elems
->isWritable
&& elems
->iteratordata_
.string
!= NULL
)
667 uprv_free(elems
->iteratordata_
.string
);
674 elems
->isWritable
= FALSE
;
676 /* free offset buffer to avoid memory leak before initializing. */
677 freeOffsetBuffer(&(elems
->iteratordata_
));
678 uprv_init_collIterate(elems
->iteratordata_
.coll
, text
, textLength
,
679 &elems
->iteratordata_
);
681 elems
->reset_
= TRUE
;
684 U_CAPI
int32_t U_EXPORT2
685 ucol_getOffset(const UCollationElements
*elems
)
687 const collIterate
*ci
= &(elems
->iteratordata_
);
689 if (ci
->offsetRepeatCount
> 0 && ci
->offsetRepeatValue
!= 0) {
690 return ci
->offsetRepeatValue
;
693 if (ci
->offsetReturn
!= NULL
) {
694 return *ci
->offsetReturn
;
697 // while processing characters in normalization buffer getOffset will
698 // return the next non-normalized character.
699 // should be inline with the old implementation since the old codes uses
700 // nextDecomp in normalizer which also decomposes the string till the
701 // first base character is found.
702 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
703 if (ci
->fcdPosition
== NULL
) {
706 return (int32_t)(ci
->fcdPosition
- ci
->string
);
709 return (int32_t)(ci
->pos
- ci
->string
);
713 U_CAPI
void U_EXPORT2
714 ucol_setOffset(UCollationElements
*elems
,
718 if (U_FAILURE(*status
)) {
722 // this methods will clean up any use of the writable buffer and points to
723 // the original string
724 collIterate
*ci
= &(elems
->iteratordata_
);
725 ci
->pos
= ci
->string
+ offset
;
726 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
727 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
728 ci
->flags
= ci
->origFlags
;
730 if ((ci
->flags
& UCOL_ITER_HASLEN
) == 0) {
731 ci
->endp
= ci
->string
+ u_strlen(ci
->string
);
732 ci
->flags
|= UCOL_ITER_HASLEN
;
734 ci
->fcdPosition
= NULL
;
735 elems
->reset_
= FALSE
;
737 ci
->offsetReturn
= NULL
;
738 ci
->offsetStore
= ci
->offsetBuffer
;
739 ci
->offsetRepeatCount
= ci
->offsetRepeatValue
= 0;
742 U_CAPI
int32_t U_EXPORT2
743 ucol_primaryOrder (int32_t order
)
745 order
&= UCOL_PRIMARYMASK
;
746 return (order
>> UCOL_PRIMARYORDERSHIFT
);
749 U_CAPI
int32_t U_EXPORT2
750 ucol_secondaryOrder (int32_t order
)
752 order
&= UCOL_SECONDARYMASK
;
753 return (order
>> UCOL_SECONDARYORDERSHIFT
);
756 U_CAPI
int32_t U_EXPORT2
757 ucol_tertiaryOrder (int32_t order
)
759 return (order
& UCOL_TERTIARYMASK
);
762 #endif /* #if !UCONFIG_NO_COLLATION */