2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
11 * Created by: Helena Shih
13 * Modification History:
15 * Date Name Description
17 * 6/23/97 helena Adding comments to make code more readable.
18 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
19 * 12/10/99 aliu Ported Thai collation support from Java.
20 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
21 * 02/19/01 swquek Removed CollationElementIterator() since it is
22 * private constructor and no calls are made to it
23 * 2012-2014 markus Rewritten in C++ again.
26 #include "unicode/utypes.h"
28 #if !UCONFIG_NO_COLLATION
30 #include "unicode/coleitr.h"
31 #include "unicode/tblcoll.h"
32 #include "unicode/ustring.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationiterator.h"
37 #include "collationsets.h"
38 #include "collationtailoring.h"
41 #include "utf16collationiterator.h"
44 /* Constants --------------------------------------------------------------- */
48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator
)
50 /* CollationElementIterator public constructor/destructor ------------------ */
52 CollationElementIterator::CollationElementIterator(
53 const CollationElementIterator
& other
)
54 : UObject(other
), iter_(NULL
), rbc_(NULL
), otherHalf_(0), dir_(0), offsets_(NULL
) {
58 CollationElementIterator::~CollationElementIterator()
64 /* CollationElementIterator public methods --------------------------------- */
68 uint32_t getFirstHalf(uint32_t p
, uint32_t lower32
) {
69 return (p
& 0xffff0000) | ((lower32
>> 16) & 0xff00) | ((lower32
>> 8) & 0xff);
71 uint32_t getSecondHalf(uint32_t p
, uint32_t lower32
) {
72 return (p
<< 16) | ((lower32
>> 8) & 0xff00) | (lower32
& 0x3f);
74 UBool
ceNeedsTwoParts(int64_t ce
) {
75 return (ce
& INT64_C(0xffff00ff003f)) != 0;
80 int32_t CollationElementIterator::getOffset() const
82 if (dir_
< 0 && offsets_
!= NULL
&& !offsets_
->isEmpty()) {
83 // CollationIterator::previousCE() decrements the CEs length
84 // while it pops CEs from its internal buffer.
85 int32_t i
= iter_
->getCEsLength();
86 if (otherHalf_
!= 0) {
87 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
90 U_ASSERT(i
< offsets_
->size());
91 return offsets_
->elementAti(i
);
93 return iter_
->getOffset();
97 * Get the ordering priority of the next character in the string.
98 * @return the next character's ordering. Returns NULLORDER if an error has
99 * occured or if the end of string has been reached
101 int32_t CollationElementIterator::next(UErrorCode
& status
)
103 if (U_FAILURE(status
)) { return NULLORDER
; }
105 // Continue forward iteration. Test this first.
106 if (otherHalf_
!= 0) {
107 uint32_t oh
= otherHalf_
;
111 } else if (dir_
== 1) {
112 // next() after setOffset()
114 } else if (dir_
== 0) {
115 // The iter_ is already reset to the start of the text.
117 } else /* dir_ < 0 */ {
118 // illegal change of direction
119 status
= U_INVALID_STATE_ERROR
;
122 // No need to keep all CEs in the buffer when we iterate.
123 iter_
->clearCEsIfNoneRemaining();
124 int64_t ce
= iter_
->nextCE(status
);
125 if (ce
== Collation::NO_CE
) { return NULLORDER
; }
126 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
127 uint32_t p
= (uint32_t)(ce
>> 32);
128 uint32_t lower32
= (uint32_t)ce
;
129 uint32_t firstHalf
= getFirstHalf(p
, lower32
);
130 uint32_t secondHalf
= getSecondHalf(p
, lower32
);
131 if (secondHalf
!= 0) {
132 otherHalf_
= secondHalf
| 0xc0; // continuation CE
137 UBool
CollationElementIterator::operator!=(
138 const CollationElementIterator
& other
) const
140 return !(*this == other
);
143 UBool
CollationElementIterator::operator==(
144 const CollationElementIterator
& that
) const
151 (rbc_
== that
.rbc_
|| *rbc_
== *that
.rbc_
) &&
152 otherHalf_
== that
.otherHalf_
&&
153 normalizeDir() == that
.normalizeDir() &&
154 string_
== that
.string_
&&
155 *iter_
== *that
.iter_
;
159 * Get the ordering priority of the previous collation element in the string.
160 * @param status the error code status.
161 * @return the previous element's ordering. Returns NULLORDER if an error has
162 * occured or if the start of string has been reached.
164 int32_t CollationElementIterator::previous(UErrorCode
& status
)
166 if (U_FAILURE(status
)) { return NULLORDER
; }
168 // Continue backwards iteration. Test this first.
169 if (otherHalf_
!= 0) {
170 uint32_t oh
= otherHalf_
;
174 } else if (dir_
== 0) {
175 iter_
->resetToOffset(string_
.length());
177 } else if (dir_
== 1) {
178 // previous() after setOffset()
180 } else /* dir_ > 1 */ {
181 // illegal change of direction
182 status
= U_INVALID_STATE_ERROR
;
185 if (offsets_
== NULL
) {
186 offsets_
= new UVector32(status
);
187 if (offsets_
== NULL
) {
188 status
= U_MEMORY_ALLOCATION_ERROR
;
192 // If we already have expansion CEs, then we also have offsets.
193 // Otherwise remember the trailing offset in case we need to
194 // write offsets for an artificial expansion.
195 int32_t limitOffset
= iter_
->getCEsLength() == 0 ? iter_
->getOffset() : 0;
196 int64_t ce
= iter_
->previousCE(*offsets_
, status
);
197 if (ce
== Collation::NO_CE
) { return NULLORDER
; }
198 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
199 uint32_t p
= (uint32_t)(ce
>> 32);
200 uint32_t lower32
= (uint32_t)ce
;
201 uint32_t firstHalf
= getFirstHalf(p
, lower32
);
202 uint32_t secondHalf
= getSecondHalf(p
, lower32
);
203 if (secondHalf
!= 0) {
204 if (offsets_
->isEmpty()) {
205 // When we convert a single 64-bit CE into two 32-bit CEs,
206 // we need to make this artificial expansion behave like a normal expansion.
207 // See CollationIterator::previousCE().
208 offsets_
->addElement(iter_
->getOffset(), status
);
209 offsets_
->addElement(limitOffset
, status
);
211 otherHalf_
= firstHalf
;
212 return secondHalf
| 0xc0; // continuation CE
218 * Resets the cursor to the beginning of the string.
220 void CollationElementIterator::reset()
222 iter_
->resetToOffset(0);
227 void CollationElementIterator::setOffset(int32_t newOffset
,
230 if (U_FAILURE(status
)) { return; }
231 if (0 < newOffset
&& newOffset
< string_
.length()) {
232 int32_t offset
= newOffset
;
234 UChar c
= string_
.charAt(offset
);
235 if (!rbc_
->isUnsafe(c
) ||
236 (U16_IS_LEAD(c
) && !rbc_
->isUnsafe(string_
.char32At(offset
)))) {
239 // Back up to before this unsafe character.
241 } while (offset
> 0);
242 if (offset
< newOffset
) {
243 // We might have backed up more than necessary.
244 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
245 // but for text "chu" setOffset(2) should remain at 2
246 // although we initially back up to offset 0.
247 // Find the last safe offset no greater than newOffset by iterating forward.
248 int32_t lastSafeOffset
= offset
;
250 iter_
->resetToOffset(lastSafeOffset
);
252 iter_
->nextCE(status
);
253 if (U_FAILURE(status
)) { return; }
254 } while ((offset
= iter_
->getOffset()) == lastSafeOffset
);
255 if (offset
<= newOffset
) {
256 lastSafeOffset
= offset
;
258 } while (offset
< newOffset
);
259 newOffset
= lastSafeOffset
;
262 iter_
->resetToOffset(newOffset
);
268 * Sets the source to the new source string.
270 void CollationElementIterator::setText(const UnicodeString
& source
,
273 if (U_FAILURE(status
)) {
278 const UChar
*s
= string_
.getBuffer();
279 CollationIterator
*newIter
;
280 UBool numeric
= rbc_
->settings
->isNumeric();
281 if (rbc_
->settings
->dontCheckFCD()) {
282 newIter
= new UTF16CollationIterator(rbc_
->data
, numeric
, s
, s
, s
+ string_
.length());
284 newIter
= new FCDUTF16CollationIterator(rbc_
->data
, numeric
, s
, s
, s
+ string_
.length());
286 if (newIter
== NULL
) {
287 status
= U_MEMORY_ALLOCATION_ERROR
;
296 // Sets the source to the new character iterator.
297 void CollationElementIterator::setText(CharacterIterator
& source
,
300 if (U_FAILURE(status
))
303 source
.getText(string_
);
304 setText(string_
, status
);
307 int32_t CollationElementIterator::strengthOrder(int32_t order
) const
309 UColAttributeValue s
= (UColAttributeValue
)rbc_
->settings
->getStrength();
310 // Mask off the unwanted differences.
311 if (s
== UCOL_PRIMARY
) {
314 else if (s
== UCOL_SECONDARY
) {
321 /* CollationElementIterator private constructors/destructors --------------- */
324 * This is the "real" constructor for this class; it constructs an iterator
325 * over the source text using the specified collator
327 CollationElementIterator::CollationElementIterator(
328 const UnicodeString
&source
,
329 const RuleBasedCollator
*coll
,
331 : iter_(NULL
), rbc_(coll
), otherHalf_(0), dir_(0), offsets_(NULL
) {
332 setText(source
, status
);
336 * This is the "real" constructor for this class; it constructs an iterator over
337 * the source text using the specified collator
339 CollationElementIterator::CollationElementIterator(
340 const CharacterIterator
&source
,
341 const RuleBasedCollator
*coll
,
343 : iter_(NULL
), rbc_(coll
), otherHalf_(0), dir_(0), offsets_(NULL
) {
344 // We only call source.getText() which should be const anyway.
345 setText(const_cast<CharacterIterator
&>(source
), status
);
348 /* CollationElementIterator private methods -------------------------------- */
350 const CollationElementIterator
& CollationElementIterator::operator=(
351 const CollationElementIterator
& other
)
353 if (this == &other
) {
357 CollationIterator
*newIter
;
358 const FCDUTF16CollationIterator
*otherFCDIter
=
359 dynamic_cast<const FCDUTF16CollationIterator
*>(other
.iter_
);
360 if(otherFCDIter
!= NULL
) {
361 newIter
= new FCDUTF16CollationIterator(*otherFCDIter
, string_
.getBuffer());
363 const UTF16CollationIterator
*otherIter
=
364 dynamic_cast<const UTF16CollationIterator
*>(other
.iter_
);
365 if(otherIter
!= NULL
) {
366 newIter
= new UTF16CollationIterator(*otherIter
, string_
.getBuffer());
371 if(newIter
!= NULL
) {
375 otherHalf_
= other
.otherHalf_
;
378 string_
= other
.string_
;
380 if(other
.dir_
< 0 && other
.offsets_
!= NULL
&& !other
.offsets_
->isEmpty()) {
381 UErrorCode errorCode
= U_ZERO_ERROR
;
382 if(offsets_
== NULL
) {
383 offsets_
= new UVector32(other
.offsets_
->size(), errorCode
);
385 if(offsets_
!= NULL
) {
386 offsets_
->assign(*other
.offsets_
, errorCode
);
394 class MaxExpSink
: public ContractionsAndExpansions::CESink
{
396 MaxExpSink(UHashtable
*h
, UErrorCode
&ec
) : maxExpansions(h
), errorCode(ec
) {}
397 virtual ~MaxExpSink();
398 virtual void handleCE(int64_t /*ce*/) {}
399 virtual void handleExpansion(const int64_t ces
[], int32_t length
) {
401 // We do not need to add single CEs into the map.
404 int32_t count
= 0; // number of CE "halves"
405 for (int32_t i
= 0; i
< length
; ++i
) {
406 count
+= ceNeedsTwoParts(ces
[i
]) ? 2 : 1;
408 // last "half" of the last CE
409 int64_t ce
= ces
[length
- 1];
410 uint32_t p
= (uint32_t)(ce
>> 32);
411 uint32_t lower32
= (uint32_t)ce
;
412 uint32_t lastHalf
= getSecondHalf(p
, lower32
);
414 lastHalf
= getFirstHalf(p
, lower32
);
415 U_ASSERT(lastHalf
!= 0);
417 lastHalf
|= 0xc0; // old-style continuation CE
419 if (count
> uhash_igeti(maxExpansions
, (int32_t)lastHalf
)) {
420 uhash_iputi(maxExpansions
, (int32_t)lastHalf
, count
, &errorCode
);
425 UHashtable
*maxExpansions
;
426 UErrorCode
&errorCode
;
429 MaxExpSink::~MaxExpSink() {}
434 CollationElementIterator::computeMaxExpansions(const CollationData
*data
, UErrorCode
&errorCode
) {
435 if (U_FAILURE(errorCode
)) { return NULL
; }
436 UHashtable
*maxExpansions
= uhash_open(uhash_hashLong
, uhash_compareLong
,
437 uhash_compareLong
, &errorCode
);
438 if (U_FAILURE(errorCode
)) { return NULL
; }
439 MaxExpSink
sink(maxExpansions
, errorCode
);
440 ContractionsAndExpansions(NULL
, NULL
, &sink
, TRUE
).forData(data
, errorCode
);
441 if (U_FAILURE(errorCode
)) {
442 uhash_close(maxExpansions
);
445 return maxExpansions
;
449 CollationElementIterator::getMaxExpansion(int32_t order
) const {
450 return getMaxExpansion(rbc_
->tailoring
->maxExpansions
, order
);
454 CollationElementIterator::getMaxExpansion(const UHashtable
*maxExpansions
, int32_t order
) {
455 if (order
== 0) { return 1; }
457 if(maxExpansions
!= NULL
&& (max
= uhash_igeti(maxExpansions
, order
)) != 0) {
460 if ((order
& 0xc0) == 0xc0) {
461 // old-style continuation CE
470 #endif /* #if !UCONFIG_NO_COLLATION */