1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 1996-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
13 * Created by: Helena Shih
15 * Modification History:
17 * Date Name Description
19 * 6/23/97 helena Adding comments to make code more readable.
20 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
21 * 12/10/99 aliu Ported Thai collation support from Java.
22 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
23 * 02/19/01 swquek Removed CollationElementIterator() since it is
24 * private constructor and no calls are made to it
25 * 2012-2014 markus Rewritten in C++ again.
28 #include "unicode/utypes.h"
30 #if !UCONFIG_NO_COLLATION
32 #include "unicode/chariter.h"
33 #include "unicode/coleitr.h"
34 #include "unicode/tblcoll.h"
35 #include "unicode/ustring.h"
37 #include "collation.h"
38 #include "collationdata.h"
39 #include "collationiterator.h"
40 #include "collationsets.h"
41 #include "collationtailoring.h"
44 #include "utf16collationiterator.h"
47 /* Constants --------------------------------------------------------------- */
51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator
)
53 /* CollationElementIterator public constructor/destructor ------------------ */
55 CollationElementIterator::CollationElementIterator(
56 const CollationElementIterator
& other
)
57 : UObject(other
), iter_(NULL
), rbc_(NULL
), otherHalf_(0), dir_(0), offsets_(NULL
) {
61 CollationElementIterator::~CollationElementIterator()
67 /* CollationElementIterator public methods --------------------------------- */
71 uint32_t getFirstHalf(uint32_t p
, uint32_t lower32
) {
72 return (p
& 0xffff0000) | ((lower32
>> 16) & 0xff00) | ((lower32
>> 8) & 0xff);
74 uint32_t getSecondHalf(uint32_t p
, uint32_t lower32
) {
75 return (p
<< 16) | ((lower32
>> 8) & 0xff00) | (lower32
& 0x3f);
77 UBool
ceNeedsTwoParts(int64_t ce
) {
78 return (ce
& INT64_C(0xffff00ff003f)) != 0;
83 int32_t CollationElementIterator::getOffset() const
85 if (dir_
< 0 && offsets_
!= NULL
&& !offsets_
->isEmpty()) {
86 // CollationIterator::previousCE() decrements the CEs length
87 // while it pops CEs from its internal buffer.
88 int32_t i
= iter_
->getCEsLength();
89 if (otherHalf_
!= 0) {
90 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
93 U_ASSERT(i
< offsets_
->size());
94 return offsets_
->elementAti(i
);
96 return iter_
->getOffset();
100 * Get the ordering priority of the next character in the string.
101 * @return the next character's ordering. Returns NULLORDER if an error has
102 * occured or if the end of string has been reached
104 int32_t CollationElementIterator::next(UErrorCode
& status
)
106 if (U_FAILURE(status
)) { return NULLORDER
; }
108 // Continue forward iteration. Test this first.
109 if (otherHalf_
!= 0) {
110 uint32_t oh
= otherHalf_
;
114 } else if (dir_
== 1) {
115 // next() after setOffset()
117 } else if (dir_
== 0) {
118 // The iter_ is already reset to the start of the text.
120 } else /* dir_ < 0 */ {
121 // illegal change of direction
122 status
= U_INVALID_STATE_ERROR
;
125 // No need to keep all CEs in the buffer when we iterate.
126 iter_
->clearCEsIfNoneRemaining();
127 int64_t ce
= iter_
->nextCE(status
);
128 if (ce
== Collation::NO_CE
) { return NULLORDER
; }
129 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130 uint32_t p
= (uint32_t)(ce
>> 32);
131 uint32_t lower32
= (uint32_t)ce
;
132 uint32_t firstHalf
= getFirstHalf(p
, lower32
);
133 uint32_t secondHalf
= getSecondHalf(p
, lower32
);
134 if (secondHalf
!= 0) {
135 otherHalf_
= secondHalf
| 0xc0; // continuation CE
140 UBool
CollationElementIterator::operator!=(
141 const CollationElementIterator
& other
) const
143 return !(*this == other
);
146 UBool
CollationElementIterator::operator==(
147 const CollationElementIterator
& that
) const
154 (rbc_
== that
.rbc_
|| *rbc_
== *that
.rbc_
) &&
155 otherHalf_
== that
.otherHalf_
&&
156 normalizeDir() == that
.normalizeDir() &&
157 string_
== that
.string_
&&
158 *iter_
== *that
.iter_
;
162 * Get the ordering priority of the previous collation element in the string.
163 * @param status the error code status.
164 * @return the previous element's ordering. Returns NULLORDER if an error has
165 * occured or if the start of string has been reached.
167 int32_t CollationElementIterator::previous(UErrorCode
& status
)
169 if (U_FAILURE(status
)) { return NULLORDER
; }
171 // Continue backwards iteration. Test this first.
172 if (otherHalf_
!= 0) {
173 uint32_t oh
= otherHalf_
;
177 } else if (dir_
== 0) {
178 iter_
->resetToOffset(string_
.length());
180 } else if (dir_
== 1) {
181 // previous() after setOffset()
183 } else /* dir_ > 1 */ {
184 // illegal change of direction
185 status
= U_INVALID_STATE_ERROR
;
188 if (offsets_
== NULL
) {
189 offsets_
= new UVector32(status
);
190 if (offsets_
== NULL
) {
191 status
= U_MEMORY_ALLOCATION_ERROR
;
195 // If we already have expansion CEs, then we also have offsets.
196 // Otherwise remember the trailing offset in case we need to
197 // write offsets for an artificial expansion.
198 int32_t limitOffset
= iter_
->getCEsLength() == 0 ? iter_
->getOffset() : 0;
199 int64_t ce
= iter_
->previousCE(*offsets_
, status
);
200 if (ce
== Collation::NO_CE
) { return NULLORDER
; }
201 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202 uint32_t p
= (uint32_t)(ce
>> 32);
203 uint32_t lower32
= (uint32_t)ce
;
204 uint32_t firstHalf
= getFirstHalf(p
, lower32
);
205 uint32_t secondHalf
= getSecondHalf(p
, lower32
);
206 if (secondHalf
!= 0) {
207 if (offsets_
->isEmpty()) {
208 // When we convert a single 64-bit CE into two 32-bit CEs,
209 // we need to make this artificial expansion behave like a normal expansion.
210 // See CollationIterator::previousCE().
211 offsets_
->addElement(iter_
->getOffset(), status
);
212 offsets_
->addElement(limitOffset
, status
);
214 otherHalf_
= firstHalf
;
215 return secondHalf
| 0xc0; // continuation CE
221 * Resets the cursor to the beginning of the string.
223 void CollationElementIterator::reset()
225 iter_
->resetToOffset(0);
230 void CollationElementIterator::setOffset(int32_t newOffset
,
233 if (U_FAILURE(status
)) { return; }
234 if (0 < newOffset
&& newOffset
< string_
.length()) {
235 int32_t offset
= newOffset
;
237 UChar c
= string_
.charAt(offset
);
238 if (!rbc_
->isUnsafe(c
) ||
239 (U16_IS_LEAD(c
) && !rbc_
->isUnsafe(string_
.char32At(offset
)))) {
242 // Back up to before this unsafe character.
244 } while (offset
> 0);
245 if (offset
< newOffset
) {
246 // We might have backed up more than necessary.
247 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248 // but for text "chu" setOffset(2) should remain at 2
249 // although we initially back up to offset 0.
250 // Find the last safe offset no greater than newOffset by iterating forward.
251 int32_t lastSafeOffset
= offset
;
253 iter_
->resetToOffset(lastSafeOffset
);
255 iter_
->nextCE(status
);
256 if (U_FAILURE(status
)) { return; }
257 } while ((offset
= iter_
->getOffset()) == lastSafeOffset
);
258 if (offset
<= newOffset
) {
259 lastSafeOffset
= offset
;
261 } while (offset
< newOffset
);
262 newOffset
= lastSafeOffset
;
265 iter_
->resetToOffset(newOffset
);
271 * Sets the source to the new source string.
273 void CollationElementIterator::setText(const UnicodeString
& source
,
276 if (U_FAILURE(status
)) {
281 const UChar
*s
= string_
.getBuffer();
282 CollationIterator
*newIter
;
283 UBool numeric
= rbc_
->settings
->isNumeric();
284 if (rbc_
->settings
->dontCheckFCD()) {
285 newIter
= new UTF16CollationIterator(rbc_
->data
, numeric
, s
, s
, s
+ string_
.length());
287 newIter
= new FCDUTF16CollationIterator(rbc_
->data
, numeric
, s
, s
, s
+ string_
.length());
289 if (newIter
== NULL
) {
290 status
= U_MEMORY_ALLOCATION_ERROR
;
299 // Sets the source to the new character iterator.
300 void CollationElementIterator::setText(CharacterIterator
& source
,
303 if (U_FAILURE(status
))
306 source
.getText(string_
);
307 setText(string_
, status
);
310 int32_t CollationElementIterator::strengthOrder(int32_t order
) const
312 UColAttributeValue s
= (UColAttributeValue
)rbc_
->settings
->getStrength();
313 // Mask off the unwanted differences.
314 if (s
== UCOL_PRIMARY
) {
317 else if (s
== UCOL_SECONDARY
) {
324 /* CollationElementIterator private constructors/destructors --------------- */
327 * This is the "real" constructor for this class; it constructs an iterator
328 * over the source text using the specified collator
330 CollationElementIterator::CollationElementIterator(
331 const UnicodeString
&source
,
332 const RuleBasedCollator
*coll
,
334 : iter_(NULL
), rbc_(coll
), otherHalf_(0), dir_(0), offsets_(NULL
) {
335 setText(source
, status
);
339 * This is the "real" constructor for this class; it constructs an iterator over
340 * the source text using the specified collator
342 CollationElementIterator::CollationElementIterator(
343 const CharacterIterator
&source
,
344 const RuleBasedCollator
*coll
,
346 : iter_(NULL
), rbc_(coll
), otherHalf_(0), dir_(0), offsets_(NULL
) {
347 // We only call source.getText() which should be const anyway.
348 setText(const_cast<CharacterIterator
&>(source
), status
);
351 /* CollationElementIterator private methods -------------------------------- */
353 const CollationElementIterator
& CollationElementIterator::operator=(
354 const CollationElementIterator
& other
)
356 if (this == &other
) {
360 CollationIterator
*newIter
;
361 const FCDUTF16CollationIterator
*otherFCDIter
=
362 dynamic_cast<const FCDUTF16CollationIterator
*>(other
.iter_
);
363 if(otherFCDIter
!= NULL
) {
364 newIter
= new FCDUTF16CollationIterator(*otherFCDIter
, string_
.getBuffer());
366 const UTF16CollationIterator
*otherIter
=
367 dynamic_cast<const UTF16CollationIterator
*>(other
.iter_
);
368 if(otherIter
!= NULL
) {
369 newIter
= new UTF16CollationIterator(*otherIter
, string_
.getBuffer());
374 if(newIter
!= NULL
) {
378 otherHalf_
= other
.otherHalf_
;
381 string_
= other
.string_
;
383 if(other
.dir_
< 0 && other
.offsets_
!= NULL
&& !other
.offsets_
->isEmpty()) {
384 UErrorCode errorCode
= U_ZERO_ERROR
;
385 if(offsets_
== NULL
) {
386 offsets_
= new UVector32(other
.offsets_
->size(), errorCode
);
388 if(offsets_
!= NULL
) {
389 offsets_
->assign(*other
.offsets_
, errorCode
);
397 class MaxExpSink
: public ContractionsAndExpansions::CESink
{
399 MaxExpSink(UHashtable
*h
, UErrorCode
&ec
) : maxExpansions(h
), errorCode(ec
) {}
400 virtual ~MaxExpSink();
401 virtual void handleCE(int64_t /*ce*/) {}
402 virtual void handleExpansion(const int64_t ces
[], int32_t length
) {
404 // We do not need to add single CEs into the map.
407 int32_t count
= 0; // number of CE "halves"
408 for (int32_t i
= 0; i
< length
; ++i
) {
409 count
+= ceNeedsTwoParts(ces
[i
]) ? 2 : 1;
411 // last "half" of the last CE
412 int64_t ce
= ces
[length
- 1];
413 uint32_t p
= (uint32_t)(ce
>> 32);
414 uint32_t lower32
= (uint32_t)ce
;
415 uint32_t lastHalf
= getSecondHalf(p
, lower32
);
417 lastHalf
= getFirstHalf(p
, lower32
);
418 U_ASSERT(lastHalf
!= 0);
420 lastHalf
|= 0xc0; // old-style continuation CE
422 if (count
> uhash_igeti(maxExpansions
, (int32_t)lastHalf
)) {
423 uhash_iputi(maxExpansions
, (int32_t)lastHalf
, count
, &errorCode
);
428 UHashtable
*maxExpansions
;
429 UErrorCode
&errorCode
;
432 MaxExpSink::~MaxExpSink() {}
437 CollationElementIterator::computeMaxExpansions(const CollationData
*data
, UErrorCode
&errorCode
) {
438 if (U_FAILURE(errorCode
)) { return NULL
; }
439 UHashtable
*maxExpansions
= uhash_open(uhash_hashLong
, uhash_compareLong
,
440 uhash_compareLong
, &errorCode
);
441 if (U_FAILURE(errorCode
)) { return NULL
; }
442 MaxExpSink
sink(maxExpansions
, errorCode
);
443 ContractionsAndExpansions(NULL
, NULL
, &sink
, TRUE
).forData(data
, errorCode
);
444 if (U_FAILURE(errorCode
)) {
445 uhash_close(maxExpansions
);
448 return maxExpansions
;
452 CollationElementIterator::getMaxExpansion(int32_t order
) const {
453 return getMaxExpansion(rbc_
->tailoring
->maxExpansions
, order
);
457 CollationElementIterator::getMaxExpansion(const UHashtable
*maxExpansions
, int32_t order
) {
458 if (order
== 0) { return 1; }
460 if(maxExpansions
!= NULL
&& (max
= uhash_igeti(maxExpansions
, order
)) != 0) {
463 if ((order
& 0xc0) == 0xc0) {
464 // old-style continuation CE
473 #endif /* #if !UCONFIG_NO_COLLATION */