]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/coleitr.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / i18n / coleitr.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
57a6839d
A
3* Copyright (C) 1996-2014, International Business Machines Corporation and
4* others. All Rights Reserved.
b75a7d8f
A
5*******************************************************************************
6*/
7
8/*
9* File coleitr.cpp
10*
b75a7d8f
A
11* Created by: Helena Shih
12*
13* Modification History:
14*
15* Date Name Description
16*
17* 6/23/97 helena Adding comments to make code more readable.
18* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
19* 12/10/99 aliu Ported Thai collation support from Java.
20* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
57a6839d 21* 02/19/01 swquek Removed CollationElementIterator() since it is
b75a7d8f 22* private constructor and no calls are made to it
57a6839d 23* 2012-2014 markus Rewritten in C++ again.
b75a7d8f
A
24*/
25
26#include "unicode/utypes.h"
27
28#if !UCONFIG_NO_COLLATION
29
30#include "unicode/coleitr.h"
57a6839d 31#include "unicode/tblcoll.h"
b75a7d8f 32#include "unicode/ustring.h"
b75a7d8f 33#include "cmemory.h"
57a6839d
A
34#include "collation.h"
35#include "collationdata.h"
36#include "collationiterator.h"
37#include "collationsets.h"
38#include "collationtailoring.h"
39#include "uassert.h"
40#include "uhash.h"
41#include "utf16collationiterator.h"
42#include "uvectr32.h"
b75a7d8f
A
43
44/* Constants --------------------------------------------------------------- */
45
46U_NAMESPACE_BEGIN
47
374ca955 48UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
b75a7d8f 49
b75a7d8f
A
50/* CollationElementIterator public constructor/destructor ------------------ */
51
52CollationElementIterator::CollationElementIterator(
53 const CollationElementIterator& other)
57a6839d 54 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
73c04bcf 55 *this = other;
b75a7d8f
A
56}
57
58CollationElementIterator::~CollationElementIterator()
59{
57a6839d
A
60 delete iter_;
61 delete offsets_;
b75a7d8f
A
62}
63
64/* CollationElementIterator public methods --------------------------------- */
65
57a6839d
A
66namespace {
67
68uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
69 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
70}
71uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
72 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
73}
74UBool ceNeedsTwoParts(int64_t ce) {
75 return (ce & INT64_C(0xffff00ff003f)) != 0;
76}
77
78} // namespace
79
b75a7d8f
A
80int32_t CollationElementIterator::getOffset() const
81{
57a6839d
A
82 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
83 // CollationIterator::previousCE() decrements the CEs length
84 // while it pops CEs from its internal buffer.
85 int32_t i = iter_->getCEsLength();
86 if (otherHalf_ != 0) {
87 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
88 ++i;
89 }
90 U_ASSERT(i < offsets_->size());
91 return offsets_->elementAti(i);
92 }
93 return iter_->getOffset();
b75a7d8f
A
94}
95
96/**
97* Get the ordering priority of the next character in the string.
98* @return the next character's ordering. Returns NULLORDER if an error has
99* occured or if the end of string has been reached
100*/
101int32_t CollationElementIterator::next(UErrorCode& status)
102{
57a6839d
A
103 if (U_FAILURE(status)) { return NULLORDER; }
104 if (dir_ > 1) {
105 // Continue forward iteration. Test this first.
106 if (otherHalf_ != 0) {
107 uint32_t oh = otherHalf_;
108 otherHalf_ = 0;
109 return oh;
110 }
111 } else if (dir_ == 1) {
112 // next() after setOffset()
113 dir_ = 2;
114 } else if (dir_ == 0) {
115 // The iter_ is already reset to the start of the text.
116 dir_ = 2;
117 } else /* dir_ < 0 */ {
118 // illegal change of direction
119 status = U_INVALID_STATE_ERROR;
120 return NULLORDER;
121 }
122 // No need to keep all CEs in the buffer when we iterate.
123 iter_->clearCEsIfNoneRemaining();
124 int64_t ce = iter_->nextCE(status);
125 if (ce == Collation::NO_CE) { return NULLORDER; }
126 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
127 uint32_t p = (uint32_t)(ce >> 32);
128 uint32_t lower32 = (uint32_t)ce;
129 uint32_t firstHalf = getFirstHalf(p, lower32);
130 uint32_t secondHalf = getSecondHalf(p, lower32);
131 if (secondHalf != 0) {
132 otherHalf_ = secondHalf | 0xc0; // continuation CE
133 }
134 return firstHalf;
b75a7d8f
A
135}
136
137UBool CollationElementIterator::operator!=(
138 const CollationElementIterator& other) const
139{
73c04bcf 140 return !(*this == other);
b75a7d8f
A
141}
142
143UBool CollationElementIterator::operator==(
144 const CollationElementIterator& that) const
145{
57a6839d 146 if (this == &that) {
b75a7d8f
A
147 return TRUE;
148 }
149
57a6839d
A
150 return
151 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
152 otherHalf_ == that.otherHalf_ &&
153 normalizeDir() == that.normalizeDir() &&
154 string_ == that.string_ &&
155 *iter_ == *that.iter_;
b75a7d8f
A
156}
157
158/**
159* Get the ordering priority of the previous collation element in the string.
160* @param status the error code status.
161* @return the previous element's ordering. Returns NULLORDER if an error has
162* occured or if the start of string has been reached.
163*/
164int32_t CollationElementIterator::previous(UErrorCode& status)
165{
57a6839d
A
166 if (U_FAILURE(status)) { return NULLORDER; }
167 if (dir_ < 0) {
168 // Continue backwards iteration. Test this first.
169 if (otherHalf_ != 0) {
170 uint32_t oh = otherHalf_;
171 otherHalf_ = 0;
172 return oh;
173 }
174 } else if (dir_ == 0) {
175 iter_->resetToOffset(string_.length());
176 dir_ = -1;
177 } else if (dir_ == 1) {
178 // previous() after setOffset()
179 dir_ = -1;
180 } else /* dir_ > 1 */ {
181 // illegal change of direction
182 status = U_INVALID_STATE_ERROR;
183 return NULLORDER;
184 }
185 if (offsets_ == NULL) {
186 offsets_ = new UVector32(status);
187 if (offsets_ == NULL) {
188 status = U_MEMORY_ALLOCATION_ERROR;
189 return NULLORDER;
190 }
191 }
192 // If we already have expansion CEs, then we also have offsets.
193 // Otherwise remember the trailing offset in case we need to
194 // write offsets for an artificial expansion.
195 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
196 int64_t ce = iter_->previousCE(*offsets_, status);
197 if (ce == Collation::NO_CE) { return NULLORDER; }
198 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
199 uint32_t p = (uint32_t)(ce >> 32);
200 uint32_t lower32 = (uint32_t)ce;
201 uint32_t firstHalf = getFirstHalf(p, lower32);
202 uint32_t secondHalf = getSecondHalf(p, lower32);
203 if (secondHalf != 0) {
204 if (offsets_->isEmpty()) {
205 // When we convert a single 64-bit CE into two 32-bit CEs,
206 // we need to make this artificial expansion behave like a normal expansion.
207 // See CollationIterator::previousCE().
208 offsets_->addElement(iter_->getOffset(), status);
209 offsets_->addElement(limitOffset, status);
210 }
211 otherHalf_ = firstHalf;
212 return secondHalf | 0xc0; // continuation CE
213 }
214 return firstHalf;
b75a7d8f
A
215}
216
217/**
218* Resets the cursor to the beginning of the string.
219*/
220void CollationElementIterator::reset()
221{
57a6839d
A
222 iter_ ->resetToOffset(0);
223 otherHalf_ = 0;
224 dir_ = 0;
b75a7d8f
A
225}
226
227void CollationElementIterator::setOffset(int32_t newOffset,
228 UErrorCode& status)
229{
57a6839d
A
230 if (U_FAILURE(status)) { return; }
231 if (0 < newOffset && newOffset < string_.length()) {
232 int32_t offset = newOffset;
233 do {
234 UChar c = string_.charAt(offset);
235 if (!rbc_->isUnsafe(c) ||
236 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
237 break;
238 }
239 // Back up to before this unsafe character.
240 --offset;
241 } while (offset > 0);
242 if (offset < newOffset) {
243 // We might have backed up more than necessary.
244 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
245 // but for text "chu" setOffset(2) should remain at 2
246 // although we initially back up to offset 0.
247 // Find the last safe offset no greater than newOffset by iterating forward.
248 int32_t lastSafeOffset = offset;
249 do {
250 iter_->resetToOffset(lastSafeOffset);
251 do {
252 iter_->nextCE(status);
253 if (U_FAILURE(status)) { return; }
254 } while ((offset = iter_->getOffset()) == lastSafeOffset);
255 if (offset <= newOffset) {
256 lastSafeOffset = offset;
257 }
258 } while (offset < newOffset);
259 newOffset = lastSafeOffset;
260 }
261 }
262 iter_->resetToOffset(newOffset);
263 otherHalf_ = 0;
264 dir_ = 1;
b75a7d8f
A
265}
266
267/**
268* Sets the source to the new source string.
269*/
270void CollationElementIterator::setText(const UnicodeString& source,
271 UErrorCode& status)
272{
73c04bcf 273 if (U_FAILURE(status)) {
b75a7d8f
A
274 return;
275 }
73c04bcf 276
57a6839d
A
277 string_ = source;
278 const UChar *s = string_.getBuffer();
279 CollationIterator *newIter;
280 UBool numeric = rbc_->settings->isNumeric();
281 if (rbc_->settings->dontCheckFCD()) {
282 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
283 } else {
284 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
73c04bcf 285 }
57a6839d
A
286 if (newIter == NULL) {
287 status = U_MEMORY_ALLOCATION_ERROR;
288 return;
b75a7d8f 289 }
57a6839d
A
290 delete iter_;
291 iter_ = newIter;
292 otherHalf_ = 0;
293 dir_ = 0;
b75a7d8f
A
294}
295
296// Sets the source to the new character iterator.
297void CollationElementIterator::setText(CharacterIterator& source,
298 UErrorCode& status)
299{
73c04bcf 300 if (U_FAILURE(status))
b75a7d8f 301 return;
73c04bcf 302
57a6839d
A
303 source.getText(string_);
304 setText(string_, status);
b75a7d8f
A
305}
306
307int32_t CollationElementIterator::strengthOrder(int32_t order) const
308{
57a6839d 309 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
73c04bcf
A
310 // Mask off the unwanted differences.
311 if (s == UCOL_PRIMARY) {
57a6839d 312 order &= 0xffff0000;
73c04bcf
A
313 }
314 else if (s == UCOL_SECONDARY) {
57a6839d 315 order &= 0xffffff00;
73c04bcf
A
316 }
317
318 return order;
b75a7d8f
A
319}
320
321/* CollationElementIterator private constructors/destructors --------------- */
322
323/**
324* This is the "real" constructor for this class; it constructs an iterator
325* over the source text using the specified collator
326*/
327CollationElementIterator::CollationElementIterator(
57a6839d
A
328 const UnicodeString &source,
329 const RuleBasedCollator *coll,
330 UErrorCode &status)
331 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
332 setText(source, status);
b75a7d8f
A
333}
334
335/**
336* This is the "real" constructor for this class; it constructs an iterator over
337* the source text using the specified collator
338*/
339CollationElementIterator::CollationElementIterator(
57a6839d
A
340 const CharacterIterator &source,
341 const RuleBasedCollator *coll,
342 UErrorCode &status)
343 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
344 // We only call source.getText() which should be const anyway.
345 setText(const_cast<CharacterIterator &>(source), status);
b75a7d8f
A
346}
347
57a6839d 348/* CollationElementIterator private methods -------------------------------- */
b75a7d8f
A
349
350const CollationElementIterator& CollationElementIterator::operator=(
351 const CollationElementIterator& other)
352{
57a6839d
A
353 if (this == &other) {
354 return *this;
355 }
356
357 CollationIterator *newIter;
358 const FCDUTF16CollationIterator *otherFCDIter =
359 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
360 if(otherFCDIter != NULL) {
361 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
362 } else {
363 const UTF16CollationIterator *otherIter =
364 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
365 if(otherIter != NULL) {
366 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
367 } else {
368 newIter = NULL;
73c04bcf 369 }
57a6839d
A
370 }
371 if(newIter != NULL) {
372 delete iter_;
373 iter_ = newIter;
374 rbc_ = other.rbc_;
375 otherHalf_ = other.otherHalf_;
376 dir_ = other.dir_;
377
378 string_ = other.string_;
379 }
380 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
381 UErrorCode errorCode = U_ZERO_ERROR;
382 if(offsets_ == NULL) {
383 offsets_ = new UVector32(other.offsets_->size(), errorCode);
73c04bcf 384 }
57a6839d
A
385 if(offsets_ != NULL) {
386 offsets_->assign(*other.offsets_, errorCode);
73c04bcf 387 }
57a6839d
A
388 }
389 return *this;
390}
391
392namespace {
73c04bcf 393
57a6839d
A
394class MaxExpSink : public ContractionsAndExpansions::CESink {
395public:
396 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
397 virtual ~MaxExpSink();
398 virtual void handleCE(int64_t /*ce*/) {}
399 virtual void handleExpansion(const int64_t ces[], int32_t length) {
400 if (length <= 1) {
401 // We do not need to add single CEs into the map.
402 return;
73c04bcf 403 }
57a6839d
A
404 int32_t count = 0; // number of CE "halves"
405 for (int32_t i = 0; i < length; ++i) {
406 count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
46f4442e 407 }
57a6839d
A
408 // last "half" of the last CE
409 int64_t ce = ces[length - 1];
410 uint32_t p = (uint32_t)(ce >> 32);
411 uint32_t lower32 = (uint32_t)ce;
412 uint32_t lastHalf = getSecondHalf(p, lower32);
413 if (lastHalf == 0) {
414 lastHalf = getFirstHalf(p, lower32);
415 U_ASSERT(lastHalf != 0);
46f4442e 416 } else {
57a6839d 417 lastHalf |= 0xc0; // old-style continuation CE
73c04bcf 418 }
57a6839d
A
419 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
420 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
73c04bcf 421 }
73c04bcf
A
422 }
423
57a6839d
A
424private:
425 UHashtable *maxExpansions;
426 UErrorCode &errorCode;
427};
428
429MaxExpSink::~MaxExpSink() {}
430
431} // namespace
432
433UHashtable *
434CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
435 if (U_FAILURE(errorCode)) { return NULL; }
436 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
437 uhash_compareLong, &errorCode);
438 if (U_FAILURE(errorCode)) { return NULL; }
439 MaxExpSink sink(maxExpansions, errorCode);
440 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
441 if (U_FAILURE(errorCode)) {
442 uhash_close(maxExpansions);
443 return NULL;
444 }
445 return maxExpansions;
446}
447
448int32_t
449CollationElementIterator::getMaxExpansion(int32_t order) const {
450 return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
451}
452
453int32_t
454CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
455 if (order == 0) { return 1; }
456 int32_t max;
457 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
458 return max;
459 }
460 if ((order & 0xc0) == 0xc0) {
461 // old-style continuation CE
462 return 2;
463 } else {
464 return 1;
465 }
b75a7d8f
A
466}
467
468U_NAMESPACE_END
469
470#endif /* #if !UCONFIG_NO_COLLATION */