]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
57a6839d A |
3 | * Copyright (C) 1996-2014, International Business Machines Corporation and |
4 | * others. All Rights Reserved. | |
b75a7d8f A |
5 | ******************************************************************************* |
6 | */ | |
7 | ||
8 | /* | |
9 | * File coleitr.cpp | |
10 | * | |
b75a7d8f A |
11 | * Created by: Helena Shih |
12 | * | |
13 | * Modification History: | |
14 | * | |
15 | * Date Name Description | |
16 | * | |
17 | * 6/23/97 helena Adding comments to make code more readable. | |
18 | * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java | |
19 | * 12/10/99 aliu Ported Thai collation support from Java. | |
20 | * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) | |
57a6839d | 21 | * 02/19/01 swquek Removed CollationElementIterator() since it is |
b75a7d8f | 22 | * private constructor and no calls are made to it |
57a6839d | 23 | * 2012-2014 markus Rewritten in C++ again. |
b75a7d8f A |
24 | */ |
25 | ||
26 | #include "unicode/utypes.h" | |
27 | ||
28 | #if !UCONFIG_NO_COLLATION | |
29 | ||
30 | #include "unicode/coleitr.h" | |
57a6839d | 31 | #include "unicode/tblcoll.h" |
b75a7d8f | 32 | #include "unicode/ustring.h" |
b75a7d8f | 33 | #include "cmemory.h" |
57a6839d A |
34 | #include "collation.h" |
35 | #include "collationdata.h" | |
36 | #include "collationiterator.h" | |
37 | #include "collationsets.h" | |
38 | #include "collationtailoring.h" | |
39 | #include "uassert.h" | |
40 | #include "uhash.h" | |
41 | #include "utf16collationiterator.h" | |
42 | #include "uvectr32.h" | |
b75a7d8f A |
43 | |
44 | /* Constants --------------------------------------------------------------- */ | |
45 | ||
46 | U_NAMESPACE_BEGIN | |
47 | ||
374ca955 | 48 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
b75a7d8f | 49 | |
b75a7d8f A |
50 | /* CollationElementIterator public constructor/destructor ------------------ */ |
51 | ||
52 | CollationElementIterator::CollationElementIterator( | |
53 | const CollationElementIterator& other) | |
57a6839d | 54 | : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { |
73c04bcf | 55 | *this = other; |
b75a7d8f A |
56 | } |
57 | ||
58 | CollationElementIterator::~CollationElementIterator() | |
59 | { | |
57a6839d A |
60 | delete iter_; |
61 | delete offsets_; | |
b75a7d8f A |
62 | } |
63 | ||
64 | /* CollationElementIterator public methods --------------------------------- */ | |
65 | ||
57a6839d A |
66 | namespace { |
67 | ||
68 | uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { | |
69 | return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); | |
70 | } | |
71 | uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { | |
72 | return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); | |
73 | } | |
74 | UBool ceNeedsTwoParts(int64_t ce) { | |
75 | return (ce & INT64_C(0xffff00ff003f)) != 0; | |
76 | } | |
77 | ||
78 | } // namespace | |
79 | ||
b75a7d8f A |
80 | int32_t CollationElementIterator::getOffset() const |
81 | { | |
57a6839d A |
82 | if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
83 | // CollationIterator::previousCE() decrements the CEs length | |
84 | // while it pops CEs from its internal buffer. | |
85 | int32_t i = iter_->getCEsLength(); | |
86 | if (otherHalf_ != 0) { | |
87 | // Return the trailing CE offset while we are in the middle of a 64-bit CE. | |
88 | ++i; | |
89 | } | |
90 | U_ASSERT(i < offsets_->size()); | |
91 | return offsets_->elementAti(i); | |
92 | } | |
93 | return iter_->getOffset(); | |
b75a7d8f A |
94 | } |
95 | ||
96 | /** | |
97 | * Get the ordering priority of the next character in the string. | |
98 | * @return the next character's ordering. Returns NULLORDER if an error has | |
99 | * occured or if the end of string has been reached | |
100 | */ | |
101 | int32_t CollationElementIterator::next(UErrorCode& status) | |
102 | { | |
57a6839d A |
103 | if (U_FAILURE(status)) { return NULLORDER; } |
104 | if (dir_ > 1) { | |
105 | // Continue forward iteration. Test this first. | |
106 | if (otherHalf_ != 0) { | |
107 | uint32_t oh = otherHalf_; | |
108 | otherHalf_ = 0; | |
109 | return oh; | |
110 | } | |
111 | } else if (dir_ == 1) { | |
112 | // next() after setOffset() | |
113 | dir_ = 2; | |
114 | } else if (dir_ == 0) { | |
115 | // The iter_ is already reset to the start of the text. | |
116 | dir_ = 2; | |
117 | } else /* dir_ < 0 */ { | |
118 | // illegal change of direction | |
119 | status = U_INVALID_STATE_ERROR; | |
120 | return NULLORDER; | |
121 | } | |
122 | // No need to keep all CEs in the buffer when we iterate. | |
123 | iter_->clearCEsIfNoneRemaining(); | |
124 | int64_t ce = iter_->nextCE(status); | |
125 | if (ce == Collation::NO_CE) { return NULLORDER; } | |
126 | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. | |
127 | uint32_t p = (uint32_t)(ce >> 32); | |
128 | uint32_t lower32 = (uint32_t)ce; | |
129 | uint32_t firstHalf = getFirstHalf(p, lower32); | |
130 | uint32_t secondHalf = getSecondHalf(p, lower32); | |
131 | if (secondHalf != 0) { | |
132 | otherHalf_ = secondHalf | 0xc0; // continuation CE | |
133 | } | |
134 | return firstHalf; | |
b75a7d8f A |
135 | } |
136 | ||
137 | UBool CollationElementIterator::operator!=( | |
138 | const CollationElementIterator& other) const | |
139 | { | |
73c04bcf | 140 | return !(*this == other); |
b75a7d8f A |
141 | } |
142 | ||
143 | UBool CollationElementIterator::operator==( | |
144 | const CollationElementIterator& that) const | |
145 | { | |
57a6839d | 146 | if (this == &that) { |
b75a7d8f A |
147 | return TRUE; |
148 | } | |
149 | ||
57a6839d A |
150 | return |
151 | (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && | |
152 | otherHalf_ == that.otherHalf_ && | |
153 | normalizeDir() == that.normalizeDir() && | |
154 | string_ == that.string_ && | |
155 | *iter_ == *that.iter_; | |
b75a7d8f A |
156 | } |
157 | ||
158 | /** | |
159 | * Get the ordering priority of the previous collation element in the string. | |
160 | * @param status the error code status. | |
161 | * @return the previous element's ordering. Returns NULLORDER if an error has | |
162 | * occured or if the start of string has been reached. | |
163 | */ | |
164 | int32_t CollationElementIterator::previous(UErrorCode& status) | |
165 | { | |
57a6839d A |
166 | if (U_FAILURE(status)) { return NULLORDER; } |
167 | if (dir_ < 0) { | |
168 | // Continue backwards iteration. Test this first. | |
169 | if (otherHalf_ != 0) { | |
170 | uint32_t oh = otherHalf_; | |
171 | otherHalf_ = 0; | |
172 | return oh; | |
173 | } | |
174 | } else if (dir_ == 0) { | |
175 | iter_->resetToOffset(string_.length()); | |
176 | dir_ = -1; | |
177 | } else if (dir_ == 1) { | |
178 | // previous() after setOffset() | |
179 | dir_ = -1; | |
180 | } else /* dir_ > 1 */ { | |
181 | // illegal change of direction | |
182 | status = U_INVALID_STATE_ERROR; | |
183 | return NULLORDER; | |
184 | } | |
185 | if (offsets_ == NULL) { | |
186 | offsets_ = new UVector32(status); | |
187 | if (offsets_ == NULL) { | |
188 | status = U_MEMORY_ALLOCATION_ERROR; | |
189 | return NULLORDER; | |
190 | } | |
191 | } | |
192 | // If we already have expansion CEs, then we also have offsets. | |
193 | // Otherwise remember the trailing offset in case we need to | |
194 | // write offsets for an artificial expansion. | |
195 | int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; | |
196 | int64_t ce = iter_->previousCE(*offsets_, status); | |
197 | if (ce == Collation::NO_CE) { return NULLORDER; } | |
198 | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. | |
199 | uint32_t p = (uint32_t)(ce >> 32); | |
200 | uint32_t lower32 = (uint32_t)ce; | |
201 | uint32_t firstHalf = getFirstHalf(p, lower32); | |
202 | uint32_t secondHalf = getSecondHalf(p, lower32); | |
203 | if (secondHalf != 0) { | |
204 | if (offsets_->isEmpty()) { | |
205 | // When we convert a single 64-bit CE into two 32-bit CEs, | |
206 | // we need to make this artificial expansion behave like a normal expansion. | |
207 | // See CollationIterator::previousCE(). | |
208 | offsets_->addElement(iter_->getOffset(), status); | |
209 | offsets_->addElement(limitOffset, status); | |
210 | } | |
211 | otherHalf_ = firstHalf; | |
212 | return secondHalf | 0xc0; // continuation CE | |
213 | } | |
214 | return firstHalf; | |
b75a7d8f A |
215 | } |
216 | ||
217 | /** | |
218 | * Resets the cursor to the beginning of the string. | |
219 | */ | |
220 | void CollationElementIterator::reset() | |
221 | { | |
57a6839d A |
222 | iter_ ->resetToOffset(0); |
223 | otherHalf_ = 0; | |
224 | dir_ = 0; | |
b75a7d8f A |
225 | } |
226 | ||
227 | void CollationElementIterator::setOffset(int32_t newOffset, | |
228 | UErrorCode& status) | |
229 | { | |
57a6839d A |
230 | if (U_FAILURE(status)) { return; } |
231 | if (0 < newOffset && newOffset < string_.length()) { | |
232 | int32_t offset = newOffset; | |
233 | do { | |
234 | UChar c = string_.charAt(offset); | |
235 | if (!rbc_->isUnsafe(c) || | |
236 | (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { | |
237 | break; | |
238 | } | |
239 | // Back up to before this unsafe character. | |
240 | --offset; | |
241 | } while (offset > 0); | |
242 | if (offset < newOffset) { | |
243 | // We might have backed up more than necessary. | |
244 | // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, | |
245 | // but for text "chu" setOffset(2) should remain at 2 | |
246 | // although we initially back up to offset 0. | |
247 | // Find the last safe offset no greater than newOffset by iterating forward. | |
248 | int32_t lastSafeOffset = offset; | |
249 | do { | |
250 | iter_->resetToOffset(lastSafeOffset); | |
251 | do { | |
252 | iter_->nextCE(status); | |
253 | if (U_FAILURE(status)) { return; } | |
254 | } while ((offset = iter_->getOffset()) == lastSafeOffset); | |
255 | if (offset <= newOffset) { | |
256 | lastSafeOffset = offset; | |
257 | } | |
258 | } while (offset < newOffset); | |
259 | newOffset = lastSafeOffset; | |
260 | } | |
261 | } | |
262 | iter_->resetToOffset(newOffset); | |
263 | otherHalf_ = 0; | |
264 | dir_ = 1; | |
b75a7d8f A |
265 | } |
266 | ||
267 | /** | |
268 | * Sets the source to the new source string. | |
269 | */ | |
270 | void CollationElementIterator::setText(const UnicodeString& source, | |
271 | UErrorCode& status) | |
272 | { | |
73c04bcf | 273 | if (U_FAILURE(status)) { |
b75a7d8f A |
274 | return; |
275 | } | |
73c04bcf | 276 | |
57a6839d A |
277 | string_ = source; |
278 | const UChar *s = string_.getBuffer(); | |
279 | CollationIterator *newIter; | |
280 | UBool numeric = rbc_->settings->isNumeric(); | |
281 | if (rbc_->settings->dontCheckFCD()) { | |
282 | newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); | |
283 | } else { | |
284 | newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); | |
73c04bcf | 285 | } |
57a6839d A |
286 | if (newIter == NULL) { |
287 | status = U_MEMORY_ALLOCATION_ERROR; | |
288 | return; | |
b75a7d8f | 289 | } |
57a6839d A |
290 | delete iter_; |
291 | iter_ = newIter; | |
292 | otherHalf_ = 0; | |
293 | dir_ = 0; | |
b75a7d8f A |
294 | } |
295 | ||
296 | // Sets the source to the new character iterator. | |
297 | void CollationElementIterator::setText(CharacterIterator& source, | |
298 | UErrorCode& status) | |
299 | { | |
73c04bcf | 300 | if (U_FAILURE(status)) |
b75a7d8f | 301 | return; |
73c04bcf | 302 | |
57a6839d A |
303 | source.getText(string_); |
304 | setText(string_, status); | |
b75a7d8f A |
305 | } |
306 | ||
307 | int32_t CollationElementIterator::strengthOrder(int32_t order) const | |
308 | { | |
57a6839d | 309 | UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
73c04bcf A |
310 | // Mask off the unwanted differences. |
311 | if (s == UCOL_PRIMARY) { | |
57a6839d | 312 | order &= 0xffff0000; |
73c04bcf A |
313 | } |
314 | else if (s == UCOL_SECONDARY) { | |
57a6839d | 315 | order &= 0xffffff00; |
73c04bcf A |
316 | } |
317 | ||
318 | return order; | |
b75a7d8f A |
319 | } |
320 | ||
321 | /* CollationElementIterator private constructors/destructors --------------- */ | |
322 | ||
323 | /** | |
324 | * This is the "real" constructor for this class; it constructs an iterator | |
325 | * over the source text using the specified collator | |
326 | */ | |
327 | CollationElementIterator::CollationElementIterator( | |
57a6839d A |
328 | const UnicodeString &source, |
329 | const RuleBasedCollator *coll, | |
330 | UErrorCode &status) | |
331 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { | |
332 | setText(source, status); | |
b75a7d8f A |
333 | } |
334 | ||
335 | /** | |
336 | * This is the "real" constructor for this class; it constructs an iterator over | |
337 | * the source text using the specified collator | |
338 | */ | |
339 | CollationElementIterator::CollationElementIterator( | |
57a6839d A |
340 | const CharacterIterator &source, |
341 | const RuleBasedCollator *coll, | |
342 | UErrorCode &status) | |
343 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { | |
344 | // We only call source.getText() which should be const anyway. | |
345 | setText(const_cast<CharacterIterator &>(source), status); | |
b75a7d8f A |
346 | } |
347 | ||
57a6839d | 348 | /* CollationElementIterator private methods -------------------------------- */ |
b75a7d8f A |
349 | |
350 | const CollationElementIterator& CollationElementIterator::operator=( | |
351 | const CollationElementIterator& other) | |
352 | { | |
57a6839d A |
353 | if (this == &other) { |
354 | return *this; | |
355 | } | |
356 | ||
357 | CollationIterator *newIter; | |
358 | const FCDUTF16CollationIterator *otherFCDIter = | |
359 | dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); | |
360 | if(otherFCDIter != NULL) { | |
361 | newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); | |
362 | } else { | |
363 | const UTF16CollationIterator *otherIter = | |
364 | dynamic_cast<const UTF16CollationIterator *>(other.iter_); | |
365 | if(otherIter != NULL) { | |
366 | newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); | |
367 | } else { | |
368 | newIter = NULL; | |
73c04bcf | 369 | } |
57a6839d A |
370 | } |
371 | if(newIter != NULL) { | |
372 | delete iter_; | |
373 | iter_ = newIter; | |
374 | rbc_ = other.rbc_; | |
375 | otherHalf_ = other.otherHalf_; | |
376 | dir_ = other.dir_; | |
377 | ||
378 | string_ = other.string_; | |
379 | } | |
380 | if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { | |
381 | UErrorCode errorCode = U_ZERO_ERROR; | |
382 | if(offsets_ == NULL) { | |
383 | offsets_ = new UVector32(other.offsets_->size(), errorCode); | |
73c04bcf | 384 | } |
57a6839d A |
385 | if(offsets_ != NULL) { |
386 | offsets_->assign(*other.offsets_, errorCode); | |
73c04bcf | 387 | } |
57a6839d A |
388 | } |
389 | return *this; | |
390 | } | |
391 | ||
392 | namespace { | |
73c04bcf | 393 | |
57a6839d A |
394 | class MaxExpSink : public ContractionsAndExpansions::CESink { |
395 | public: | |
396 | MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} | |
397 | virtual ~MaxExpSink(); | |
398 | virtual void handleCE(int64_t /*ce*/) {} | |
399 | virtual void handleExpansion(const int64_t ces[], int32_t length) { | |
400 | if (length <= 1) { | |
401 | // We do not need to add single CEs into the map. | |
402 | return; | |
73c04bcf | 403 | } |
57a6839d A |
404 | int32_t count = 0; // number of CE "halves" |
405 | for (int32_t i = 0; i < length; ++i) { | |
406 | count += ceNeedsTwoParts(ces[i]) ? 2 : 1; | |
46f4442e | 407 | } |
57a6839d A |
408 | // last "half" of the last CE |
409 | int64_t ce = ces[length - 1]; | |
410 | uint32_t p = (uint32_t)(ce >> 32); | |
411 | uint32_t lower32 = (uint32_t)ce; | |
412 | uint32_t lastHalf = getSecondHalf(p, lower32); | |
413 | if (lastHalf == 0) { | |
414 | lastHalf = getFirstHalf(p, lower32); | |
415 | U_ASSERT(lastHalf != 0); | |
46f4442e | 416 | } else { |
57a6839d | 417 | lastHalf |= 0xc0; // old-style continuation CE |
73c04bcf | 418 | } |
57a6839d A |
419 | if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
420 | uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); | |
73c04bcf | 421 | } |
73c04bcf A |
422 | } |
423 | ||
57a6839d A |
424 | private: |
425 | UHashtable *maxExpansions; | |
426 | UErrorCode &errorCode; | |
427 | }; | |
428 | ||
429 | MaxExpSink::~MaxExpSink() {} | |
430 | ||
431 | } // namespace | |
432 | ||
433 | UHashtable * | |
434 | CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { | |
435 | if (U_FAILURE(errorCode)) { return NULL; } | |
436 | UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, | |
437 | uhash_compareLong, &errorCode); | |
438 | if (U_FAILURE(errorCode)) { return NULL; } | |
439 | MaxExpSink sink(maxExpansions, errorCode); | |
440 | ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); | |
441 | if (U_FAILURE(errorCode)) { | |
442 | uhash_close(maxExpansions); | |
443 | return NULL; | |
444 | } | |
445 | return maxExpansions; | |
446 | } | |
447 | ||
448 | int32_t | |
449 | CollationElementIterator::getMaxExpansion(int32_t order) const { | |
450 | return getMaxExpansion(rbc_->tailoring->maxExpansions, order); | |
451 | } | |
452 | ||
453 | int32_t | |
454 | CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { | |
455 | if (order == 0) { return 1; } | |
456 | int32_t max; | |
457 | if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { | |
458 | return max; | |
459 | } | |
460 | if ((order & 0xc0) == 0xc0) { | |
461 | // old-style continuation CE | |
462 | return 2; | |
463 | } else { | |
464 | return 1; | |
465 | } | |
b75a7d8f A |
466 | } |
467 | ||
468 | U_NAMESPACE_END | |
469 | ||
470 | #endif /* #if !UCONFIG_NO_COLLATION */ |