]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ******************************************************************************* | |
57a6839d A |
5 | * Copyright (C) 1996-2014, International Business Machines Corporation and |
6 | * others. All Rights Reserved. | |
b75a7d8f A |
7 | ******************************************************************************* |
8 | */ | |
9 | ||
10 | /* | |
11 | * File coleitr.cpp | |
12 | * | |
b75a7d8f A |
13 | * Created by: Helena Shih |
14 | * | |
15 | * Modification History: | |
16 | * | |
17 | * Date Name Description | |
18 | * | |
19 | * 6/23/97 helena Adding comments to make code more readable. | |
20 | * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java | |
21 | * 12/10/99 aliu Ported Thai collation support from Java. | |
22 | * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) | |
57a6839d | 23 | * 02/19/01 swquek Removed CollationElementIterator() since it is |
b75a7d8f | 24 | * private constructor and no calls are made to it |
57a6839d | 25 | * 2012-2014 markus Rewritten in C++ again. |
b75a7d8f A |
26 | */ |
27 | ||
28 | #include "unicode/utypes.h" | |
29 | ||
30 | #if !UCONFIG_NO_COLLATION | |
31 | ||
f3c0d7a5 | 32 | #include "unicode/chariter.h" |
b75a7d8f | 33 | #include "unicode/coleitr.h" |
57a6839d | 34 | #include "unicode/tblcoll.h" |
b75a7d8f | 35 | #include "unicode/ustring.h" |
b75a7d8f | 36 | #include "cmemory.h" |
57a6839d A |
37 | #include "collation.h" |
38 | #include "collationdata.h" | |
39 | #include "collationiterator.h" | |
40 | #include "collationsets.h" | |
41 | #include "collationtailoring.h" | |
42 | #include "uassert.h" | |
43 | #include "uhash.h" | |
44 | #include "utf16collationiterator.h" | |
45 | #include "uvectr32.h" | |
b75a7d8f A |
46 | |
47 | /* Constants --------------------------------------------------------------- */ | |
48 | ||
49 | U_NAMESPACE_BEGIN | |
50 | ||
374ca955 | 51 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
b75a7d8f | 52 | |
b75a7d8f A |
53 | /* CollationElementIterator public constructor/destructor ------------------ */ |
54 | ||
55 | CollationElementIterator::CollationElementIterator( | |
56 | const CollationElementIterator& other) | |
57a6839d | 57 | : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { |
73c04bcf | 58 | *this = other; |
b75a7d8f A |
59 | } |
60 | ||
61 | CollationElementIterator::~CollationElementIterator() | |
62 | { | |
57a6839d A |
63 | delete iter_; |
64 | delete offsets_; | |
b75a7d8f A |
65 | } |
66 | ||
67 | /* CollationElementIterator public methods --------------------------------- */ | |
68 | ||
57a6839d A |
69 | namespace { |
70 | ||
71 | uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { | |
72 | return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); | |
73 | } | |
74 | uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { | |
75 | return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); | |
76 | } | |
77 | UBool ceNeedsTwoParts(int64_t ce) { | |
78 | return (ce & INT64_C(0xffff00ff003f)) != 0; | |
79 | } | |
80 | ||
81 | } // namespace | |
82 | ||
b75a7d8f A |
83 | int32_t CollationElementIterator::getOffset() const |
84 | { | |
57a6839d A |
85 | if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
86 | // CollationIterator::previousCE() decrements the CEs length | |
87 | // while it pops CEs from its internal buffer. | |
88 | int32_t i = iter_->getCEsLength(); | |
89 | if (otherHalf_ != 0) { | |
90 | // Return the trailing CE offset while we are in the middle of a 64-bit CE. | |
91 | ++i; | |
92 | } | |
93 | U_ASSERT(i < offsets_->size()); | |
94 | return offsets_->elementAti(i); | |
95 | } | |
96 | return iter_->getOffset(); | |
b75a7d8f A |
97 | } |
98 | ||
99 | /** | |
100 | * Get the ordering priority of the next character in the string. | |
101 | * @return the next character's ordering. Returns NULLORDER if an error has | |
102 | * occured or if the end of string has been reached | |
103 | */ | |
104 | int32_t CollationElementIterator::next(UErrorCode& status) | |
105 | { | |
57a6839d A |
106 | if (U_FAILURE(status)) { return NULLORDER; } |
107 | if (dir_ > 1) { | |
108 | // Continue forward iteration. Test this first. | |
109 | if (otherHalf_ != 0) { | |
110 | uint32_t oh = otherHalf_; | |
111 | otherHalf_ = 0; | |
112 | return oh; | |
113 | } | |
114 | } else if (dir_ == 1) { | |
115 | // next() after setOffset() | |
116 | dir_ = 2; | |
117 | } else if (dir_ == 0) { | |
118 | // The iter_ is already reset to the start of the text. | |
119 | dir_ = 2; | |
120 | } else /* dir_ < 0 */ { | |
121 | // illegal change of direction | |
122 | status = U_INVALID_STATE_ERROR; | |
123 | return NULLORDER; | |
124 | } | |
125 | // No need to keep all CEs in the buffer when we iterate. | |
126 | iter_->clearCEsIfNoneRemaining(); | |
127 | int64_t ce = iter_->nextCE(status); | |
128 | if (ce == Collation::NO_CE) { return NULLORDER; } | |
129 | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. | |
130 | uint32_t p = (uint32_t)(ce >> 32); | |
131 | uint32_t lower32 = (uint32_t)ce; | |
132 | uint32_t firstHalf = getFirstHalf(p, lower32); | |
133 | uint32_t secondHalf = getSecondHalf(p, lower32); | |
134 | if (secondHalf != 0) { | |
135 | otherHalf_ = secondHalf | 0xc0; // continuation CE | |
136 | } | |
137 | return firstHalf; | |
b75a7d8f A |
138 | } |
139 | ||
140 | UBool CollationElementIterator::operator!=( | |
141 | const CollationElementIterator& other) const | |
142 | { | |
73c04bcf | 143 | return !(*this == other); |
b75a7d8f A |
144 | } |
145 | ||
146 | UBool CollationElementIterator::operator==( | |
147 | const CollationElementIterator& that) const | |
148 | { | |
57a6839d | 149 | if (this == &that) { |
b75a7d8f A |
150 | return TRUE; |
151 | } | |
152 | ||
57a6839d A |
153 | return |
154 | (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && | |
155 | otherHalf_ == that.otherHalf_ && | |
156 | normalizeDir() == that.normalizeDir() && | |
157 | string_ == that.string_ && | |
158 | *iter_ == *that.iter_; | |
b75a7d8f A |
159 | } |
160 | ||
161 | /** | |
162 | * Get the ordering priority of the previous collation element in the string. | |
163 | * @param status the error code status. | |
164 | * @return the previous element's ordering. Returns NULLORDER if an error has | |
165 | * occured or if the start of string has been reached. | |
166 | */ | |
167 | int32_t CollationElementIterator::previous(UErrorCode& status) | |
168 | { | |
57a6839d A |
169 | if (U_FAILURE(status)) { return NULLORDER; } |
170 | if (dir_ < 0) { | |
171 | // Continue backwards iteration. Test this first. | |
172 | if (otherHalf_ != 0) { | |
173 | uint32_t oh = otherHalf_; | |
174 | otherHalf_ = 0; | |
175 | return oh; | |
176 | } | |
177 | } else if (dir_ == 0) { | |
178 | iter_->resetToOffset(string_.length()); | |
179 | dir_ = -1; | |
180 | } else if (dir_ == 1) { | |
181 | // previous() after setOffset() | |
182 | dir_ = -1; | |
183 | } else /* dir_ > 1 */ { | |
184 | // illegal change of direction | |
185 | status = U_INVALID_STATE_ERROR; | |
186 | return NULLORDER; | |
187 | } | |
188 | if (offsets_ == NULL) { | |
189 | offsets_ = new UVector32(status); | |
190 | if (offsets_ == NULL) { | |
191 | status = U_MEMORY_ALLOCATION_ERROR; | |
192 | return NULLORDER; | |
193 | } | |
194 | } | |
195 | // If we already have expansion CEs, then we also have offsets. | |
196 | // Otherwise remember the trailing offset in case we need to | |
197 | // write offsets for an artificial expansion. | |
198 | int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; | |
199 | int64_t ce = iter_->previousCE(*offsets_, status); | |
200 | if (ce == Collation::NO_CE) { return NULLORDER; } | |
201 | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. | |
202 | uint32_t p = (uint32_t)(ce >> 32); | |
203 | uint32_t lower32 = (uint32_t)ce; | |
204 | uint32_t firstHalf = getFirstHalf(p, lower32); | |
205 | uint32_t secondHalf = getSecondHalf(p, lower32); | |
206 | if (secondHalf != 0) { | |
207 | if (offsets_->isEmpty()) { | |
208 | // When we convert a single 64-bit CE into two 32-bit CEs, | |
209 | // we need to make this artificial expansion behave like a normal expansion. | |
210 | // See CollationIterator::previousCE(). | |
211 | offsets_->addElement(iter_->getOffset(), status); | |
212 | offsets_->addElement(limitOffset, status); | |
213 | } | |
214 | otherHalf_ = firstHalf; | |
215 | return secondHalf | 0xc0; // continuation CE | |
216 | } | |
217 | return firstHalf; | |
b75a7d8f A |
218 | } |
219 | ||
220 | /** | |
221 | * Resets the cursor to the beginning of the string. | |
222 | */ | |
223 | void CollationElementIterator::reset() | |
224 | { | |
57a6839d A |
225 | iter_ ->resetToOffset(0); |
226 | otherHalf_ = 0; | |
227 | dir_ = 0; | |
b75a7d8f A |
228 | } |
229 | ||
230 | void CollationElementIterator::setOffset(int32_t newOffset, | |
231 | UErrorCode& status) | |
232 | { | |
57a6839d A |
233 | if (U_FAILURE(status)) { return; } |
234 | if (0 < newOffset && newOffset < string_.length()) { | |
235 | int32_t offset = newOffset; | |
236 | do { | |
237 | UChar c = string_.charAt(offset); | |
238 | if (!rbc_->isUnsafe(c) || | |
239 | (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { | |
240 | break; | |
241 | } | |
242 | // Back up to before this unsafe character. | |
243 | --offset; | |
244 | } while (offset > 0); | |
245 | if (offset < newOffset) { | |
246 | // We might have backed up more than necessary. | |
247 | // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, | |
248 | // but for text "chu" setOffset(2) should remain at 2 | |
249 | // although we initially back up to offset 0. | |
250 | // Find the last safe offset no greater than newOffset by iterating forward. | |
251 | int32_t lastSafeOffset = offset; | |
252 | do { | |
253 | iter_->resetToOffset(lastSafeOffset); | |
254 | do { | |
255 | iter_->nextCE(status); | |
256 | if (U_FAILURE(status)) { return; } | |
257 | } while ((offset = iter_->getOffset()) == lastSafeOffset); | |
258 | if (offset <= newOffset) { | |
259 | lastSafeOffset = offset; | |
260 | } | |
261 | } while (offset < newOffset); | |
262 | newOffset = lastSafeOffset; | |
263 | } | |
264 | } | |
265 | iter_->resetToOffset(newOffset); | |
266 | otherHalf_ = 0; | |
267 | dir_ = 1; | |
b75a7d8f A |
268 | } |
269 | ||
270 | /** | |
271 | * Sets the source to the new source string. | |
272 | */ | |
273 | void CollationElementIterator::setText(const UnicodeString& source, | |
274 | UErrorCode& status) | |
275 | { | |
73c04bcf | 276 | if (U_FAILURE(status)) { |
b75a7d8f A |
277 | return; |
278 | } | |
73c04bcf | 279 | |
57a6839d A |
280 | string_ = source; |
281 | const UChar *s = string_.getBuffer(); | |
282 | CollationIterator *newIter; | |
283 | UBool numeric = rbc_->settings->isNumeric(); | |
284 | if (rbc_->settings->dontCheckFCD()) { | |
285 | newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); | |
286 | } else { | |
287 | newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); | |
73c04bcf | 288 | } |
57a6839d A |
289 | if (newIter == NULL) { |
290 | status = U_MEMORY_ALLOCATION_ERROR; | |
291 | return; | |
b75a7d8f | 292 | } |
57a6839d A |
293 | delete iter_; |
294 | iter_ = newIter; | |
295 | otherHalf_ = 0; | |
296 | dir_ = 0; | |
b75a7d8f A |
297 | } |
298 | ||
299 | // Sets the source to the new character iterator. | |
300 | void CollationElementIterator::setText(CharacterIterator& source, | |
301 | UErrorCode& status) | |
302 | { | |
73c04bcf | 303 | if (U_FAILURE(status)) |
b75a7d8f | 304 | return; |
73c04bcf | 305 | |
57a6839d A |
306 | source.getText(string_); |
307 | setText(string_, status); | |
b75a7d8f A |
308 | } |
309 | ||
310 | int32_t CollationElementIterator::strengthOrder(int32_t order) const | |
311 | { | |
57a6839d | 312 | UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
73c04bcf A |
313 | // Mask off the unwanted differences. |
314 | if (s == UCOL_PRIMARY) { | |
57a6839d | 315 | order &= 0xffff0000; |
73c04bcf A |
316 | } |
317 | else if (s == UCOL_SECONDARY) { | |
57a6839d | 318 | order &= 0xffffff00; |
73c04bcf A |
319 | } |
320 | ||
321 | return order; | |
b75a7d8f A |
322 | } |
323 | ||
324 | /* CollationElementIterator private constructors/destructors --------------- */ | |
325 | ||
326 | /** | |
327 | * This is the "real" constructor for this class; it constructs an iterator | |
328 | * over the source text using the specified collator | |
329 | */ | |
330 | CollationElementIterator::CollationElementIterator( | |
57a6839d A |
331 | const UnicodeString &source, |
332 | const RuleBasedCollator *coll, | |
333 | UErrorCode &status) | |
334 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { | |
335 | setText(source, status); | |
b75a7d8f A |
336 | } |
337 | ||
338 | /** | |
339 | * This is the "real" constructor for this class; it constructs an iterator over | |
340 | * the source text using the specified collator | |
341 | */ | |
342 | CollationElementIterator::CollationElementIterator( | |
57a6839d A |
343 | const CharacterIterator &source, |
344 | const RuleBasedCollator *coll, | |
345 | UErrorCode &status) | |
346 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { | |
347 | // We only call source.getText() which should be const anyway. | |
348 | setText(const_cast<CharacterIterator &>(source), status); | |
b75a7d8f A |
349 | } |
350 | ||
57a6839d | 351 | /* CollationElementIterator private methods -------------------------------- */ |
b75a7d8f A |
352 | |
353 | const CollationElementIterator& CollationElementIterator::operator=( | |
354 | const CollationElementIterator& other) | |
355 | { | |
57a6839d A |
356 | if (this == &other) { |
357 | return *this; | |
358 | } | |
359 | ||
360 | CollationIterator *newIter; | |
361 | const FCDUTF16CollationIterator *otherFCDIter = | |
362 | dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); | |
363 | if(otherFCDIter != NULL) { | |
364 | newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); | |
365 | } else { | |
366 | const UTF16CollationIterator *otherIter = | |
367 | dynamic_cast<const UTF16CollationIterator *>(other.iter_); | |
368 | if(otherIter != NULL) { | |
369 | newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); | |
370 | } else { | |
371 | newIter = NULL; | |
73c04bcf | 372 | } |
57a6839d A |
373 | } |
374 | if(newIter != NULL) { | |
375 | delete iter_; | |
376 | iter_ = newIter; | |
377 | rbc_ = other.rbc_; | |
378 | otherHalf_ = other.otherHalf_; | |
379 | dir_ = other.dir_; | |
380 | ||
381 | string_ = other.string_; | |
382 | } | |
383 | if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { | |
384 | UErrorCode errorCode = U_ZERO_ERROR; | |
385 | if(offsets_ == NULL) { | |
386 | offsets_ = new UVector32(other.offsets_->size(), errorCode); | |
73c04bcf | 387 | } |
57a6839d A |
388 | if(offsets_ != NULL) { |
389 | offsets_->assign(*other.offsets_, errorCode); | |
73c04bcf | 390 | } |
57a6839d A |
391 | } |
392 | return *this; | |
393 | } | |
394 | ||
395 | namespace { | |
73c04bcf | 396 | |
57a6839d A |
397 | class MaxExpSink : public ContractionsAndExpansions::CESink { |
398 | public: | |
399 | MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} | |
400 | virtual ~MaxExpSink(); | |
401 | virtual void handleCE(int64_t /*ce*/) {} | |
402 | virtual void handleExpansion(const int64_t ces[], int32_t length) { | |
403 | if (length <= 1) { | |
404 | // We do not need to add single CEs into the map. | |
405 | return; | |
73c04bcf | 406 | } |
57a6839d A |
407 | int32_t count = 0; // number of CE "halves" |
408 | for (int32_t i = 0; i < length; ++i) { | |
409 | count += ceNeedsTwoParts(ces[i]) ? 2 : 1; | |
46f4442e | 410 | } |
57a6839d A |
411 | // last "half" of the last CE |
412 | int64_t ce = ces[length - 1]; | |
413 | uint32_t p = (uint32_t)(ce >> 32); | |
414 | uint32_t lower32 = (uint32_t)ce; | |
415 | uint32_t lastHalf = getSecondHalf(p, lower32); | |
416 | if (lastHalf == 0) { | |
417 | lastHalf = getFirstHalf(p, lower32); | |
418 | U_ASSERT(lastHalf != 0); | |
46f4442e | 419 | } else { |
57a6839d | 420 | lastHalf |= 0xc0; // old-style continuation CE |
73c04bcf | 421 | } |
57a6839d A |
422 | if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
423 | uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); | |
73c04bcf | 424 | } |
73c04bcf A |
425 | } |
426 | ||
57a6839d A |
427 | private: |
428 | UHashtable *maxExpansions; | |
429 | UErrorCode &errorCode; | |
430 | }; | |
431 | ||
432 | MaxExpSink::~MaxExpSink() {} | |
433 | ||
434 | } // namespace | |
435 | ||
436 | UHashtable * | |
437 | CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { | |
438 | if (U_FAILURE(errorCode)) { return NULL; } | |
439 | UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, | |
440 | uhash_compareLong, &errorCode); | |
441 | if (U_FAILURE(errorCode)) { return NULL; } | |
442 | MaxExpSink sink(maxExpansions, errorCode); | |
443 | ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); | |
444 | if (U_FAILURE(errorCode)) { | |
445 | uhash_close(maxExpansions); | |
446 | return NULL; | |
447 | } | |
448 | return maxExpansions; | |
449 | } | |
450 | ||
451 | int32_t | |
452 | CollationElementIterator::getMaxExpansion(int32_t order) const { | |
453 | return getMaxExpansion(rbc_->tailoring->maxExpansions, order); | |
454 | } | |
455 | ||
456 | int32_t | |
457 | CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { | |
458 | if (order == 0) { return 1; } | |
459 | int32_t max; | |
460 | if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { | |
461 | return max; | |
462 | } | |
463 | if ((order & 0xc0) == 0xc0) { | |
464 | // old-style continuation CE | |
465 | return 2; | |
466 | } else { | |
467 | return 1; | |
468 | } | |
b75a7d8f A |
469 | } |
470 | ||
471 | U_NAMESPACE_END | |
472 | ||
473 | #endif /* #if !UCONFIG_NO_COLLATION */ |